Skip to content

Commit

Permalink
fix(): correct training and inference code to use gpt4 tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
AmitMY committed Jun 29, 2024
1 parent 17243d7 commit fd657a2
Show file tree
Hide file tree
Showing 11 changed files with 125 additions and 45 deletions.
20 changes: 11 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ conda activate sockeye

cd signwriting_translation

MODEL_DIR=/shares/volk.cl.uzh/amoryo/checkpoints/signwriting-translation
MODEL_DIR=/shares/iict-sp2.ebling.cl.uzh/amoryo/checkpoints/signwriting-translation
DATA_DIR=/home/amoryo/sign-language/signwriting-translation/parallel
DIRECTION="spoken-to-signed"

Expand All @@ -47,21 +47,21 @@ sbatch prepare_data.sh
# (Train without factors)
sbatch train_sockeye_model.sh \
--data_dir="$DATA_DIR/$DIRECTION" \
--model_dir="$MODEL_DIR/$DIRECTION/no-factors" \
--model_dir="$MODEL_DIR/$DIRECTION/no-factors-gpt" \
--optimized_metric="signwriting-similarity" \
--partition lowprio
# (Train with factors)
sbatch train_sockeye_model.sh \
--data_dir="$DATA_DIR/$DIRECTION" \
--model_dir="$MODEL_DIR/$DIRECTION/target-factors-v4" \
--model_dir="$MODEL_DIR/$DIRECTION/target-factors-gpt" \
--optimized_metric="signwriting-similarity" \
--use_target_factors=true \
--partition lowprio
# (Fine tune model on cleaned data)
sbatch train_sockeye_model.sh \
--data_dir="$DATA_DIR-clean/$DIRECTION" \
--model_dir="$MODEL_DIR/$DIRECTION/target-factors-tuned" \
--base_model_dir="$MODEL_DIR/$DIRECTION/target-factors-v4" \
--model_dir="$MODEL_DIR/$DIRECTION/target-factors-gpt-tuned" \
--base_model_dir="$MODEL_DIR/$DIRECTION/target-factors-gpt" \
--optimized_metric="signwriting-similarity" \
--use_target_factors=true \
--partition lowprio
Expand All @@ -72,7 +72,7 @@ cat "$MODEL_DIR/$DIRECTION/target-factors/model/metrics" | grep "signwriting-sim

# 3. Test it yourself
python -m signwriting_translation.bin \
--model="$MODEL_DIR/$DIRECTION/target-factors-tuned/model" \
--model="$MODEL_DIR/$DIRECTION/target-factors-gpt/model" \
--spoken-language="en" \
--signed-language="ase" \
--input="My name is John."
Expand All @@ -82,17 +82,19 @@ python -m signwriting_translation.bin \

```bash
# Copy the model files to a new directory
SE_MODEL_PATH="$MODEL_DIR/$DIRECTION/target-factors-tuned"
HF_MODEL_PATH="$MODEL_DIR/$DIRECTION/huggingface/target-factors-tuned"
SE_MODEL_PATH="$MODEL_DIR/$DIRECTION/target-factors-gpt-tuned"
HF_MODEL_PATH="$MODEL_DIR/$DIRECTION/huggingface/target-factors-gpt-tuned"

rm -r "$HF_MODEL_PATH"
mkdir -p "$HF_MODEL_PATH"
cp tokenizer.json "$HF_MODEL_PATH/tokenizer.json"
cp "$SE_MODEL_PATH/model/params.best" "$HF_MODEL_PATH/params.best"
cp "$SE_MODEL_PATH/model/version" "$HF_MODEL_PATH/version"
cp "$SE_MODEL_PATH/model/metrics" "$HF_MODEL_PATH/metrics"
cp "$SE_MODEL_PATH/model/config" "$HF_MODEL_PATH/config"
cp "$SE_MODEL_PATH/model/args.yaml" "$HF_MODEL_PATH/args.yaml"
cp "$SE_MODEL_PATH/model/vocab."* "$HF_MODEL_PATH"
# if tokenizer exists
! [ -f "tokenizer.json" ] && cp tokenizer.json "$HF_MODEL_PATH/tokenizer.json"

# Upload to HuggingFace
huggingface-cli login
Expand Down
12 changes: 7 additions & 5 deletions signwriting_translation/bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

from signwriting.tokenizer import SignWritingTokenizer
from sockeye.inference import TranslatorOutput
from tokenizers import Tokenizer

from signwriting_translation.tokenizer import tokenize_spoken_text

sw_tokenizer = SignWritingTokenizer()

Expand Down Expand Up @@ -36,9 +37,9 @@ def load_sockeye_translator(model_path: str, log_timing: bool = False):
if log_timing:
print("Loaded sockeye translator in", time.time() - now, "seconds")

tokenizer = Tokenizer.from_file(str(Path(model_path) / 'tokenizer.json'))
tokenizer_path = str(Path(model_path) / 'tokenizer.json')

return translator, tokenizer
return translator, tokenizer_path


def translate(translator, texts: List[str], log_timing: bool = False):
Expand Down Expand Up @@ -70,8 +71,9 @@ def signwriting_to_text():
# pylint: disable=unused-variable
args = get_args()

translator, tokenizer = load_sockeye_translator(args.model)
tokenized_text = " ".join(tokenizer.encode(args.input).tokens)
translator, tokenizer_path = load_sockeye_translator(args.model)
# tokenized_text = " ".join(tokenizer.encode(args.input).tokens)
tokenized_text = tokenize_spoken_text(args.input) # , tokenizer_path)
model_input = f"${args.spoken_language} ${args.signed_language} {tokenized_text}"
outputs = translate(translator, [model_input])
print(outputs[0])
Expand Down
4 changes: 2 additions & 2 deletions signwriting_translation/create_parallel_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from signwriting.tokenizer import SignWritingTokenizer, normalize_signwriting
from tqdm import tqdm

from spoken_language_tokenizer import tokenize_spoken_text
from tokenizer import tokenize_spoken_text

csv.field_size_limit(int(1e6))

Expand All @@ -34,7 +34,7 @@ def load_csv(data_path: Path):
CLEAN_DIRECTIONS = {
"spoken-to-signed": {
"more": 1,
"cleaned": 1,
"cleaned": 2, # de emphasize fingerspelling
},
"signed-to-spoken": {
"more": 1,
Expand Down
26 changes: 16 additions & 10 deletions signwriting_translation/graph_metrics.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
import os
from collections import defaultdict

import matplotlib.pyplot as plt

MODELS_DIR = "/shares/volk.cl.uzh/amoryo/checkpoints/signwriting-translation/"
MODELS_DIR = "/shares/iict-sp2.ebling.cl.uzh/amoryo/checkpoints/signwriting-translation/"
DIRECTION = "spoken-to-signed"
MODELS = [
"no-factors",
"target-factors",
"target-factors-v2",
"target-factors-v4",
"target-factors-tuned"
"no-factors-gpt",
"target-factors-gpt",
"target-factors-gpt-tuned",
"target-factors-gpt-12",
]

if __name__ == "__main__":
models_metrics = defaultdict(lambda: defaultdict(list))

for model_name in MODELS:
metrics_file = f"{MODELS_DIR}{DIRECTION}/{model_name}/model/metrics"
model_path = f"{MODELS_DIR}{DIRECTION}/{model_name}"
if not os.path.exists(f"{model_path}/model/metrics"):
continue
metrics_file = f"{model_path}/model/metrics"
with open(metrics_file, 'r', encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
Expand All @@ -27,13 +30,16 @@
except ValueError:
pass

for metric in ['chrf', 'signwriting-similarity']:
print(next(iter(models_metrics.values())).keys())

for metric in ['chrf-val', 'signwriting-similarity-val', 'signwriting-clip-val',
'perplexity-train', 'perplexity-val', 'decode-walltime-val', 'max-gpu-memory']:
plt.figure(figsize=(10, 5))

plt.grid(axis='y', linestyle='--', linewidth=0.5)
for model_name, metrics in models_metrics.items():
plt.plot(metrics[f"{metric}-val"], label=model_name)
if metric == 'signwriting-similarity':
plt.plot(metrics[metric], label=model_name)
if metric == 'signwriting-similarity-val':
plt.ylim(0.35, None)
plt.legend(loc='lower right')
plt.savefig(f"{metric}.png")
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import argparse
import os
from functools import lru_cache
from typing import List

Expand All @@ -9,7 +9,12 @@

@lru_cache(maxsize=None)
def load_tokenizer(tokenizer_file: str = 'tokenizer.json'):
return Tokenizer.from_file(tokenizer_file)
# check if tokenizer_file exists
if os.path.exists(tokenizer_file):
return Tokenizer.from_file(tokenizer_file)

from transformers import GPT2TokenizerFast
return GPT2TokenizerFast.from_pretrained('Xenova/gpt-4')


@lru_cache(maxsize=int(1e7))
Expand All @@ -22,7 +27,9 @@ def tokenize_spoken_text(text: str, tokenizer_file: str = 'tokenizer.json'):
def train(files: List[str], target_file: str):
tokenizer = Tokenizer(BPE())
tokenizer.normalizer = normalizers.NFKD()
# Take the pre tokenizer setting from GPT-4, https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
# Take the pre tokenizer setting from GPT-4o,
# https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py#L101 # TODO
# https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
pre_tokenizers.Split(pattern=tokenizers.Regex(
"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"),
Expand All @@ -31,12 +38,14 @@ def train(files: List[str], target_file: str):
pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False)
])
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(vocab_size=8000)
trainer = trainers.BpeTrainer(vocab_size=16000)
tokenizer.train(files=files, trainer=trainer)

tokenizer.save(target_file)


raise Exception("Do not use this tokenizer file")

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--files", nargs='+', type=str, help="Files to train tokenizer on")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def print_factored_prediction(factors_file_template: str):
print(tokenizer.tokens_to_text((" ".join(symbols)).split(" ")))


print_prediction("/home/amoryo/sign-language/signwriting-translation/parallel/spoken-to-signed/test/target.txt")
# print_prediction("/home/amoryo/sign-language/signwriting-translation/parallel/spoken-to-signed/test/target.txt")
print_factored_prediction(
# pylint: disable=line-too-long
"/shares/volk.cl.uzh/amoryo/checkpoints/signwriting-translation/spoken-to-signed/target-factors/model/decode.output.{}.00332")
"/shares/iict-sp2.ebling.cl.uzh/amoryo/checkpoints/signwriting-translation/spoken-to-signed/target-factors-gpt-tuned/model/decode.output.{}.00248")
31 changes: 31 additions & 0 deletions signwriting_translation/old/model_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from signwriting_translation.bin import load_sockeye_translator, translate
from signwriting_translation.tokenizer import tokenize_spoken_text

spoken_language = "en"
signed_language = "ase"
texts = [
"Hello world",
"World",
"Hello",
"How are you?"
"Hello, how are you?",
"Hi",
"goodbye world",
"My name is Amit.",
"test",
"Test",
"testing",
"Testing",
"Washington D.C.",
"Washington, D.C.",
"Washington DC",
]
translator, spoken_tokenizer = load_sockeye_translator(
"/shares/iict-sp2.ebling.cl.uzh/amoryo/checkpoints/signwriting-translation/spoken-to-signed/target-factors-gpt-tuned/model",
log_timing=True)
tokenized_texts = [tokenize_spoken_text(text) for text in texts]
model_inputs = [f"${spoken_language} ${signed_language} {tokenized_text}" for tokenized_text in tokenized_texts]

for translation in translate(translator, model_inputs, log_timing=True):
print(translation)
print("M500x500S38800464x496")
8 changes: 4 additions & 4 deletions signwriting_translation/prepare_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ git clone https://github.com/sign-language-processing/signbank-plus.git "$SIGNBA
[ ! -d "$SIGNBANK_DIR/data/parallel/cleaned" ] && \
python "$SIGNBANK_DIR/signbank_plus/prep_nmt.py"

# Train a tokenizer
python spoken_language_tokenizer.py \
--files $SIGNBANK_DIR/data/parallel/cleaned/train.target $SIGNBANK_DIR/data/parallel/more/train.target \
--output="tokenizer.json"
## Train a tokenizer
#python spoken_language_tokenizer.py \
# --files $SIGNBANK_DIR/data/parallel/cleaned/train.target $SIGNBANK_DIR/data/parallel/more/train.target \
# --output="tokenizer.json"

# Prepare the parallel corpus (with source/target-factors)
python create_parallel_data.py \
Expand Down
23 changes: 23 additions & 0 deletions signwriting_translation/tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from functools import lru_cache

import tiktoken


@lru_cache()
def load_tokenizer():
return tiktoken.get_encoding("cl100k_base")


def tokenize_spoken_text(text: str):
tokenizer = load_tokenizer()
encoding = tokenizer.encode(text)

tokens_text = [tokenizer.decode([t]) for t in encoding]
# replace space with special token:
tokens_text = [t.replace(" ", "Ġ") for t in tokens_text]

return " ".join(tokens_text)


if __name__ == "__main__":
print(tokenize_spoken_text("Hello world, my name is amit."))
25 changes: 16 additions & 9 deletions signwriting_translation/train_sockeye_model.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

#SBATCH --job-name=train-sockeye
#SBATCH --time=48:00:00
#SBATCH --time=24:00:00
#SBATCH --mem=16G
#SBATCH --output=train-%j.out

Expand Down Expand Up @@ -85,19 +85,19 @@ function find_vocabulary_factors_files() {
}

function vocabulary_files() {
local base_model_dir=$1
local base_vocab_dir=$1
local type=$2 # e.g., "src" or "trg"
local type_short=$3 # e.g., "src" or "trg"
local use_factors=$4 # Pass 'true' or 'false' to use factors

if [ -z "$base_model_dir" ]; then
if [ -z "$base_vocab_dir" ]; then
return
fi

echo "--${type}-vocab $base_model_dir/model/vocab.${type_short}.0.json "
echo "--${type}-vocab $base_vocab_dir/vocab.${type_short}.0.json "

if [[ "$use_factors" == "true" ]]; then
echo "--${type}-factor-vocabs $(find_vocabulary_factors_files $base_model_dir/model $type_short)"
echo "--${type}-factor-vocabs $(find_vocabulary_factors_files $base_vocab_dir $type_short)"
fi
}

Expand All @@ -107,17 +107,20 @@ max_seq_len=2048
[ "$use_target_factors" == "true" ] && max_seq_len=512

# Prepare data
if [ -n "$base_model_dir" ]; then
vocab_dir="$base_model_dir/model"
fi

TRAIN_DATA_DIR="$model_dir/train_data"
[ ! -f "$TRAIN_DATA_DIR/data.version" ] && \
python -m sockeye.prepare_data \
--max-seq-len $max_seq_len:$max_seq_len \
$(vocabulary_files "$base_model_dir" "source" "src" $use_source_factors) \
$(vocabulary_files "$vocab_dir" "source" "src" $use_source_factors) \
$(translation_files "source" "source" "$data_dir/train" $use_source_factors) \
$(vocabulary_files "$base_model_dir" "target" "trg" $use_target_factors) \
$(vocabulary_files "$vocab_dir" "target" "trg" $use_target_factors) \
$(translation_files "target" "target" "$data_dir/train" $use_target_factors) \
--output $TRAIN_DATA_DIR \

cp tokenizer.json $model_dir/tokenizer.json

MODEL_DIR="$model_dir/model"
rm -rf $MODEL_DIR
Expand Down Expand Up @@ -152,7 +155,8 @@ python -m sockeye.train \
--learning-rate-reduce-factor 0.7 \
--decode-and-evaluate 1000 \
--checkpoint-interval 1000 \
--max-num-checkpoint-not-improved 50 \
--max-num-checkpoint-not-improved 100 \
--min-num-epochs 10 \
--embed-dropout 0.5 \
--transformer-dropout-prepost 0.2 \
--transformer-dropout-act 0.2 \
Expand All @@ -162,3 +166,6 @@ python -m sockeye.train \
--no-bucketing \
$extra_arguments \
--output $MODEL_DIR

[ -f tokenizer.json ] &&
cp tokenizer.json $MODEL_DIR/tokenizer.json

0 comments on commit fd657a2

Please sign in to comment.