From e4dbf7bb1e3fc626111c3256916c4f3e4fcb5916 Mon Sep 17 00:00:00 2001 From: Amit Moryossef Date: Mon, 3 Jun 2024 15:11:42 +0200 Subject: [PATCH] feat(data): add dgs corpus --- README.md | 3 +- sign_gpt/datasets/dgs_corpus.py | 84 +++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 sign_gpt/datasets/dgs_corpus.py diff --git a/README.md b/README.md index 218aa30..7ca48d4 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ conda create --name sign_gpt python=3.11 -y conda activate sign_gpt pip install git+https://github.com/sign-language-processing/datasets.git -pip install mediapipe gdown +pip install mediapipe gdown lxml ``` Generate the data: @@ -52,4 +52,5 @@ Generate the data: python -m sign_gpt.datasets.rwth_phoenix2014_t python -m sign_gpt.datasets.dicta_sign python -m sign_gpt.datasets.dgs_types +python -m sign_gpt.datasets.dgs_corpus ``` \ No newline at end of file diff --git a/sign_gpt/datasets/dgs_corpus.py b/sign_gpt/datasets/dgs_corpus.py new file mode 100644 index 0000000..a3415ee --- /dev/null +++ b/sign_gpt/datasets/dgs_corpus.py @@ -0,0 +1,84 @@ +import gzip +import json +from pathlib import Path + +import tensorflow_datasets as tfds +from sign_language_datasets.datasets.dgs_corpus import DgsCorpusConfig +from tqdm import tqdm + +from sign_gpt.language_utils.i18n import i18n + +DATASET_NAME = "dgs_corpus" +DATA_PATH = Path(f"processed/{DATASET_NAME}") +DATA_PATH.mkdir(parents=True, exist_ok=True) + +config = DgsCorpusConfig(name="only-annotations-sentence-level-uzh", version="1.0.0", + include_video=False, include_pose=None, data_type="sentence", + split="3.0.0-uzh-sentence") +dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config)) + +TASKS = { + "gloss_to_text": "Given a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus, translate it into a natural {spoken_language} sentence.\nInput: {gloss}\nOutput: {text}", + "text_to_gloss": "Given a {spoken_language} sentence, convert it into a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus.\nInput: {text}\nOutput: {gloss}", +} + + +def build_gloss_text(glosses: list[str], + gloss_starts: list[int], + mouthings: list[str], + mouthings_start: list[int]) -> str: + text = "" + mouthing_index = 0 + for gloss, gloss_start in zip(glosses, gloss_starts): + text += " " + gloss + gloss_mouthings = [] + while mouthing_index < len(mouthings) and mouthings_start[mouthing_index] <= gloss_start: + gloss_mouthings.append(mouthings[mouthing_index]) + mouthing_index += 1 + if len(gloss_mouthings) > 0: + text += f" ({' '.join(gloss_mouthings)})" + return text.strip() + + +for split, split_data in dataset.items(): + split_files = { + task: gzip.open(DATA_PATH / f"{task}.{split}.jsonl.gz", "wt", encoding="utf-8") + for task in TASKS + } + + for datum in tqdm(split_data): + sentence = datum['sentence'] + + glosses = sentence['glosses'] + german_glosses = [g.numpy().decode('utf-8') for g in glosses['Gebärde']] + english_glosses = [g.numpy().decode('utf-8') for g in glosses['Sign']] + gloss_start_times = [int(t.numpy()) for t in glosses['start']] + gloss_end_times = [int(t.numpy()) for t in glosses['end']] + + mouthings = sentence["mouthings"] + mouthings_text = [m.numpy().decode('utf-8') for m in mouthings['mouthing']] + mouthings_start_times = [int(t.numpy()) for t in mouthings['start']] + mouthings_end_times = [int(t.numpy()) for t in mouthings['end']] + + params_list = [ + { + "gloss": build_gloss_text(german_glosses, gloss_start_times, mouthings_text, mouthings_start_times), + "text": sentence['german'].numpy().decode('utf-8'), + "signed_language": i18n("signed_languages", "gsg"), + "spoken_language": i18n("languages", "de"), + }, + { + "gloss": build_gloss_text(english_glosses, gloss_start_times, mouthings_text, mouthings_start_times), + "text": sentence['english'].numpy().decode('utf-8'), + "signed_language": i18n("signed_languages", "gsg"), + "spoken_language": i18n("languages", "en"), + } + ] + + for params in params_list: + for task, file in split_files.items(): + instruction_text = TASKS[task].format(**params) + file.write(json.dumps({"text": instruction_text}) + "\n") + + for file in split_files.values(): + file.close()