Skip to content

Commit

Permalink
feat(data): add dgs corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
AmitMY committed Jun 3, 2024
1 parent fa8c81a commit e4dbf7b
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 1 deletion.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,13 @@ conda create --name sign_gpt python=3.11 -y
conda activate sign_gpt

pip install git+https://github.com/sign-language-processing/datasets.git
pip install mediapipe gdown
pip install mediapipe gdown lxml
```

Generate the data:
```bash
python -m sign_gpt.datasets.rwth_phoenix2014_t
python -m sign_gpt.datasets.dicta_sign
python -m sign_gpt.datasets.dgs_types
python -m sign_gpt.datasets.dgs_corpus
```
84 changes: 84 additions & 0 deletions sign_gpt/datasets/dgs_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import gzip
import json
from pathlib import Path

import tensorflow_datasets as tfds
from sign_language_datasets.datasets.dgs_corpus import DgsCorpusConfig
from tqdm import tqdm

from sign_gpt.language_utils.i18n import i18n

DATASET_NAME = "dgs_corpus"
DATA_PATH = Path(f"processed/{DATASET_NAME}")
DATA_PATH.mkdir(parents=True, exist_ok=True)

config = DgsCorpusConfig(name="only-annotations-sentence-level-uzh", version="1.0.0",
include_video=False, include_pose=None, data_type="sentence",
split="3.0.0-uzh-sentence")
dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config))

TASKS = {
"gloss_to_text": "Given a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus, translate it into a natural {spoken_language} sentence.\nInput: {gloss}\nOutput: {text}",
"text_to_gloss": "Given a {spoken_language} sentence, convert it into a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus.\nInput: {text}\nOutput: {gloss}",
}


def build_gloss_text(glosses: list[str],
gloss_starts: list[int],
mouthings: list[str],
mouthings_start: list[int]) -> str:
text = ""
mouthing_index = 0
for gloss, gloss_start in zip(glosses, gloss_starts):
text += " " + gloss
gloss_mouthings = []
while mouthing_index < len(mouthings) and mouthings_start[mouthing_index] <= gloss_start:
gloss_mouthings.append(mouthings[mouthing_index])
mouthing_index += 1
if len(gloss_mouthings) > 0:
text += f" ({' '.join(gloss_mouthings)})"
return text.strip()


for split, split_data in dataset.items():
split_files = {
task: gzip.open(DATA_PATH / f"{task}.{split}.jsonl.gz", "wt", encoding="utf-8")
for task in TASKS
}

for datum in tqdm(split_data):
sentence = datum['sentence']

glosses = sentence['glosses']
german_glosses = [g.numpy().decode('utf-8') for g in glosses['Gebärde']]
english_glosses = [g.numpy().decode('utf-8') for g in glosses['Sign']]
gloss_start_times = [int(t.numpy()) for t in glosses['start']]
gloss_end_times = [int(t.numpy()) for t in glosses['end']]

mouthings = sentence["mouthings"]
mouthings_text = [m.numpy().decode('utf-8') for m in mouthings['mouthing']]
mouthings_start_times = [int(t.numpy()) for t in mouthings['start']]
mouthings_end_times = [int(t.numpy()) for t in mouthings['end']]

params_list = [
{
"gloss": build_gloss_text(german_glosses, gloss_start_times, mouthings_text, mouthings_start_times),
"text": sentence['german'].numpy().decode('utf-8'),
"signed_language": i18n("signed_languages", "gsg"),
"spoken_language": i18n("languages", "de"),
},
{
"gloss": build_gloss_text(english_glosses, gloss_start_times, mouthings_text, mouthings_start_times),
"text": sentence['english'].numpy().decode('utf-8'),
"signed_language": i18n("signed_languages", "gsg"),
"spoken_language": i18n("languages", "en"),
}
]

for params in params_list:
for task, file in split_files.items():
instruction_text = TASKS[task].format(**params)
file.write(json.dumps({"text": instruction_text}) + "\n")

for file in split_files.values():
file.close()

0 comments on commit e4dbf7b

Please sign in to comment.