Skip to content

Commit

Permalink
feat(datasets): remove template from source data
Browse files Browse the repository at this point in the history
  • Loading branch information
AmitMY committed Jun 20, 2024
1 parent 0940012 commit d452f76
Show file tree
Hide file tree
Showing 30 changed files with 115 additions and 32 deletions.
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,14 @@ python -m sign_gpt.custom_datasets.dicta_sign
python -m sign_gpt.custom_datasets.dgs_types
python -m sign_gpt.custom_datasets.dgs_corpus
python -m sign_gpt.custom_datasets.signbank_plus
```
```

### Crawl Idea

We have very large crawlers (such as CommonCrawl) that can be used to collect data from websites/books.
We can vectorize all videos.
We have very strong and capable language models able to help us create data.
So the idea would be: crawl the web/whatever, feed the contents to a language model to generate a system prompt,
and the relevant inputs and outputs from the document. We then compile that into "CrawlInstruct"

Videos that include captions are always covnerted to a translation task.
Binary file modified processed/dgs_corpus/gloss_to_text.test.jsonl.gz
Binary file not shown.
Binary file modified processed/dgs_corpus/gloss_to_text.train.jsonl.gz
Binary file not shown.
Binary file modified processed/dgs_corpus/gloss_to_text.validation.jsonl.gz
Binary file not shown.
Binary file modified processed/dgs_corpus/text_to_gloss.test.jsonl.gz
Binary file not shown.
Binary file modified processed/dgs_corpus/text_to_gloss.train.jsonl.gz
Binary file not shown.
Binary file modified processed/dgs_corpus/text_to_gloss.validation.jsonl.gz
Binary file not shown.
Binary file modified processed/dgs_types/gloss_to_hamnosys.train.jsonl.gz
Binary file not shown.
Binary file modified processed/dgs_types/hamnosys_to_gloss.train.jsonl.gz
Binary file not shown.
Binary file modified processed/dicta_sign/hamnosys_to_text.train.jsonl.gz
Binary file not shown.
Binary file modified processed/dicta_sign/text_to_hamnosys.train.jsonl.gz
Binary file not shown.
Binary file modified processed/rwth_phoenix2014_t/gloss_to_text.test.jsonl.gz
Binary file not shown.
Binary file modified processed/rwth_phoenix2014_t/gloss_to_text.train.jsonl.gz
Binary file not shown.
Binary file modified processed/rwth_phoenix2014_t/gloss_to_text.validation.jsonl.gz
Binary file not shown.
Binary file modified processed/rwth_phoenix2014_t/text_to_gloss.test.jsonl.gz
Binary file not shown.
Binary file modified processed/rwth_phoenix2014_t/text_to_gloss.train.jsonl.gz
Binary file not shown.
Binary file modified processed/rwth_phoenix2014_t/text_to_gloss.validation.jsonl.gz
Binary file not shown.
Binary file modified processed/signbank_plus/signwriting_to_text.test.jsonl.gz
Binary file not shown.
Binary file modified processed/signbank_plus/signwriting_to_text.train.jsonl.gz
Binary file not shown.
Binary file modified processed/signbank_plus/signwriting_to_text.validation.jsonl.gz
Binary file not shown.
Binary file modified processed/signbank_plus/text_to_signwriting.test.jsonl.gz
Binary file not shown.
Binary file modified processed/signbank_plus/text_to_signwriting.train.jsonl.gz
Binary file not shown.
Binary file modified processed/signbank_plus/text_to_signwriting.validation.jsonl.gz
Binary file not shown.
13 changes: 13 additions & 0 deletions sign_gpt/custom_datasets/dataset_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from copy import deepcopy


def format_task(task, params):
task = deepcopy(task)
for key, value in task.items():
if isinstance(value, str):
task[key] = value.format(**params)
elif isinstance(value, list):
task[key] = [format_task(v, params) for v in value]
elif isinstance(value, dict):
task[key] = format_task(value, params)
return task
22 changes: 17 additions & 5 deletions sign_gpt/custom_datasets/dgs_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
from sign_language_datasets.datasets.dgs_corpus import DgsCorpusConfig
from tqdm import tqdm

from sign_gpt.custom_datasets.dataset_utils import format_task
from sign_gpt.language_utils.i18n import i18n

DATASET_NAME = "dgs_corpus"
DATA_PATH = Path(f"processed/{DATASET_NAME}")
DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME
DATA_PATH.mkdir(parents=True, exist_ok=True)

config = DgsCorpusConfig(name="only-annotations-sentence-level-uzh", version="1.0.0",
Expand All @@ -18,8 +19,20 @@
dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config))

TASKS = {
"gloss_to_text": "Given a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus, translate it into a natural {spoken_language} sentence.\nInput: {gloss}\nOutput: {text}",
"text_to_gloss": "Given a {spoken_language} sentence, convert it into a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus.\nInput: {text}\nOutput: {gloss}",
"gloss_to_text": {
"system": "Given a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus, translate it into a natural {spoken_language} sentence.",
"messages": [{
"input": "{gloss}",
"output": "{text}",
}]
},
"text_to_gloss": {
"system": "Given a {spoken_language} sentence, convert it into a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus.",
"messages": [{
"input": "{text}",
"output": "{gloss}",
}]
}
}


Expand Down Expand Up @@ -77,8 +90,7 @@ def build_gloss_text(glosses: list[str],

for params in params_list:
for task, file in split_files.items():
instruction_text = TASKS[task].format(**params)
file.write(json.dumps({"text": instruction_text}) + "\n")
file.write(json.dumps(format_task(TASKS[task], params)) + "\n")

for file in split_files.values():
file.close()
22 changes: 17 additions & 5 deletions sign_gpt/custom_datasets/dgs_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,32 @@
import tensorflow_datasets as tfds
from sign_language_datasets.datasets.config import SignDatasetConfig

from sign_gpt.custom_datasets.dataset_utils import format_task
from sign_gpt.language_utils.i18n import i18n

DATASET_NAME = "dgs_types"
DATA_PATH = Path(f"processed/{DATASET_NAME}")
DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME
DATA_PATH.mkdir(parents=True, exist_ok=True)

config = SignDatasetConfig(name="only-annotations", version="1.0.0",
include_video=False, process_video=False, include_pose=None)
dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config))

TASKS = {
"hamnosys_to_gloss": "Given a sequence of HamNoSys notation for a sign in {signed_language}, translate it into a {spoken_language} gloss according to the DGS Corpus.\nInput: {hamnosys}\nOutput: {gloss}",
"gloss_to_hamnosys": "Given a {spoken_language} gloss according to the DGS Corpus, translate it into HamNoSys notation in {signed_language}.\nInput: {gloss}\nOutput: {hamnosys}",
"hamnosys_to_gloss": {
"system": "Given a sequence of HamNoSys notation for a sign in {signed_language}, translate it into a {spoken_language} gloss according to the DGS Corpus.",
"messages": [{
"input": "{hamnosys}",
"output": "{gloss}",
}]
},
"gloss_to_hamnosys": {
"system": "Given a {spoken_language} gloss according to the DGS Corpus, translate it into HamNoSys notation in {signed_language}.",
"messages": [{
"input": "{gloss}",
"output": "{hamnosys}",
}]
}
}

for split, split_data in dataset.items():
Expand All @@ -37,8 +50,7 @@
}

for task, file in split_files.items():
instruction_text = TASKS[task].format(**params)
file.write(json.dumps({"text": instruction_text}) + "\n")
file.write(json.dumps(format_task(TASKS[task], params)) + "\n")

for file in split_files.values():
file.close()
22 changes: 17 additions & 5 deletions sign_gpt/custom_datasets/dicta_sign.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,32 @@
import tensorflow_datasets as tfds
from sign_language_datasets.datasets.config import SignDatasetConfig

from sign_gpt.custom_datasets.dataset_utils import format_task
from sign_gpt.language_utils.i18n import i18n
from sign_gpt.language_utils.info import sign_language_by_abbreviation

DATASET_NAME = "dicta_sign"
DATA_PATH = Path(f"processed/{DATASET_NAME}")
DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME
DATA_PATH.mkdir(parents=True, exist_ok=True)

config = SignDatasetConfig(name="only-annotations", version="1.0.0", include_video=False, include_pose=None)
dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config))

TASKS = {
"hamnosys_to_text": "Given a sequence of HamNoSys notation for a sign in {signed_language}, translate it into {spoken_language} text.\nInput: {hamnosys}\nOutput: {text}",
"text_to_hamnosys": "Given a sequence of {spoken_language} text, translate it into HamNoSys notation in {signed_language}.\nInput: {text}\nOutput: {hamnosys}",
"hamnosys_to_text": {
"system": "Given a sequence of HamNoSys notation for a sign in {signed_language}, translate it into {spoken_language} text.",
"messages": [{
"input": "{hamnosys}",
"output": "{text}",
}]
},
"text_to_hamnosys": {
"system": "Given a sequence of {spoken_language} text, translate it into HamNoSys notation in {signed_language}.",
"messages": [{
"input": "{text}",
"output": "{hamnosys}",
}]
}
}

for split, split_data in dataset.items():
Expand All @@ -38,8 +51,7 @@
"gloss": datum['gloss'].numpy().decode('utf-8'),
}
for task, file in split_files.items():
instruction_text = TASKS[task].format(**params)
file.write(json.dumps({"text": instruction_text}) + "\n")
file.write(json.dumps(format_task(TASKS[task], params)) + "\n")

for file in split_files.values():
file.close()
25 changes: 19 additions & 6 deletions sign_gpt/custom_datasets/rwth_phoenix2014_t.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,34 @@
import json
import gzip
import json
from pathlib import Path

import tensorflow_datasets as tfds
from sign_language_datasets.datasets.config import SignDatasetConfig

from sign_gpt.custom_datasets.dataset_utils import format_task

DATASET_NAME = "rwth_phoenix2014_t"
DATA_PATH = Path(f"processed/{DATASET_NAME}")
DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME
DATA_PATH.mkdir(parents=True, exist_ok=True)

config = SignDatasetConfig(name="only-annotations", version="3.0.0", include_video=False)
dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config))

TASKS = {
"gloss_to_text": "Given a sequence of German Sign Language glosses following the style and conventions of the RWTH-PHOENIX-Weather 2014 T dataset, convert it into a natural German sentence, lowercased.\nInput: {gloss}\nOutput: {text}",
"text_to_gloss": "Given a German sentence, convert it into a sequence of German Sign Language glosses following the style and conventions of the RWTH-PHOENIX-Weather 2014 T dataset.\nInput: {text}\nOutput: {gloss}",
"gloss_to_text": {
"system": "Given a sequence of German Sign Language glosses following the style and conventions of the RWTH-PHOENIX-Weather 2014 T dataset, convert it into a natural German sentence, lowercased.",
"messages": [{
"input": "{gloss}",
"output": "{text}",
}]
},
"text_to_gloss": {
"system": "Given a German sentence, convert it into a sequence of German Sign Language glosses following the style and conventions of the RWTH-PHOENIX-Weather 2014 T dataset.",
"messages": [{
"input": "{text}",
"output": "{gloss}",
}]
}
}

for split, split_data in dataset.items():
Expand All @@ -29,8 +43,7 @@
"text": datum['text'].numpy().decode('utf-8')
}
for task, file in split_files.items():
instruction_text = TASKS[task].format(**params)
file.write(json.dumps({"text": instruction_text}) + "\n")
file.write(json.dumps(format_task(TASKS[task], params)) + "\n")

for file in split_files.values():
file.close()
23 changes: 17 additions & 6 deletions sign_gpt/custom_datasets/signbank_plus.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@

from tqdm import tqdm

from sign_gpt.custom_datasets.dataset_utils import format_task
from sign_gpt.language_utils.i18n import i18n

csv.field_size_limit(2 ** 20) # Increase limit to 1MB (2^20 characters)

DATASET_NAME = "signbank_plus"
DATA_PATH = Path(f"processed/{DATASET_NAME}")
DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME
DATA_PATH.mkdir(parents=True, exist_ok=True)

data_dir = Path("/Users/amitmoryossef/dev/sign-language-processing/signbank-annotation/signbank-plus/data/parallel")
Expand All @@ -20,7 +21,6 @@

paths = {
"train": {
"expanded": data_dir / "expanded" / "train.csv",
"cleaned": data_dir / "cleaned" / "train.csv",
"more": data_dir / "more" / "train.csv",
},
Expand All @@ -33,8 +33,20 @@
}

TASKS = {
"signwriting_to_text": "Given a sequence of SignWriting in {signed_language} ({data_type}), translate it into {spoken_language}.\nInput: {signwriting}\nOutput: {text}",
"text_to_signwriting": "Given {spoken_language} text, translate it into {signed_language} using SignWriting ({data_type}).\nInput: {text}\nOutput: {signwriting}",
"signwriting_to_text": {
"system": "Given a sequence of SignWriting in {signed_language}, translate it into {spoken_language}.",
"messages": [{
"input": "{signwriting}",
"output": "{text}",
}]
},
"text_to_signwriting": {
"system": "Given {spoken_language} text, translate it into {signed_language} using SignWriting.",
"messages": [{
"input": "{text}",
"output": "{signwriting}",
}]
}
}

for split, split_data in paths.items():
Expand All @@ -58,8 +70,7 @@
"signwriting": ' '.join(signs)
}
for task, file in split_files.items():
instruction_text = TASKS[task].format(**params)
file.write(json.dumps({"text": instruction_text}) + "\n")
file.write(json.dumps(format_task(TASKS[task], params)) + "\n")

for file in split_files.values():
file.close()
8 changes: 4 additions & 4 deletions sign_gpt/train_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@


def prep_llama3_instruction(datum):
text = datum['text']
text = "<|start_header_id|>system<|end_header_id|>\n\n" + text + "<|eot_id|>"
text = text.replace("\nInput: ", "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n")
text = text.replace("\nOutput: ", "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n")
text = "<|start_header_id|>system<|end_header_id|>\n\n" + datum['system'] + "<|eot_id|>"
for message in datum['messages']:
text += "<|start_header_id|>user<|end_header_id|>\n\n" + message['input'] + "<|eot_id|>"
text += "<|start_header_id|>assistant<|end_header_id|>\n\n" + message['output'] + "<|eot_id|>"
return {"text": text}


Expand Down

0 comments on commit d452f76

Please sign in to comment.