diff --git a/README.md b/README.md index d77e779..441b1d6 100644 --- a/README.md +++ b/README.md @@ -56,4 +56,14 @@ python -m sign_gpt.custom_datasets.dicta_sign python -m sign_gpt.custom_datasets.dgs_types python -m sign_gpt.custom_datasets.dgs_corpus python -m sign_gpt.custom_datasets.signbank_plus -``` \ No newline at end of file +``` + +### Crawl Idea + +We have very large crawlers (such as CommonCrawl) that can be used to collect data from websites/books. +We can vectorize all videos. +We have very strong and capable language models able to help us create data. +So the idea would be: crawl the web/whatever, feed the contents to a language model to generate a system prompt, +and the relevant inputs and outputs from the document. We then compile that into "CrawlInstruct" + +Videos that include captions are always covnerted to a translation task. diff --git a/processed/dgs_corpus/gloss_to_text.test.jsonl.gz b/processed/dgs_corpus/gloss_to_text.test.jsonl.gz index f77645d..80738e5 100644 Binary files a/processed/dgs_corpus/gloss_to_text.test.jsonl.gz and b/processed/dgs_corpus/gloss_to_text.test.jsonl.gz differ diff --git a/processed/dgs_corpus/gloss_to_text.train.jsonl.gz b/processed/dgs_corpus/gloss_to_text.train.jsonl.gz index 3626a35..146a46a 100644 Binary files a/processed/dgs_corpus/gloss_to_text.train.jsonl.gz and b/processed/dgs_corpus/gloss_to_text.train.jsonl.gz differ diff --git a/processed/dgs_corpus/gloss_to_text.validation.jsonl.gz b/processed/dgs_corpus/gloss_to_text.validation.jsonl.gz index 194bc4e..31f06f4 100644 Binary files a/processed/dgs_corpus/gloss_to_text.validation.jsonl.gz and b/processed/dgs_corpus/gloss_to_text.validation.jsonl.gz differ diff --git a/processed/dgs_corpus/text_to_gloss.test.jsonl.gz b/processed/dgs_corpus/text_to_gloss.test.jsonl.gz index 0a26a52..97ed025 100644 Binary files a/processed/dgs_corpus/text_to_gloss.test.jsonl.gz and b/processed/dgs_corpus/text_to_gloss.test.jsonl.gz differ diff --git a/processed/dgs_corpus/text_to_gloss.train.jsonl.gz b/processed/dgs_corpus/text_to_gloss.train.jsonl.gz index 40b97fd..361b116 100644 Binary files a/processed/dgs_corpus/text_to_gloss.train.jsonl.gz and b/processed/dgs_corpus/text_to_gloss.train.jsonl.gz differ diff --git a/processed/dgs_corpus/text_to_gloss.validation.jsonl.gz b/processed/dgs_corpus/text_to_gloss.validation.jsonl.gz index 3dbb2ce..cc5f87d 100644 Binary files a/processed/dgs_corpus/text_to_gloss.validation.jsonl.gz and b/processed/dgs_corpus/text_to_gloss.validation.jsonl.gz differ diff --git a/processed/dgs_types/gloss_to_hamnosys.train.jsonl.gz b/processed/dgs_types/gloss_to_hamnosys.train.jsonl.gz index 49c143b..ce1afad 100644 Binary files a/processed/dgs_types/gloss_to_hamnosys.train.jsonl.gz and b/processed/dgs_types/gloss_to_hamnosys.train.jsonl.gz differ diff --git a/processed/dgs_types/hamnosys_to_gloss.train.jsonl.gz b/processed/dgs_types/hamnosys_to_gloss.train.jsonl.gz index 0a9f760..781458e 100644 Binary files a/processed/dgs_types/hamnosys_to_gloss.train.jsonl.gz and b/processed/dgs_types/hamnosys_to_gloss.train.jsonl.gz differ diff --git a/processed/dicta_sign/hamnosys_to_text.train.jsonl.gz b/processed/dicta_sign/hamnosys_to_text.train.jsonl.gz index ceb8269..7b32c59 100644 Binary files a/processed/dicta_sign/hamnosys_to_text.train.jsonl.gz and b/processed/dicta_sign/hamnosys_to_text.train.jsonl.gz differ diff --git a/processed/dicta_sign/text_to_hamnosys.train.jsonl.gz b/processed/dicta_sign/text_to_hamnosys.train.jsonl.gz index 73b732e..d0a391d 100644 Binary files a/processed/dicta_sign/text_to_hamnosys.train.jsonl.gz and b/processed/dicta_sign/text_to_hamnosys.train.jsonl.gz differ diff --git a/processed/rwth_phoenix2014_t/gloss_to_text.test.jsonl.gz b/processed/rwth_phoenix2014_t/gloss_to_text.test.jsonl.gz index fabd09f..c54c4c4 100644 Binary files a/processed/rwth_phoenix2014_t/gloss_to_text.test.jsonl.gz and b/processed/rwth_phoenix2014_t/gloss_to_text.test.jsonl.gz differ diff --git a/processed/rwth_phoenix2014_t/gloss_to_text.train.jsonl.gz b/processed/rwth_phoenix2014_t/gloss_to_text.train.jsonl.gz index 1d7e0a4..a8e8a52 100644 Binary files a/processed/rwth_phoenix2014_t/gloss_to_text.train.jsonl.gz and b/processed/rwth_phoenix2014_t/gloss_to_text.train.jsonl.gz differ diff --git a/processed/rwth_phoenix2014_t/gloss_to_text.validation.jsonl.gz b/processed/rwth_phoenix2014_t/gloss_to_text.validation.jsonl.gz index cf2c6fb..91ffe34 100644 Binary files a/processed/rwth_phoenix2014_t/gloss_to_text.validation.jsonl.gz and b/processed/rwth_phoenix2014_t/gloss_to_text.validation.jsonl.gz differ diff --git a/processed/rwth_phoenix2014_t/text_to_gloss.test.jsonl.gz b/processed/rwth_phoenix2014_t/text_to_gloss.test.jsonl.gz index 68dd404..9cd7944 100644 Binary files a/processed/rwth_phoenix2014_t/text_to_gloss.test.jsonl.gz and b/processed/rwth_phoenix2014_t/text_to_gloss.test.jsonl.gz differ diff --git a/processed/rwth_phoenix2014_t/text_to_gloss.train.jsonl.gz b/processed/rwth_phoenix2014_t/text_to_gloss.train.jsonl.gz index c940e58..ec8f485 100644 Binary files a/processed/rwth_phoenix2014_t/text_to_gloss.train.jsonl.gz and b/processed/rwth_phoenix2014_t/text_to_gloss.train.jsonl.gz differ diff --git a/processed/rwth_phoenix2014_t/text_to_gloss.validation.jsonl.gz b/processed/rwth_phoenix2014_t/text_to_gloss.validation.jsonl.gz index abf7ae2..50d6235 100644 Binary files a/processed/rwth_phoenix2014_t/text_to_gloss.validation.jsonl.gz and b/processed/rwth_phoenix2014_t/text_to_gloss.validation.jsonl.gz differ diff --git a/processed/signbank_plus/signwriting_to_text.test.jsonl.gz b/processed/signbank_plus/signwriting_to_text.test.jsonl.gz index ba7d49f..d96c8cd 100644 Binary files a/processed/signbank_plus/signwriting_to_text.test.jsonl.gz and b/processed/signbank_plus/signwriting_to_text.test.jsonl.gz differ diff --git a/processed/signbank_plus/signwriting_to_text.train.jsonl.gz b/processed/signbank_plus/signwriting_to_text.train.jsonl.gz index bde489c..ac5fb34 100644 Binary files a/processed/signbank_plus/signwriting_to_text.train.jsonl.gz and b/processed/signbank_plus/signwriting_to_text.train.jsonl.gz differ diff --git a/processed/signbank_plus/signwriting_to_text.validation.jsonl.gz b/processed/signbank_plus/signwriting_to_text.validation.jsonl.gz index b6bc705..e8e24a1 100644 Binary files a/processed/signbank_plus/signwriting_to_text.validation.jsonl.gz and b/processed/signbank_plus/signwriting_to_text.validation.jsonl.gz differ diff --git a/processed/signbank_plus/text_to_signwriting.test.jsonl.gz b/processed/signbank_plus/text_to_signwriting.test.jsonl.gz index 401f290..60f4285 100644 Binary files a/processed/signbank_plus/text_to_signwriting.test.jsonl.gz and b/processed/signbank_plus/text_to_signwriting.test.jsonl.gz differ diff --git a/processed/signbank_plus/text_to_signwriting.train.jsonl.gz b/processed/signbank_plus/text_to_signwriting.train.jsonl.gz index cdbd31f..8aa2204 100644 Binary files a/processed/signbank_plus/text_to_signwriting.train.jsonl.gz and b/processed/signbank_plus/text_to_signwriting.train.jsonl.gz differ diff --git a/processed/signbank_plus/text_to_signwriting.validation.jsonl.gz b/processed/signbank_plus/text_to_signwriting.validation.jsonl.gz index ae7fc5d..126f5fa 100644 Binary files a/processed/signbank_plus/text_to_signwriting.validation.jsonl.gz and b/processed/signbank_plus/text_to_signwriting.validation.jsonl.gz differ diff --git a/sign_gpt/custom_datasets/dataset_utils.py b/sign_gpt/custom_datasets/dataset_utils.py new file mode 100644 index 0000000..f3a40c9 --- /dev/null +++ b/sign_gpt/custom_datasets/dataset_utils.py @@ -0,0 +1,13 @@ +from copy import deepcopy + + +def format_task(task, params): + task = deepcopy(task) + for key, value in task.items(): + if isinstance(value, str): + task[key] = value.format(**params) + elif isinstance(value, list): + task[key] = [format_task(v, params) for v in value] + elif isinstance(value, dict): + task[key] = format_task(value, params) + return task diff --git a/sign_gpt/custom_datasets/dgs_corpus.py b/sign_gpt/custom_datasets/dgs_corpus.py index a3415ee..1f508e2 100644 --- a/sign_gpt/custom_datasets/dgs_corpus.py +++ b/sign_gpt/custom_datasets/dgs_corpus.py @@ -6,10 +6,11 @@ from sign_language_datasets.datasets.dgs_corpus import DgsCorpusConfig from tqdm import tqdm +from sign_gpt.custom_datasets.dataset_utils import format_task from sign_gpt.language_utils.i18n import i18n DATASET_NAME = "dgs_corpus" -DATA_PATH = Path(f"processed/{DATASET_NAME}") +DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME DATA_PATH.mkdir(parents=True, exist_ok=True) config = DgsCorpusConfig(name="only-annotations-sentence-level-uzh", version="1.0.0", @@ -18,8 +19,20 @@ dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config)) TASKS = { - "gloss_to_text": "Given a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus, translate it into a natural {spoken_language} sentence.\nInput: {gloss}\nOutput: {text}", - "text_to_gloss": "Given a {spoken_language} sentence, convert it into a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus.\nInput: {text}\nOutput: {gloss}", + "gloss_to_text": { + "system": "Given a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus, translate it into a natural {spoken_language} sentence.", + "messages": [{ + "input": "{gloss}", + "output": "{text}", + }] + }, + "text_to_gloss": { + "system": "Given a {spoken_language} sentence, convert it into a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus.", + "messages": [{ + "input": "{text}", + "output": "{gloss}", + }] + } } @@ -77,8 +90,7 @@ def build_gloss_text(glosses: list[str], for params in params_list: for task, file in split_files.items(): - instruction_text = TASKS[task].format(**params) - file.write(json.dumps({"text": instruction_text}) + "\n") + file.write(json.dumps(format_task(TASKS[task], params)) + "\n") for file in split_files.values(): file.close() diff --git a/sign_gpt/custom_datasets/dgs_types.py b/sign_gpt/custom_datasets/dgs_types.py index a802b5a..55451b1 100644 --- a/sign_gpt/custom_datasets/dgs_types.py +++ b/sign_gpt/custom_datasets/dgs_types.py @@ -5,10 +5,11 @@ import tensorflow_datasets as tfds from sign_language_datasets.datasets.config import SignDatasetConfig +from sign_gpt.custom_datasets.dataset_utils import format_task from sign_gpt.language_utils.i18n import i18n DATASET_NAME = "dgs_types" -DATA_PATH = Path(f"processed/{DATASET_NAME}") +DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME DATA_PATH.mkdir(parents=True, exist_ok=True) config = SignDatasetConfig(name="only-annotations", version="1.0.0", @@ -16,8 +17,20 @@ dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config)) TASKS = { - "hamnosys_to_gloss": "Given a sequence of HamNoSys notation for a sign in {signed_language}, translate it into a {spoken_language} gloss according to the DGS Corpus.\nInput: {hamnosys}\nOutput: {gloss}", - "gloss_to_hamnosys": "Given a {spoken_language} gloss according to the DGS Corpus, translate it into HamNoSys notation in {signed_language}.\nInput: {gloss}\nOutput: {hamnosys}", + "hamnosys_to_gloss": { + "system": "Given a sequence of HamNoSys notation for a sign in {signed_language}, translate it into a {spoken_language} gloss according to the DGS Corpus.", + "messages": [{ + "input": "{hamnosys}", + "output": "{gloss}", + }] + }, + "gloss_to_hamnosys": { + "system": "Given a {spoken_language} gloss according to the DGS Corpus, translate it into HamNoSys notation in {signed_language}.", + "messages": [{ + "input": "{gloss}", + "output": "{hamnosys}", + }] + } } for split, split_data in dataset.items(): @@ -37,8 +50,7 @@ } for task, file in split_files.items(): - instruction_text = TASKS[task].format(**params) - file.write(json.dumps({"text": instruction_text}) + "\n") + file.write(json.dumps(format_task(TASKS[task], params)) + "\n") for file in split_files.values(): file.close() diff --git a/sign_gpt/custom_datasets/dicta_sign.py b/sign_gpt/custom_datasets/dicta_sign.py index 0d1b333..5acb20c 100644 --- a/sign_gpt/custom_datasets/dicta_sign.py +++ b/sign_gpt/custom_datasets/dicta_sign.py @@ -5,19 +5,32 @@ import tensorflow_datasets as tfds from sign_language_datasets.datasets.config import SignDatasetConfig +from sign_gpt.custom_datasets.dataset_utils import format_task from sign_gpt.language_utils.i18n import i18n from sign_gpt.language_utils.info import sign_language_by_abbreviation DATASET_NAME = "dicta_sign" -DATA_PATH = Path(f"processed/{DATASET_NAME}") +DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME DATA_PATH.mkdir(parents=True, exist_ok=True) config = SignDatasetConfig(name="only-annotations", version="1.0.0", include_video=False, include_pose=None) dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config)) TASKS = { - "hamnosys_to_text": "Given a sequence of HamNoSys notation for a sign in {signed_language}, translate it into {spoken_language} text.\nInput: {hamnosys}\nOutput: {text}", - "text_to_hamnosys": "Given a sequence of {spoken_language} text, translate it into HamNoSys notation in {signed_language}.\nInput: {text}\nOutput: {hamnosys}", + "hamnosys_to_text": { + "system": "Given a sequence of HamNoSys notation for a sign in {signed_language}, translate it into {spoken_language} text.", + "messages": [{ + "input": "{hamnosys}", + "output": "{text}", + }] + }, + "text_to_hamnosys": { + "system": "Given a sequence of {spoken_language} text, translate it into HamNoSys notation in {signed_language}.", + "messages": [{ + "input": "{text}", + "output": "{hamnosys}", + }] + } } for split, split_data in dataset.items(): @@ -38,8 +51,7 @@ "gloss": datum['gloss'].numpy().decode('utf-8'), } for task, file in split_files.items(): - instruction_text = TASKS[task].format(**params) - file.write(json.dumps({"text": instruction_text}) + "\n") + file.write(json.dumps(format_task(TASKS[task], params)) + "\n") for file in split_files.values(): file.close() diff --git a/sign_gpt/custom_datasets/rwth_phoenix2014_t.py b/sign_gpt/custom_datasets/rwth_phoenix2014_t.py index b217c33..b9cfb0b 100644 --- a/sign_gpt/custom_datasets/rwth_phoenix2014_t.py +++ b/sign_gpt/custom_datasets/rwth_phoenix2014_t.py @@ -1,20 +1,34 @@ -import json import gzip +import json from pathlib import Path import tensorflow_datasets as tfds from sign_language_datasets.datasets.config import SignDatasetConfig +from sign_gpt.custom_datasets.dataset_utils import format_task + DATASET_NAME = "rwth_phoenix2014_t" -DATA_PATH = Path(f"processed/{DATASET_NAME}") +DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME DATA_PATH.mkdir(parents=True, exist_ok=True) config = SignDatasetConfig(name="only-annotations", version="3.0.0", include_video=False) dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config)) TASKS = { - "gloss_to_text": "Given a sequence of German Sign Language glosses following the style and conventions of the RWTH-PHOENIX-Weather 2014 T dataset, convert it into a natural German sentence, lowercased.\nInput: {gloss}\nOutput: {text}", - "text_to_gloss": "Given a German sentence, convert it into a sequence of German Sign Language glosses following the style and conventions of the RWTH-PHOENIX-Weather 2014 T dataset.\nInput: {text}\nOutput: {gloss}", + "gloss_to_text": { + "system": "Given a sequence of German Sign Language glosses following the style and conventions of the RWTH-PHOENIX-Weather 2014 T dataset, convert it into a natural German sentence, lowercased.", + "messages": [{ + "input": "{gloss}", + "output": "{text}", + }] + }, + "text_to_gloss": { + "system": "Given a German sentence, convert it into a sequence of German Sign Language glosses following the style and conventions of the RWTH-PHOENIX-Weather 2014 T dataset.", + "messages": [{ + "input": "{text}", + "output": "{gloss}", + }] + } } for split, split_data in dataset.items(): @@ -29,8 +43,7 @@ "text": datum['text'].numpy().decode('utf-8') } for task, file in split_files.items(): - instruction_text = TASKS[task].format(**params) - file.write(json.dumps({"text": instruction_text}) + "\n") + file.write(json.dumps(format_task(TASKS[task], params)) + "\n") for file in split_files.values(): file.close() diff --git a/sign_gpt/custom_datasets/signbank_plus.py b/sign_gpt/custom_datasets/signbank_plus.py index 8728942..4f8508f 100644 --- a/sign_gpt/custom_datasets/signbank_plus.py +++ b/sign_gpt/custom_datasets/signbank_plus.py @@ -6,12 +6,13 @@ from tqdm import tqdm +from sign_gpt.custom_datasets.dataset_utils import format_task from sign_gpt.language_utils.i18n import i18n csv.field_size_limit(2 ** 20) # Increase limit to 1MB (2^20 characters) DATASET_NAME = "signbank_plus" -DATA_PATH = Path(f"processed/{DATASET_NAME}") +DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME DATA_PATH.mkdir(parents=True, exist_ok=True) data_dir = Path("/Users/amitmoryossef/dev/sign-language-processing/signbank-annotation/signbank-plus/data/parallel") @@ -20,7 +21,6 @@ paths = { "train": { - "expanded": data_dir / "expanded" / "train.csv", "cleaned": data_dir / "cleaned" / "train.csv", "more": data_dir / "more" / "train.csv", }, @@ -33,8 +33,20 @@ } TASKS = { - "signwriting_to_text": "Given a sequence of SignWriting in {signed_language} ({data_type}), translate it into {spoken_language}.\nInput: {signwriting}\nOutput: {text}", - "text_to_signwriting": "Given {spoken_language} text, translate it into {signed_language} using SignWriting ({data_type}).\nInput: {text}\nOutput: {signwriting}", + "signwriting_to_text": { + "system": "Given a sequence of SignWriting in {signed_language}, translate it into {spoken_language}.", + "messages": [{ + "input": "{signwriting}", + "output": "{text}", + }] + }, + "text_to_signwriting": { + "system": "Given {spoken_language} text, translate it into {signed_language} using SignWriting.", + "messages": [{ + "input": "{text}", + "output": "{signwriting}", + }] + } } for split, split_data in paths.items(): @@ -58,8 +70,7 @@ "signwriting": ' '.join(signs) } for task, file in split_files.items(): - instruction_text = TASKS[task].format(**params) - file.write(json.dumps({"text": instruction_text}) + "\n") + file.write(json.dumps(format_task(TASKS[task], params)) + "\n") for file in split_files.values(): file.close() diff --git a/sign_gpt/train_lora.py b/sign_gpt/train_lora.py index 636272c..0ebfdb1 100644 --- a/sign_gpt/train_lora.py +++ b/sign_gpt/train_lora.py @@ -12,10 +12,10 @@ def prep_llama3_instruction(datum): - text = datum['text'] - text = "<|start_header_id|>system<|end_header_id|>\n\n" + text + "<|eot_id|>" - text = text.replace("\nInput: ", "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n") - text = text.replace("\nOutput: ", "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n") + text = "<|start_header_id|>system<|end_header_id|>\n\n" + datum['system'] + "<|eot_id|>" + for message in datum['messages']: + text += "<|start_header_id|>user<|end_header_id|>\n\n" + message['input'] + "<|eot_id|>" + text += "<|start_header_id|>assistant<|end_header_id|>\n\n" + message['output'] + "<|eot_id|>" return {"text": text}