feat(datasets): remove template from source data

sign-language-processing · Jun 20, 2024 · d452f76 · d452f76
1 parent 0940012
commit d452f76
Show file tree

Hide file tree

Showing 30 changed files with 115 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -56,4 +56,14 @@ python -m sign_gpt.custom_datasets.dicta_sign
 python -m sign_gpt.custom_datasets.dgs_types
 python -m sign_gpt.custom_datasets.dgs_corpus
 python -m sign_gpt.custom_datasets.signbank_plus
-```
+```
+
+### Crawl Idea
+
+We have very large crawlers (such as CommonCrawl) that can be used to collect data from websites/books.
+We can vectorize all videos.
+We have very strong and capable language models able to help us create data.
+So the idea would be: crawl the web/whatever, feed the contents to a language model to generate a system prompt, 
+and the relevant inputs and outputs from the document. We then compile that into "CrawlInstruct"
+
+Videos that include captions are always covnerted to a translation task.
diff --git a/processed/dgs_corpus/gloss_to_text.test.jsonl.gz b/processed/dgs_corpus/gloss_to_text.test.jsonl.gz
diff --git a/processed/dgs_corpus/gloss_to_text.train.jsonl.gz b/processed/dgs_corpus/gloss_to_text.train.jsonl.gz
diff --git a/processed/dgs_corpus/gloss_to_text.validation.jsonl.gz b/processed/dgs_corpus/gloss_to_text.validation.jsonl.gz
diff --git a/processed/dgs_corpus/text_to_gloss.test.jsonl.gz b/processed/dgs_corpus/text_to_gloss.test.jsonl.gz
diff --git a/processed/dgs_corpus/text_to_gloss.train.jsonl.gz b/processed/dgs_corpus/text_to_gloss.train.jsonl.gz
diff --git a/processed/dgs_corpus/text_to_gloss.validation.jsonl.gz b/processed/dgs_corpus/text_to_gloss.validation.jsonl.gz
diff --git a/processed/dgs_types/gloss_to_hamnosys.train.jsonl.gz b/processed/dgs_types/gloss_to_hamnosys.train.jsonl.gz
diff --git a/processed/dgs_types/hamnosys_to_gloss.train.jsonl.gz b/processed/dgs_types/hamnosys_to_gloss.train.jsonl.gz
diff --git a/processed/dicta_sign/hamnosys_to_text.train.jsonl.gz b/processed/dicta_sign/hamnosys_to_text.train.jsonl.gz
diff --git a/processed/dicta_sign/text_to_hamnosys.train.jsonl.gz b/processed/dicta_sign/text_to_hamnosys.train.jsonl.gz
diff --git a/processed/rwth_phoenix2014_t/gloss_to_text.test.jsonl.gz b/processed/rwth_phoenix2014_t/gloss_to_text.test.jsonl.gz
diff --git a/processed/rwth_phoenix2014_t/gloss_to_text.train.jsonl.gz b/processed/rwth_phoenix2014_t/gloss_to_text.train.jsonl.gz
diff --git a/processed/rwth_phoenix2014_t/gloss_to_text.validation.jsonl.gz b/processed/rwth_phoenix2014_t/gloss_to_text.validation.jsonl.gz
diff --git a/processed/rwth_phoenix2014_t/text_to_gloss.test.jsonl.gz b/processed/rwth_phoenix2014_t/text_to_gloss.test.jsonl.gz
diff --git a/processed/rwth_phoenix2014_t/text_to_gloss.train.jsonl.gz b/processed/rwth_phoenix2014_t/text_to_gloss.train.jsonl.gz
diff --git a/processed/rwth_phoenix2014_t/text_to_gloss.validation.jsonl.gz b/processed/rwth_phoenix2014_t/text_to_gloss.validation.jsonl.gz
diff --git a/processed/signbank_plus/signwriting_to_text.test.jsonl.gz b/processed/signbank_plus/signwriting_to_text.test.jsonl.gz
diff --git a/processed/signbank_plus/signwriting_to_text.train.jsonl.gz b/processed/signbank_plus/signwriting_to_text.train.jsonl.gz
diff --git a/processed/signbank_plus/signwriting_to_text.validation.jsonl.gz b/processed/signbank_plus/signwriting_to_text.validation.jsonl.gz
diff --git a/processed/signbank_plus/text_to_signwriting.test.jsonl.gz b/processed/signbank_plus/text_to_signwriting.test.jsonl.gz
diff --git a/processed/signbank_plus/text_to_signwriting.train.jsonl.gz b/processed/signbank_plus/text_to_signwriting.train.jsonl.gz
diff --git a/processed/signbank_plus/text_to_signwriting.validation.jsonl.gz b/processed/signbank_plus/text_to_signwriting.validation.jsonl.gz
diff --git a/sign_gpt/custom_datasets/dataset_utils.py b/sign_gpt/custom_datasets/dataset_utils.py
@@ -0,0 +1,13 @@
+from copy import deepcopy
+
+
+def format_task(task, params):
+    task = deepcopy(task)
+    for key, value in task.items():
+        if isinstance(value, str):
+            task[key] = value.format(**params)
+        elif isinstance(value, list):
+            task[key] = [format_task(v, params) for v in value]
+        elif isinstance(value, dict):
+            task[key] = format_task(value, params)
+    return task
diff --git a/sign_gpt/custom_datasets/dgs_corpus.py b/sign_gpt/custom_datasets/dgs_corpus.py
@@ -6,10 +6,11 @@
 from sign_language_datasets.datasets.dgs_corpus import DgsCorpusConfig
 from tqdm import tqdm
 
+from sign_gpt.custom_datasets.dataset_utils import format_task
 from sign_gpt.language_utils.i18n import i18n
 
 DATASET_NAME = "dgs_corpus"
-DATA_PATH = Path(f"processed/{DATASET_NAME}")
+DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME
 DATA_PATH.mkdir(parents=True, exist_ok=True)
 
 config = DgsCorpusConfig(name="only-annotations-sentence-level-uzh", version="1.0.0",
@@ -18,8 +19,20 @@
 dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config))
 
 TASKS = {
-    "gloss_to_text": "Given a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus, translate it into a natural {spoken_language} sentence.\nInput: {gloss}\nOutput: {text}",
-    "text_to_gloss": "Given a {spoken_language} sentence, convert it into a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus.\nInput: {text}\nOutput: {gloss}",
+    "gloss_to_text": {
+        "system": "Given a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus, translate it into a natural {spoken_language} sentence.",
+        "messages": [{
+            "input": "{gloss}",
+            "output": "{text}",
+        }]
+    },
+    "text_to_gloss": {
+        "system": "Given a {spoken_language} sentence, convert it into a sequence of {signed_language}, {spoken_language} glosses and (mouthings) following the style and conventions of The DGS Corpus.",
+        "messages": [{
+            "input": "{text}",
+            "output": "{gloss}",
+        }]
+    }
 }
 
 
@@ -77,8 +90,7 @@ def build_gloss_text(glosses: list[str],
 
         for params in params_list:
             for task, file in split_files.items():
-                instruction_text = TASKS[task].format(**params)
-                file.write(json.dumps({"text": instruction_text}) + "\n")
+                file.write(json.dumps(format_task(TASKS[task], params)) + "\n")
 
     for file in split_files.values():
         file.close()
diff --git a/sign_gpt/custom_datasets/dgs_types.py b/sign_gpt/custom_datasets/dgs_types.py
@@ -5,19 +5,32 @@
 import tensorflow_datasets as tfds
 from sign_language_datasets.datasets.config import SignDatasetConfig
 
+from sign_gpt.custom_datasets.dataset_utils import format_task
 from sign_gpt.language_utils.i18n import i18n
 
 DATASET_NAME = "dgs_types"
-DATA_PATH = Path(f"processed/{DATASET_NAME}")
+DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME
 DATA_PATH.mkdir(parents=True, exist_ok=True)
 
 config = SignDatasetConfig(name="only-annotations", version="1.0.0",
                            include_video=False, process_video=False, include_pose=None)
 dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config))
 
 TASKS = {
-    "hamnosys_to_gloss": "Given a sequence of HamNoSys notation for a sign in {signed_language}, translate it into a {spoken_language} gloss according to the DGS Corpus.\nInput: {hamnosys}\nOutput: {gloss}",
-    "gloss_to_hamnosys": "Given a {spoken_language} gloss according to the DGS Corpus, translate it into HamNoSys notation in {signed_language}.\nInput: {gloss}\nOutput: {hamnosys}",
+    "hamnosys_to_gloss": {
+        "system": "Given a sequence of HamNoSys notation for a sign in {signed_language}, translate it into a {spoken_language} gloss according to the DGS Corpus.",
+        "messages": [{
+            "input": "{hamnosys}",
+            "output": "{gloss}",
+        }]
+    },
+    "gloss_to_hamnosys": {
+        "system": "Given a {spoken_language} gloss according to the DGS Corpus, translate it into HamNoSys notation in {signed_language}.",
+        "messages": [{
+            "input": "{gloss}",
+            "output": "{hamnosys}",
+        }]
+    }
 }
 
 for split, split_data in dataset.items():
@@ -37,8 +50,7 @@
             }
 
             for task, file in split_files.items():
-                instruction_text = TASKS[task].format(**params)
-                file.write(json.dumps({"text": instruction_text}) + "\n")
+                file.write(json.dumps(format_task(TASKS[task], params)) + "\n")
 
     for file in split_files.values():
         file.close()
diff --git a/sign_gpt/custom_datasets/dicta_sign.py b/sign_gpt/custom_datasets/dicta_sign.py
@@ -5,19 +5,32 @@
 import tensorflow_datasets as tfds
 from sign_language_datasets.datasets.config import SignDatasetConfig
 
+from sign_gpt.custom_datasets.dataset_utils import format_task
 from sign_gpt.language_utils.i18n import i18n
 from sign_gpt.language_utils.info import sign_language_by_abbreviation
 
 DATASET_NAME = "dicta_sign"
-DATA_PATH = Path(f"processed/{DATASET_NAME}")
+DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME
 DATA_PATH.mkdir(parents=True, exist_ok=True)
 
 config = SignDatasetConfig(name="only-annotations", version="1.0.0", include_video=False, include_pose=None)
 dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config))
 
 TASKS = {
-    "hamnosys_to_text": "Given a sequence of HamNoSys notation for a sign in {signed_language}, translate it into {spoken_language} text.\nInput: {hamnosys}\nOutput: {text}",
-    "text_to_hamnosys": "Given a sequence of {spoken_language} text, translate it into HamNoSys notation in {signed_language}.\nInput: {text}\nOutput: {hamnosys}",
+    "hamnosys_to_text": {
+        "system": "Given a sequence of HamNoSys notation for a sign in {signed_language}, translate it into {spoken_language} text.",
+        "messages": [{
+            "input": "{hamnosys}",
+            "output": "{text}",
+        }]
+    },
+    "text_to_hamnosys": {
+        "system": "Given a sequence of {spoken_language} text, translate it into HamNoSys notation in {signed_language}.",
+        "messages": [{
+            "input": "{text}",
+            "output": "{hamnosys}",
+        }]
+    }
 }
 
 for split, split_data in dataset.items():
@@ -38,8 +51,7 @@
             "gloss": datum['gloss'].numpy().decode('utf-8'),
         }
         for task, file in split_files.items():
-            instruction_text = TASKS[task].format(**params)
-            file.write(json.dumps({"text": instruction_text}) + "\n")
+            file.write(json.dumps(format_task(TASKS[task], params)) + "\n")
 
     for file in split_files.values():
         file.close()
diff --git a/sign_gpt/custom_datasets/rwth_phoenix2014_t.py b/sign_gpt/custom_datasets/rwth_phoenix2014_t.py
@@ -1,20 +1,34 @@
-import json
 import gzip
+import json
 from pathlib import Path
 
 import tensorflow_datasets as tfds
 from sign_language_datasets.datasets.config import SignDatasetConfig
 
+from sign_gpt.custom_datasets.dataset_utils import format_task
+
 DATASET_NAME = "rwth_phoenix2014_t"
-DATA_PATH = Path(f"processed/{DATASET_NAME}")
+DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME
 DATA_PATH.mkdir(parents=True, exist_ok=True)
 
 config = SignDatasetConfig(name="only-annotations", version="3.0.0", include_video=False)
 dataset = tfds.load(name=DATASET_NAME, builder_kwargs=dict(config=config))
 
 TASKS = {
-    "gloss_to_text": "Given a sequence of German Sign Language glosses following the style and conventions of the RWTH-PHOENIX-Weather 2014 T dataset, convert it into a natural German sentence, lowercased.\nInput: {gloss}\nOutput: {text}",
-    "text_to_gloss": "Given a German sentence, convert it into a sequence of German Sign Language glosses following the style and conventions of the RWTH-PHOENIX-Weather 2014 T dataset.\nInput: {text}\nOutput: {gloss}",
+    "gloss_to_text": {
+        "system": "Given a sequence of German Sign Language glosses following the style and conventions of the RWTH-PHOENIX-Weather 2014 T dataset, convert it into a natural German sentence, lowercased.",
+        "messages": [{
+            "input": "{gloss}",
+            "output": "{text}",
+        }]
+    },
+    "text_to_gloss": {
+        "system": "Given a German sentence, convert it into a sequence of German Sign Language glosses following the style and conventions of the RWTH-PHOENIX-Weather 2014 T dataset.",
+        "messages": [{
+            "input": "{text}",
+            "output": "{gloss}",
+        }]
+    }
 }
 
 for split, split_data in dataset.items():
@@ -29,8 +43,7 @@
             "text": datum['text'].numpy().decode('utf-8')
         }
         for task, file in split_files.items():
-            instruction_text = TASKS[task].format(**params)
-            file.write(json.dumps({"text": instruction_text}) + "\n")
+            file.write(json.dumps(format_task(TASKS[task], params)) + "\n")
 
     for file in split_files.values():
         file.close()
diff --git a/sign_gpt/custom_datasets/signbank_plus.py b/sign_gpt/custom_datasets/signbank_plus.py
@@ -6,12 +6,13 @@
 
 from tqdm import tqdm
 
+from sign_gpt.custom_datasets.dataset_utils import format_task
 from sign_gpt.language_utils.i18n import i18n
 
 csv.field_size_limit(2 ** 20)  # Increase limit to 1MB (2^20 characters)
 
 DATASET_NAME = "signbank_plus"
-DATA_PATH = Path(f"processed/{DATASET_NAME}")
+DATA_PATH = Path(__file__).parent.parent.parent / "processed" / DATASET_NAME
 DATA_PATH.mkdir(parents=True, exist_ok=True)
 
 data_dir = Path("/Users/amitmoryossef/dev/sign-language-processing/signbank-annotation/signbank-plus/data/parallel")
@@ -20,7 +21,6 @@
 
 paths = {
     "train": {
-        "expanded": data_dir / "expanded" / "train.csv",
         "cleaned": data_dir / "cleaned" / "train.csv",
         "more": data_dir / "more" / "train.csv",
     },
@@ -33,8 +33,20 @@
 }
 
 TASKS = {
-    "signwriting_to_text": "Given a sequence of SignWriting in {signed_language} ({data_type}), translate it into {spoken_language}.\nInput: {signwriting}\nOutput: {text}",
-    "text_to_signwriting": "Given {spoken_language} text, translate it into {signed_language} using SignWriting ({data_type}).\nInput: {text}\nOutput: {signwriting}",
+    "signwriting_to_text": {
+        "system": "Given a sequence of SignWriting in {signed_language}, translate it into {spoken_language}.",
+        "messages": [{
+            "input": "{signwriting}",
+            "output": "{text}",
+        }]
+    },
+    "text_to_signwriting": {
+        "system": "Given {spoken_language} text, translate it into {signed_language} using SignWriting.",
+        "messages": [{
+            "input": "{text}",
+            "output": "{signwriting}",
+        }]
+    }
 }
 
 for split, split_data in paths.items():
@@ -58,8 +70,7 @@
                 "signwriting": ' '.join(signs)
             }
             for task, file in split_files.items():
-                instruction_text = TASKS[task].format(**params)
-                file.write(json.dumps({"text": instruction_text}) + "\n")
+                file.write(json.dumps(format_task(TASKS[task], params)) + "\n")
 
     for file in split_files.values():
         file.close()
diff --git a/sign_gpt/train_lora.py b/sign_gpt/train_lora.py
@@ -12,10 +12,10 @@
 
 
 def prep_llama3_instruction(datum):
-    text = datum['text']
-    text = "<|start_header_id|>system<|end_header_id|>\n\n" + text + "<|eot_id|>"
-    text = text.replace("\nInput: ", "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n")
-    text = text.replace("\nOutput: ", "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n")
+    text = "<|start_header_id|>system<|end_header_id|>\n\n" + datum['system'] + "<|eot_id|>"
+    for message in datum['messages']:
+        text += "<|start_header_id|>user<|end_header_id|>\n\n" + message['input'] + "<|eot_id|>"
+        text += "<|start_header_id|>assistant<|end_header_id|>\n\n" + message['output'] + "<|eot_id|>"
     return {"text": text}