Modalities · lhahn-iis · Jan 22, 2024 · Jan 29, 2024 · Jan 30, 2024 · Jan 30, 2024
diff --git a/benchmarks/dataloader/README.md b/benchmarks/dataloader/README.md
@@ -0,0 +1,68 @@
+# Benchmarking of Dataset Implementations
+
+## Motivation
+We want to include a storage efficient, fast and generic dataset implementation in this repository.
+Previous work and ideas were based on MegatronLM and its dataset implementation.
+
+Unfortunately its usage is quite intransparent and causes regularly unexpected side effects.
+Those problems are hard to trace, as we are not the original authors of the code.
+
+Therefore we want to provide an own implementation, which comes with all the above mentioned benefits.
+Most importantly, it should be at least as fast as MegatronLM's implementation.
+
+
+## Benchmark Overview
+
+We want to evaluate multiple aspects of the dataset implementations:
+* preparation speed - All datasets need to do some initial steps like tokenization and indexing.
+* initialization speed - When firing up a respective `Dataset` object inside the code.
+* iteration speed - When accessing elements (in a random order) in the respective datasets
+
+
+## Used Example Dataset
+
+The experiments were conducted on a small sample of openwebtext. The data is provided in `.jsonl`-format.
+The relevant data included can be found under `"text"` and is obviously text-only.
+Each dataset with X samples refers to the first X lines in the full openwebtext data,
+ as it can be obtained from huggingface.
+
+
+## Experimental Setup
+
+We relied on the functions provided in `launch_benchmark.sh`. One can reproduce those by calling e.g.
+
+```shell
+. launch_benchmark.sh
+
+INPUT_DIR=<path-to-your-example-dataset.jsonl>
+
+echo "MegatronLM:"
+measure_megatronLM_iteration
+echo "Modalities:"
+measure_modalities_iteration
+```
+
+> For launching the preparation of MegatronLM's dataset, refer to:
+> https://github.com/OpenGPTX/opengptx_data/tree/docs/modalities-vs-megatronlm-dl and look at the `launch_benchmark.sh`
+> script.
+
+
+## Results
+
+
+| Evaluation Aspect    | Implementation | Required Time | # Samples in Data |
+|----------------------|----------------|---------------|-------------------|
+| preparation speed    | MegatronLM     | `0m16,965s`   | `20000(OWT)`      |
+| preparation speed    | Modalities     | `0m18,952s`   | `20000(OWT)`      |
+| preparation speed    | MegatronLM     | `2m11,856s`   | `200000(OWT)`     |
+| preparation speed    | Modalities     | `1m42,943s`   | `200000(OWT)`     |
+| initialization speed | MegatronLM     | `19.3 msec`   | `20000(OWT)`      |
+| initialization speed | Modalities     | `5.85 msec`   | `20000(OWT)`      |
+| initialization speed | MegatronLM     | `180 msec `   | `200000(OWT)`     |
+| initialization speed | Modalities     | `58 msec`     | `200000(OWT)`     |
+| iteration speed      | MegatronLM     | `52.4 msec`   | `20000(OWT)`      |
+| iteration speed      | Modalities     | `66.8 msec`   | `20000(OWT)`      | 
+| iteration speed      | MegatronLM     | `426 msec `   | `200000(OWT)`     |
+| iteration speed      | Modalities     | `545 msec`    | `200000(OWT)`     |
+
+
diff --git a/benchmarks/dataloader/launch_benchmark.sh b/benchmarks/dataloader/launch_benchmark.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+
+
+INPUT_DIR="/tmp/i-do-not-exist.jsonl"
+
+
+measure_modalities_preparation() {
+    time (
+        set -e
+        test -f $INPUT_DIR
+        rm -f ${INPUT_DIR/.jsonl/.idx}
+        modalities create_memmap_index $INPUT_DIR &> /dev/null
+        echo "finished memmap index creation"
+        rm -f ${INPUT_DIR/.jsonl/.pbin}
+        modalities create_packed_data $INPUT_DIR &> /dev/null
+        echo "finished memmap packing"
+    )
+}
+
+
+measure_modalities_initialization() {
+  input_file=${INPUT_DIR/.jsonl/.pbin}
+  python -m timeit -n 50 -r 5 -s "
+import sys, io
+null_device = io.StringIO()
+from modalities.dataloader.dataset import PackedMemMapDatasetMegatron
+from pathlib import Path
+p = Path(\"${input_file}\")
+  " -- "
+sys.stdout = null_device  # deactivate stdout to avoid getting spammed
+PackedMemMapDatasetMegatron(raw_data_path=p, block_size=1024, sample_key=\"sample\")
+sys.stdout = sys.__stdout__  # reactivate stdout for timeit
+"
+}
+
+measure_megatronLM_initialization() {
+  input_file="${INPUT_DIR/.jsonl/.megLM.bin_text_document}"
+  python -m timeit -n 50 -r 5 -s "
+import sys, io
+null_device = io.StringIO()
+from modalities.dataloader.open_gptx_dataset.mmap_dataset import MMapIndexedDataset
+p = \"${input_file}\"
+  " -- "
+sys.stdout = null_device  # deactivate stdout to avoid getting spammed
+MMapIndexedDataset(p)
+sys.stdout = sys.__stdout__  # reactivate stdout for timeit
+"
+}
+
+measure_modalities_iteration() {
+  input_file=${INPUT_DIR/.jsonl/.pbin}
+  python -m timeit -n 5 -r 3 -s "
+import random, sys, io
+null_device = io.StringIO()
+from modalities.dataloader.dataset import PackedMemMapDatasetMegatron
+from pathlib import Path
+p = Path(\"${input_file}\")
+sys.stdout = null_device  # deactivate stdout to avoid getting spammed
+dataset = PackedMemMapDatasetMegatron(raw_data_path=p, block_size=1024, sample_key=\"sample\")
+random_indices = random.sample(range(len(dataset)), len(dataset))
+sys.stdout = sys.__stdout__  # reactivate stdout for timeit
+  " -- "
+list(dataset)  # sequential access
+for i in random_indices:
+  dataset[i]
+"
+}
+
+
+measure_megatronLM_iteration() {
+  input_file="${INPUT_DIR/.jsonl/.megLM.bin_text_document}"
+  python -m timeit -n 5 -r 3 -s "
+import random, sys, io
+null_device = io.StringIO()
+from modalities.dataloader.open_gptx_dataset.mmap_dataset import MMapIndexedDataset
+p = \"${input_file}\"
+sys.stdout = null_device  # deactivate stdout to avoid getting spammed
+dataset = MMapIndexedDataset(p)
+random_indices = random.sample(range(len(dataset)), len(dataset))
+sys.stdout = sys.__stdout__  # reactivate stdout for timeit
+  " -- "
+list(dataset)  # sequential access
+for i in random_indices:
+  dataset[i]
+"
+}
+
+
+echo "MegatronLM:"
+measure_megatronLM_iteration
+echo "Modalities:"
+measure_modalities_iteration
diff --git a/src/modalities/__main__.py b/src/modalities/__main__.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import logging
+import os
 from pathlib import Path
 from typing import Dict, List, Tuple
 
@@ -137,15 +138,26 @@ def entry_point_create_memmap_index(src_path, index_path):
     default=".text",
     help="jq pattern to extract the data from the json line.",
 )
-def entry_point_create_packed_data(src_path, dst_path, index_path, tokenizer_type, tokenizer_file, jq_pattern):
+@click.option(
+    "--num-cpus",
+    type=int,
+    show_default=True,
+    default=os.cpu_count(),
+    help="Specify the number of tokenization workers. Default is the number of available CPUs.",
+)
+def entry_point_create_packed_data(
+    src_path, dst_path, index_path, tokenizer_type, tokenizer_file, jq_pattern, num_cpus
+):
     # TODO: if we want to use alternative entrypoints together with the ResolverRegistry,
     #  we can currently not rely on the existing class resolver.
     #  This is based on its connection to the overall `AppConfig`.
     #  One would requires an object of it to instantiate the ResolverRegistry.
     #  This could get resolved by implementing on own ResolverRegistry for each entrypoint or adapting the existing
     #  ResolverRegistry to work dynamically with any type-hinted config object from config.py.
     tokenizer = tokenizer_type.value(tokenizer_file=str(tokenizer_file))
-    generator = PackedDataGenerator(src_path, index_path=index_path, tokenizer=tokenizer, jq_pattern=jq_pattern)
+    generator = PackedDataGenerator(
+        src_path, index_path=index_path, tokenizer=tokenizer, jq_pattern=jq_pattern, number_of_processes=num_cpus
+    )
     generator.run(dst_path)
 
 

diff --git a/src/modalities/constants.py b/src/modalities/constants.py
@@ -0,0 +1,2 @@
+# Not relying on "utf8" after encountering encoding issues when using OpenGPT-X Data.
+DEFAULT_ENCODING = "iso-8859-1"
diff --git a/src/modalities/dataloader/create_index.py b/src/modalities/dataloader/create_index.py
@@ -6,16 +6,17 @@
 import warnings
 from pathlib import Path
 
-import numpy as np
 from tqdm import tqdm
 
+from modalities.constants import DEFAULT_ENCODING
+
 
 # TODO: benchmark against pyspark
 class IndexGenerator:
     def __init__(self, src_file: Path, chunksize: int = 4096, drop_faulty_entries: bool = False):
         """
         Reads in a JSON file as a binary file, iterates character by character und builds up
-        the sample index (char-wisestart and end position for each JSON sample) via "\n" character positions.
+        the sample index (char-wise start and end position for each JSON sample) via "\n" character positions.
 
         :param src_file: Path to a jsonl-file.
         :param chunksize: defines the size of byte chunks that are processed via a producer-consumer approach.
@@ -59,16 +60,16 @@ def queue_generator():
         def process_line(last_index: int, curr_index: int):
             segment_len = curr_index - last_index
             try:  # check if line is a valid json
-                line = np.memmap(self.src_file, mode="r", offset=last_index, shape=(segment_len,)).view("S1").tolist()
-                line = [c.decode("utf8") for c in line]
-                line = "".join(line)
-                json.loads(line)
+                f = self.src_file.open(encoding=DEFAULT_ENCODING)
+                f.seek(last_index)
+                decoded_line = f.read(segment_len)
+                json.loads(decoded_line)
                 self._index_map.append((last_index, segment_len))
             except Exception as low_level_err:
                 if self.drop_faulty_entries:
                     warnings.warn(f"faulty line at {last_index}-{curr_index}, skipping...")
                 else:
-                    warnings.warn(f"faulty line: {line=}")
+                    warnings.warn(f"faulty line: {decoded_line}")
                     err = ValueError(f"faulty line at {last_index}-{curr_index}")
                     err.__cause__ = low_level_err
                     self._exception_buffer.append(err)
@@ -78,15 +79,15 @@ def process_line(last_index: int, curr_index: int):
         for chunk_idx, chunk in tqdm(enumerate(queue_generator()), desc="Processed Chunks", total=self.num_chunks):
             for char_index, c in enumerate(chunk):
                 curr_index = chunk_idx * self.chunksize + char_index
-                if c == ord("\n"):
+                if c == "\n":
                     process_line(last_index, curr_index)
                     last_index = curr_index + 1
         # prevents automatically added "\n"-chars at the end of files getting interpreted as own sample
         if curr_index >= last_index:
             process_line(last_index, curr_index + 1)
 
     def _reader_thread(self):
-        with open(self.src_file, "rb") as fin:
+        with open(self.src_file, "r", encoding=DEFAULT_ENCODING) as fin:
             while True:
                 chunk = fin.read(self.chunksize)
                 if self._exception_buffer:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Not relying on "utf8" after encountering encoding issues when using OpenGPT-X Data.
		DEFAULT_ENCODING = "iso-8859-1"