Add num_proc=4 to maps

tomaarsen · Jan 9, 2024 · 1ba9b9a · 1ba9b9a
1 parent 9cdc18a
commit 1ba9b9a
Showing 1 changed file with 3 additions and 0 deletions.
diff --git a/span_marker/trainer.py b/span_marker/trainer.py
@@ -203,6 +203,7 @@ def preprocess_dataset(
             input_columns=("tokens", "ner_tags"),
             desc=f"Label normalizing the {dataset_name} dataset",
             batched=True,
+            num_proc=4,
         )
 
         # Setting model card data based on training data
@@ -230,6 +231,7 @@ def preprocess_dataset(
                 remove_columns=set(dataset.column_names) - set(self.OPTIONAL_COLUMNS),
                 desc=f"Tokenizing the {dataset_name} dataset",
                 fn_kwargs={"return_num_words": is_evaluate},
+                num_proc=4,
             )
         # If "document_id" AND "sentence_id" exist in the training dataset
         if {"document_id", "sentence_id"} <= set(dataset.column_names):
@@ -265,6 +267,7 @@ def preprocess_dataset(
                 "model_max_length": tokenizer.model_max_length,
                 "marker_max_length": self.model.config.marker_max_length,
             },
+            num_proc=4,
         )
         new_length = len(dataset)
         logger.info(