Skip to content

Commit

Permalink
Add num_proc=4 to maps
Browse files Browse the repository at this point in the history
  • Loading branch information
tomaarsen committed Jan 9, 2024
1 parent 9cdc18a commit 1ba9b9a
Showing 1 changed file with 3 additions and 0 deletions.
3 changes: 3 additions & 0 deletions span_marker/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ def preprocess_dataset(
input_columns=("tokens", "ner_tags"),
desc=f"Label normalizing the {dataset_name} dataset",
batched=True,
num_proc=4,
)

# Setting model card data based on training data
Expand Down Expand Up @@ -230,6 +231,7 @@ def preprocess_dataset(
remove_columns=set(dataset.column_names) - set(self.OPTIONAL_COLUMNS),
desc=f"Tokenizing the {dataset_name} dataset",
fn_kwargs={"return_num_words": is_evaluate},
num_proc=4,
)
# If "document_id" AND "sentence_id" exist in the training dataset
if {"document_id", "sentence_id"} <= set(dataset.column_names):
Expand Down Expand Up @@ -265,6 +267,7 @@ def preprocess_dataset(
"model_max_length": tokenizer.model_max_length,
"marker_max_length": self.model.config.marker_max_length,
},
num_proc=4,
)
new_length = len(dataset)
logger.info(
Expand Down

0 comments on commit 1ba9b9a

Please sign in to comment.