From 6042b13bc08eb12e4c9b73b236285657af0e152d Mon Sep 17 00:00:00 2001 From: pgmpablo157321 Date: Thu, 7 Sep 2023 17:29:12 -0500 Subject: [PATCH] Add HPC 3.0 missing files --- .../rcp_checker/hpc_3.0.0/rcps_cosmoflow.json | 82 ++++++++++++ .../rcp_checker/hpc_3.0.0/rcps_deepcam.json | 119 ++++++++++++++++++ .../rcp_checker/hpc_3.0.0/rcps_oc20.json | 41 ++++++ scripts/verify_for_v3.0_hpc.sh | 18 +++ 4 files changed, 260 insertions(+) create mode 100644 mlperf_logging/rcp_checker/hpc_3.0.0/rcps_cosmoflow.json create mode 100644 mlperf_logging/rcp_checker/hpc_3.0.0/rcps_deepcam.json create mode 100644 mlperf_logging/rcp_checker/hpc_3.0.0/rcps_oc20.json create mode 100755 scripts/verify_for_v3.0_hpc.sh diff --git a/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_cosmoflow.json b/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_cosmoflow.json new file mode 100644 index 00000000..54949099 --- /dev/null +++ b/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_cosmoflow.json @@ -0,0 +1,82 @@ +{ + "cosmoflow_ref_32": { + "Benchmark": "cosmoflow", + "BS": 32, + "Epochs to converge": [ 8, 8, 9, 8, 6, 8, 6, 6, 7, 8, 8, 8, 9, 8, 7, 8, 6, 9, 8, 8, 8, 8, 9, 8, 9, 8, 8, 8, 8, 8, 8, 8, 8 ], + "Hyperparams": { + "global_batch_size": 32, + "opt_name": "SGD", + "opt_base_learning_rate": 0.001, + "opt_learning_rate_warmup_epochs": 2, + "opt_learning_rate_warmup_factor": 4.0, + "opt_learning_rate_decay_boundary_epochs": [ 4, 6 ], + "opt_learning_rate_decay_factor": [0.25, 0.0625], + "dropout": 0.5, + "opt_weight_decay": 0.0 + } + }, + "cosmoflow_ref_64": { + "Benchmark": "cosmoflow", + "BS": 64, + "Epochs to converge": [ 19, 18, 20, 17, 18, 19, 18, 18, 18, 18, 18, 18, 18, 19, 18, 18, 18, 18, 18, 20 ], + "Hyperparams": { + "global_batch_size": 64, + "opt_name": "SGD", + "opt_base_learning_rate": 0.001, + "opt_learning_rate_warmup_epochs": 4, + "opt_learning_rate_warmup_factor": 1.0, + "opt_learning_rate_decay_boundary_epochs": [ 16, 32 ], + "opt_learning_rate_decay_factor": 0.25, + "dropout": 0.0, + "opt_weight_decay": 0.01 + } + }, + "cosmoflow_ref_128": { + "Benchmark": "cosmoflow", + "BS": 128, + "Epochs to converge": [ 16, 18, 18, 18, 18, 17, 18, 18, 18, 18, 18, 18, 17, 18, 18, 18, 19, 18, 18, 18 ], + "Hyperparams": { + "global_batch_size": 128, + "opt_name": "SGD", + "opt_base_learning_rate": 0.004, + "opt_learning_rate_warmup_epochs": 4, + "opt_learning_rate_warmup_factor": 4.0, + "opt_learning_rate_decay_boundary_epochs": [ 16, 32 ], + "opt_learning_rate_decay_factor": 0.25, + "dropout": 0.5, + "opt_weight_decay": 0.0 + } + }, + "cosmoflow_ref_512": { + "Benchmark": "cosmoflow", + "BS": 512, + "Epochs to converge": [ 22, 23, 23, 24, 24, 21, 23, 23, 23, 23, 24, 23, 23, 21, 23, 23, 21, 23, 23, 23 ], + "Hyperparams": { + "global_batch_size": 512, + "opt_name": "SGD", + "opt_base_learning_rate": 0.006, + "opt_learning_rate_warmup_epochs": 6, + "opt_learning_rate_warmup_factor": 4.0, + "opt_learning_rate_decay_boundary_epochs": [ 19, 21, 22, 23 ], + "opt_learning_rate_decay_factor": [0.5, 0.25, 0.125, 0.0625], + "dropout": 0.0, + "opt_weight_decay": 0.01 + } + }, + "cosmoflow_ref_1024": { + "Benchmark": "cosmoflow", + "BS": 1024, + "Epochs to converge": [ 42, 38, 42, 40, 40, 39, 43, 37, 39, 37, 43, 39, 38, 42, 40, 42, 42, 38, 36, 43 ], + "Hyperparams": { + "global_batch_size": 1024, + "opt_name": "SGD", + "opt_base_learning_rate": 0.012, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_warmup_factor": 1.0, + "opt_learning_rate_decay_boundary_epochs": [ 32, 64 ], + "opt_learning_rate_decay_factor": 0.25, + "dropout": 0.5, + "opt_weight_decay": 0.0 + } + } +} diff --git a/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_deepcam.json b/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_deepcam.json new file mode 100644 index 00000000..bcc32625 --- /dev/null +++ b/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_deepcam.json @@ -0,0 +1,119 @@ +{ + "deepcam_ref_128": { + "Benchmark": "deepcam", + "BS": 128, + "Epochs to converge": [ 7, 7, 7, 7, 7, 7, 7, 8, 8, 6, 7, 8, 7, 6, 6, 7, 7, 6, 7, 7 ], + "Hyperparams": { + "batchnorm_groupsize": 1, + "global_batch_size": 128, + "gradient_accumulation_frequency": 1, + "num_workers": 32, + "opt_betas": [ 0.9, 0.999 ], + "opt_bias_correction": true, + "opt_eps": 1e-06, + "opt_grad_averaging": true, + "opt_lr": 0.00155, + "opt_max_grad_norm": 1.0, + "opt_weight_decay": 0.01, + "opt_name": "LAMB", + "scheduler_decay_rate": "0.1", + "scheduler_lr_warmup_factor": 1.0, + "scheduler_lr_warmup_steps": 0, + "scheduler_t_max": "9000", + "scheduler_eta_min": "0.0", + "scheduler_type": "cosine_annealing" + } + }, + "deepcam_ref_256": { + "Benchmark": "deepcam", + "BS": 256, + "Epochs to converge": [ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9 ], + "Hyperparams": { + "batchnorm_groupsize": 1, + "global_batch_size": 256, + "gradient_accumulation_frequency": 1, + "opt_betas": [ 0.9, 0.999 ], + "opt_bias_correction": true, + "opt_eps": 1e-06, + "opt_grad_averaging": true, + "opt_lr": 0.002, + "opt_max_grad_norm": 1.0, + "opt_weight_decay": 0.01, + "opt_name": "LAMB", + "scheduler_decay_rate": 0.1, + "scheduler_lr_warmup_factor": 1.0, + "scheduler_lr_warmup_steps": 0, + "scheduler_milestones": [ 4096, 8192 ], + "scheduler_type": "multistep" + } + }, + "deepcam_ref_512": { + "Benchmark": "deepcam", + "BS": 512, + "Epochs to converge": [ 11, 11, 11, 11, 11, 10, 16, 12, 11, 11, 11, 10, 11, 11, 11, 10, 11, 11, 11, 11, 11, 11, 11, 12, 11, 11, 11, 11, 13, 10, 11, 13, 11, 11, 11, 10, 14, 10, 10, 10 ], + "Hyperparams": { + "batchnorm_groupsize": 1, + "global_batch_size": 512, + "gradient_accumulation_frequency": 1, + "opt_betas": [ 0.9, 0.999 ], + "opt_bias_correction": true, + "opt_eps": 1e-06, + "opt_grad_averaging": true, + "opt_lr": 0.004, + "opt_max_grad_norm": 1.0, + "opt_weight_decay": 0.01, + "opt_name": "LAMB", + "scheduler_decay_rate": 0.1, + "scheduler_lr_warmup_factor": 1.0, + "scheduler_lr_warmup_steps": 100, + "scheduler_milestones": [ 2048, 4096 ], + "scheduler_type": "multistep" + } + }, + "deepcam_ref_1024": { + "Benchmark": "deepcam", + "BS": 1024, + "Epochs to converge": [ 13, 13, 12, 13, 13, 13, 14, 13, 13, 13 ], + "Hyperparams": { + "batchnorm_groupsize": 1, + "global_batch_size": 1024, + "gradient_accumulation_frequency": 1, + "opt_betas": [ 0.9, 0.999 ], + "opt_bias_correction": true, + "opt_eps": 1e-06, + "opt_grad_averaging": true, + "opt_lr": 0.004, + "opt_max_grad_norm": 1.0, + "opt_weight_decay": 0.01, + "opt_name": "LAMB", + "scheduler_decay_rate": 0.1, + "scheduler_lr_warmup_factor": 1.0, + "scheduler_lr_warmup_steps": 200, + "scheduler_milestones": [ 1100, 4096 ], + "scheduler_type": "multistep" + } + }, + "deepcam_ref_2048": { + "Benchmark": "deepcam", + "BS": 2048, + "Epochs to converge": [ 23, 23, 22, 23, 22, 26, 22, 22, 24, 22 ], + "Hyperparams": { + "batchnorm_groupsize": 1, + "global_batch_size": 2048, + "gradient_accumulation_frequency": 1, + "opt_betas": [ 0.9, 0.999 ], + "opt_bias_correction": true, + "opt_eps": 1e-06, + "opt_grad_averaging": true, + "opt_lr": 0.0055, + "opt_max_grad_norm": 1.0, + "opt_weight_decay": 0.01, + "opt_name": "LAMB", + "scheduler_decay_rate": "0.1", + "scheduler_lr_warmup_factor": 1.0, + "scheduler_lr_warmup_steps": 400, + "scheduler_milestones": [ 800 ], + "scheduler_type": "multistep" + } + } +} diff --git a/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_oc20.json b/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_oc20.json new file mode 100644 index 00000000..d42987a7 --- /dev/null +++ b/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_oc20.json @@ -0,0 +1,41 @@ +{ + "oc20_ref_256": { + "Benchmark": "oc20", + "BS": 256, + "Epochs to converge": [ 20, 18, 20, 20, 22, 18, 19, 19, 21, 21 ], + "Hyperparams": { + "global_batch_size": 256, + "opt_base_learning_rate": 0.0004, + "opt_learning_rate_warmup_steps": 31252, + "opt_learning_rate_warmup_factor": 0.2, + "opt_learning_rate_decay_boundary_steps": [ 125008, 187512, 250016 ], + "opt_learning_rate_decay_factor": 0.1 + } + }, + "oc20_ref_1024": { + "Benchmark": "oc20", + "BS": 1024, + "Epochs to converge": [ 23, 25, 22, 25, 25, 25, 25, 24, 23, 25 ], + "Hyperparams": { + "global_batch_size": 1024, + "opt_base_learning_rate": 0.0012, + "opt_learning_rate_warmup_steps": 7816, + "opt_learning_rate_warmup_factor": 0.2, + "opt_learning_rate_decay_boundary_steps": [ 31264, 46896 ], + "opt_learning_rate_decay_factor": 0.1 + } + }, + "oc20_ref_2048": { + "Benchmark": "oc20", + "BS": 2048, + "Epochs to converge": [ 33, 32, 33, 33, 33, 34, 33, 33, 30, 33 ], + "Hyperparams": { + "global_batch_size": 2048, + "opt_base_learning_rate": 0.0016, + "opt_learning_rate_warmup_steps": 3908, + "opt_learning_rate_warmup_factor": 0.2, + "opt_learning_rate_decay_boundary_steps": [ 23448, 31264 ], + "opt_learning_rate_decay_factor": 0.1 + } + } +} diff --git a/scripts/verify_for_v3.0_hpc.sh b/scripts/verify_for_v3.0_hpc.sh new file mode 100755 index 00000000..9206ef56 --- /dev/null +++ b/scripts/verify_for_v3.0_hpc.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e + +# rcp_bypass and rcp_bert_train_samples packahe checker params +# need to be retrieved at package_checker_params file at top-level submission dir. +PACKAGE_CHECKER_PARAMS="" +PACKAGE_CHECKER_PARAMS_FILE="$1/package_checker_params" +if test -f "$PACKAGE_CHECKER_PARAMS_FILE"; then + while IFS= read -r line + do + PACKAGE_CHECKER_PARAMS="$PACKAGE_CHECKER_PARAMS --$line" + done < "$PACKAGE_CHECKER_PARAMS_FILE" +fi + +python3 -m mlperf_logging.package_checker $1 hpc 3.0.0 $PACKAGE_CHECKER_PARAMS +python3 -m mlperf_logging.result_summarizer $1 hpc 3.0.0 +python3 -m mlperf_logging.repo_checker $1 hpc 3.0.0