Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GNN training benchmark #359

Merged
merged 4 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions mlperf_logging/benchmark_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
'ncf': 10,
'rnnt': 10,
'unet3d': 40,
'gnn' : 10,
},

'hpc' : {
Expand Down Expand Up @@ -108,6 +109,17 @@
'rnnt',
'unet3d',
'stable_diffusion'
],
'4.0': [
'bert',
'dlrm_dcnv2',
'gpt3',
'resnet',
'ssd',
'rnnt',
'unet3d',
'stable_diffusion',
'gnn'
]
},

Expand Down
4 changes: 2 additions & 2 deletions mlperf_logging/compliance_checker/mlp_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ def parse_file(filename, ruleset='0.6.0'):
elif ruleset == '3.1.0':
return parse_file_310(filename)
elif ruleset == '4.0.0':
return parse_file_400(filename)
return parse_file_400(filename)
else:
raise Exception(f'Ruleset "{ruleset}" is not supported')
raise Exception(f'Ruleset "{ruleset}" is not supported')
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] "
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] "
POST: " enqueue_config('training_4.0.0/closed_{}.yaml'.format(v['value'])) "

- KEY:
NAME: gradient_accumulation_steps
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We actually do not have this key in the reference implementation (mllog.constants.GRADIENT_ACCUMULATION_STEPS). Should we also include this in the reference branch? Or is it okay for us to directly remove this in the closed_common.yaml checker?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's include gradient acccumulation in reference, as we should not be modifying the common yaml.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gradient accumulation is included in reference.

REQ: EXACTLY_ONE
CHECK: " v['value'] > 0 "
CHECK: " v['value'] > 0 "
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
- KEY:
NAME: global_batch_size
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"

- KEY:
NAME: opt_name
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the reference code, the optimizer name is not reported, but we do have a SEED exported at this line. Should we report the optimizer name as well in the reference code? And should we also check if the seed is included in the log as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should report optimizer name in reference. The seed check is part of package_checker and we don't need to include it here.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added optimizer name in reference.

REQ: EXACTLY_ONE
CHECK: " v['value'] == 'adam' "

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0.0"

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.72 and v['value'] < 1.0"
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] "
POST: " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) "
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] "
POST: " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) "
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] < 1.0"
1 change: 1 addition & 0 deletions mlperf_logging/mllog/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
UNET3D = "unet3d"
BERT = "bert"
GPT3 = "gpt3"
GNN = "gnn"

# Constant values - model info
ADAGRAD = "adagrad"
Expand Down
1 change: 1 addition & 0 deletions mlperf_logging/rcp_checker/rcp_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
'unet3d' : 40,
'rnnt': 10,
'stable_diffusion': 10,
'gnn': 10,
},
"hpc": {
'cosmoflow': 10,
Expand Down
72 changes: 72 additions & 0 deletions mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{

"gnn_ref_4096":
{
"Benchmark": "gnn",
"Creator": "NVIDIA",
"When": "Reference RCPs before v4.0",
"Platform": "1xDGX-A100 and 8xDGX-A100",
"BS": 4096,
"Hyperparams": {
"opt_base_learning_rate": 0.001
},
"Epochs to converge": [
0.85,0.75,0.75,0.80,0.80,0.75,
0.75,0.85,0.75,0.75,0.80,0.80,
0.80,0.75,0.80,0.80,0.80,0.80,
0.80,0.85 ]
},

"gnn_ref_16384":
{
"Benchmark": "gnn",
"Creator": "NVIDIA",
"When": "Reference RCPs before v4.0",
"Platform": "8xDGX-A100",
"BS": 16384,
"Hyperparams": {
"opt_base_learning_rate": 0.002
},
"Epochs to converge": [
0.85,0.95,0.85,0.80,0.90,0.75,
0.80,0.90,0.90,0.85,0.90,0.85,
0.85,0.85,0.85,0.90,0.85,0.85,
0.85,0.90 ]
},

"gnn_ref_32768":
{
"Benchmark": "gnn",
"Creator": "Intel",
"When": "Reference RCPs before v4.0",
"Platform": "16xSPR-2S",
"BS": 32768,
"Hyperparams": {
"opt_base_learning_rate": 0.002
},
"Epochs to converge": [
1.00,0.95,0.90,0.95,0.95,1.00,
0.90,0.95,0.95,0.95,1.00,0.90,
0.95,0.95,0.95,0.90,0.95,0.90,
0.90,0.90 ]
},

"gnn_ref_65536":
{
"Benchmark": "gnn",
"Creator": "NVIDIA",
"When": "Reference RCPs before v4.0",
"Platform": "32xDGX-A100",
"BS": 65536,
"Hyperparams": {
"opt_base_learning_rate": 0.003
},
"Epochs to converge": [
1.25,1.20,1.25,1.20,1.15,1.15,
1.15,1.20,1.15,1.20,1.25,1.15,
1.20,1.20,1.15,1.25,1.20,1.15,
1.10,1.15
]
}
}

1 change: 1 addition & 0 deletions mlperf_logging/result_summarizer/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ columns:
rnnt: ["Benchmark results (minutes)", "Speech recognition", "LibriSpeech", "RNN-T"]
unet3d: ["Benchmark results (minutes)", "Image segmentation (medical)", "KiTS19", "3D U-Net"]
stable_diffusion: ["Benchmark results (minutes)", "Text to image", "Laion 400m and Coco-2017", "StableDiffusion"]
gnn: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"]
default: [" ", " ", " "]

hpc:
Expand Down
Loading