From 27bf355dc5d620c5d1cf309637e37bbaacea3cbf Mon Sep 17 00:00:00 2001 From: drcanchi Date: Wed, 6 Mar 2024 11:12:57 -0800 Subject: [PATCH 1/3] Add GNN RCP --- mlperf_logging/rcp_checker/rcp_checker.py | 1 + .../rcp_checker/training_4.0.0/rcps_gnn.json | 72 +++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index c63cd57..9fd06ff 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -28,6 +28,7 @@ 'unet3d' : 40, 'rnnt': 10, 'stable_diffusion': 10, + 'gnn': 10, }, "hpc": { 'cosmoflow': 10, diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json new file mode 100644 index 0000000..19d4430 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_4.0.0/rcps_gnn.json @@ -0,0 +1,72 @@ +{ + + "gnn_ref_4096": + { + "Benchmark": "gnn", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.0", + "Platform": "1xDGX-A100 and 8xDGX-A100", + "BS": 4096, + "Hyperparams": { + "opt_base_learning_rate": 0.001 + }, + "Epochs to converge": [ + 0.85,0.75,0.75,0.80,0.80,0.75, + 0.75,0.85,0.75,0.75,0.80,0.80, + 0.80,0.75,0.80,0.80,0.80,0.80, + 0.80,0.85 ] + }, + + "gnn_ref_16384": + { + "Benchmark": "gnn", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.0", + "Platform": "8xDGX-A100", + "BS": 16384, + "Hyperparams": { + "opt_base_learning_rate": 0.002 + }, + "Epochs to converge": [ + 0.85,0.95,0.85,0.80,0.90,0.75, + 0.80,0.90,0.90,0.85,0.90,0.85, + 0.85,0.85,0.85,0.90,0.85,0.85, + 0.85,0.90 ] + }, + + "gnn_ref_32768": + { + "Benchmark": "gnn", + "Creator": "Intel", + "When": "Reference RCPs before v4.0", + "Platform": "16xSPR-2S", + "BS": 32768, + "Hyperparams": { + "opt_base_learning_rate": 0.002 + }, + "Epochs to converge": [ + 1.00,0.95,0.90,0.95,0.95,1.00, + 0.90,0.95,0.95,0.95,1.00,0.90, + 0.95,0.95,0.95,0.90,0.95,0.90, + 0.90,0.90 ] + }, + + "gnn_ref_65536": + { + "Benchmark": "gnn", + "Creator": "NVIDIA", + "When": "Reference RCPs before v4.0", + "Platform": "32xDGX-A100", + "BS": 65536, + "Hyperparams": { + "opt_base_learning_rate": 0.003 + }, + "Epochs to converge": [ + 1.25,1.20,1.25,1.20,1.15,1.15, + 1.15,1.20,1.15,1.20,1.25,1.15, + 1.20,1.20,1.15,1.25,1.20,1.15, + 1.10,1.15 + ] + } +} + From 8ebd5926039211f78b1c8d8b64bf8eb9cf8e86b3 Mon Sep 17 00:00:00 2001 From: drcanchi Date: Wed, 6 Mar 2024 11:45:20 -0800 Subject: [PATCH 2/3] Add GNN benchmark --- mlperf_logging/benchmark_meta.py | 12 ++++++++++++ mlperf_logging/mllog/constants.py | 1 + mlperf_logging/repo_checker/repo_checker.py | 4 ++-- mlperf_logging/result_summarizer/config.yaml | 12 +++++++++++- 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py index 1091b68..e97da9a 100644 --- a/mlperf_logging/benchmark_meta.py +++ b/mlperf_logging/benchmark_meta.py @@ -15,6 +15,7 @@ 'ncf': 10, 'rnnt': 10, 'unet3d': 40, + 'gnn' : 10, }, 'hpc' : { @@ -108,6 +109,17 @@ 'rnnt', 'unet3d', 'stable_diffusion' + ], + '4.0': [ + 'bert', + 'dlrm_dcnv2', + 'gpt3', + 'resnet', + 'ssd', + 'rnnt', + 'unet3d', + 'stable_diffusion', + 'gnn' ] }, diff --git a/mlperf_logging/mllog/constants.py b/mlperf_logging/mllog/constants.py index 7e4c169..4547518 100644 --- a/mlperf_logging/mllog/constants.py +++ b/mlperf_logging/mllog/constants.py @@ -50,6 +50,7 @@ UNET3D = "unet3d" BERT = "bert" GPT3 = "gpt3" +GNN = "gnn" # Constant values - model info ADAGRAD = "adagrad" diff --git a/mlperf_logging/repo_checker/repo_checker.py b/mlperf_logging/repo_checker/repo_checker.py index 2f9b1cf..f7494a3 100644 --- a/mlperf_logging/repo_checker/repo_checker.py +++ b/mlperf_logging/repo_checker/repo_checker.py @@ -127,8 +127,8 @@ def get_parser(): parser.add_argument( 'ruleset', type=str, - choices=['2.0.0', '2.1.0', '3.0.0', '3.1.0'], - help='the ruleset. 2.0.0, 2.1.0, 3.0.0 and 3.1.0 are currently supported.' + choices=['2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0'], + help='the ruleset. 2.0.0, 2.1.0, 3.0.0, 3.1.0 and 4.0.0 are currently supported.' ) parser.add_argument( '--log_output', diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index af01d5a..c801612 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -61,7 +61,17 @@ columns: unet3d: ["Benchmark results (minutes)", "Image segmentation (medical)", "KiTS19", "3D U-Net"] stable_diffusion: ["Benchmark results (minutes)", "Text to image", "Laion 400m and Coco-2017", "StableDiffusion"] default: [" ", " ", " "] - + "4.0.0": + bert: ["Benchmark results (minutes)", "NLP", "Wikipedia", "BERT"] + gpt3: ["Benchmark results (minutes)", "LLM", "C4", "GPT3"] + dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] + resnet: ["Benchmark results (minutes)", "Image classification", "ImageNet", "ResNet"] + ssd: ["Benchmark results (minutes)", "Object detection, light-weight", "OpenImages", "RetinaNet"] + rnnt: ["Benchmark results (minutes)", "Speech recognition", "LibriSpeech", "RNN-T"] + unet3d: ["Benchmark results (minutes)", "Image segmentation (medical)", "KiTS19", "3D U-Net"] + stable_diffusion: ["Benchmark results (minutes)", "Text to image", "Laion 400m and Coco-2017", "StableDiffusion"] + gnn: ["Benchmark results (minutes)", "Graph node classification", "IGBH-Full", "R-GAT"] + default: [" ", " ", " "] hpc: "2.0.0": bert: ["Benchmark results (minutes)", "CosmoFlow"] From 596cb01a6ce95e41ec19067a10959ea18548bb35 Mon Sep 17 00:00:00 2001 From: drcanchi Date: Wed, 6 Mar 2024 12:30:39 -0800 Subject: [PATCH 3/3] GNN compliance check --- .../compliance_checker/mlp_parser/__init__.py | 3 + .../mlp_parser/ruleset_400.py | 105 ++++++++++++ .../training_4.0.0/closed_common.yaml | 11 ++ .../training_4.0.0/closed_gnn.yaml | 21 +++ .../training_4.0.0/common.yaml | 151 ++++++++++++++++++ .../training_4.0.0/open_common.yaml | 7 + .../training_4.0.0/open_gnn.yaml | 7 + 7 files changed, 305 insertions(+) create mode 100644 mlperf_logging/compliance_checker/mlp_parser/ruleset_400.py create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/closed_gnn.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/common.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml create mode 100644 mlperf_logging/compliance_checker/training_4.0.0/open_gnn.yaml diff --git a/mlperf_logging/compliance_checker/mlp_parser/__init__.py b/mlperf_logging/compliance_checker/mlp_parser/__init__.py index 94fb516..5b073d6 100644 --- a/mlperf_logging/compliance_checker/mlp_parser/__init__.py +++ b/mlperf_logging/compliance_checker/mlp_parser/__init__.py @@ -6,6 +6,7 @@ from .ruleset_210 import parse_file as parse_file_210 from .ruleset_300 import parse_file as parse_file_300 from .ruleset_310 import parse_file as parse_file_310 +from .ruleset_400 import parse_file as parse_file_400 def parse_file(filename, ruleset='0.6.0'): @@ -25,5 +26,7 @@ def parse_file(filename, ruleset='0.6.0'): return parse_file_300(filename) elif ruleset == '3.1.0': return parse_file_310(filename) + elif ruleset == '4.0.0': + return parse_file_400(filename) else: raise Exception(f'Ruleset "{ruleset}" is not supported') diff --git a/mlperf_logging/compliance_checker/mlp_parser/ruleset_400.py b/mlperf_logging/compliance_checker/mlp_parser/ruleset_400.py new file mode 100644 index 0000000..e30b08d --- /dev/null +++ b/mlperf_logging/compliance_checker/mlp_parser/ruleset_400.py @@ -0,0 +1,105 @@ +''' +Parses a text MLPerf log into a structured format. +''' + +from __future__ import print_function + +import collections +import json +import re +import sys +from dataclasses import dataclass + +from io import open + +@dataclass +class LogLine: + """Class for keeping track of an item in inventory.""" + full_string: str + timestamp: float + key: str + value: str + lineno: int + +TOKEN = ':::MLLOG ' + + +def parse_line(line): + if not line.startswith(TOKEN): + return None + + return json.loads(line[len(TOKEN):]) + + +def string_to_logline(lineno, string): + ''' Returns a LogLine or raises a ValueError ''' + m = parse_line(string) + + if m is None: + raise ValueError('does not match regex') + + args = [] + args.append(string) # full string + + ts = float(m['time_ms']) # may raise error, e.g. "1.2.3" + # TODO check for weird values + args.append(ts) + + args.append(m['key']) # key + + j = { 'value': m['value'], 'metadata': m['metadata'] } + args.append(j) + + args.append(lineno) + return LogLine(*args) + + +def parse_file(filename): + ''' Reads a file by name and returns list of loglines and list of errors''' + with open(filename, encoding='latin-1') as f: + return parse_generator(f) + + +def strip_and_dedup(gen): + lines = [] + for l in gen: + if TOKEN not in l: + continue + lines.append(re.sub(".*"+TOKEN, TOKEN, l)) + return lines + + + +def parse_generator(gen): + ''' Reads a generator of lines and returns (loglines, errors) + The list of errors are any parsing issues as a tuple (str_line, error_msg) + ''' + loglines = [] + failed = [] + for lineno, line in enumerate(strip_and_dedup(gen)): + line = line.strip() + try: + ll = string_to_logline(lineno, line) + loglines.append(ll) + except ValueError as e: + failed.append((line, str(e))) + return loglines, failed + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print('usage: mlp_parser.py FILENAME') + print(' tests parsing on the file.') + sys.exit(1) + + filename = sys.argv[1] + lines, errors = parse_file(filename) + + print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors))) + + if len(errors) > 0: + print('Lines which failed to parse:') + for line, error in errors: + print(' Following line failed: {}'.format(error)) + print(line) + diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml new file mode 100644 index 0000000..f1e6c7e --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml @@ -0,0 +1,11 @@ + +- KEY: + NAME: submission_benchmark + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] " + POST: " enqueue_config('training_4.0.0/closed_{}.yaml'.format(v['value'])) " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " diff --git a/mlperf_logging/compliance_checker/training_4.0.0/closed_gnn.yaml b/mlperf_logging/compliance_checker/training_4.0.0/closed_gnn.yaml new file mode 100644 index 0000000..2c1f728 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/closed_gnn.yaml @@ -0,0 +1,21 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0" + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adam' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.72 and v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_4.0.0/common.yaml b/mlperf_logging/compliance_checker/training_4.0.0/common.yaml new file mode 100644 index 0000000..1360fed --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/common.yaml @@ -0,0 +1,151 @@ +# This file lists all the KEYs to be checked. Every line that matches mlperf logging regex (::MLL...) will be checked against these rules. +# In the order of the appearance in the log, for each line will execute the code specified under CHECK for the KEY in that line. +# The code will be launched using local state 'v' which is the content of value field in log line, and global state 's'. +# Global state 's' exists to allow cross-line checks, like start/stop pairs etc. To initialize 's' use BEGIN record which CODE will +# be executed before any checks. +# In addition, occurrence of each key will be counted and at the end if a requirement regarding the number of occurrences is defined it will +# be confirmed. This could be implemented using global state, but since this is a common thing to do it is natively supported. +# +# KEY record: +# NAME +# REQ - optional - {EXACTLY_ONE, AT_LEAST_ONE} +# PRE - optional - code to be executed before CHECK +# CHECK - optional - expression to be evaluated to verify correctness +# POST - optional - code to be executed after CHECK + +- BEGIN: + CODE: > + s.update({ + 'init_started': False, + 'init_stopped' : False, + 'run_started' : False, + 'run_stopped' : False, + 'in_epoch' : False, + 'last_epoch' : 0, + 'in_block' : False, + 'block_first_epoch' : -1, + 'first_init_start': 9e99, + 'compile_time_mins': 0, + }) + +- KEY: + NAME: submission_org + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: submission_platform + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: submission_division + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['closed', 'open'] " + POST: " enqueue_config('training_4.0.0/{}_common.yaml'.format(v['value'])); s['compile_time_mins'] = 240 if v['value'] == 'open' else 30 " + +- KEY: + NAME: submission_status + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['cloud', 'onprem', 'preview', 'research'] " + +# at least one record should be found, but any found records must pass the test +- KEY: + NAME: cache_clear + REQ: AT_LEAST_ONE + CHECK: + - "'value' in v" + +# frequency not checked +- KEY: + NAME: init_start + REQ: AT_LEAST_ONE + CHECK: + - "not s['init_stopped']" + - "not s['run_started']" + POST: " s['init_started'] = True; s['first_init_start']=min(s['first_init_start'], ll.timestamp) " + +# confirm less than 20min since the very first init_start +- KEY: + NAME: init_stop + REQ: EXACTLY_ONE + CHECK: + - "s['init_started']" + - "not s['run_started']" + - "ll.timestamp - s['first_init_start'] < (s['compile_time_mins']*60*1e3)" + POST: " s['init_stopped'] = True" + +- KEY: + NAME: run_start + REQ: EXACTLY_ONE + CHECK: " ( s['init_stopped'] == True )" + POST: " s['run_started'] = True " + +# status can also be aborted, but not allowing it here for now +# if eval is inside epoch and we decide to terminate, we can lack epoch_stop, it is ok +- KEY: + NAME: run_stop + REQ: EXACTLY_ONE + CHECK: + - "s['run_started']" + - "'status' in v['metadata']" + POST: " s['run_stopped'] = True " + +# FIXME: check epoch_count value match +- KEY: + NAME: block_start + REQ: AT_LEAST_ONE_OR(epoch_start) + CHECK: + - "s['run_started']" + - "('epoch_count' in v['metadata']) | ('step_num' in v['metadata'])" + - "'first_epoch_num' in v['metadata'] if 'epoch_count' in v['metadata'] else True" + - "v['metadata']['epoch_count'] > 0 if 'epoch_count' in v['metadata'] else True" + - "v['metadata']['step_num'] >= 0 if 'step_num' in v['metadata'] else True" + +- KEY: + NAME: block_stop + REQ: AT_LEAST_ONE_OR(epoch_stop) + CHECK: + - "('first_epoch_num' in v['metadata']) | ('step_num' in v['metadata'])" + +- KEY: + NAME: epoch_start + REQ: AT_LEAST_ONE_OR(block_start) + CHECK: + - "'epoch_num' in v['metadata']" + +- KEY: + NAME: epoch_stop + REQ: AT_LEAST_ONE_OR(block_stop) + CHECK: + - "'epoch_num' in v['metadata']" + +# making sure previous eval did print it's accuracy result +- KEY: + NAME: eval_start + REQ: AT_LEAST_ONE_OR(block_start) + CHECK: + - "('epoch_num' in v['metadata']) | ('step_num' in v['metadata'])" + +- KEY: + NAME: eval_stop + REQ: AT_LEAST_ONE_OR(block_stop) + CHECK: + - "('epoch_num' in v['metadata']) | ('step_num' in v['metadata'])" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "('epoch_num' in v['metadata']) | ('step_num' in v['metadata'])" + +- KEY: + NAME: train_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml new file mode 100644 index 0000000..43f781d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: submission_benchmark + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn'] " + POST: " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) " + diff --git a/mlperf_logging/compliance_checker/training_4.0.0/open_gnn.yaml b/mlperf_logging/compliance_checker/training_4.0.0/open_gnn.yaml new file mode 100644 index 0000000..14c4176 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_4.0.0/open_gnn.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0"