-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #357 from ahmadki/ahmadki/sd_iters_to_samples
[SD][v4.0] RCPs to use num of samples instead iterations
- Loading branch information
Showing
17 changed files
with
478 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
105 changes: 105 additions & 0 deletions
105
mlperf_logging/compliance_checker/mlp_parser/ruleset_400.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
''' | ||
Parses a text MLPerf log into a structured format. | ||
''' | ||
|
||
from __future__ import print_function | ||
|
||
import collections | ||
import json | ||
import re | ||
import sys | ||
from dataclasses import dataclass | ||
|
||
from io import open | ||
|
||
@dataclass | ||
class LogLine: | ||
"""Class for keeping track of an item in inventory.""" | ||
full_string: str | ||
timestamp: float | ||
key: str | ||
value: str | ||
lineno: int | ||
|
||
TOKEN = ':::MLLOG ' | ||
|
||
|
||
def parse_line(line): | ||
if not line.startswith(TOKEN): | ||
return None | ||
|
||
return json.loads(line[len(TOKEN):]) | ||
|
||
|
||
def string_to_logline(lineno, string): | ||
''' Returns a LogLine or raises a ValueError ''' | ||
m = parse_line(string) | ||
|
||
if m is None: | ||
raise ValueError('does not match regex') | ||
|
||
args = [] | ||
args.append(string) # full string | ||
|
||
ts = float(m['time_ms']) # may raise error, e.g. "1.2.3" | ||
# TODO check for weird values | ||
args.append(ts) | ||
|
||
args.append(m['key']) # key | ||
|
||
j = { 'value': m['value'], 'metadata': m['metadata'] } | ||
args.append(j) | ||
|
||
args.append(lineno) | ||
return LogLine(*args) | ||
|
||
|
||
def parse_file(filename): | ||
''' Reads a file by name and returns list of loglines and list of errors''' | ||
with open(filename, encoding='latin-1') as f: | ||
return parse_generator(f) | ||
|
||
|
||
def strip_and_dedup(gen): | ||
lines = [] | ||
for l in gen: | ||
if TOKEN not in l: | ||
continue | ||
lines.append(re.sub(".*"+TOKEN, TOKEN, l)) | ||
return lines | ||
|
||
|
||
|
||
def parse_generator(gen): | ||
''' Reads a generator of lines and returns (loglines, errors) | ||
The list of errors are any parsing issues as a tuple (str_line, error_msg) | ||
''' | ||
loglines = [] | ||
failed = [] | ||
for lineno, line in enumerate(strip_and_dedup(gen)): | ||
line = line.strip() | ||
try: | ||
ll = string_to_logline(lineno, line) | ||
loglines.append(ll) | ||
except ValueError as e: | ||
failed.append((line, str(e))) | ||
return loglines, failed | ||
|
||
|
||
if __name__ == '__main__': | ||
if len(sys.argv) != 2: | ||
print('usage: mlp_parser.py FILENAME') | ||
print(' tests parsing on the file.') | ||
sys.exit(1) | ||
|
||
filename = sys.argv[1] | ||
lines, errors = parse_file(filename) | ||
|
||
print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors))) | ||
|
||
if len(errors) > 0: | ||
print('Lines which failed to parse:') | ||
for line, error in errors: | ||
print(' Following line failed: {}'.format(error)) | ||
print(line) | ||
|
11 changes: 11 additions & 0 deletions
11
mlperf_logging/compliance_checker/training_4.0.0/closed_common.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
|
||
- KEY: | ||
NAME: submission_benchmark | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] " | ||
POST: " enqueue_config('training_4.0.0/closed_{}.yaml'.format(v['value'])) " | ||
|
||
- KEY: | ||
NAME: gradient_accumulation_steps | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] > 0 " |
74 changes: 74 additions & 0 deletions
74
mlperf_logging/compliance_checker/training_4.0.0/closed_stable_diffusion.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
# Stable diffusion uses two metrics, FID and CLIP. | ||
# These metrics can be calculated offline, using different scripts | ||
# and logged seperatly. Therefore, we create a virtual key | ||
# called aggregated_eval_accuracy, which aggregates | ||
# both metrics into a single log line | ||
|
||
- BEGIN: | ||
CODE: | | ||
from dataclasses import replace | ||
agg_eval_lines = {} | ||
for line in loglines: | ||
if line.key == "eval_accuracy" and 'metric' in line.value['metadata']: | ||
samples_count = line.value['metadata']['samples_count'] | ||
if samples_count not in agg_eval_lines: | ||
new_line = replace(line) # Make a copy | ||
new_line.key = "aggregated_eval_accuracy" | ||
new_line.full_string = "" # Not needed | ||
new_line.lineno = -1 # Not needed | ||
new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}} | ||
agg_eval_lines[samples_count] = new_line | ||
agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp) | ||
agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value'] | ||
loglines.extend(agg_eval_lines.values()) | ||
- KEY: | ||
NAME: global_batch_size | ||
REQ: AT_LEAST_ONE | ||
CHECK: " v['value'] >= 0 " | ||
|
||
- KEY: | ||
NAME: opt_name | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] == 'adamw' " | ||
|
||
- KEY: | ||
NAME: opt_adamw_beta_1 | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] == 0.9 " | ||
|
||
- KEY: | ||
NAME: opt_adamw_beta_2 | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] == 0.999 " | ||
|
||
- KEY: | ||
NAME: opt_adamw_epsilon | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] == 1e-08 " | ||
|
||
- KEY: | ||
NAME: opt_adamw_weight_decay | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] == 0.01 " | ||
|
||
- KEY: | ||
NAME: opt_base_learning_rate | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] >= 0.0 " | ||
|
||
- KEY: | ||
NAME: opt_learning_rate_warmup_steps | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] >= 0 " | ||
|
||
- KEY: | ||
NAME: aggregated_eval_accuracy | ||
REQ: AT_LEAST(2) | ||
CHECK: | ||
- "'FID' in v['value']" | ||
- "'CLIP' in v['value']" | ||
- "'samples_count' in v['value']" | ||
ATLEAST_ONE_CHECK: "(0.0 <= v['value']['FID'] <= 90.0) and (0.15 <= v['value']['CLIP'] <= 1.0)" |
151 changes: 151 additions & 0 deletions
151
mlperf_logging/compliance_checker/training_4.0.0/common.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
# This file lists all the KEYs to be checked. Every line that matches mlperf logging regex (::MLL...) will be checked against these rules. | ||
# In the order of the appearance in the log, for each line will execute the code specified under CHECK for the KEY in that line. | ||
# The code will be launched using local state 'v' which is the content of value field in log line, and global state 's'. | ||
# Global state 's' exists to allow cross-line checks, like start/stop pairs etc. To initialize 's' use BEGIN record which CODE will | ||
# be executed before any checks. | ||
# In addition, occurrence of each key will be counted and at the end if a requirement regarding the number of occurrences is defined it will | ||
# be confirmed. This could be implemented using global state, but since this is a common thing to do it is natively supported. | ||
# | ||
# KEY record: | ||
# NAME | ||
# REQ - optional - {EXACTLY_ONE, AT_LEAST_ONE} | ||
# PRE - optional - code to be executed before CHECK | ||
# CHECK - optional - expression to be evaluated to verify correctness | ||
# POST - optional - code to be executed after CHECK | ||
|
||
- BEGIN: | ||
CODE: > | ||
s.update({ | ||
'init_started': False, | ||
'init_stopped' : False, | ||
'run_started' : False, | ||
'run_stopped' : False, | ||
'in_epoch' : False, | ||
'last_epoch' : 0, | ||
'in_block' : False, | ||
'block_first_epoch' : -1, | ||
'first_init_start': 9e99, | ||
'compile_time_mins': 0, | ||
}) | ||
- KEY: | ||
NAME: submission_org | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] != '' " | ||
|
||
- KEY: | ||
NAME: submission_platform | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] != '' " | ||
|
||
- KEY: | ||
NAME: submission_division | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] in ['closed', 'open'] " | ||
POST: " enqueue_config('training_4.0.0/{}_common.yaml'.format(v['value'])); s['compile_time_mins'] = 240 if v['value'] == 'open' else 30 " | ||
|
||
- KEY: | ||
NAME: submission_status | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] in ['cloud', 'onprem', 'preview', 'research'] " | ||
|
||
# at least one record should be found, but any found records must pass the test | ||
- KEY: | ||
NAME: cache_clear | ||
REQ: AT_LEAST_ONE | ||
CHECK: | ||
- "'value' in v" | ||
|
||
# frequency not checked | ||
- KEY: | ||
NAME: init_start | ||
REQ: AT_LEAST_ONE | ||
CHECK: | ||
- "not s['init_stopped']" | ||
- "not s['run_started']" | ||
POST: " s['init_started'] = True; s['first_init_start']=min(s['first_init_start'], ll.timestamp) " | ||
|
||
# confirm less than 20min since the very first init_start | ||
- KEY: | ||
NAME: init_stop | ||
REQ: EXACTLY_ONE | ||
CHECK: | ||
- "s['init_started']" | ||
- "not s['run_started']" | ||
- "ll.timestamp - s['first_init_start'] < (s['compile_time_mins']*60*1e3)" | ||
POST: " s['init_stopped'] = True" | ||
|
||
- KEY: | ||
NAME: run_start | ||
REQ: EXACTLY_ONE | ||
CHECK: " ( s['init_stopped'] == True )" | ||
POST: " s['run_started'] = True " | ||
|
||
# status can also be aborted, but not allowing it here for now | ||
# if eval is inside epoch and we decide to terminate, we can lack epoch_stop, it is ok | ||
- KEY: | ||
NAME: run_stop | ||
REQ: EXACTLY_ONE | ||
CHECK: | ||
- "s['run_started']" | ||
- "'status' in v['metadata']" | ||
POST: " s['run_stopped'] = True " | ||
|
||
# FIXME: check epoch_count value match | ||
- KEY: | ||
NAME: block_start | ||
REQ: AT_LEAST_ONE_OR(epoch_start) | ||
CHECK: | ||
- "s['run_started']" | ||
- "('epoch_count' in v['metadata']) | ('samples_count' in v['metadata'])" | ||
- "'first_epoch_num' in v['metadata'] if 'epoch_count' in v['metadata'] else True" | ||
- "v['metadata']['epoch_count'] > 0 if 'epoch_count' in v['metadata'] else True" | ||
- "v['metadata']['samples_count'] >= 0 if 'samples_count' in v['metadata'] else True" | ||
|
||
- KEY: | ||
NAME: block_stop | ||
REQ: AT_LEAST_ONE_OR(epoch_stop) | ||
CHECK: | ||
- "('first_epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" | ||
|
||
- KEY: | ||
NAME: epoch_start | ||
REQ: AT_LEAST_ONE_OR(block_start) | ||
CHECK: | ||
- "'epoch_num' in v['metadata']" | ||
|
||
- KEY: | ||
NAME: epoch_stop | ||
REQ: AT_LEAST_ONE_OR(block_stop) | ||
CHECK: | ||
- "'epoch_num' in v['metadata']" | ||
|
||
# making sure previous eval did print it's accuracy result | ||
- KEY: | ||
NAME: eval_start | ||
REQ: AT_LEAST_ONE_OR(block_start) | ||
CHECK: | ||
- "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" | ||
|
||
- KEY: | ||
NAME: eval_stop | ||
REQ: AT_LEAST_ONE_OR(block_stop) | ||
CHECK: | ||
- "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" | ||
|
||
- KEY: | ||
NAME: eval_accuracy | ||
REQ: AT_LEAST_ONE | ||
CHECK: | ||
- "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" | ||
|
||
- KEY: | ||
NAME: train_samples | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] != '' " | ||
|
||
- KEY: | ||
NAME: eval_samples | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] != '' " | ||
|
6 changes: 6 additions & 0 deletions
6
mlperf_logging/compliance_checker/training_4.0.0/open_common.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
|
||
- KEY: | ||
NAME: submission_benchmark | ||
REQ: EXACTLY_ONE | ||
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] " | ||
POST: " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) " |
Oops, something went wrong.