Skip to content

Commit

Permalink
Merge pull request #357 from ahmadki/ahmadki/sd_iters_to_samples
Browse files Browse the repository at this point in the history
[SD][v4.0] RCPs to use num of samples instead iterations
  • Loading branch information
hiwotadese authored Mar 14, 2024
2 parents c7b23b3 + 19ce6fb commit 99ba37a
Show file tree
Hide file tree
Showing 17 changed files with 478 additions and 16 deletions.
3 changes: 3 additions & 0 deletions mlperf_logging/compliance_checker/mlp_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .ruleset_210 import parse_file as parse_file_210
from .ruleset_300 import parse_file as parse_file_300
from .ruleset_310 import parse_file as parse_file_310
from .ruleset_400 import parse_file as parse_file_400


def parse_file(filename, ruleset='0.6.0'):
Expand All @@ -25,5 +26,7 @@ def parse_file(filename, ruleset='0.6.0'):
return parse_file_300(filename)
elif ruleset == '3.1.0':
return parse_file_310(filename)
elif ruleset == '4.0.0':
return parse_file_400(filename)
else:
raise Exception(f'Ruleset "{ruleset}" is not supported')
105 changes: 105 additions & 0 deletions mlperf_logging/compliance_checker/mlp_parser/ruleset_400.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
'''
Parses a text MLPerf log into a structured format.
'''

from __future__ import print_function

import collections
import json
import re
import sys
from dataclasses import dataclass

from io import open

@dataclass
class LogLine:
"""Class for keeping track of an item in inventory."""
full_string: str
timestamp: float
key: str
value: str
lineno: int

TOKEN = ':::MLLOG '


def parse_line(line):
if not line.startswith(TOKEN):
return None

return json.loads(line[len(TOKEN):])


def string_to_logline(lineno, string):
''' Returns a LogLine or raises a ValueError '''
m = parse_line(string)

if m is None:
raise ValueError('does not match regex')

args = []
args.append(string) # full string

ts = float(m['time_ms']) # may raise error, e.g. "1.2.3"
# TODO check for weird values
args.append(ts)

args.append(m['key']) # key

j = { 'value': m['value'], 'metadata': m['metadata'] }
args.append(j)

args.append(lineno)
return LogLine(*args)


def parse_file(filename):
''' Reads a file by name and returns list of loglines and list of errors'''
with open(filename, encoding='latin-1') as f:
return parse_generator(f)


def strip_and_dedup(gen):
lines = []
for l in gen:
if TOKEN not in l:
continue
lines.append(re.sub(".*"+TOKEN, TOKEN, l))
return lines



def parse_generator(gen):
''' Reads a generator of lines and returns (loglines, errors)
The list of errors are any parsing issues as a tuple (str_line, error_msg)
'''
loglines = []
failed = []
for lineno, line in enumerate(strip_and_dedup(gen)):
line = line.strip()
try:
ll = string_to_logline(lineno, line)
loglines.append(ll)
except ValueError as e:
failed.append((line, str(e)))
return loglines, failed


if __name__ == '__main__':
if len(sys.argv) != 2:
print('usage: mlp_parser.py FILENAME')
print(' tests parsing on the file.')
sys.exit(1)

filename = sys.argv[1]
lines, errors = parse_file(filename)

print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors)))

if len(errors) > 0:
print('Lines which failed to parse:')
for line, error in errors:
print(' Following line failed: {}'.format(error))
print(line)

Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] "
POST: " enqueue_config('training_4.0.0/closed_{}.yaml'.format(v['value'])) "

- KEY:
NAME: gradient_accumulation_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0 "
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Stable diffusion uses two metrics, FID and CLIP.
# These metrics can be calculated offline, using different scripts
# and logged seperatly. Therefore, we create a virtual key
# called aggregated_eval_accuracy, which aggregates
# both metrics into a single log line

- BEGIN:
CODE: |
from dataclasses import replace
agg_eval_lines = {}
for line in loglines:
if line.key == "eval_accuracy" and 'metric' in line.value['metadata']:
samples_count = line.value['metadata']['samples_count']
if samples_count not in agg_eval_lines:
new_line = replace(line) # Make a copy
new_line.key = "aggregated_eval_accuracy"
new_line.full_string = "" # Not needed
new_line.lineno = -1 # Not needed
new_line.value = {'value': {'samples_count': samples_count}, 'metadata':{}}
agg_eval_lines[samples_count] = new_line
agg_eval_lines[samples_count].timestamp = max(line.timestamp, agg_eval_lines[samples_count].timestamp)
agg_eval_lines[samples_count].value['value'][line.value['metadata']['metric']] = line.value['value']
loglines.extend(agg_eval_lines.values())
- KEY:
NAME: global_batch_size
REQ: AT_LEAST_ONE
CHECK: " v['value'] >= 0 "

- KEY:
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] == 'adamw' "

- KEY:
NAME: opt_adamw_beta_1
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.9 "

- KEY:
NAME: opt_adamw_beta_2
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.999 "

- KEY:
NAME: opt_adamw_epsilon
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1e-08 "

- KEY:
NAME: opt_adamw_weight_decay
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0.01 "

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0.0 "

- KEY:
NAME: opt_learning_rate_warmup_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0 "

- KEY:
NAME: aggregated_eval_accuracy
REQ: AT_LEAST(2)
CHECK:
- "'FID' in v['value']"
- "'CLIP' in v['value']"
- "'samples_count' in v['value']"
ATLEAST_ONE_CHECK: "(0.0 <= v['value']['FID'] <= 90.0) and (0.15 <= v['value']['CLIP'] <= 1.0)"
151 changes: 151 additions & 0 deletions mlperf_logging/compliance_checker/training_4.0.0/common.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# This file lists all the KEYs to be checked. Every line that matches mlperf logging regex (::MLL...) will be checked against these rules.
# In the order of the appearance in the log, for each line will execute the code specified under CHECK for the KEY in that line.
# The code will be launched using local state 'v' which is the content of value field in log line, and global state 's'.
# Global state 's' exists to allow cross-line checks, like start/stop pairs etc. To initialize 's' use BEGIN record which CODE will
# be executed before any checks.
# In addition, occurrence of each key will be counted and at the end if a requirement regarding the number of occurrences is defined it will
# be confirmed. This could be implemented using global state, but since this is a common thing to do it is natively supported.
#
# KEY record:
# NAME
# REQ - optional - {EXACTLY_ONE, AT_LEAST_ONE}
# PRE - optional - code to be executed before CHECK
# CHECK - optional - expression to be evaluated to verify correctness
# POST - optional - code to be executed after CHECK

- BEGIN:
CODE: >
s.update({
'init_started': False,
'init_stopped' : False,
'run_started' : False,
'run_stopped' : False,
'in_epoch' : False,
'last_epoch' : 0,
'in_block' : False,
'block_first_epoch' : -1,
'first_init_start': 9e99,
'compile_time_mins': 0,
})
- KEY:
NAME: submission_org
REQ: EXACTLY_ONE
CHECK: " v['value'] != '' "

- KEY:
NAME: submission_platform
REQ: EXACTLY_ONE
CHECK: " v['value'] != '' "

- KEY:
NAME: submission_division
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['closed', 'open'] "
POST: " enqueue_config('training_4.0.0/{}_common.yaml'.format(v['value'])); s['compile_time_mins'] = 240 if v['value'] == 'open' else 30 "

- KEY:
NAME: submission_status
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['cloud', 'onprem', 'preview', 'research'] "

# at least one record should be found, but any found records must pass the test
- KEY:
NAME: cache_clear
REQ: AT_LEAST_ONE
CHECK:
- "'value' in v"

# frequency not checked
- KEY:
NAME: init_start
REQ: AT_LEAST_ONE
CHECK:
- "not s['init_stopped']"
- "not s['run_started']"
POST: " s['init_started'] = True; s['first_init_start']=min(s['first_init_start'], ll.timestamp) "

# confirm less than 20min since the very first init_start
- KEY:
NAME: init_stop
REQ: EXACTLY_ONE
CHECK:
- "s['init_started']"
- "not s['run_started']"
- "ll.timestamp - s['first_init_start'] < (s['compile_time_mins']*60*1e3)"
POST: " s['init_stopped'] = True"

- KEY:
NAME: run_start
REQ: EXACTLY_ONE
CHECK: " ( s['init_stopped'] == True )"
POST: " s['run_started'] = True "

# status can also be aborted, but not allowing it here for now
# if eval is inside epoch and we decide to terminate, we can lack epoch_stop, it is ok
- KEY:
NAME: run_stop
REQ: EXACTLY_ONE
CHECK:
- "s['run_started']"
- "'status' in v['metadata']"
POST: " s['run_stopped'] = True "

# FIXME: check epoch_count value match
- KEY:
NAME: block_start
REQ: AT_LEAST_ONE_OR(epoch_start)
CHECK:
- "s['run_started']"
- "('epoch_count' in v['metadata']) | ('samples_count' in v['metadata'])"
- "'first_epoch_num' in v['metadata'] if 'epoch_count' in v['metadata'] else True"
- "v['metadata']['epoch_count'] > 0 if 'epoch_count' in v['metadata'] else True"
- "v['metadata']['samples_count'] >= 0 if 'samples_count' in v['metadata'] else True"

- KEY:
NAME: block_stop
REQ: AT_LEAST_ONE_OR(epoch_stop)
CHECK:
- "('first_epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])"

- KEY:
NAME: epoch_start
REQ: AT_LEAST_ONE_OR(block_start)
CHECK:
- "'epoch_num' in v['metadata']"

- KEY:
NAME: epoch_stop
REQ: AT_LEAST_ONE_OR(block_stop)
CHECK:
- "'epoch_num' in v['metadata']"

# making sure previous eval did print it's accuracy result
- KEY:
NAME: eval_start
REQ: AT_LEAST_ONE_OR(block_start)
CHECK:
- "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])"

- KEY:
NAME: eval_stop
REQ: AT_LEAST_ONE_OR(block_stop)
CHECK:
- "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])"

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])"

- KEY:
NAME: train_samples
REQ: EXACTLY_ONE
CHECK: " v['value'] != '' "

- KEY:
NAME: eval_samples
REQ: EXACTLY_ONE
CHECK: " v['value'] != '' "

Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d'] "
POST: " enqueue_config('training_4.0.0/open_{}.yaml'.format(v['value'])) "
Loading

0 comments on commit 99ba37a

Please sign in to comment.