Skip to content

Commit

Permalink
Merge branch 'master' into ahmadki/sd_iters_to_samples
Browse files Browse the repository at this point in the history
  • Loading branch information
hiwotadese authored Mar 14, 2024
2 parents 1e848cf + c7b23b3 commit 19ce6fb
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 2 deletions.
132 changes: 132 additions & 0 deletions mlperf_logging/rcp_checker/training_4.0.0/rcps_unet3d.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
{

"unet3d_ref_28":
{
"Benchmark": "unet3d",
"Creator": "NVIDIA",
"When": "Prior to 4.0 submission",
"Platform": "TBD",
"BS": 28,
"Hyperparams": {
"opt_base_learning_rate": 2.0,
"opt_momentum": 0.9,
"opt_learning_rate_warmup_epochs": 168000,
"opt_initial_learning_rate": 1e-4,
"opt_learning_rate_decay_boundary_epochs": [],
"opt_learning_rate_decay_factor": 1.0,
"opt_weight_decay": 0.0
},
"Epochs to converge": [
396480, 456960, 386400, 262080, 362880, 520800, 309120, 248640, 225120, 510720,
241920, 356160, 312480, 359520, 389760, 262080, 366240, 302400, 325920, 225120,
312480, 235200, 299040, 315840, 225120, 302400, 302400, 366240, 420000, 268800,
450240, 275520, 299040, 292320, 349440, 305760, 245280, 302400, 342720, 406560,
255360, 319200, 312480, 282240, 409920, 265440, 329280, 241920, 309120, 383040,
231840, 393120, 272160, 346080, 255360, 493920, 299040, 299040, 399840, 305760,
426720, 235200, 493920, 342720, 483840, 282240, 288960, 426720, 477120, 416640,
302400, 299040, 436800, 282240, 299040, 517440, 440160, 551040, 389760, 325920,
339360, 530880, 433440, 282240, 342720, 413280, 403200, 295680, 376320, 571200,
238560, 332640, 292320, 295680, 325920, 349440, 336000, 399840, 537600, 497280
]
},

"unet3d_ref_56":
{
"Benchmark": "unet3d",
"Creator": "NVIDIA",
"When": "Prior to 4.0 submission",
"Platform": "TBD",
"BS": 56,
"Hyperparams": {
"opt_base_learning_rate": 2.0,
"opt_momentum": 0.9,
"opt_learning_rate_warmup_epochs": 168000,
"opt_initial_learning_rate": 1e-4,
"opt_learning_rate_decay_boundary_epochs": [],
"opt_learning_rate_decay_factor": 1.0,
"opt_weight_decay": 0.0
},
"Epochs to converge": [
399840, 480480, 356160, 467040, 376320, 325920, 255360, 376320, 302400, 329280,
372960, 389760, 255360, 356160, 440160, 366240, 319200, 376320, 295680, 376320,
366240, 305760, 544320, 618240, 379680, 376320, 215040, 225120, 322560, 406560,
268800, 346080, 635040, 349440, 292320, 369600, 329280, 332640, 329280, 366240,
372960, 389760, 295680, 305760, 235200, 366240, 325920, 487200, 332640, 359520,
369600, 460320, 393120, 416640, 430080, 477120, 349440, 295680, 520800, 241920,
315840, 299040, 413280, 349440, 339360, 272160, 433440, 453600, 514080, 336000,
393120, 319200, 248640, 366240, 346080, 272160, 262080, 278880, 430080, 292320,
352800, 332640, 258720, 393120, 255360, 352800, 389760, 309120, 262080, 356160,
272160, 272160, 292320, 413280, 372960, 544320, 201600, 493920, 319200, 433440,
386400, 403200, 369600, 322560, 436800, 389760, 339360, 329280, 339360, 554400,
315840, 302400, 288960, 362880, 356160, 278880, 564480, 456960, 369600, 430080,
359520, 527520, 272160, 268800, 282240, 383040, 467040, 265440, 420000, 403200,
302400, 393120, 285600, 329280, 601440, 416640, 470400, 282240, 309120, 362880,
285600, 366240, 319200, 349440, 389760, 356160, 235200, 399840, 483840, 359520,
262080, 248640, 409920, 272160, 325920, 299040, 252000, 339360, 302400, 342720,
295680, 302400, 252000, 403200, 396480, 504000, 393120, 372960, 420000, 349440,
312480, 453600, 188160, 336000, 352800, 339360, 346080, 332640, 393120, 322560,
399840, 362880, 379680, 463680, 383040, 366240, 255360, 359520, 389760, 292320,
409920, 356160, 396480, 581280, 295680, 295680, 329280
]
},

"unet3d_ref_64":
{
"Benchmark": "unet3d",
"Creator": "NVIDIA",
"When": "Prior to 4.0 submission",
"Platform": "TBD",
"BS": 64,
"Hyperparams": {
"opt_base_learning_rate": 2.0,
"opt_momentum": 0.9,
"opt_learning_rate_warmup_epochs": 168000,
"opt_initial_learning_rate": 1e-4,
"opt_learning_rate_decay_boundary_epochs": [],
"opt_learning_rate_decay_factor": 1.0,
"opt_weight_decay": 0.0
},
"Epochs to converge": [
504000, 238592, 477120, 530880, 359552, 336000, 507392, 346112, 420032, 423360,
346112, 372992, 275520, 319232, 305792, 463680, 594752, 352832, 346112, 480512,
349440, 409920, 426752, 426752, 490560, 255360, 288960, 581312, 574592, 312512,
248640, 540992, 420032, 299072, 517440, 430080, 456960, 524160, 342720, 430080,
325952, 241920, 258752, 329280, 577920, 497280, 467072, 564480, 315840, 534272,
386432, 288960, 322560, 500672, 265472, 420032, 252032, 346112, 319232, 544320,
248640, 339392, 356160, 349440, 346112, 517440, 473792, 268800, 409920, 309120,
362880, 272192, 426752, 383040, 325952, 349440, 339392, 349440, 399872, 406592,
409920, 319232, 456960, 564480, 426752, 504000, 352832, 322560, 352832, 420032,
440192
]
},

"unet3d_ref_84":
{
"Benchmark": "unet3d",
"Creator": "NVIDIA",
"When": "Prior to 4.0 submission",
"Platform": "TBD",
"BS": 84,
"Hyperparams": {
"opt_base_learning_rate": 1.5,
"opt_momentum": 0.9,
"opt_learning_rate_warmup_epochs": 252000,
"opt_initial_learning_rate": 1e-4,
"opt_learning_rate_decay_boundary_epochs": [],
"opt_learning_rate_decay_factor": 1.0,
"opt_weight_decay": 0.0
},
"Epochs to converge": [
413280, 537600, 463680, 325920, 423360, 339360, 463680, 551040, 366240, 332640,
376320, 588000, 339360, 621600, 383040, 389760, 369600, 409920, 504000, 584640,
362880, 443520, 356160, 571200, 359520, 359520, 393120, 540960, 278880, 399840,
742560, 460320, 446880, 262080, 433440, 369600, 356160, 477120, 433440, 356160,
487200, 440160, 460320, 416640, 275520, 393120, 399840, 567840, 389760, 500640,
470400, 614880, 409920, 393120, 393120, 530880, 403200, 453600, 409920, 396480,
544320, 393120, 490560, 413280, 500640, 332640, 332640, 423360, 443520, 302400,
332640, 403200, 315840, 433440, 379680, 366240, 463680, 430080, 315840, 420000,
504000, 285600, 386400, 362880, 477120, 409920, 336000, 315840, 376320, 409920,
517440, 305760, 366240, 433440, 359520, 416640, 456960
]
}
}
20 changes: 18 additions & 2 deletions mlperf_logging/result_summarizer/result_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,12 +374,12 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset):
if has_power:
unsorted_scores = scores.copy()

scaling_factor = _get_scaling_factor(benchmark_folder)
if dropped_scores <= max_dropped_scores:
olympic_avg = _compute_olympic_average(
scores, dropped_scores, max_dropped_scores)
if olympic_avg is not None:
benchmark_scores[benchmark] = olympic_avg
scaling_factor = _get_scaling_factor(benchmark_folder)
benchmark_scores[benchmark] *= scaling_factor

if has_power and dropped_scores <= max_dropped_scores:
Expand All @@ -388,6 +388,7 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset):
power_scores, index, dropped_scores, max_dropped_scores)
if olympic_avg is not None:
benchmark_power_scores[benchmark] = olympic_avg
benchmark_power_scores[benchmark] *= scaling_factor
_fill_empty_benchmark_scores(benchmark_scores, usage, ruleset)
if len(benchmark_power_scores) > 0:
_fill_empty_benchmark_scores(benchmark_power_scores, usage, ruleset)
Expand Down Expand Up @@ -695,6 +696,10 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None):
urls.items(),
):
power_summary.push(column_name, value)
if column_name in strong_scaling_scores:
power_summary.push(column_name, strong_scaling_scores[column_name])
else:
power_summary.push(column_name, value)
if usage == 'hpc' and len(power_scores_weak_scaling) > 0:
for column_name, value in itertools.chain(
system_specs.items(),
Expand Down Expand Up @@ -818,6 +823,13 @@ def _update_summaries(folder):
def _map_columns_index(column, config):
map_ = config["columns"][args.usage][args.ruleset]
return tuple(map_.get(column, map_.get("default") + [column]))

def agg_columns_fn(df, benchmarks):
agg_map = {}
for model in benchmarks:
agg_map[(model, "perf")] = df[model].iloc[0]
agg_map[(model, "power")] = df[model].iloc[-1]
return pd.Series(agg_map)

def _summaries_to_xlsx(summaries: pd.DataFrame, path, version):
config_path = os.path.join(os.path.dirname(__file__), "config.yaml")
Expand Down Expand Up @@ -920,8 +932,12 @@ def _print_and_write(summaries, weak_scaling=False, mode='w', power = False):
csv = args.csv
assert csv.endswith(".csv")
if power:
benchmarks = get_allowed_benchmarks(args.usage, args.ruleset)
specs_and_notes = [c for c in summaries.columns if c not in benchmarks]
csv = csv.replace(".csv", "_power.csv")
summaries.to_csv(csv, index=False, mode=mode)
summaries.groupby(specs_and_notes).apply(lambda x: agg_columns_fn(x, benchmarks)).to_csv(csv, mode=mode)
else:
summaries.to_csv(csv, index=False, mode=mode)

if args.xlsx is not None:
_summaries_to_xlsx(summaries, args.xlsx, args.ruleset[:3])
Expand Down

0 comments on commit 19ce6fb

Please sign in to comment.