Merge branch 'master' into ahmadki/sd_iters_to_samples

mlcommons · Mar 14, 2024 · 19ce6fb · 19ce6fb
2 parents 1e848cf + c7b23b3
commit 19ce6fb
Show file tree

Hide file tree

Showing 2 changed files with 150 additions and 2 deletions.
diff --git a/mlperf_logging/rcp_checker/training_4.0.0/rcps_unet3d.json b/mlperf_logging/rcp_checker/training_4.0.0/rcps_unet3d.json
@@ -0,0 +1,132 @@
+{
+
+  "unet3d_ref_28":
+  {
+    "Benchmark": "unet3d",
+    "Creator": "NVIDIA",
+    "When": "Prior to 4.0 submission",
+    "Platform": "TBD",
+    "BS": 28,
+    "Hyperparams": {
+      "opt_base_learning_rate": 2.0,
+      "opt_momentum": 0.9,
+      "opt_learning_rate_warmup_epochs": 168000,
+      "opt_initial_learning_rate": 1e-4,
+      "opt_learning_rate_decay_boundary_epochs": [],
+      "opt_learning_rate_decay_factor": 1.0,
+      "opt_weight_decay": 0.0
+    },
+    "Epochs to converge": [
+      396480, 456960, 386400, 262080, 362880, 520800, 309120, 248640, 225120, 510720,
+      241920, 356160, 312480, 359520, 389760, 262080, 366240, 302400, 325920, 225120,
+      312480, 235200, 299040, 315840, 225120, 302400, 302400, 366240, 420000, 268800,
+      450240, 275520, 299040, 292320, 349440, 305760, 245280, 302400, 342720, 406560,
+      255360, 319200, 312480, 282240, 409920, 265440, 329280, 241920, 309120, 383040,
+      231840, 393120, 272160, 346080, 255360, 493920, 299040, 299040, 399840, 305760,
+      426720, 235200, 493920, 342720, 483840, 282240, 288960, 426720, 477120, 416640,
+      302400, 299040, 436800, 282240, 299040, 517440, 440160, 551040, 389760, 325920,
+      339360, 530880, 433440, 282240, 342720, 413280, 403200, 295680, 376320, 571200,
+      238560, 332640, 292320, 295680, 325920, 349440, 336000, 399840, 537600, 497280
+    ]
+  },
+
+  "unet3d_ref_56":
+  {
+    "Benchmark": "unet3d",
+    "Creator": "NVIDIA",
+    "When": "Prior to 4.0 submission",
+    "Platform": "TBD",
+    "BS": 56,
+    "Hyperparams": {
+      "opt_base_learning_rate": 2.0,
+      "opt_momentum": 0.9,
+      "opt_learning_rate_warmup_epochs": 168000,
+      "opt_initial_learning_rate": 1e-4,
+      "opt_learning_rate_decay_boundary_epochs": [],
+      "opt_learning_rate_decay_factor": 1.0,
+      "opt_weight_decay": 0.0
+    },
+    "Epochs to converge": [
+      399840, 480480, 356160, 467040, 376320, 325920, 255360, 376320, 302400, 329280, 
+      372960, 389760, 255360, 356160, 440160, 366240, 319200, 376320, 295680, 376320, 
+      366240, 305760, 544320, 618240, 379680, 376320, 215040, 225120, 322560, 406560, 
+      268800, 346080, 635040, 349440, 292320, 369600, 329280, 332640, 329280, 366240, 
+      372960, 389760, 295680, 305760, 235200, 366240, 325920, 487200, 332640, 359520, 
+      369600, 460320, 393120, 416640, 430080, 477120, 349440, 295680, 520800, 241920, 
+      315840, 299040, 413280, 349440, 339360, 272160, 433440, 453600, 514080, 336000, 
+      393120, 319200, 248640, 366240, 346080, 272160, 262080, 278880, 430080, 292320, 
+      352800, 332640, 258720, 393120, 255360, 352800, 389760, 309120, 262080, 356160, 
+      272160, 272160, 292320, 413280, 372960, 544320, 201600, 493920, 319200, 433440, 
+      386400, 403200, 369600, 322560, 436800, 389760, 339360, 329280, 339360, 554400, 
+      315840, 302400, 288960, 362880, 356160, 278880, 564480, 456960, 369600, 430080, 
+      359520, 527520, 272160, 268800, 282240, 383040, 467040, 265440, 420000, 403200, 
+      302400, 393120, 285600, 329280, 601440, 416640, 470400, 282240, 309120, 362880, 
+      285600, 366240, 319200, 349440, 389760, 356160, 235200, 399840, 483840, 359520, 
+      262080, 248640, 409920, 272160, 325920, 299040, 252000, 339360, 302400, 342720, 
+      295680, 302400, 252000, 403200, 396480, 504000, 393120, 372960, 420000, 349440, 
+      312480, 453600, 188160, 336000, 352800, 339360, 346080, 332640, 393120, 322560, 
+      399840, 362880, 379680, 463680, 383040, 366240, 255360, 359520, 389760, 292320, 
+      409920, 356160, 396480, 581280, 295680, 295680, 329280
+    ]
+  },
+
+  "unet3d_ref_64":
+  {
+    "Benchmark": "unet3d",
+    "Creator": "NVIDIA",
+    "When": "Prior to 4.0 submission",
+    "Platform": "TBD",
+    "BS": 64,
+    "Hyperparams": {
+      "opt_base_learning_rate": 2.0,
+      "opt_momentum": 0.9,
+      "opt_learning_rate_warmup_epochs": 168000,
+      "opt_initial_learning_rate": 1e-4,
+      "opt_learning_rate_decay_boundary_epochs": [],
+      "opt_learning_rate_decay_factor": 1.0,
+      "opt_weight_decay": 0.0
+    },
+    "Epochs to converge": [
+      504000, 238592, 477120, 530880, 359552, 336000, 507392, 346112, 420032, 423360,
+      346112, 372992, 275520, 319232, 305792, 463680, 594752, 352832, 346112, 480512,
+      349440, 409920, 426752, 426752, 490560, 255360, 288960, 581312, 574592, 312512,
+      248640, 540992, 420032, 299072, 517440, 430080, 456960, 524160, 342720, 430080,
+      325952, 241920, 258752, 329280, 577920, 497280, 467072, 564480, 315840, 534272,
+      386432, 288960, 322560, 500672, 265472, 420032, 252032, 346112, 319232, 544320,
+      248640, 339392, 356160, 349440, 346112, 517440, 473792, 268800, 409920, 309120,
+      362880, 272192, 426752, 383040, 325952, 349440, 339392, 349440, 399872, 406592,
+      409920, 319232, 456960, 564480, 426752, 504000, 352832, 322560, 352832, 420032,
+      440192
+    ]
+  },
+
+  "unet3d_ref_84":
+  {
+    "Benchmark": "unet3d",
+    "Creator": "NVIDIA",
+    "When": "Prior to 4.0 submission",
+    "Platform": "TBD",
+    "BS": 84,
+    "Hyperparams": {
+      "opt_base_learning_rate": 1.5,
+      "opt_momentum": 0.9,
+      "opt_learning_rate_warmup_epochs": 252000,
+      "opt_initial_learning_rate": 1e-4,
+      "opt_learning_rate_decay_boundary_epochs": [],
+      "opt_learning_rate_decay_factor": 1.0,
+      "opt_weight_decay": 0.0
+    },
+    "Epochs to converge": [
+      413280, 537600, 463680, 325920, 423360, 339360, 463680, 551040, 366240, 332640, 
+      376320, 588000, 339360, 621600, 383040, 389760, 369600, 409920, 504000, 584640, 
+      362880, 443520, 356160, 571200, 359520, 359520, 393120, 540960, 278880, 399840, 
+      742560, 460320, 446880, 262080, 433440, 369600, 356160, 477120, 433440, 356160, 
+      487200, 440160, 460320, 416640, 275520, 393120, 399840, 567840, 389760, 500640, 
+      470400, 614880, 409920, 393120, 393120, 530880, 403200, 453600, 409920, 396480, 
+      544320, 393120, 490560, 413280, 500640, 332640, 332640, 423360, 443520, 302400, 
+      332640, 403200, 315840, 433440, 379680, 366240, 463680, 430080, 315840, 420000, 
+      504000, 285600, 386400, 362880, 477120, 409920, 336000, 315840, 376320, 409920, 
+      517440, 305760, 366240, 433440, 359520, 416640, 456960
+      ]
+  }
+}
diff --git a/mlperf_logging/result_summarizer/result_summarizer.py b/mlperf_logging/result_summarizer/result_summarizer.py
@@ -374,12 +374,12 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset):
         if has_power:
             unsorted_scores = scores.copy()
 
+        scaling_factor = _get_scaling_factor(benchmark_folder)
         if dropped_scores <= max_dropped_scores:
             olympic_avg = _compute_olympic_average(
                 scores, dropped_scores, max_dropped_scores)
             if olympic_avg is not None:
                 benchmark_scores[benchmark] = olympic_avg
-                scaling_factor = _get_scaling_factor(benchmark_folder)
                 benchmark_scores[benchmark] *= scaling_factor
 
         if has_power and dropped_scores <= max_dropped_scores:
@@ -388,6 +388,7 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset):
                 power_scores, index, dropped_scores, max_dropped_scores)
             if olympic_avg is not None:
                 benchmark_power_scores[benchmark] = olympic_avg
+                benchmark_power_scores[benchmark] *= scaling_factor
     _fill_empty_benchmark_scores(benchmark_scores, usage, ruleset)
     if len(benchmark_power_scores) > 0:
         _fill_empty_benchmark_scores(benchmark_power_scores, usage, ruleset)
@@ -695,6 +696,10 @@ def _check_and_update_system_specs(desc_keys, column_name, query=None):
                     urls.items(),
             ):
                 power_summary.push(column_name, value)
+                if column_name in strong_scaling_scores:
+                    power_summary.push(column_name, strong_scaling_scores[column_name])
+                else:
+                    power_summary.push(column_name, value)
         if usage == 'hpc' and len(power_scores_weak_scaling) > 0:
             for column_name, value in itertools.chain(
                     system_specs.items(),
@@ -818,6 +823,13 @@ def _update_summaries(folder):
     def _map_columns_index(column, config):
         map_ = config["columns"][args.usage][args.ruleset]
         return tuple(map_.get(column, map_.get("default") + [column]))
+
+    def agg_columns_fn(df, benchmarks):
+        agg_map = {}
+        for model in benchmarks:
+            agg_map[(model, "perf")] = df[model].iloc[0]
+            agg_map[(model, "power")] = df[model].iloc[-1]
+        return pd.Series(agg_map)
 
     def _summaries_to_xlsx(summaries: pd.DataFrame, path, version):
         config_path = os.path.join(os.path.dirname(__file__), "config.yaml")
@@ -920,8 +932,12 @@ def _print_and_write(summaries, weak_scaling=False, mode='w', power = False):
                 csv = args.csv
                 assert csv.endswith(".csv")
                 if power:
+                    benchmarks = get_allowed_benchmarks(args.usage, args.ruleset)
+                    specs_and_notes = [c for c in summaries.columns if c not in benchmarks]
                     csv = csv.replace(".csv", "_power.csv")
-                summaries.to_csv(csv, index=False, mode=mode)
+                    summaries.groupby(specs_and_notes).apply(lambda x: agg_columns_fn(x, benchmarks)).to_csv(csv, mode=mode)
+                else:
+                    summaries.to_csv(csv, index=False, mode=mode)
 
             if args.xlsx is not None:
                 _summaries_to_xlsx(summaries, args.xlsx, args.ruleset[:3])