Skip to content

Commit

Permalink
Add MLPerf logging (#831)
Browse files Browse the repository at this point in the history
Adds an experimental logger to create MLperf compliant submission files
  • Loading branch information
hanlint authored and ravi-mosaicml committed May 3, 2022
1 parent 5280f9c commit 02a5414
Show file tree
Hide file tree
Showing 12 changed files with 648 additions and 18 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ test-dist-gpu:
clean-notebooks:
$(PYTHON) scripts/clean_notebooks.py -i notebooks/*.ipynb

.PHONY: test test-gpu test-dist test-dist-gpu lint style clean-notebooks
.PHONY: test test-gpu test-dist test-dist-gpu clean-notebooks
6 changes: 5 additions & 1 deletion composer/callbacks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
examples for writing your own callbacks at the :class:`~composer.core.callback.Callback` base class.
"""
from composer.callbacks.callback_hparams import (CallbackHparams, CheckpointSaverHparams, GradMonitorHparams,
LRMonitorHparams, MemoryMonitorHparams, SpeedMonitorHparams)
LRMonitorHparams, MemoryMonitorHparams, MLPerfCallbackHparams,
SpeedMonitorHparams)
from composer.callbacks.checkpoint_saver import CheckpointSaver
from composer.callbacks.grad_monitor import GradMonitor
from composer.callbacks.lr_monitor import LRMonitor
from composer.callbacks.memory_monitor import MemoryMonitor
from composer.callbacks.mlperf import MLPerfCallback
from composer.callbacks.speed_monitor import SpeedMonitor

__all__ = [
Expand All @@ -19,11 +21,13 @@
"MemoryMonitor",
"SpeedMonitor",
"CheckpointSaver",
"MLPerfCallback",
# hparams objects
"CallbackHparams",
"CheckpointSaverHparams",
"GradMonitorHparams",
"LRMonitorHparams",
"MemoryMonitorHparams",
"SpeedMonitorHparams",
"MLPerfCallbackHparams",
]
62 changes: 59 additions & 3 deletions composer/callbacks/callback_hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import abc
import textwrap
from dataclasses import dataclass
from dataclasses import asdict, dataclass
from typing import Optional

import yahp as hp
Expand All @@ -14,6 +14,7 @@
from composer.callbacks.grad_monitor import GradMonitor
from composer.callbacks.lr_monitor import LRMonitor
from composer.callbacks.memory_monitor import MemoryMonitor
from composer.callbacks.mlperf import MLPerfCallback
from composer.callbacks.speed_monitor import SpeedMonitor
from composer.core.callback import Callback
from composer.core.time import Time
Expand Down Expand Up @@ -48,7 +49,7 @@ class GradMonitorHparams(CallbackHparams):
""":class:`~.GradMonitor` hyperparamters.
Args:
log_layer_grad_norms (bool, optional):
log_layer_grad_norms (bool, optional):
See :class:`~.GradMonitor` for documentation.
"""

Expand Down Expand Up @@ -119,10 +120,65 @@ def initialize_object(self) -> SpeedMonitor:
return SpeedMonitor(window_size=self.window_size)


@dataclass
class MLPerfCallbackHparams(CallbackHparams):
""":class:`~.MLPerfCallback` hyperparameters.
Args:
root_folder (str): The root submission folder
index (int): The repetition index of this run. The filename created will be
``result_[index].txt``.
benchmark (str, optional): Benchmark name. Currently only ``resnet`` supported.
target (float, optional): The target metric before the mllogger marks the stop
of the timing run. Default: ``0.759`` (resnet benchmark).
division (str, optional): Division of submission. Currently only ``open`` division supported.
metric_name (str, optional): name of the metric to compare against the target. Default: ``Accuracy``.
metric_label (str, optional): label name. The metric will be accessed via ``state.current_metrics[metric_label][metric_name]``.
submitter (str, optional): Submitting organization. Default: MosaicML.
system_name (str, optional): Name of the system (e.g. 8xA100_composer). If
not provided, system name will default to ``[world_size]x[device_name]_composer``,
e.g. ``8xNVIDIA_A100_80GB_composer``.
status (str, optional): Submission status. One of (onprem, cloud, or preview).
Default: ``"onprem"``.
cache_clear_cmd (str, optional): Command to invoke during the cache clear. This callback
will call ``subprocess(cache_clear_cmd)``. Default is disabled (None)
"""

root_folder: str = hp.required("The root submission folder.")
index: int = hp.required("The repetition index of this run.")
benchmark: str = hp.optional("Benchmark name. Default: resnet", default="resnet")
target: float = hp.optional("The target metric before mllogger marks run_stop. Default: 0.759 (resnet)",
default=0.759)
division: Optional[str] = hp.optional(
"Division of submission. Currently only open division"
"is supported. Default: open", default="open")
metric_name: str = hp.optional('name of the metric to compare against the target. Default: Accuracy',
default='Accuracy')
metric_label: str = hp.optional(
'label name. The metric will be accessed via state.current_metrics[metric_label][metric_name]. Default: eval',
default='eval')
submitter: str = hp.optional("Submitting organization. Default: MosaicML", default='MosaicML')
system_name: Optional[str] = hp.optional("Name of the system, defaults to [world_size]x[device_name]", default=None)
status: str = hp.optional("Submission status. Default: onprem", default="onprem")
cache_clear_cmd: Optional[str] = hp.optional(
"Command to invoke during the cache clear. This callback will call subprocess(cache_clear_cmd). Default: Disabled.",
default=None,
)

def initialize_object(self) -> MLPerfCallback:
"""Initialize the MLPerf Callback.
Returns:
MLPerfCallback: An instance of :class:`~.MLPerfCallback`
"""
return MLPerfCallback(**asdict(self))


@dataclass
class CheckpointSaverHparams(CallbackHparams):
""":class:`~.CheckpointSaver` hyperparameters.
Args:
save_folder (str, optional): See :class:`~.CheckpointSaver`.
filename (str, optional): See :class:`~.CheckpointSaver`.
Expand Down
Loading

0 comments on commit 02a5414

Please sign in to comment.