mlperf_logger.py

import collections
import os
import subprocess

import torch

from mlperf_logging import mllog
from mlperf_logging.mllog import constants

mllogger = mllog.get_mllogger()


def log_start(*args, **kwargs):
    _log_print(mllogger.start, *args, **kwargs)


def log_end(*args, **kwargs):
    _log_print(mllogger.end, *args, **kwargs)


def log_event(*args, **kwargs):
    _log_print(mllogger.event, *args, **kwargs)


def _log_print(logger, *args, **kwargs):
    if kwargs.pop('sync', False):
        barrier()
    if 'stack_offset' not in kwargs:
        kwargs['stack_offset'] = 3
    if 'value' not in kwargs:
        kwargs['value'] = None

    if kwargs.pop('log_all_ranks', False):
        log = True
    else:
        log = (get_rank() == 0)

    if log:
        logger(*args, **kwargs)


def barrier():
    """
    Works as a temporary distributed barrier, currently pytorch
    doesn't implement barrier for NCCL backend.
    Calls all_reduce on dummy tensor and synchronizes with GPU.
    """
    if torch.distributed.is_available() and torch.distributed.is_initialized():
        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
        torch.cuda.synchronize()


def get_rank():
    """
    Gets distributed rank or returns zero if distributed is not initialized.
    """
    if torch.distributed.is_available() and torch.distributed.is_initialized():
        rank = torch.distributed.get_rank()
    else:
        rank = 0
    return rank


def mlperf_submission_log(benchmark):

    num_nodes = os.environ.get('TASK_NNODES', 1)

    mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
    mllogger = mllog.get_mllogger()
    mllogger.logger.propagate = False

    log_event(
        key=constants.SUBMISSION_BENCHMARK,
        value=benchmark,
        )

    log_event(
        key=constants.SUBMISSION_ORG,
        value='SAMSUNG')

    log_event(
        key=constants.SUBMISSION_DIVISION,
        value='open')

    log_event(
        key=constants.SUBMISSION_STATUS,
        value='onprem')

    log_event(
        key=constants.SUBMISSION_PLATFORM,
        value=f'{num_nodes}xNVIDIA DGX A100')