From cec62ff6107f4e63b12cdeb900f3cd9c522594c9 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 10 Dec 2024 23:01:57 +0530 Subject: [PATCH 01/14] Update backend_pytorch.py | Fix lock usage (#1964) Co-authored-by: Miro --- text_to_image/backend_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/text_to_image/backend_pytorch.py b/text_to_image/backend_pytorch.py index 027e15565..8e52e0a61 100644 --- a/text_to_image/backend_pytorch.py +++ b/text_to_image/backend_pytorch.py @@ -387,7 +387,7 @@ def predict(self, inputs): pooled_prompt_embeds, negative_pooled_prompt_embeds, ) = self.prepare_inputs(inputs, i) - with lock: + with self.lock: generated = self.pipe( prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, From bc5e6fbeb9a47210cd47039ec12462140525ccb4 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 11 Dec 2024 17:23:47 -0500 Subject: [PATCH 02/14] Add llama3 metrics + remove llama3-99.9 (#1973) --- language/llama3-405b/README.md | 11 +++- tools/submission/submission_checker.py | 69 +++++++------------------- 2 files changed, 28 insertions(+), 52 deletions(-) diff --git a/language/llama3-405b/README.md b/language/llama3-405b/README.md index 8c04c202e..dcc5344c4 100644 --- a/language/llama3-405b/README.md +++ b/language/llama3-405b/README.md @@ -193,5 +193,12 @@ The ServerSUT was not tested for GPU runs. ## Accuracy Target -Running the GPU implementation in FP16 precision resulted in the following FP16 accuracy targets (normalized to a 0-100 -scale from a 0.0-1.0 scale): +Running the GPU implementation in FP16 precision resulted in the following FP16 accuracy targets: +``` +{ + 'rougeL': 21.6666, + 'exact_match': 90.1335, + 'tokens_per_sample': 684.68, +} +``` + diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 51f8c7aab..37cd3a0f8 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -271,8 +271,7 @@ "llama2-70b-99.9", "stable-diffusion-xl", "mixtral-8x7b", - "llama3-405b-99", - "llama3-405b-99.9", + "llama3-405b", "rgat", # TODO: add automotive ], @@ -289,8 +288,7 @@ "llama2-70b-99.9": ["Server", "Offline"], "stable-diffusion-xl": ["Server", "Offline"], "mixtral-8x7b": ["Server", "Offline"], - "llama3-405b-99": ["Server", "Offline"], - "llama3-405b-99.9": ["Server", "Offline"], + "llama3-405b": ["Server", "Offline"], "rgat": ["Offline"], }, "optional-scenarios-datacenter": {}, @@ -320,8 +318,7 @@ "llama2-70b-99.9": ["Server", "Offline"], "stable-diffusion-xl": ["SingleStream", "Offline", "Server"], "mixtral-8x7b": ["Server", "Offline"], - "llama3-405b-99": ["Server", "Offline"], - "llama3-405b-99.9": ["Server", "Offline"], + "llama3-405b": ["Server", "Offline"], "rgat": ["Offline"], }, "optional-scenarios-datacenter-edge": {}, @@ -395,22 +392,13 @@ "mbxp_accuracy", 60.12 * 0.99, ), - # TODO: Get llama3 metrics - "llama3-405b-99": ( + "llama3-405b": ( "ROUGEL", - 1 * 0.99, + 21.6666 * 0.99, "exact_match", - 1 * 0.99, + 90.1335 * 0.99, "TOKENS_PER_SAMPLE", - 1000 * 0.9, - ), - "llama3-405b-99.9": ( - "ROUGEL", - 1 * 0.99, - "exact_match", - 1 * 0.99, - "TOKENS_PER_SAMPLE", - 20000 * 0.9, + 684.68 * 0.9, ), "rgat": ("acc", 0.7286 * 0.99), }, @@ -424,8 +412,7 @@ "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), "mixtral-8x7b": ("TOKENS_PER_SAMPLE", 145.9 * 1.1), - "llama3-405b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), - "llama3-405b-99.9": ("TOKENS_PER_SAMPLE", 20000 * 1.1), + "llama3-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1), }, "accuracy-delta-perc": { "stable-diffusion-xl": {"CLIP_SCORE": 1, "FID_SCORE": 2} @@ -445,8 +432,7 @@ "llama2-70b-99.9": 24576, "stable-diffusion-xl": 5000, "mixtral-8x7b": 15000, - "llama3-405b-99": 8312, - "llama3-405b-99.9": 8312, + "llama3-405b": 8312, "rgat": 788379 }, @@ -510,8 +496,7 @@ "Offline": 1, }, "mixtral-8x7b": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, - "llama3-405b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, - "llama3-405b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "llama3-405b": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, "rgat": {"SingleStream": 1024, "Server": 270336, "Offline": 1} }, }, @@ -600,7 +585,7 @@ "llama2-70b-99.9": 24576, "stable-diffusion-xl": 5000, "mixtral-8x7b": 15000, - "llama3-405b-99": 8312, + "llama3-405b": 8312, "llama2-405b-99.9": 8312, "rgat": 788379, } @@ -680,11 +665,7 @@ "Offline": "result_tokens_per_second", "Server": "result_completed_tokens_per_second", }, - "llama3-405b-99": { - "Offline": "result_tokens_per_second", - "Server": "result_completed_tokens_per_second", - }, - "llama3-405b-99.9": { + "llama3-405b": { "Offline": "result_tokens_per_second", "Server": "result_completed_tokens_per_second", }, @@ -699,10 +680,7 @@ "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000} }, "mixtral-8x7b": {"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}}, - "llama3-405b-99": { - "conversational": {"ttft": 6000 * 1000000, "tpot": 175 * 1000000} - }, - "llama3-405b-99.9": { + "llama3-405b": { "conversational": {"ttft": 6000 * 1000000, "tpot": 175 * 1000000} }, } @@ -986,8 +964,7 @@ def requires_equal_issue(self, model, division): "llama2-70b-99", "llama2-70b-99.9", "mixtral-8x7b", - "llama3-405b-99", - "llama3-405b-99.9", + "llama3-405b", ] and self.version not in ["v4.0", "v4.1"] ) @@ -1355,7 +1332,7 @@ def check_performance_dir( ) if model in ["llama2-70b-99", "llama2-70b-99.9", - "mixtral-8x7b", "llama3-405b-99", "llama3-405b-99.9"]: + "mixtral-8x7b", "llama3-405b"]: llama_constraint, is_valid = extra_check_llm( mlperf_log, scenario_fixed, model) @@ -1895,13 +1872,7 @@ def log_result( "Offline": "Tokens/s", "Server": "Tokens/s", }, - "llama3-405b-99": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", - "Offline": "Tokens/s", - "Server": "Tokens/s", - }, - "llama3-405b-99.9": { + "llama3-405b": { "SingleStream": "Latency (ms)", "MultiStream": "Latency (ms)", "Offline": "Tokens/s", @@ -2986,8 +2957,7 @@ def check_compliance_dir( "llama2-70b-99", "llama2-70b-99.9", "mixtral-8x7b", - "llama3-405b-99", - "llama3-405b-99.9", + "llama3-405b", "rgat", ]: test_list.remove("TEST04") @@ -3008,8 +2978,7 @@ def check_compliance_dir( "llama2-70b-99", "llama2-70b-99.9", "mixtral-8x7b", - "llama3-405b-99", - "llama3-405b-99.9", + "llama3-405b", ]: test_list.remove("TEST01") @@ -3018,7 +2987,7 @@ def check_compliance_dir( test_list.remove("TEST04") if model in ["llama2-70b-99", "llama2-70b-99.9", - "mixtral-8x7b", "llama3-405b-99", "llama3-405b-99.9"]: + "mixtral-8x7b", "llama3-405b"]: test_list.append("TEST06") if test_list and not os.path.exists(compliance_dir): From 8948934e2485e4f82d1712661167bcce64eadcec Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 12 Dec 2024 20:15:56 +0530 Subject: [PATCH 03/14] Fix submission checker for v5.0 rgat (#1974) * Fix submission checker for v5.0 rgat * Update submission_checker.py | Updates for v5.0 * [Automated Commit] Format Codebase * Update submission_checker.py | Fixes latency_constraints for v5.0 * [Automated Commit] Format Codebase --------- Co-authored-by: mlcommons-bot --- tools/submission/submission_checker.py | 46 +++++++++++--------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 37cd3a0f8..35b70a4e7 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1,4 +1,4 @@ -"""A checker for MLPerf Inference submissions from v4.0 onwards (for checking older submissions please use the submission checker from the respective release) +"""A checker for MLPerf Inference submissions from v4.1 onwards (for checking older submissions please use the submission checker from the respective release) """ from __future__ import division @@ -196,13 +196,11 @@ "resnet50": "resnet", }, "seeds": { - # TODO: Update random seeds "qsl_rng_seed": 3066443479025735752, "sample_index_rng_seed": 10688027786191513374, "schedule_rng_seed": 14962580496156340209, }, "test05_seeds": { - # TODO: Update random seeds "qsl_rng_seed": 16799458546791641818, "sample_index_rng_seed": 5453809927556429288, "schedule_rng_seed": 5435552105434836064, @@ -220,8 +218,7 @@ "llama2-70b-99": {"Server": 20000000000}, "llama2-70b-99.9": {"Server": 20000000000}, "stable-diffusion-xl": {"Server": 20000000000}, - # TODO: Mixtral metrics - # "mixtral-8x7b" : {"Server": 20000000000} + "mixtral-8x7b": {"Server": 20000000000} }, "min-queries": { "resnet": { @@ -260,7 +257,6 @@ "retinanet", "bert-99", "bert-99.9", - # TODO: remove dlrm? "dlrm-v2-99", "dlrm-v2-99.9", "3d-unet-99", @@ -273,7 +269,7 @@ "mixtral-8x7b", "llama3-405b", "rgat", - # TODO: add automotive + # TODO: add automotive? ], "required-scenarios-datacenter": { "resnet": ["Server", "Offline"], @@ -296,6 +292,7 @@ "resnet": ["SingleStream", "MultiStream", "Offline"], "retinanet": ["SingleStream", "MultiStream", "Offline"], "bert-99": ["SingleStream", "Offline"], + "bert-99.9": ["SingleStream", "Offline"], "3d-unet-99": ["SingleStream", "Offline"], "3d-unet-99.9": ["SingleStream", "Offline"], "gptj-99": ["SingleStream", "Offline"], @@ -306,8 +303,8 @@ "required-scenarios-datacenter-edge": { "resnet": ["SingleStream", "Offline", "MultiStream", "Server"], "retinanet": ["SingleStream", "Offline", "MultiStream", "Server"], - "bert-99": ["SingleStream", "Offline", "Server"], - "bert-99.9": ["Offline", "Server"], + "bert-99": ["SingleStream", "Offline"], + "bert-99.9": ["SingleStream", "Offline"], "dlrm-v2-99": ["Offline", "Server"], "dlrm-v2-99.9": ["Offline", "Server"], "3d-unet-99": ["SingleStream", "Offline"], @@ -436,10 +433,11 @@ "rgat": 788379 }, - # TODO: Update this list. + # model_mapping.json is expected in the root directory of the + # submission folder for open submissions and so the below dictionary is + # not really needed "model_mapping": { # map model names to the official mlperf model class - "ssd-resnet34": "retinanet", "mobilenet": "resnet", "resnet50": "resnet", }, @@ -449,23 +447,19 @@ "sample_index_rng_seed": 10688027786191513374, "schedule_rng_seed": 14962580496156340209, }, - "test05_seeds": { - # TODO: Update random seeds - "qsl_rng_seed": 16799458546791641818, - "sample_index_rng_seed": 5453809927556429288, - "schedule_rng_seed": 5435552105434836064, - }, "ignore_errors": [], "latency-constraint": { "resnet": {"Server": 15000000}, "retinanet": {"Server": 100000000}, - "bert-99": {"Server": 130000000}, - "bert-99.9": {"Server": 130000000}, "dlrm-v2-99": {"Server": 60000000}, "dlrm-v2-99.9": {"Server": 60000000}, "gptj-99": {"Server": 20000000000}, "gptj-99.9": {"Server": 20000000000}, "stable-diffusion-xl": {"Server": 20000000000}, + "llama2-70b-99": {"Server": 20000000000}, + "llama2-70b-99.9": {"Server": 20000000000}, + "mixtral-8x7b": {"Server": 20000000000}, + "llama3-405b": {"Server": 60000000000} }, "min-queries": { "resnet": { @@ -480,8 +474,8 @@ "Server": 270336, "Offline": 1, }, - "bert-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, - "bert-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "bert-99": {"SingleStream": 1024, "Offline": 1}, + "bert-99.9": {"SingleStream": 1024, "Offline": 1}, "dlrm-v2-99": {"Server": 270336, "Offline": 1}, "dlrm-v2-99.9": {"Server": 270336, "Offline": 1}, "3d-unet-99": {"SingleStream": 1024, "Offline": 1}, @@ -497,7 +491,7 @@ }, "mixtral-8x7b": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, "llama3-405b": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, - "rgat": {"SingleStream": 1024, "Server": 270336, "Offline": 1} + "rgat": {"SingleStream": 1024, "Offline": 1} }, }, } @@ -605,17 +599,15 @@ } RESULT_FIELD_NEW = { - "v4.0": { + "v4.1": { "Offline": "result_samples_per_second", "SingleStream": "early_stopping_latency_ss", - "MultiStreamLegacy": "effective_samples_per_query", "MultiStream": "early_stopping_latency_ms", - "Server": "result_scheduled_samples_per_sec", + "Server": "result_completed_samples_per_sec", }, - "v4.1": { + "v5.0": { "Offline": "result_samples_per_second", "SingleStream": "early_stopping_latency_ss", - "MultiStreamLegacy": "effective_samples_per_query", "MultiStream": "early_stopping_latency_ms", "Server": "result_completed_samples_per_sec", }, From 2d2eb3081dcda64e766a28dc1b6fb9c40fb9eefa Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 13 Dec 2024 00:43:35 +0530 Subject: [PATCH 04/14] Fix test05 seeds missing error for v5.0 submission checker (#1976) --- tools/submission/submission_checker.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 35b70a4e7..a5165b381 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -827,7 +827,8 @@ def __init__( self.version = version self.models = self.base["models"] self.seeds = self.base["seeds"] - self.test05_seeds = self.base["test05_seeds"] + if self.base.get("test05_seeds"): + self.test05_seeds = self.base["test05_seeds"] self.accuracy_target = self.base["accuracy-target"] self.accuracy_delta_perc = self.base["accuracy-delta-perc"] self.accuracy_upper_limit = self.base.get("accuracy-upper-limit", {}) @@ -968,7 +969,7 @@ def get_args(): parser.add_argument("--input", required=True, help="submission directory") parser.add_argument( "--version", - default="v4.1", + default="v5.0", choices=list(MODEL_CONFIG.keys()), help="mlperf version", ) From fa274563d74b555f0fb8cd6fe5d11a24e83684ce Mon Sep 17 00:00:00 2001 From: Zhihan Jiang <68881590+nvzhihanj@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:38:37 -0800 Subject: [PATCH 05/14] Fix llama3-405B docker workflow and performance sample count (#1978) * Fix llama3-405B docker workflow * Fix the performance sample count from 8312 to 8313 * More fixes --- .gitignore | 1 + language/llama3-405b/README.md | 81 ++++++++++++++----- .../{launch.sh => launch_docker.sh} | 0 language/llama3-405b/main.py | 2 +- language/llama3-405b/run_accuracy.sh | 5 +- language/llama3-405b/run_offline.sh | 17 ++-- language/llama3-405b/run_server.sh | 17 ++-- language/llama3-405b/with_the_same_user | 27 +++++++ loadgen/mlperf.conf | 8 +- tools/submission/submission_checker.py | 5 +- 10 files changed, 116 insertions(+), 47 deletions(-) rename language/llama3-405b/{launch.sh => launch_docker.sh} (100%) create mode 100755 language/llama3-405b/with_the_same_user diff --git a/.gitignore b/.gitignore index 9545a7977..eba8bb341 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ loadgen/build/ libmlperf_loadgen.a __pycache__/ generated/ +*.swp diff --git a/language/llama3-405b/README.md b/language/llama3-405b/README.md index dcc5344c4..8df2a81f1 100644 --- a/language/llama3-405b/README.md +++ b/language/llama3-405b/README.md @@ -9,34 +9,64 @@ Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3-405b) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. - + ## Prepare environment -Copy the mlperf.conf file to this folder. -``` -cp ../../mlperf.conf . +### Local Environment Run + +The following steps were tested in Ubuntu 22.04 with python 3.10 + +- **Prerrequisite for GPU runs:** Install Nvidia Driver and cuda 12.1. + +The following links contain the commands for installing the [NVIDIA Driver](https://developer.nvidia.com/datacenter-driver-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_local) and [Cuda](https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_local) + +- **Prerrequisite:** Install conda. + +```bash +mkdir -p ~/miniconda3 +wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh +bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 +rm ~/miniconda3/miniconda.sh +~/miniconda3/bin/conda init ``` -For a CPU-only run: +- Set the following helper variables +```bash +export ROOT=$PWD/inference +export LLAMA_FOLDER=$PWD/inference/language/llama3-405b +export LOADGEN_FOLDER=$PWD/inference/loadgen +export DATASET_FOLDER=$PWD/inference/language/llama3-405b/dataset +``` +- Clone the inference repository: +```bash +git clone --recurse-submodules https://github.com/mlcommons/inference.git \ + --depth 1 ``` -conda create -n llama3-405b python=3.9 + +- Create a conda environment: +```bash +conda create -y -n llama3-405b python=3.10 conda activate llama3-405b +conda install -y -c conda-forge libstdcxx-ng=12 +``` +- Install requirements and loadgen: +```bash +cd $LLAMA_FOLDER # Install packages pip install -r requirements.txt +``` -export CUR_DIR=${PWD} -cd /loadgen - - -python -m pip install . +```bash +cd $LOADGEN_FOLDER +pip install -e . ``` -For a GPU-based run: +### Docker Run A dockerfile is provided, along with scripts to help launch it. First, add any docker volume mounts you want in -`launch.sh`. There is a section at the top of the file that looks like: +`launch_docker.sh`. There is a section at the top of the file that looks like: ``` # Add any volume mounts here with the following syntax # /path/to/src:/path/to/dir/in/container @@ -54,10 +84,13 @@ MOUNTS=( /raid/data:/raid/data ) ``` -Once you have added all your mounts, launch the container with `bash launch.sh`. +Once you have added all your mounts, build and launch the container with `bash launch.sh`. -Inside the container, set up the environment with `bash build.sh`. This will install all the dependencies from the -CPU-only setup, as well as any GPU versions for applicable libraries like PyTorch. +Now install all the dependencies: +``` +pip install -r requirements.txt +pip install -e ../../loadgen +``` ## Get Model @@ -73,7 +106,7 @@ TODO: Host model and grant access to submitters export CHECKPOINT_PATH=Meta-Llama-3.1-405B-Instruct git lfs install git clone https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct ${CHECKPOINT_PATH} - +cd ${CHECKPOINT_PATH} && git checkout be673f326cab4cd22ccfef76109faf68e41aa5f1 ``` ## Get Dataset @@ -109,9 +142,10 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/llama3_405b/mlperf_llama ``` python -u main.py --scenario Offline \ --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ --dtype float16 \ --user-conf user.conf \ - --total-sample-count 8312 \ + --total-sample-count 8313 \ --dataset-path ${DATASET_PATH} \ --output-log-dir output \ --tensor-parallel-size ${GPU_COUNT} \ @@ -123,9 +157,10 @@ python -u main.py --scenario Offline \ ``` python -u main.py --scenario Server \ --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ --dtype float16 \ --user-conf user.conf \ - --total-sample-count 8312 \ + --total-sample-count 8313 \ --dataset-path ${DATASET_PATH} \ --output-log-dir output \ --tensor-parallel-size ${GPU_COUNT} \ @@ -145,10 +180,11 @@ mkdir -p "run_outputs" # The script will dump all the outputs to 'run_outputs'. python -u main.py --scenario Offline \ --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ --accuracy \ --dtype float16 \ --user-conf user.conf \ - --total-sample-count 8312 \ + --total-sample-count 8313 \ --dataset-path ${DATASET_PATH} \ --output-log-dir output \ --tensor-parallel-size ${GPU_COUNT} \ @@ -172,10 +208,11 @@ OUTPUT_LOG_DIR=server-accuracy-logs python -u main.py --scenario Server \ --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ --accuracy \ --dtype float16 \ --user-conf user.conf \ - --total-sample-count 8312 \ + --total-sample-count 8313 \ --dataset-path ${DATASET_PATH} \ --output-log-dir output \ --tensor-parallel-size ${GPU_COUNT} \ @@ -201,4 +238,4 @@ Running the GPU implementation in FP16 precision resulted in the following FP16 'tokens_per_sample': 684.68, } ``` - +The accuracy target is 99% for rougeL and exact_match, and 90% for tokens_per_sample diff --git a/language/llama3-405b/launch.sh b/language/llama3-405b/launch_docker.sh similarity index 100% rename from language/llama3-405b/launch.sh rename to language/llama3-405b/launch_docker.sh diff --git a/language/llama3-405b/main.py b/language/llama3-405b/main.py index 26d5726b3..f7802687e 100644 --- a/language/llama3-405b/main.py +++ b/language/llama3-405b/main.py @@ -77,7 +77,7 @@ def get_args(): parser.add_argument( "--total-sample-count", type=int, - default=8312, + default=8313, help="Number of samples to use in benchmark.", ) parser.add_argument( diff --git a/language/llama3-405b/run_accuracy.sh b/language/llama3-405b/run_accuracy.sh index 075245913..9a54d8f13 100644 --- a/language/llama3-405b/run_accuracy.sh +++ b/language/llama3-405b/run_accuracy.sh @@ -5,10 +5,11 @@ mkdir -p "run_outputs" python3 -u main.py --scenario Offline \ --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ --accuracy \ --mlperf-conf mlperf.conf \ --user-conf user.conf \ - --total-sample-count 8312 \ + --total-sample-count 8313 \ --dataset-path ${DATASET_PATH} \ --output-log-dir offline_accuracy_loadgen_logs \ --dtype float32 | tee offline_accuracy_log.log @@ -17,5 +18,3 @@ python3 evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \ --mlperf-accuracy-file offline_accuracy_loadgen_logs/mlperf_log_accuracy.json \ --dataset-file ${DATASET_PATH} \ --dtype int32 - -python3 consolidate_results.py --dataset-path ${DATASET_PATH} --model-dir ${CHECKPOINT_PATH} diff --git a/language/llama3-405b/run_offline.sh b/language/llama3-405b/run_offline.sh index 89fa9e45f..6b3a56e01 100644 --- a/language/llama3-405b/run_offline.sh +++ b/language/llama3-405b/run_offline.sh @@ -1,10 +1,13 @@ CHECKPOINT_PATH="${CHECKPOINT_PATH:Meta-Llama-3.1-405B-Instruct}" -DATASET_PATH="${DATASET_PATH:-open-orca-val-set.pkl}" +DATASET_PATH="${DATASET_PATH:mlperf_llama3.1_405b_dataset_8318.pkl}" python -u main.py --scenario Offline \ - --model-path ${CHECKPOINT_PATH} \ - --mlperf-conf mlperf.conf \ - --user-conf user.conf \ - --total-sample-count 8312 \ - --dataset-path ${DATASET_PATH} \ - --device cpu 2>&1 | tee server_log.log + --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ + --dtype float16 \ + --user-conf user.conf \ + --total-sample-count 8313 \ + --dataset-path ${DATASET_PATH} \ + --output-log-dir output \ + --tensor-parallel-size ${GPU_COUNT} \ + --vllm 2>&1 | tee offline.log diff --git a/language/llama3-405b/run_server.sh b/language/llama3-405b/run_server.sh index fe2a31c43..010a359de 100644 --- a/language/llama3-405b/run_server.sh +++ b/language/llama3-405b/run_server.sh @@ -1,12 +1,15 @@ CHECKPOINT_PATH="${CHECKPOINT_PATH:Meta-Llama-3.1-405B-Instruct}" -DATASET_PATH="${DATASET_PATH:-open-orca-val-set.pkl}" +DATASET_PATH="${DATASET_PATH:mlperf_llama3.1_405b_dataset_8318.pkl}" python -u main.py --scenario Server \ - --model-path ${CHECKPOINT_PATH} \ - --mlperf-conf mlperf.conf \ - --user-conf user.conf \ - --total-sample-count 8312 \ - --dataset-path ${DATASET_PATH} \ - --device cpu 2>&1 | tee server_log.log + --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ + --dtype float16 \ + --user-conf user.conf \ + --total-sample-count 8313 \ + --dataset-path ${DATASET_PATH} \ + --output-log-dir output \ + --tensor-parallel-size ${GPU_COUNT} \ + --vllm 2>&1 | tee server.log diff --git a/language/llama3-405b/with_the_same_user b/language/llama3-405b/with_the_same_user new file mode 100755 index 000000000..cfa57902f --- /dev/null +++ b/language/llama3-405b/with_the_same_user @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# wkong: manually set the user info in env first + +set -ex + +if [ -z "$@" ]; then + COMMAND=(bash) +else + COMMAND=("$@") +fi + +apt-get update && apt-get install -y sudo + +getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" +getent passwd "${CI_BUILD_UID}" || adduser --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" --disabled-password --quiet "${CI_BUILD_USER}" + +usermod -a -G dip "${CI_BUILD_USER}" +usermod -a -G sudo "${CI_BUILD_USER}" +usermod -a -G root "${CI_BUILD_USER}" + +echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +sudo -H -u "#${CI_BUILD_UID}" --preserve-env \ + PATH="${PATH}" \ + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \ + PYTHONPATH="${PYTHONPATH}" \ + ${COMMAND[@]} diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf index 31ad5ef62..1fe202253 100644 --- a/loadgen/mlperf.conf +++ b/loadgen/mlperf.conf @@ -14,7 +14,7 @@ dlrm-v2.*.performance_sample_count_override = 204800 rnnt.*.performance_sample_count_override = 2513 gptj.*.performance_sample_count_override = 13368 llama2-70b.*.performance_sample_count_override = 24576 -llama3-405b.*.performance_sample_count_override = 8312 +llama3-405b.*.performance_sample_count_override = 8313 stable-diffusion-xl.*.performance_sample_count_override = 5000 rgat.*.performance_sample_count_override = 788379 # set to 0 to let entire sample set to be performance sample @@ -84,8 +84,8 @@ llama3-405b.Server.tpot_latency = 175 *.Offline.min_duration = 600000 # In Offline scenario, we always have one query. But LoadGen maps this to -# min_sample_count internally in Offline scenario. If the dataset size is larger -# than 24576 we limit the min_query_count to 24576 and otherwise we use +# min_sample_count internally in Offline scenario. If the dataset size is larger +# than 24576 we limit the min_query_count to 24576 and otherwise we use # the dataset size as the limit resnet50.Offline.min_query_count = 24576 @@ -97,7 +97,7 @@ rnnt.Offline.min_query_count = 2513 3d-unet.Offline.min_query_count = 43 stable-diffusion-xl.Offline.min_query_count = 5000 llama2-70b.Offline.min_query_count = 24576 -llama3-405b.Offline.min_query_count = 8312 +llama3-405b.Offline.min_query_count = 8313 mixtral-8x7b.Offline.min_query_count = 15000 rgat.Offline.min_query_count = 788379 diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index a5165b381..43fa1350c 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -429,7 +429,7 @@ "llama2-70b-99.9": 24576, "stable-diffusion-xl": 5000, "mixtral-8x7b": 15000, - "llama3-405b": 8312, + "llama3-405b": 8313, "rgat": 788379 }, @@ -579,8 +579,7 @@ "llama2-70b-99.9": 24576, "stable-diffusion-xl": 5000, "mixtral-8x7b": 15000, - "llama3-405b": 8312, - "llama2-405b-99.9": 8312, + "llama3-405b": 8313, "rgat": 788379, } From 1e48c0dce32ab98b8c930b50e919e13f1fc8f57a Mon Sep 17 00:00:00 2001 From: mrmhodak Date: Fri, 13 Dec 2024 04:38:49 +0000 Subject: [PATCH 06/14] Increment version to 5.0.2 --- loadgen/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loadgen/VERSION.txt b/loadgen/VERSION.txt index 6b244dcd6..a1ef0cae1 100644 --- a/loadgen/VERSION.txt +++ b/loadgen/VERSION.txt @@ -1 +1 @@ -5.0.1 +5.0.2 From c71b83f92f0cd2c3b2f07ffec9d3108bb227b712 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Fri, 13 Dec 2024 20:57:22 +0530 Subject: [PATCH 07/14] Fix submission generation for v5.0 (#1981) * Fix submission checker for v5.0 rgat * Fix accuracy pattern for rgat, report-generator for v5.0 --- tools/submission/generate_final_report.py | 99 +++++++++++++++-------- tools/submission/submission_checker.py | 2 +- 2 files changed, 67 insertions(+), 34 deletions(-) diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py index 5e5d22c45..34ae82fb1 100644 --- a/tools/submission/generate_final_report.py +++ b/tools/submission/generate_final_report.py @@ -160,39 +160,72 @@ def main(): ], ] - filter_scenarios = { - "datacenter": { - "resnet": ["Server", "Offline"], - "retinanet": ["Server", "Offline"], - "rnnt": ["Server", "Offline"], - "bert-99": ["Server", "Offline"], - "bert-99.9": ["Server", "Offline"], - "dlrm-v2-99": ["Server", "Offline"], - "dlrm-v2-99.9": ["Server", "Offline"], - "3d-unet-99": ["Offline"], - "3d-unet-99.9": ["Offline"], - "gptj-99": ["Server", "Offline"], - "gptj-99.9": ["Server", "Offline"], - "stable-diffusion-xl": ["Server", "Offline"], - "llama2-70b-99": ["Server", "Offline"], - "llama2-70b-99.9": ["Server", "Offline"], - "mixtral-8x7b": ["Server", "Offline"], - }, - "edge": { - "resnet": ["SingleStream", "MultiStream", "Offline"], - "retinanet": ["SingleStream", "MultiStream", "Offline"], - "rnnt": ["SingleStream", "Offline"], - "bert-99": ["SingleStream", "Offline"], - "bert-99.9": [], - "dlrm-v2-99": [], - "dlrm-v2-99.9": [], - "3d-unet-99": ["SingleStream", "Offline"], - "3d-unet-99.9": ["SingleStream", "Offline"], - "gptj-99": ["SingleStream", "Offline"], - "gptj-99.9": ["SingleStream", "Offline"], - "stable-diffusion-xl": ["SingleStream", "Offline"], - }, - } + if args.version == "4.1": + filter_scenarios = { + "datacenter": { + "resnet": ["Server", "Offline"], + "retinanet": ["Server", "Offline"], + "rnnt": ["Server", "Offline"], + "bert-99": ["Server", "Offline"], + "bert-99.9": ["Server", "Offline"], + "dlrm-v2-99": ["Server", "Offline"], + "dlrm-v2-99.9": ["Server", "Offline"], + "3d-unet-99": ["Offline"], + "3d-unet-99.9": ["Offline"], + "gptj-99": ["Server", "Offline"], + "gptj-99.9": ["Server", "Offline"], + "stable-diffusion-xl": ["Server", "Offline"], + "llama2-70b-99": ["Server", "Offline"], + "llama2-70b-99.9": ["Server", "Offline"], + "mixtral-8x7b": ["Server", "Offline"], + }, + "edge": { + "resnet": ["SingleStream", "MultiStream", "Offline"], + "retinanet": ["SingleStream", "MultiStream", "Offline"], + "rnnt": ["SingleStream", "Offline"], + "bert-99": ["SingleStream", "Offline"], + "bert-99.9": [], + "dlrm-v2-99": [], + "dlrm-v2-99.9": [], + "3d-unet-99": ["SingleStream", "Offline"], + "3d-unet-99.9": ["SingleStream", "Offline"], + "gptj-99": ["SingleStream", "Offline"], + "gptj-99.9": ["SingleStream", "Offline"], + "stable-diffusion-xl": ["SingleStream", "Offline"], + }, + } + else: + filter_scenarios = { + "datacenter": { + "resnet": ["Server", "Offline"], + "retinanet": ["Server", "Offline"], + "rnnt": ["Server", "Offline"], + "dlrm-v2-99": ["Server", "Offline"], + "dlrm-v2-99.9": ["Server", "Offline"], + "3d-unet-99": ["Offline"], + "3d-unet-99.9": ["Offline"], + "gptj-99": ["Server", "Offline"], + "gptj-99.9": ["Server", "Offline"], + "stable-diffusion-xl": ["Server", "Offline"], + "llama2-70b-99": ["Server", "Offline"], + "llama2-70b-99.9": ["Server", "Offline"], + "mixtral-8x7b": ["Server", "Offline"], + "rgat": ["Offline"], + "llama3-405b": ["Offline", "Server"] + }, + "edge": { + "resnet": ["SingleStream", "MultiStream", "Offline"], + "retinanet": ["SingleStream", "MultiStream", "Offline"], + "rnnt": ["SingleStream", "Offline"], + "bert-99": ["SingleStream", "Offline"], + "bert-99.9": ["SingleStream", "Offline"], + "3d-unet-99": ["SingleStream", "Offline"], + "3d-unet-99.9": ["SingleStream", "Offline"], + "gptj-99": ["SingleStream", "Offline"], + "gptj-99.9": ["SingleStream", "Offline"], + "stable-diffusion-xl": ["SingleStream", "Offline"], + }, + } def MakeWorksheet(df, index, filter_dict, sheet_name, outjsondata=[]): for key, value in filter_dict.items(): diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 43fa1350c..4a463f304 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -677,7 +677,7 @@ } ACC_PATTERN = { - "acc": r"^accuracy=([\d\.]+).*", + "acc": r"^(?:\{\"accuracy|accuracy)[\": ]*=?\s*([\d\.]+).*", "AUC": r"^AUC=([\d\.]+).*", "mAP": r"^mAP=([\d\.]+).*", "bleu": r"^BLEU\:\s*([\d\.]+).*", From aebc0183310fa7f8dd4598d2d19985734c5113d3 Mon Sep 17 00:00:00 2001 From: Zhihan Jiang <68881590+nvzhihanj@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:27:19 -0800 Subject: [PATCH 08/14] More minor fixes for llama3.1-405b (#1983) * More minor fixes * Fix indentation for stats report --- language/llama3-405b/SUT_VLLM.py | 22 +++++++++++----------- language/llama3-405b/dataset.py | 2 +- language/llama3-405b/evaluate-accuracy.py | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/language/llama3-405b/SUT_VLLM.py b/language/llama3-405b/SUT_VLLM.py index e64999d09..f5a802021 100644 --- a/language/llama3-405b/SUT_VLLM.py +++ b/language/llama3-405b/SUT_VLLM.py @@ -31,7 +31,7 @@ def __init__( model_path=None, dtype="bfloat16", batch_size=None, - total_sample_count=8312, + total_sample_count=8313, dataset_path=None, use_cached_outputs=False, # Set this to True *only for test accuracy runs* in case your prior @@ -140,16 +140,16 @@ def process_queries(self): n_tokens)] lg.QuerySamplesComplete(response) - tok = time.time() + tok = time.time() - with self.sample_counter_lock: - self.sample_counter += len(qitem) - log.info(f"Samples run: {self.sample_counter}") - if tik1: - log.info(f"\tBatchMaker time: {tik2 - tik1}") - log.info(f"\tInference time: {tik3 - tik2}") - log.info(f"\tPostprocess time: {tok - tik3}") - log.info(f"\t==== Total time: {tok - tik1}") + with self.sample_counter_lock: + self.sample_counter += len(qitem) + log.info(f"Samples run: {self.sample_counter}") + if tik1: + log.info(f"\tBatchMaker time: {tik2 - tik1}") + log.info(f"\tInference time: {tik3 - tik2}") + log.info(f"\tPostprocess time: {tok - tik3}") + log.info(f"\t==== Total time: {tok - tik1}") def load_model(self): log.info("Loading model...") @@ -194,7 +194,7 @@ def __init__( self, model_path=None, dtype="bfloat16", - total_sample_count=8312, + total_sample_count=8313, dataset_path=None, batch_size=None, workers=1, diff --git a/language/llama3-405b/dataset.py b/language/llama3-405b/dataset.py index 04fe9c4b2..084f13208 100644 --- a/language/llama3-405b/dataset.py +++ b/language/llama3-405b/dataset.py @@ -24,7 +24,7 @@ class Dataset: def __init__( self, model_name=None, - total_sample_count=8312, + total_sample_count=8313, perf_count_override=None, dataset_path=None, dtype="bfloat16" diff --git a/language/llama3-405b/evaluate-accuracy.py b/language/llama3-405b/evaluate-accuracy.py index ccc87f71f..f5677820e 100644 --- a/language/llama3-405b/evaluate-accuracy.py +++ b/language/llama3-405b/evaluate-accuracy.py @@ -141,7 +141,7 @@ def main(): tokenizer = AutoTokenizer.from_pretrained( checkpoint_path, - model_max_length=2048, + model_max_length=22000, padding_side="left", use_fast=False, ) From 3ae2b2a20b01fda1236e6950d089f2bc6eac91aa Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 18 Dec 2024 12:28:57 -0500 Subject: [PATCH 09/14] Remove unused rgat files (#1961) Co-authored-by: Miro --- graph/R-GAT/igbh/tiny/models/dataloader.py | 82 ------ graph/R-GAT/igbh/tiny/models/gnn.py | 296 --------------------- graph/R-GAT/igbh/tiny/models/main.py | 79 ------ graph/R-GAT/igbh/tiny/models/utils.py | 224 ---------------- 4 files changed, 681 deletions(-) delete mode 100644 graph/R-GAT/igbh/tiny/models/dataloader.py delete mode 100644 graph/R-GAT/igbh/tiny/models/gnn.py delete mode 100644 graph/R-GAT/igbh/tiny/models/main.py delete mode 100644 graph/R-GAT/igbh/tiny/models/utils.py diff --git a/graph/R-GAT/igbh/tiny/models/dataloader.py b/graph/R-GAT/igbh/tiny/models/dataloader.py deleted file mode 100644 index cc64d1466..000000000 --- a/graph/R-GAT/igbh/tiny/models/dataloader.py +++ /dev/null @@ -1,82 +0,0 @@ -import torch -from torch_geometric.data import InMemoryDataset, Data -from dgl.data import DGLDataset - -from utils import IGL260MDataset - -# TODO: Make a PyG dataloader for large datasets - - -class IGL260M_PyG(InMemoryDataset): - def __init__(self, args): - super().__init__(root, transform, pre_transform, pre_filter) - - def process(self): - dataset = IGL260MDataset(root=self.dir, size=args.dataset_size, - in_memory=args.in_memory, classes=args.type_classes, synthetic=args.synthetic) - node_features = torch.from_numpy(dataset.paper_feat) - node_edges = torch.from_numpy(dataset.paper_edge).T - node_labels = torch.from_numpy(dataset.paper_label).to(torch.long) - data = Data(x=node_features, edge_index=node_edges, y=node_labels) - - n_nodes = node_features.shape[0] - - n_train = int(n_nodes * 0.6) - n_val = int(n_nodes * 0.2) - - train_mask = torch.zeros(n_nodes, dtype=torch.bool) - val_mask = torch.zeros(n_nodes, dtype=torch.bool) - test_mask = torch.zeros(n_nodes, dtype=torch.bool) - - train_mask[:n_train] = True - val_mask[n_train:n_train + n_val] = True - test_mask[n_train + n_val:] = True - - data.train_mask = train_mask - data.val_mask = val_mask - data.test_mask = test_mask - - -class IGL260M_DGL(DGLDataset): - def __init__(self, args): - self.dir = args.path - super().__init__(name='IGB260M') - - def process(self): - dataset = IGL260MDataset(root=self.dir, size=args.dataset_size, - in_memory=args.in_memory, classes=args.type_classes, synthetic=args.synthetic) - node_features = torch.from_numpy(dataset.paper_feat) - node_edges = torch.from_numpy(dataset.paper_edge) - node_labels = torch.from_numpy(dataset.paper_label).to(torch.long) - - self.graph = dgl.graph( - (node_edges[:, 0], node_edges[:, 1]), num_nodes=node_features.shape[0]) - - self.graph.ndata['feat'] = node_features - self.graph.ndata['label'] = node_labels - - self.graph = dgl.remove_self_loop(self.graph) - self.graph = dgl.add_self_loop(self.graph) - - n_nodes = node_features.shape[0] - - n_train = int(n_nodes * 0.6) - n_val = int(n_nodes * 0.2) - - train_mask = torch.zeros(n_nodes, dtype=torch.bool) - val_mask = torch.zeros(n_nodes, dtype=torch.bool) - test_mask = torch.zeros(n_nodes, dtype=torch.bool) - - train_mask[:n_train] = True - val_mask[n_train:n_train + n_val] = True - test_mask[n_train + n_val:] = True - - self.graph.ndata['train_mask'] = train_mask - self.graph.ndata['val_mask'] = val_mask - self.graph.ndata['test_mask'] = test_mask - - def __getitem__(self, i): - return self.graph - - def __len__(self): - return 1 diff --git a/graph/R-GAT/igbh/tiny/models/gnn.py b/graph/R-GAT/igbh/tiny/models/gnn.py deleted file mode 100644 index 20d5ecd72..000000000 --- a/graph/R-GAT/igbh/tiny/models/gnn.py +++ /dev/null @@ -1,296 +0,0 @@ -from utils import IGL260MDataset -import warnings -from tqdm import tqdm -import numpy as np -import time -import torch.nn.functional as F -import torch.optim as optim -import torch.nn as nn -import dgl -from dgl.data import DGLDataset -import dgl.nn.pytorch as dglnn -from dgl.nn.pytorch import GATConv, GraphConv, SAGEConv -import os.path as osp -from sys import getsizeof - - -import torch -torch.manual_seed(0) -dgl.seed(0) -warnings.filterwarnings("ignore") - - -class GCN(nn.Module): - def __init__(self, - in_feats, - n_hidden, - n_classes, - n_layers, - activation, - dropout): - super(GCN, self).__init__() - self.layers = nn.ModuleList() - self.n_layers = n_layers - self.n_hidden = n_hidden - self.n_classes = n_classes - # input layer - self.layers.append( - GraphConv( - in_feats, - n_hidden, - activation=activation)) - # hidden layers - for i in range(n_layers - 1): - self.layers.append( - GraphConv( - n_hidden, - n_hidden, - activation=activation)) - # output layer - self.layers.append(GraphConv(n_hidden, n_classes)) - self.dropout = nn.Dropout(p=dropout) - self.activation = activation - - def forward(self, blocks, x): - h = x - for l, (layer, block) in enumerate(zip(self.layers, blocks)): - if l != len(self.layers) - 1: - # h = self.activation(h) - h = self.dropout(h) - h = layer(block, h) - return h - - def inference(self, g, x, batch_size, device): - """ - Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). - g : the entire graph. - x : the input of entire node set. - The inference code is written in a fashion that it could handle any number of nodes and - layers. - """ - # During inference with sampling, multi-layer blocks are very inefficient because - # lots of computations in the first few layers are repeated. - # Therefore, we compute the representation of all nodes layer by layer. The nodes - # on each layer are of course splitted in batches. - # TODO: can we standardize this? - for l, layer in enumerate(self.layers): - y = torch.zeros(g.number_of_nodes(), self.n_hidden if l != - len(self.layers) - 1 else self.n_classes) - - sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) - dataloader = dgl.dataloading.NodeDataLoader( - g, - torch.arange(g.number_of_nodes()), - sampler, - batch_size=batch_size, - shuffle=True, - drop_last=False, - num_workers=4) - - for input_nodes, output_nodes, blocks in dataloader: - block = blocks[0] - - block = block.int().to(device) - h = x[input_nodes].to(device) - h = layer(block, h) - if l != len(self.layers) - 1: - h = self.activation(h) - h = self.dropout(h) - - y[output_nodes] = h.cpu() - - x = y - return y - - -class GAT(nn.Module): - def __init__( - self, in_feats, n_hidden, n_classes, n_layers, num_heads, activation - ): - super().__init__() - self.n_layers = n_layers - self.n_hidden = n_hidden - self.n_classes = n_classes - self.layers = nn.ModuleList() - self.layers.append( - dglnn.GATConv( - (in_feats, in_feats), - n_hidden, - num_heads=num_heads, - activation=activation, - ) - ) - for i in range(1, n_layers - 1): - self.layers.append( - dglnn.GATConv( - (n_hidden * num_heads, n_hidden * num_heads), - n_hidden, - num_heads=num_heads, - activation=activation, - ) - ) - self.layers.append( - dglnn.GATConv( - (n_hidden * num_heads, n_hidden * num_heads), - n_classes, - num_heads=num_heads, - activation=None, - ) - ) - - def forward(self, blocks, x): - h = x - for l, (layer, block) in enumerate(zip(self.layers, blocks)): - # We need to first copy the representation of nodes on the RHS from the - # appropriate nodes on the LHS. - # Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst - # would be (num_nodes_RHS, D) - h_dst = h[: block.num_dst_nodes()] - # Then we compute the updated representation on the RHS. - # The shape of h now becomes (num_nodes_RHS, D) - if l < self.n_layers - 1: - h = layer(block, (h, h_dst)).flatten(1) - else: - h = layer(block, (h, h_dst)) - h = h.mean(1) - return h.log_softmax(dim=-1) - - def inference(self, g, x, batch_size, device): - """ - Inference with the GAT model on full neighbors (i.e. without neighbor sampling). - g : the entire graph. - x : the input of entire node set. - The inference code is written in a fashion that it could handle any number of nodes and - layers. - """ - # During inference with sampling, multi-layer blocks are very inefficient because - # lots of computations in the first few layers are repeated. - # Therefore, we compute the representation of all nodes layer by layer. The nodes - # on each layer are of course splitted in batches. - # TODO: can we standardize this? - # TODO: make thiw into a variable - num_heads = 2 - for l, layer in enumerate(self.layers): - if l < self.n_layers - 1: - y = torch.zeros( - g.num_nodes(), - self.n_hidden * num_heads - if l != len(self.layers) - 1 - else self.n_classes, - ) - else: - y = torch.zeros( - g.num_nodes(), - self.n_hidden - if l != len(self.layers) - 1 - else self.n_classes, - ) - - sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) - dataloader = dgl.dataloading.DataLoader( - g, - torch.arange(g.num_nodes()), - sampler, - batch_size=batch_size, - shuffle=True, - drop_last=False, - num_workers=4, - ) - - for input_nodes, output_nodes, blocks in dataloader: - block = blocks[0].int().to(device) - - h = x[input_nodes].to(device) - h_dst = h[: block.num_dst_nodes()] - if l < self.n_layers - 1: - h = layer(block, (h, h_dst)).flatten(1) - else: - h = layer(block, (h, h_dst)) - h = h.mean(1) - h = h.log_softmax(dim=-1) - - y[output_nodes] = h.cpu() - - x = y - return y - - -class SAGE(nn.Module): - def __init__(self, - in_feats, - n_hidden, - n_classes, - n_layers, - activation, - dropout, - aggregator_type): - super().__init__() - self.n_layers = n_layers - self.n_hidden = n_hidden - self.n_classes = n_classes - self.layers = nn.ModuleList() - self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, aggregator_type)) - for i in range(1, n_layers - 1): - self.layers.append( - dglnn.SAGEConv( - n_hidden, - n_hidden, - aggregator_type)) - self.layers.append( - dglnn.SAGEConv( - n_hidden, - n_classes, - aggregator_type)) - self.dropout = nn.Dropout(dropout) - self.activation = activation - - def forward(self, blocks, x): - h = x - for l, (layer, block) in enumerate(zip(self.layers, blocks)): - h = layer(block, h) - if l != len(self.layers) - 1: - h = self.activation(h) - h = self.dropout(h) - return h - - def inference(self, g, x, batch_size, device): - """ - Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). - g : the entire graph. - x : the input of entire node set. - The inference code is written in a fashion that it could handle any number of nodes and - layers. - """ - # During inference with sampling, multi-layer blocks are very inefficient because - # lots of computations in the first few layers are repeated. - # Therefore, we compute the representation of all nodes layer by layer. The nodes - # on each layer are of course splitted in batches. - # TODO: can we standardize this? - for l, layer in enumerate(self.layers): - y = torch.zeros(g.number_of_nodes(), self.n_hidden if l != - len(self.layers) - 1 else self.n_classes) - - sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) - dataloader = dgl.dataloading.NodeDataLoader( - g, - torch.arange(g.number_of_nodes()), - sampler, - batch_size=batch_size, - shuffle=True, - drop_last=False, - num_workers=4) - - for input_nodes, output_nodes, blocks in dataloader: - block = blocks[0] - - block = block.int().to(device) - h = x[input_nodes].to(device) - h = layer(block, h) - if l != len(self.layers) - 1: - h = self.activation(h) - h = self.dropout(h) - - y[output_nodes] = h.cpu() - - x = y - return y diff --git a/graph/R-GAT/igbh/tiny/models/main.py b/graph/R-GAT/igbh/tiny/models/main.py deleted file mode 100644 index 4ab22eb75..000000000 --- a/graph/R-GAT/igbh/tiny/models/main.py +++ /dev/null @@ -1,79 +0,0 @@ -import argparse - - -def main(): - parser = argparse.ArgumentParser() - - # Input/output paths - parser.add_argument('--path', type=str, default='/gnndataset/') - parser.add_argument('--modelpath', type=str, default='gcn_19.pt') - - # Dataset selection - parser.add_argument( - '--dataset_size', - type=str, - default='experimental', - choices=[ - 'experimental', - 'small', - 'medium', - 'large', - 'full']) - parser.add_argument( - '--type_classes', - type=int, - default=19, - choices=[ - 19, - 292, - 2983]) - - # Hyperparameters - parser.add_argument('--hidden_channels', type=int, default=16) - parser.add_argument('--fan_out', type=str, default='5,10') - parser.add_argument('--num_layers', type=int, default=2) - parser.add_argument('--learning_rate', type=int, default=0.01) - parser.add_argument('--decay', type=int, default=0.001) - parser.add_argument('--num_workers', type=int, default=4) - parser.add_argument('--batch_size', type=int, default=2048 * 16) - parser.add_argument('--dropout', type=float, default=0.2) - parser.add_argument('--epochs', type=int, default=20) - parser.add_argument( - '--model_type', - type=str, - default='gcn', - choices=[ - 'gat', - 'sage', - 'gcn']) - parser.add_argument('--in_memory', type=int, default=0) - parser.add_argument('--synthetic', type=int, default=0) - parser.add_argument('--device', type=str, default='1') - args = parser.parse_args() - - print("Dataset_size: " + args.dataset_size) - print("Model : " + args.model) - print("Num_classes : " + str(args.num_classes)) - print() - - device = f'cuda:' + args.device if torch.cuda.is_available() else 'cpu' - - dataset = IGL260M_DGL(args) - g = dataset[0] - - best_test_acc, train_acc, test_acc = track_acc(g, args) - - print( - f"Train accuracy: {np.mean(train_acc):.2f} \u00B1 {np.std(train_acc):.2f} \t Best: {np.max(train_acc) * 100:.4f}%") - print( - f"Test accuracy: {np.mean(test_acc):.2f} \u00B1 {np.std(test_acc):.2f} \t Best: {np.max(test_acc) * 100:.4f}%") - print() - print(" -------- For debugging --------- ") - print("Parameters: ", args) - print(g) - print("Train accuracy: ", train_acc) - print("Test accuracy: ", test_acc) - - -if __name__ == '__main__': - main() diff --git a/graph/R-GAT/igbh/tiny/models/utils.py b/graph/R-GAT/igbh/tiny/models/utils.py deleted file mode 100644 index 5e9e1a25d..000000000 --- a/graph/R-GAT/igbh/tiny/models/utils.py +++ /dev/null @@ -1,224 +0,0 @@ -import numpy as np -import torch - - -class IGL260MDataset(object): - def __init__(self, root: str, size: str, in_memory: int, - classes: int, synthetic: int): - self.dir = root - self.size = size - self.synthetic = synthetic - self.in_memory = in_memory - self.num_classes = classes - self.__meta__ = torch.load(osp.join(self.dir, self.size, 'meta.pt')) - - self.num_features = self.__meta__['paper']['emb_dim'] - self.num_nodes = self.__meta__['paper']['num_node'] - self.num_edges = self.__meta__['cites']['num_edge'] - - @property - def paper_feat(self) -> np.ndarray: - if self.synthetic: - return np.random((self.num_nodes, self.num_edges)) - - path = osp.join( - self.dir, - self.size, - 'processed', - 'paper', - 'node_feat.npy') - if self.in_memory: - return np.load(path) - else: - return np.load(path, mmap_mode='r') - - @property - def paper_label(self) -> np.ndarray: - if self.num_classes == 19: - path = osp.join( - self.dir, - self.size, - 'processed', - 'paper', - 'node_label_19.npy') - else: - path = osp.join( - self.dir, - self.size, - 'processed', - 'paper', - 'node_label_2K.npy') - if self.in_memory: - return np.load(path) - else: - return np.load(path, mmap_mode='r') - - @property - def paper_edge(self) -> np.ndarray: - path = osp.join( - self.dir, - self.size, - 'processed', - 'paper__cites__paper', - 'edge_index.npy') - if self.in_memory: - return np.load(path) - else: - return np.load(path, mmap_mode='r') - - -def compute_acc(pred, labels): - """ - Compute the accuracy of prediction given the labels. - """ - labels = labels.long() - return (torch.argmax(pred, dim=1) == labels).float().sum() / len(pred) - - -def evaluate(model, g, inputs, labels, val_nid, batch_size, device): - """ - Evaluate the model on the validation set specified by ``val_nid``. - g : The entire graph. - inputs : The features of all the nodes. - labels : The labels of all the nodes. - val_nid : the node Ids for validation. - batch_size : Number of nodes to compute at the same time. - device : The GPU device to evaluate on. - """ - model.eval() - with torch.no_grad(): - pred = model.inference(g, inputs, batch_size, device) - model.train() - return compute_acc(pred[val_nid], labels[val_nid]) - - -def load_subtensor(g, seeds, input_nodes, device): - """ - Copys features and labels of a set of nodes onto GPU. - """ - batch_inputs = g.ndata['features'][input_nodes].to(device) - batch_labels = g.ndata['labels'][seeds].to(device) - return batch_inputs, batch_labels - - -def track_acc(g, args): - train_accuracy = [] - test_accuracy = [] - g.ndata['features'] = g.ndata['feat'] - g.ndata['labels'] = g.ndata['label'] - in_feats = g.ndata['features'].shape[1] - n_classes = args.num_classes - - # Create csr/coo/csc formats before launching training processes with multi-gpu. - # This avoids creating certain formats in each sub-process, which saves - # momory and CPU. - g.create_formats_() - - num_epochs = args.epochs - num_hidden = args.hidden_channels - num_layers = args.num_layers - fan_out = args.fan_out - batch_size = args.batch_size - lr = args.learning_rate - dropout = args.dropout - num_workers = args.num_workers - - train_nid = torch.nonzero(g.ndata['train_mask'], as_tuple=True)[0] - - # Create PyTorch DataLoader for constructing blocks - sampler = dgl.dataloading.MultiLayerNeighborSampler( - [int(fanout) for fanout in fan_out.split(',')]) - - dataloader = dgl.dataloading.NodeDataLoader( - g, - train_nid, - sampler, - batch_size=batch_size, - shuffle=True, - drop_last=False, - num_workers=num_workers) - - # Define model and optimizer - if args.model_type == 'gcn': - model = GCN(in_feats, num_hidden, n_classes, 1, F.relu, dropout) - if args.model_type == 'sage': - model = SAGE( - in_feats, - num_hidden, - n_classes, - num_layers, - F.relu, - dropout, - 'gcn') - if args.model_type == 'gat': - model = GAT(in_feats, num_hidden, n_classes, num_layers, 2, F.relu) - - model = model.to(device) - loss_fcn = nn.CrossEntropyLoss() - loss_fcn = loss_fcn.to(device) - optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=args.decay) - - # Training loop - avg = 0 - best_test_acc = 0 - log_every = 1 - training_start = time.time() - for epoch in (range(num_epochs)): - # Loop over the dataloader to sample the computation dependency graph as a list of - # blocks. - epoch_loss = 0 - gpu_mem_alloc = 0 - epoch_start = time.time() - for step, (input_nodes, seeds, blocks) in (enumerate(dataloader)): - # Load the input features as well as output labels - # batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device) - blocks = [block.int().to(device) for block in blocks] - batch_inputs = blocks[0].srcdata['features'] - batch_labels = blocks[-1].dstdata['labels'] - - # Compute loss and prediction - batch_pred = model(blocks, batch_inputs) - loss = loss_fcn(batch_pred, batch_labels) - optimizer.zero_grad() - loss.backward() - optimizer.step() - - epoch_loss += loss.detach() - - gpu_mem_alloc += ( - torch.cuda.max_memory_allocated() / 1000000 - if torch.cuda.is_available() - else 0 - ) - - train_g = g - train_nid = torch.nonzero( - train_g.ndata['train_mask'], as_tuple=True)[0] - train_acc = evaluate( - model, train_g, train_g.ndata['features'], train_g.ndata['labels'], train_nid, batch_size, device) - - test_g = g - test_nid = torch.nonzero( - test_g.ndata['test_mask'], as_tuple=True)[0] - test_acc = evaluate( - model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, batch_size, device) - - if test_acc.item() > best_test_acc: - best_test_acc = test_acc.item() - tqdm.write( - "Epoch {:05d} | Loss {:.4f} | Train Acc {:.4f} | Test Acc {:.4f} | Time {:.2f}s | GPU {:.1f} MB".format( - epoch, - epoch_loss, - train_acc.item(), - test_acc.item(), - time.time() - epoch_start, - gpu_mem_alloc - ) - ) - test_accuracy.append(test_acc.item()) - train_accuracy.append(train_acc.item()) - torch.save(model.state_dict(), args.modelpath) - print() - print("Total time taken: ", time.time() - training_start) - - return best_test_acc, train_accuracy, test_accuracy From 03c96663dc2bd47cc5e8f5fbb0fc4079ae2c784d Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 18 Dec 2024 12:31:23 -0500 Subject: [PATCH 10/14] Update docker GPU, avoid long build time (#1966) Co-authored-by: Miro --- graph/R-GAT/README.md | 5 +++-- graph/R-GAT/dockerfile.gpu | 6 ++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/graph/R-GAT/README.md b/graph/R-GAT/README.md index 569233ac6..aecf7ffe9 100644 --- a/graph/R-GAT/README.md +++ b/graph/R-GAT/README.md @@ -181,9 +181,10 @@ docker build . -f dockerfile.gpu -t rgat-gpu ``` Run docker container: ```bash -docker run --rm -it -v $(pwd):/root --gpus all rgat-gpu +docker run --rm -it -v $(pwd):/workspace/root --gpus all rgat-gpu ``` -Run benchmark inside the docker container: +Go inside the root folder and run benchmark inside the docker container: ```bash +cd root python3 main.py --dataset igbh-dgl --dataset-path igbh/ --profile rgat-dgl-full --device gpu [--model-path ] [--in-memory] [--dtype ] [--scenario ] ``` diff --git a/graph/R-GAT/dockerfile.gpu b/graph/R-GAT/dockerfile.gpu index fae65081f..f600028fe 100644 --- a/graph/R-GAT/dockerfile.gpu +++ b/graph/R-GAT/dockerfile.gpu @@ -26,6 +26,8 @@ RUN apt install -y --no-install-recommends rsync # Upgrade pip RUN python3 -m pip install --upgrade pip +RUN pip install torch-geometric torch-scatter torch-sparse -f https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html +RUN pip install dgl -f https://data.dgl.ai/wheels/torch-2.1/cu121/repo.html COPY requirements.txt requirements.txt RUN pip install -r requirements.txt @@ -35,10 +37,6 @@ RUN cd /tmp && \ pip install pybind11 && \ CFLAGS="-std=c++14" python3 setup.py install -RUN export TORCH_VERSION=$(python -c "import torch; print(torch.__version__)") -RUN pip install torch-geometric torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-${TORCH_VERSION}.html -RUN pip install dgl -f https://data.dgl.ai/wheels/torch-2.1/cu121/repo.html - # Clean up RUN rm -rf mlperf \ rm requirements.txt \ No newline at end of file From 867def46417627eaa9de8f926721bf88167009ba Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 18 Dec 2024 13:15:33 -0500 Subject: [PATCH 11/14] Require equal issue mode for R-GAT (#1968) * Require equal issue mode for R-GAT * Add equal issue note in readme --------- Co-authored-by: Miro --- graph/R-GAT/README.md | 2 ++ loadgen/mlperf.conf | 3 +++ tools/submission/submission_checker.py | 1 + 3 files changed, 6 insertions(+) diff --git a/graph/R-GAT/README.md b/graph/R-GAT/README.md index aecf7ffe9..69883c0d1 100644 --- a/graph/R-GAT/README.md +++ b/graph/R-GAT/README.md @@ -188,3 +188,5 @@ Go inside the root folder and run benchmark inside the docker container: cd root python3 main.py --dataset igbh-dgl --dataset-path igbh/ --profile rgat-dgl-full --device gpu [--model-path ] [--in-memory] [--dtype ] [--scenario ] ``` + +**NOTE:** For official submissions, this benchmark is required to run in equal issue mode. Please make sure that the flag `rgat.*.sample_concatenate_permutation` is set to one in the [mlperf.conf](../../loadgen/mlperf.conf) file when loadgen is built. diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf index 1fe202253..95cc08351 100644 --- a/loadgen/mlperf.conf +++ b/loadgen/mlperf.conf @@ -42,6 +42,9 @@ retinanet.MultiStream.target_latency = 528 # 3D-UNet uses equal issue mode because it has non-uniform inputs 3d-unet.*.sample_concatenate_permutation = 1 +# R-GAT uses equal issue mode because it may have non-uniform inputs +rgat.*.sample_concatenate_permutation = 1 + # LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario gptj.*.sample_concatenate_permutation = 1 llama2-70b.*.sample_concatenate_permutation = 1 diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 4a463f304..dcdad1180 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -957,6 +957,7 @@ def requires_equal_issue(self, model, division): "llama2-70b-99.9", "mixtral-8x7b", "llama3-405b", + "rgat", ] and self.version not in ["v4.0", "v4.1"] ) From b3e1e8e636908a6989a3d04c4c09c21b756f3a4a Mon Sep 17 00:00:00 2001 From: mrmhodak Date: Wed, 18 Dec 2024 18:15:47 +0000 Subject: [PATCH 12/14] Increment version to 5.0.3 --- loadgen/VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loadgen/VERSION.txt b/loadgen/VERSION.txt index a1ef0cae1..50e2274e6 100644 --- a/loadgen/VERSION.txt +++ b/loadgen/VERSION.txt @@ -1 +1 @@ -5.0.2 +5.0.3 From 8397bec7447afc2eba5a0b630594981cc4dfed27 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 18 Dec 2024 18:17:07 +0000 Subject: [PATCH 13/14] Docs update for r-gat (#1969) * Fixes #1648, restrict loadgen uncommitted error message to within the loadgen directory * Update test-rnnt.yml (#1688) Stopping the github action for rnnt * Added docs init Added github action for website publish Update benchmark documentation Update publish.yaml Update publish.yaml Update benchmark documentation Improved the submission documentation Fix taskname Removed unused images * Fix benchmark URLs * Fix links * Add _full variation to run commands * Added script flow diagram * Added docker setup command for CM, extra run options * Added support for docker options in the docs * Added --quiet to the CM run_cmds in docs * Fix the test query count for cm commands * Support ctuning-cpp implementation * Added commands for mobilenet models * Docs cleanup * Docs cleanup * Added separate files for dataset and models in the docs * Remove redundant tab in the docs * Fixes some WIP models in the docs * Use the official docs page for CM installation * Fix the deadlink in docs * Fix indendation issue in docs * Added dockerinfo for nvidia implementation * Added run options for gptj * Added execution environment tabs * Cleanup of the docs * Cleanup of the docs * Reordered the sections of the docs page * Removed an unnecessary heading in the docs * Fixes the commands for datacenter * Fix the build --sdist for loadgen * Fixes #1761, llama2 and mixtral runtime error on CPU systems * Added mixtral to the benchmark list, improved benchmark docs * Update docs for MLPerf inference v4.1 * Update docs for MLPerf inference v4.1 * Fix typo * Gave direct link to implementation readmes * Added tables detailing implementations * Update vision README.md, split the frameworks into separate rows * Update README.md * pointed links to specific frameworks * pointed links to specific frameworks * Update Submission_Guidelines.md * Update Submission_Guidelines.md * Update Submission_Guidelines.md * api support llama2 * Added request module and reduced max token len * Fix for llama2 api server * Update SUT_API offline to work for OpenAI * Update SUT_API.py * Minor fixes * Fix json import in SUT_API.py * Fix llama2 token length * Added model name verification with server * clean temp files * support num_workers in LLAMA2 SUTs * Remove batching from Offline SUT_API.py * Update SUT_API.py * Minor fixes for llama2 API * Fix for llama2 API * removed table of contents * enabled llama2-nvidia + vllm-NM : WIP * enabled dlrm for intel * lower cased implementation * added raw data input * corrected data download commands * renamed filename * changes for bert and vllm * documentation to work on custom repo and branch * benchmark index page update * enabled sdxl for nvidia and intel * updated vllm server run cmd * benchmark page information addition * fix indendation issue * Added submission categories * update submission page - generate submission with or w/o using CM for benchmarking * Updated kits dataset documentation * Updated model parameters * updation of information * updated non cm based benchmark * added info about hf password * added links to model and access tokens * Updated reference results structuree tree * submission docs cleanup * Some cleanups for benchmark info * Some cleanups for benchmark info * Some cleanups for benchmark info * added generic stubs deepsparse * Some cleanups for benchmark info * Some cleanups for benchmark info * Some cleanups for benchmark info * Some cleanups for benchmark info (FID and CLIP data added) * typo fix for bert deepsparse framework * added min system requirements for models * fixed code version * changes for displaying reference and intel implementation tip * added reference to installation page * updated neural magic documentation * Added links to the install page, redirect benchmarks page * added tips about batch size and dataset for nvidia llama2 * fix conditions logic * modified tips and additional run cmds * sentence corrections * Minor fix for the documentation * fixed bug in deepsparse generic model stubs + styling * added more information to stubs * Added SCC24 readme, support reproducibility in the docs * Made clear the custom CM repo URL format * Support conditional implementation, setup and run tips * Support rocm for sdxl * Fix _short tag support * Fix install URL * Expose bfloat16 and float16 options for sdxl * Expose download model to host option for sdxl * IndySCC24 documentation added * Improve the SCC24 docs * Improve the support of short variation * Improved the indyscc24 documentation * Updated scc run commands * removed test_query_count option for scc * Remove scc24 in the main docs * Remove scc24 in the main docs * Fix docs: indendation issue on the submission page * generalised code for skipping test query count * Fixes for SCC24 docs * Fix scenario text in main.py * Fix links for scc24 * Fix links for scc24 * Improve the general docs * Fix links for scc24 * Use float16 in scc24 doc * Improve scc24 docs * Improve scc24 docs * Use float16 in scc24 doc * fixed command bug * Fix typo in docs * Fix typo in docs * Remove unnecessary indendation in docs * initial commit for tip - native run CUDA * Updated tip * added docker_cm_repo_branch to more run option - docker * Update docs for IndySCC24 * Support custom repo branch and owner for final report generation * enabled amd implementation for llama2 * updations for amd - docs * Fix scenarios in docs page * formatted the files to pass the gh action * scenarios -> fixed_scenarios in docs * [Automated Commit] Format Codebase * Update indyscc24-bert.md * Update scc24.md * updated tip for reference implementation (#1912) * [Automated Commit] Format Codebase * fix for run suffix (#1913) * [Automated Commit] Format Codebase * Updation for adding submission flow diagram * Added submission flow diagram * Update scc24.md * changes in submission documentation (#1946) * update results category (#1947) * changes for adding rgat to docs (#1965) * Update index.md | Added R-GAT details (WIP) * Update index.md * Create system_requirements.yml * Update system_requirements.yml * Update system_requirements.yml * Update system_requirements.yml --------- Co-authored-by: anandhu-eng Co-authored-by: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Co-authored-by: Michael Goin Co-authored-by: arjunsuresh Co-authored-by: Pablo Gonzalez Co-authored-by: Mitchelle Rasquinha <80070689+mrasquinha-g@users.noreply.github.com> Co-authored-by: Miro --- docs/benchmarks/graph/get-rgat-data.md | 39 ++++++ docs/benchmarks/graph/rgat.md | 13 ++ docs/index.md | 19 ++- docs/submission/index.md | 160 +++++++++++++------------ docs/system_requirements.yml | 50 ++++++++ main.py | 9 +- mkdocs.yml | 2 + 7 files changed, 211 insertions(+), 81 deletions(-) create mode 100644 docs/benchmarks/graph/get-rgat-data.md create mode 100644 docs/benchmarks/graph/rgat.md create mode 100644 docs/system_requirements.yml diff --git a/docs/benchmarks/graph/get-rgat-data.md b/docs/benchmarks/graph/get-rgat-data.md new file mode 100644 index 000000000..189c25b87 --- /dev/null +++ b/docs/benchmarks/graph/get-rgat-data.md @@ -0,0 +1,39 @@ +--- +hide: + - toc +--- + +# Graph Neural Network using R-GAT + +## Dataset + +The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. + +=== "Full Dataset" + R-GAT validation run uses the IGBH dataset consisting of 547,306,935 nodes and 5,812,005,639 edges. + + ### Get Full Dataset + ``` + cm run script --tags=get,dataset,igbh,_full -j + ``` + +=== "Debug Dataset" + R-GAT debug run uses the IGBH debug dataset(tiny). + + ### Get Full Dataset + ``` + cm run script --tags=get,dataset,igbh,_debug -j + ``` + +## Model +The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. + +Get the Official MLPerf R-GAT Model + +=== "PyTorch" + + ### PyTorch + ``` + cm run script --tags=get,ml-model,rgat -j + ``` + diff --git a/docs/benchmarks/graph/rgat.md b/docs/benchmarks/graph/rgat.md new file mode 100644 index 000000000..ffff467a4 --- /dev/null +++ b/docs/benchmarks/graph/rgat.md @@ -0,0 +1,13 @@ +--- +hide: + - toc +--- + + +# Graph Neural Network using R-GAT + + +=== "MLCommons-Python" + ## MLPerf Reference Implementation in Python + +{{ mlperf_inference_implementation_readme (4, "rgat", "reference", devices = ["CPU", "CUDA"]) }} \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 11f2a52c2..b46d4c274 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,7 +1,7 @@ # MLPerf Inference Benchmarks ## Overview -The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf inference v4.0 round are listed below, categorized by tasks. Under each model you can find its details like the dataset used, reference accuracy, server latency constraints etc. +The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf inference v5.0 round are listed below, categorized by tasks. Under each model you can find its details like the dataset used, reference accuracy, server latency constraints etc. --- @@ -80,7 +80,7 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe - **Server Scenario Latency Constraint**: 130ms - **Equal Issue mode**: False - **High accuracy variant**: yes -- **Submission Category**: Datacenter, Edge +- **Submission Category**: Edge #### [LLAMA2-70B](benchmarks/language/llama2-70b.md) - **Dataset**: OpenORCA (GPT-4 split, max_seq_len=1024) @@ -157,11 +157,22 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe - **High accuracy variant**: Yes - **Submission Category**: Datacenter +## Graph Neural Networks +### [R-GAT](benchmarks/graph/rgat.md) +- **Dataset**: Illinois Graph Benchmark Heterogeneous validation dataset + - **Dataset Size**: 788,379 + - **QSL Size**: 788,379 +- **Number of Parameters**: +- **Reference Model Accuracy**: ACC = ? +- **Server Scenario Latency Constraint**: N/A +- **Equal Issue mode**: True +- **High accuracy variant**: No +- **Submission Category**: Datacenter --- ## Submission Categories -- **Datacenter Category**: All the current inference benchmarks are applicable to the datacenter category. -- **Edge Category**: All benchmarks except DLRMv2, LLAMA2-70B, and Mixtral-8x7B are applicable to the edge category. +- **Datacenter Category**: All benchmarks except bert are applicable to the datacenter category for inference v5.0. +- **Edge Category**: All benchmarks except DLRMv2, LLAMA2-70B, Mixtral-8x7B and R-GAT are applicable to the edge category for v5.0. ## High Accuracy Variants - **Benchmarks**: `bert`, `llama2-70b`, `gpt-j`, `dlrm_v2`, and `3d-unet` have a normal accuracy variant as well as a high accuracy variant. diff --git a/docs/submission/index.md b/docs/submission/index.md index c99802420..1050f5fb0 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -13,13 +13,15 @@ hide: Click [here](https://youtu.be/eI1Hoecc3ho) to view the recording of the workshop: Streamlining your MLPerf Inference results using CM. -=== "CM based benchmark" +Click [here](https://docs.google.com/presentation/d/1cmbpZUpVr78EIrhzyMBnnWnjJrD-mZ2vmSb-yETkTA8/edit?usp=sharing) to view the prposal slide for Common Automation for MLPerf Inference Submission Generation through CM. + +=== "CM based results" If you have followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, all the valid results will get aggregated to the `cm cache` folder. The following command could be used to browse the structure of inference results folder generated by CM. ### Get results folder structure ```bash cm find cache --tags=get,mlperf,inference,results,dir | xargs tree ``` -=== "Non CM based benchmark" +=== "Non CM based results" If you have not followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, please make sure that the result directory is structured in the following way. ``` └── System description ID(SUT Name) @@ -35,18 +37,20 @@ Click [here](https://youtu.be/eI1Hoecc3ho) to view the recording of the workshop | ├── mlperf_log_detail.txt | ├── mlperf_log_accuracy.json | └── accuracy.txt - └── Compliance_Test_ID - ├── Performance - | └── run_x/#1 run for all scenarios - | ├── mlperf_log_summary.txt - | └── mlperf_log_detail.txt - ├── Accuracy - | ├── baseline_accuracy.txt - | ├── compliance_accuracy.txt - | ├── mlperf_log_accuracy.json - | └── accuracy.txt - ├── verify_performance.txt - └── verify_accuracy.txt #for TEST01 only + |── Compliance_Test_ID + | ├── Performance + | | └── run_x/#1 run for all scenarios + | | ├── mlperf_log_summary.txt + | | └── mlperf_log_detail.txt + | ├── Accuracy + | | ├── baseline_accuracy.txt + | | ├── compliance_accuracy.txt + | | ├── mlperf_log_accuracy.json + | | └── accuracy.txt + | ├── verify_performance.txt + | └── verify_accuracy.txt #for TEST01 only + |── user.conf + └── measurements.json ```
@@ -67,67 +71,69 @@ Once all the results across all the models are ready you can use the following c ## Generate actual submission tree -=== "Closed Edge" - ### Closed Edge Submission - ```bash - cm run script --tags=generate,inference,submission \ - --clean \ - --preprocess_submission=yes \ - --run-checker \ - --submitter=MLCommons \ - --tar=yes \ - --env.CM_TAR_OUTFILE=submission.tar.gz \ - --division=closed \ - --category=edge \ - --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ - --quiet - ``` - -=== "Closed Datacenter" - ### Closed Datacenter Submission - ```bash - cm run script --tags=generate,inference,submission \ - --clean \ - --preprocess_submission=yes \ - --run-checker \ - --submitter=MLCommons \ - --tar=yes \ - --env.CM_TAR_OUTFILE=submission.tar.gz \ - --division=closed \ - --category=datacenter \ - --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ - --quiet - ``` -=== "Open Edge" - ### Open Edge Submission - ```bash - cm run script --tags=generate,inference,submission \ - --clean \ - --preprocess_submission=yes \ - --run-checker \ - --submitter=MLCommons \ - --tar=yes \ - --env.CM_TAR_OUTFILE=submission.tar.gz \ - --division=open \ - --category=edge \ - --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ - --quiet - ``` -=== "Open Datacenter" - ### Closed Datacenter Submission - ```bash - cm run script --tags=generate,inference,submission \ - --clean \ - --preprocess_submission=yes \ - --run-checker \ - --submitter=MLCommons \ - --tar=yes \ - --env.CM_TAR_OUTFILE=submission.tar.gz \ - --division=open \ - --category=datacenter \ - --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ - --quiet - ``` +=== "Docker run" + ### Docker run + === "Closed" + ### Closed Submission + ```bash + cm docker script --tags=generate,inference,submission \ + --clean \ + --preprocess_submission=yes \ + --run-checker \ + --submitter=MLCommons \ + --tar=yes \ + --env.CM_TAR_OUTFILE=submission.tar.gz \ + --division=closed \ + --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ + --quiet + ``` + + === "Open" + ### Open Submission + ```bash + cm docker script --tags=generate,inference,submission \ + --clean \ + --preprocess_submission=yes \ + --run-checker \ + --submitter=MLCommons \ + --tar=yes \ + --env.CM_TAR_OUTFILE=submission.tar.gz \ + --division=open \ + --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ + --quiet + ``` + +=== "Native run" + ### Native run + === "Closed" + ### Closed Submission + ```bash + cm run script --tags=generate,inference,submission \ + --clean \ + --preprocess_submission=yes \ + --run-checker \ + --submitter=MLCommons \ + --tar=yes \ + --env.CM_TAR_OUTFILE=submission.tar.gz \ + --division=closed \ + --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ + --quiet + ``` + + === "Open" + ### Open Submission + ```bash + cm run script --tags=generate,inference,submission \ + --clean \ + --preprocess_submission=yes \ + --run-checker \ + --submitter=MLCommons \ + --tar=yes \ + --env.CM_TAR_OUTFILE=submission.tar.gz \ + --division=open \ + --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ + --quiet + ``` * Use `--hw_name="My system name"` to give a meaningful system name. Examples can be seen [here](https://github.com/mlcommons/inference_results_v3.0/tree/main/open/cTuning/systems) @@ -137,6 +143,10 @@ Once all the results across all the models are ready you can use the following c * Use `--results_dir` option to specify the results folder for Non CM based benchmarks +* Use `--category` option to specify the category for which submission is generated(datacenter/edge). By default, the category is taken from `system_meta.json` file located in the SUT root directory. + +* Use `--submission_base_dir` to specify the directory to which outputs from preprocess submission script and final submission is to be dumped. No need to provide `--submission_dir` along with this. For `docker run`, use `--submission_base_dir` instead of `--submission_dir`. + The above command should generate "submission.tar.gz" if there are no submission checker issues and you can upload it to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). ## Aggregate Results in GitHub diff --git a/docs/system_requirements.yml b/docs/system_requirements.yml new file mode 100644 index 000000000..5dfec202a --- /dev/null +++ b/docs/system_requirements.yml @@ -0,0 +1,50 @@ +# All memory requirements in GB +resnet: + reference: + fp32: + system_memory: 8 + accelerator_memory: 4 + disk_storage: 25 + nvidia: + int8: + system_memory: 8 + accelerator_memory: 4 + disk_storage: 100 + intel: + int8: + system_memory: 8 + accelerator_memory: 0 + disk_storage: 50 + qualcomm: + int8: + system_memory: 8 + accelerator_memory: 8 + disk_storage: 50 +retinanet: + reference: + fp32: + system_memory: 8 + accelerator_memory: 8 + disk_storage: 200 + nvidia: + int8: + system_memory: 8 + accelerator_memory: 8 + disk_storage: 200 + intel: + int8: + system_memory: 8 + accelerator_memory: 0 + disk_storage: 200 + qualcomm: + int8: + system_memory: 8 + accelerator_memory: 8 + disk_storage: 200 +rgat: + reference: + fp32: + system_memory: 768 + accelerator_memory: 8 + disk_storage: 2300 + diff --git a/main.py b/main.py index c8c64b8c3..c5b22a705 100755 --- a/main.py +++ b/main.py @@ -239,7 +239,8 @@ def mlperf_inference_implementation_readme( common_info = get_common_info( spaces + 16, - implementation + implementation, + model.lower() ) if ( @@ -488,7 +489,7 @@ def get_venv_command(spaces): # contains run command information which is common to both docker and # native runs - def get_common_info(spaces, implementation): + def get_common_info(spaces, implementation, model): info = "" pre_space = "" for i in range(1, spaces): @@ -496,7 +497,11 @@ def get_common_info(spaces, implementation): pre_space += " " # pre_space = " " info += f"\n{pre_space}!!! tip\n\n" + info += f"{pre_space} - Number of threads could be adjusted using `--threads=#`, where `#` is the desired number of threads. This option works only if the implementation in use supports threading.\n\n" info += f"{pre_space} - Batch size could be adjusted using `--batch_size=#`, where `#` is the desired batch size. This option works only if the implementation in use is supporting the given batch size.\n\n" + if model == "rgat": + info += f"{pre_space} - Add `--env.CM_DATASET_IGBH_PATH=` if you have already downloaded the dataset. The path will be automatically mounted when using docker run.\n\n" + info += f"{pre_space} - Add `--env.CM_ML_MODEL_RGAT_CHECKPOINT_PATH=` if you have already downloaded the model. The path will be automatically mounted when using docker run.\n\n" if implementation.lower() == "reference": info += f"{pre_space} - Add `--adr.mlperf-implementation.tags=_branch.master,_repo.` if you are modifying the official MLPerf Inference implementation in a custom fork.\n\n" info += f"{pre_space} - Add `--adr.inference-src.tags=_repo.` if you are modifying the model config accuracy script in the submission checker within a custom fork.\n\n" diff --git a/mkdocs.yml b/mkdocs.yml index 95dfb6e86..96bcfb758 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -42,6 +42,8 @@ nav: - MIXTRAL-8x7B: benchmarks/language/mixtral-8x7b.md - Recommendation: - DLRM-v2: benchmarks/recommendation/dlrm-v2.md + - Graph Neural Networks: + - R-GAT: benchmarks/graph/rgat.md - Install CM: - install/index.md - Submission: From 647f9f84ff91394eb865ed9eaf5de688a1d37448 Mon Sep 17 00:00:00 2001 From: mlcommons-bot Date: Wed, 18 Dec 2024 18:17:31 +0000 Subject: [PATCH 14/14] [Automated Commit] Format Codebase --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index c5b22a705..6a34587dd 100755 --- a/main.py +++ b/main.py @@ -501,7 +501,7 @@ def get_common_info(spaces, implementation, model): info += f"{pre_space} - Batch size could be adjusted using `--batch_size=#`, where `#` is the desired batch size. This option works only if the implementation in use is supporting the given batch size.\n\n" if model == "rgat": info += f"{pre_space} - Add `--env.CM_DATASET_IGBH_PATH=` if you have already downloaded the dataset. The path will be automatically mounted when using docker run.\n\n" - info += f"{pre_space} - Add `--env.CM_ML_MODEL_RGAT_CHECKPOINT_PATH=` if you have already downloaded the model. The path will be automatically mounted when using docker run.\n\n" + info += f"{pre_space} - Add `--env.CM_ML_MODEL_RGAT_CHECKPOINT_PATH=` if you have already downloaded the model. The path will be automatically mounted when using docker run.\n\n" if implementation.lower() == "reference": info += f"{pre_space} - Add `--adr.mlperf-implementation.tags=_branch.master,_repo.` if you are modifying the official MLPerf Inference implementation in a custom fork.\n\n" info += f"{pre_space} - Add `--adr.inference-src.tags=_repo.` if you are modifying the model config accuracy script in the submission checker within a custom fork.\n\n"