From a536cd2be1f039a43e59b10384d4d2fc17583c62 Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Thu, 20 Jun 2024 01:42:39 -0500 Subject: [PATCH 1/4] Update for v4.1: Add new seeds + update checker + update compliance test table (#1736) --- compliance/nvidia/README.md | 5 +- mlperf.conf | 12 +- text_to_image/tools/sample_ids.py | 2 +- text_to_image/tools/sample_ids.txt | 20 +-- tools/submission/submission_checker.py | 234 +++++++++++++++++++++++-- 5 files changed, 239 insertions(+), 34 deletions(-) diff --git a/compliance/nvidia/README.md b/compliance/nvidia/README.md index e3a751e983..6306761807 100755 --- a/compliance/nvidia/README.md +++ b/compliance/nvidia/README.md @@ -37,5 +37,6 @@ The `run_verification.py` found in each test directory will copy the test files | 3d-unet | [TEST01](./TEST01/), [TEST05](./TEST05/) | | rnnt | [TEST01](./TEST01/), [TEST05](./TEST05/) | | gpt-j | - | -| stable-diffusion-xl | - | -| Llama2-70b | [TEST06]() | +| stable-diffusion-xl | [TEST01](./TEST01/), [TEST04](./TEST04/), [TEST05](./TEST05/) | +| Llama2-70b | [TEST06](./TEST06/) | +| mixtral-8x7b | [TEST06](./TEST06/) | diff --git a/mlperf.conf b/mlperf.conf index 5a3c78b22f..6487bfc238 100644 --- a/mlperf.conf +++ b/mlperf.conf @@ -19,13 +19,13 @@ stable-diffusion-xl.*.performance_sample_count_override = 5000 3d-unet.*.performance_sample_count_override = 0 # Set seeds. The seeds will be distributed two weeks before the submission. -*.*.qsl_rng_seed = 13281865557512327830 -*.*.sample_index_rng_seed = 198141574272810017 -*.*.schedule_rng_seed = 7575108116881280410 +*.*.qsl_rng_seed = 3066443479025735752 +*.*.sample_index_rng_seed = 10688027786191513374 +*.*.schedule_rng_seed = 14962580496156340209 # Set seeds for TEST_05. The seeds will be distributed two weeks before the submission. -*.*.test05_qsl_rng_seed = 2376919268182438552 -*.*.test05_sample_index_rng_seed = 11176391829184272374 -*.*.test05_schedule_rng_seed = 3911940905271271337 +*.*.test05_qsl_rng_seed = 16799458546791641818 +*.*.test05_sample_index_rng_seed = 5453809927556429288 +*.*.test05_schedule_rng_seed = 5435552105434836064 *.SingleStream.target_latency_percentile = 90 diff --git a/text_to_image/tools/sample_ids.py b/text_to_image/tools/sample_ids.py index e1d6effb4b..8c440ec5b3 100644 --- a/text_to_image/tools/sample_ids.py +++ b/text_to_image/tools/sample_ids.py @@ -16,7 +16,7 @@ def get_args(): "--n", type=int, default=10, help="Dataset download location" ) parser.add_argument( - "--seed", "-s", type=int, default=926019364, help="Dataset download location" + "--seed", "-s", type=int, default=633994880, help="Dataset download location" ) args = parser.parse_args() return args diff --git a/text_to_image/tools/sample_ids.txt b/text_to_image/tools/sample_ids.txt index 65c9f5641a..8bf3d2be8d 100644 --- a/text_to_image/tools/sample_ids.txt +++ b/text_to_image/tools/sample_ids.txt @@ -1,10 +1,10 @@ -4459 -4015 -2705 -1682 -4048 -4683 -3757 -1578 -3319 -95 \ No newline at end of file +4655 +2569 +1303 +109 +4509 +3009 +2179 +1826 +2094 +3340 \ No newline at end of file diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 451cd66b6a..9263a5f687 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -186,6 +186,169 @@ "stable-diffusion-xl": {"SingleStream": 1024, "Server": 270336, "Offline": 1} }, }, + "v4.1": { + "models": [ + "resnet", + "retinanet", + "bert-99", + "bert-99.9", + "dlrm-v2-99", + "dlrm-v2-99.9", + "3d-unet-99", + "3d-unet-99.9", + "gptj-99", + "gptj-99.9", + "llama2-70b-99", + "llama2-70b-99.9", + "stable-diffusion-xl", + "mixtral-8x7b" + ], + "required-scenarios-datacenter": { + "resnet": ["Server", "Offline"], + "retinanet": ["Server", "Offline"], + "bert-99": ["Server", "Offline"], + "bert-99.9": ["Server", "Offline"], + "dlrm-v2-99": ["Server", "Offline"], + "dlrm-v2-99.9": ["Server", "Offline"], + "3d-unet-99": ["Offline"], + "3d-unet-99.9": ["Offline"], + "gptj-99": ["Server", "Offline"], + "gptj-99.9": ["Server", "Offline"], + "llama2-70b-99": ["Server", "Offline"], + "llama2-70b-99.9": ["Server", "Offline"], + "stable-diffusion-xl": ["Server", "Offline"], + "mixtral-8x7b": ["Server", "Offline"] + }, + "optional-scenarios-datacenter": {}, + "required-scenarios-edge": { + "resnet": ["SingleStream", "MultiStream", "Offline"], + "retinanet": ["SingleStream", "MultiStream", "Offline"], + "bert-99": ["SingleStream", "Offline"], + "3d-unet-99": ["SingleStream", "Offline"], + "3d-unet-99.9": ["SingleStream", "Offline"], + "gptj-99": ["SingleStream", "Offline"], + "gptj-99.9": ["SingleStream", "Offline"], + "stable-diffusion-xl": ["SingleStream", "Offline"], + }, + "optional-scenarios-edge": {}, + "required-scenarios-datacenter-edge": { + "resnet": ["SingleStream", "Offline", "MultiStream", "Server"], + "retinanet": ["SingleStream", "Offline", "MultiStream", "Server"], + "bert-99": ["SingleStream", "Offline", "Server"], + "bert-99.9": ["Offline", "Server"], + "dlrm-v2-99": ["Offline", "Server"], + "dlrm-v2-99.9": ["Offline", "Server"], + "3d-unet-99": ["SingleStream", "Offline"], + "3d-unet-99.9": ["SingleStream", "Offline"], + "gptj-99": ["SingleStream", "Offline", "Server"], + "gptj-99.9": ["SingleStream", "Offline", "Server"], + "llama2-70b-99": ["Server", "Offline"], + "llama2-70b-99.9": ["Server", "Offline"], + "stable-diffusion-xl": ["SingleStream", "Offline", "Server"], + "mixtral-8x7b": ["SingleStream""Server", "Offline"] + }, + "optional-scenarios-datacenter-edge": {}, + "accuracy-target": { + "resnet": ("acc", 76.46 * 0.99), + "retinanet": ("mAP", 37.55 * 0.99), + "bert-99": ("F1", 90.874 * 0.99), + "bert-99.9": ("F1", 90.874 * 0.999), + "dlrm-v2-99": ("AUC", 80.31 * 0.99), + "dlrm-v2-99.9": ("AUC", 80.31 * 0.999), + "3d-unet-99": ("DICE", 0.86170 * 0.99), + "3d-unet-99.9": ("DICE", 0.86170 * 0.999), + "gptj-99" : ("ROUGE1", 42.9865 * 0.99, "ROUGE2", 20.1235 * 0.99, "ROUGEL", 29.9881 * 0.99, "GEN_LEN", 4016878*0.9), + "gptj-99.9" : ("ROUGE1", 42.9865 * 0.999, "ROUGE2", 20.1235 * 0.999, "ROUGEL", 29.9881 * 0.999, "GEN_LEN", 4016878*0.9), + "llama2-70b-99" : ("ROUGE1", 44.4312 * 0.99, "ROUGE2", 22.0352 * 0.99, "ROUGEL", 28.6162 * 0.99, "TOKENS_PER_SAMPLE", 294.45*0.9), + "llama2-70b-99.9" : ("ROUGE1", 44.4312 * 0.999, "ROUGE2", 22.0352 * 0.999, "ROUGEL", 28.6162 * 0.999, "TOKENS_PER_SAMPLE", 294.45*0.9), + "stable-diffusion-xl": ("CLIP_SCORE", 31.68631873, "FID_SCORE", 23.01085758), + # TODO: Mixtral metrics + # "mixtral-8x7b" : ("ROUGE1", X * 0.99, "ROUGE2", X * 0.99, "ROUGEL", X * 0.99, "TOKENS_PER_SAMPLE", X * 0.9, "gsm8k_accuracy": 73.78*0.99, "mbxp_accuracy": 60.12 * 0.99), + }, + "accuracy-upper-limit": { + "stable-diffusion-xl": ("CLIP_SCORE", 31.81331801, "FID_SCORE", 23.95007626), + "llama2-70b-99" : ("TOKENS_PER_SAMPLE", 294.45*1.1), + "llama2-70b-99.9" : ("TOKENS_PER_SAMPLE", 294.45*1.1) + # "mixtral-8x7b" :("TOKENS_PER_SAMPLE", X * 0.9) + }, + "performance-sample-count": { + "resnet": 1024, + "retinanet": 64, + "bert-99": 10833, + "bert-99.9": 10833, + "dlrm-v2-99": 204800, + "dlrm-v2-99.9": 204800, + "3d-unet-99": 43, + "3d-unet-99.9": 43, + "gptj-99": 13368, + "gptj-99.9": 13368, + "llama2-70b-99": 24576, + "llama2-70b-99.9": 24576, + "stable-diffusion-xl": 5000, + "mixtral-8x7b": 15000, + }, + # TODO: Update this list. + "model_mapping": { + # map model names to the official mlperf model class + "ssd-resnet34": "retinanet", + "mobilenet": "resnet", + "resnet50": "resnet" + }, + "seeds": { + # TODO: Update random seeds + "qsl_rng_seed": 3066443479025735752, + "sample_index_rng_seed": 10688027786191513374, + "schedule_rng_seed": 14962580496156340209, + }, + "test05_seeds": { + # TODO: Update random seeds + "qsl_rng_seed": 16799458546791641818, + "sample_index_rng_seed": 5453809927556429288, + "schedule_rng_seed": 5435552105434836064, + }, + "ignore_errors": [], + "latency-constraint": { + "resnet": {"Server": 15000000}, + "retinanet": {"Server": 100000000}, + "bert-99": {"Server": 130000000}, + "bert-99.9": {"Server": 130000000}, + "dlrm-v2-99": {"Server": 60000000}, + "dlrm-v2-99.9": {"Server": 60000000}, + "gptj-99": {"Server": 20000000000}, + "gptj-99.9": {"Server": 20000000000}, + "llama2-70b-99": {"Server": 20000000000}, + "llama2-70b-99.9": {"Server": 20000000000}, + "stable-diffusion-xl" : {"Server": 20000000000} + # TODO: Mixtral metrics + # "mixtral-8x7b" : {"Server": 20000000000} + }, + "min-queries": { + "resnet": { + "SingleStream": 1024, + "MultiStream": 270336, + "Server": 270336, + "Offline": 1, + }, + "retinanet": { + "SingleStream": 1024, + "MultiStream": 270336, + "Server": 270336, + "Offline": 1, + }, + "bert-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "bert-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "dlrm-v2-99": {"Server": 270336, "Offline": 1}, + "dlrm-v2-99.9": {"Server": 270336, "Offline": 1}, + "3d-unet-99": {"SingleStream": 1024, "Offline": 1}, + "3d-unet-99.9": {"SingleStream": 1024, "Offline": 1}, + "gptj-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "gptj-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "llama2-70b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "llama2-70b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "stable-diffusion-xl": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "mixtral-8x7b": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + }, + }, } VALID_DIVISIONS = ["open", "closed", "network"] @@ -221,6 +384,20 @@ "3319", "95" ] + }, + "v4.1": { + "images": [ + "4655", + "2569", + "1303", + "109", + "4509", + "3009", + "2179", + "1826", + "2094", + "3340" + ] } } } @@ -255,7 +432,8 @@ "gptj-99.9": 13368, "llama2-70b-99": 24576, "llama2-70b-99.9": 24576, - "stable-diffusion-xl": 5000 + "stable-diffusion-xl": 5000, + "mixtral-8x7b": 15000 } SCENARIO_MAPPING = { @@ -302,8 +480,8 @@ }, "v4.1": { "llama2-70b-99": { - "Offline": "result_tokens_per_second", - "Server": "result_completed_tokens_per_second", + "Offline": "result_tokens_per_second", + "Server": "result_completed_tokens_per_second", }, "llama2-70b-99.9": { "Offline": "result_tokens_per_second", @@ -316,16 +494,33 @@ "gptj-99.9": { "Offline": "result_inferred_tokens_per_second", "Server": "result_inferred_completed_tokens_per_second", + }, + "mixtral-8x7b": { + "Offline": "result_tokens_per_second", + "Server": "result_completed_tokens_per_second", } } } -LLAMA2_LATENCY_LIMITS = { - # We might add interactive in the next round. Latency in ns - "conversational": { - "ttft": 2000 * 1000000, - "tpot": 200 * 1000000 - } +LLM_LATENCY_LIMITS = { + "llama2-70b-99":{ + "conversational": { + "ttft": 2000 * 1000000, + "tpot": 200 * 1000000 + } + }, + "llama2-70b-99.9":{ + "conversational": { + "ttft": 2000 * 1000000, + "tpot": 200 * 1000000 + } + }, + # "mixtral-8x7b":{ + # "conversational": { + # "ttft": 2000 * 1000000, + # "tpot": 200 * 1000000 + # } + # } } ACC_PATTERN = { @@ -799,13 +994,13 @@ def check_accuracy_dir(config, model, path, verbose): return is_valid, result_acc -def extra_check_llama2(mlperf_log, scenario): +def extra_check_llm(mlperf_log, scenario, model): if (mlperf_log["requested_use_token_latencies"]): if scenario == "Offline": # For offline no further checks are necessary return None, True else: - for constraint, limits in LLAMA2_LATENCY_LIMITS.items(): + for constraint, limits in LLM_LATENCY_LIMITS[model].items(): if mlperf_log["result_first_token_99.00_percentile_latency_ns"] < limits["ttft"] and mlperf_log["result_time_per_output_token_99.00_percentile_ns"] < limits["tpot"]: return constraint, True else: @@ -867,8 +1062,8 @@ def check_performance_dir( res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version][model][scenario]]) - if model in ["llama2-70b-99", "llama2-70b-99.9"]: - llama_constraint, is_valid = extra_check_llama2(mlperf_log, scenario_fixed) + if model in ["llama2-70b-99", "llama2-70b-99.9", "mixtral-8x7b"]: + llama_constraint, is_valid = extra_check_llm(mlperf_log, scenario_fixed, model) latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"] latency_mean = mlperf_log["result_mean_latency_ns"] @@ -2344,8 +2539,7 @@ def check_compliance_dir( "gptj-99.9", "llama2-70b-99", "llama2-70b-99.9", - "stable-diffusion-xl" - + "mixtral-8x7b" ]: test_list.remove("TEST04") @@ -2355,13 +2549,23 @@ def check_compliance_dir( "llama2-70b-99", "llama2-70b-99.9", "stable-diffusion-xl" + "mixtral-8x7b" ]: test_list.remove("TEST05") + + if model in [ + "gptj-99", + "gptj-99.9", + "llama2-70b-99", + "llama2-70b-99.9", + "mixtral-8x7b" + ]: test_list.remove("TEST01") if model in [ "llama2-70b-99", "llama2-70b-99.9", + "mixtral-8x7b" ]: test_list.append("TEST06") From 84e77719f117149940e326e92acd4f7d885090e8 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 20 Jun 2024 20:21:45 +0100 Subject: [PATCH 2/4] Fix build --sdist for loadgen (#1732) * Fix build --sdist for loadgen * Update build_wheels.yml * Update build_wheels.yml * Update build_wheels.yml * Update setup.py * Simplify the github actions based on latest CM * Simplify the github actions based on latest CM * Fix the version of submission checker in the github action --- .github/workflows/build_wheels.yml | 8 +++++++- .github/workflows/test-bert.yml | 6 ++---- .github/workflows/test-loadgen.yml | 5 ++--- .github/workflows/test-resnet50.yml | 6 ++---- .github/workflows/test-retinanet.yml | 6 ++---- .github/workflows/test-submission-checker.yml | 2 +- .github/workflows/test-tvm.yml | 6 ++---- loadgen/pyproject.toml | 2 +- loadgen/setup.py | 12 +++++++++--- loadgen/version_generator.py | 1 - 10 files changed, 28 insertions(+), 26 deletions(-) diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml index 3c14198e81..30a88f11b2 100644 --- a/.github/workflows/build_wheels.yml +++ b/.github/workflows/build_wheels.yml @@ -3,12 +3,18 @@ name: Build loadgen wheels and release them into PYPI on: release: types: [published] + push: + branches: + - master + paths: + - loadgen/setup.py jobs: build_wheels: name: Build wheels on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: + fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macOS-latest] @@ -18,7 +24,7 @@ jobs: - uses: actions/setup-python@v3 - name: Install requirements - run: python -m pip install cibuildwheel==2.16.2 twine==4.0.2 + run: python -m pip install cibuildwheel twine - name: Build wheels run: python -m cibuildwheel loadgen/ --output-dir wheels diff --git a/.github/workflows/test-bert.yml b/.github/workflows/test-bert.yml index fb5ba3e515..6f6d77a397 100755 --- a/.github/workflows/test-bert.yml +++ b/.github/workflows/test-bert.yml @@ -30,9 +30,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python3 -m pip install cmind - cm pull repo mlcommons@ck - cm run script --quiet --tags=get,sys-utils-cm + python3 -m pip install cm4mlops - name: Test BERT and end to end submission generation run: | - cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=bert-99 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --adr.compiler.tags=gcc --adr.inference-src.version=custom --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }} --target_qps=1 + cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=bert-99 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --adr.compiler.tags=gcc --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} diff --git a/.github/workflows/test-loadgen.yml b/.github/workflows/test-loadgen.yml index 125825e390..b010f82584 100755 --- a/.github/workflows/test-loadgen.yml +++ b/.github/workflows/test-loadgen.yml @@ -28,8 +28,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python3 -m pip install cmind - cm pull repo mlcommons@ck + python3 -m pip install cm4mlops - name: Test Loadgen run: | - cm run script --tags=get,mlperf,inference,loadgen --quiet --version=custom --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }} + cm run script --tags=get,mlperf,inference,loadgen --quiet --version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} diff --git a/.github/workflows/test-resnet50.yml b/.github/workflows/test-resnet50.yml index ab14d4ca50..b5d09a66ad 100755 --- a/.github/workflows/test-resnet50.yml +++ b/.github/workflows/test-resnet50.yml @@ -30,9 +30,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python3 -m pip install cmind - cm pull repo mlcommons@ck - cm run script --quiet --tags=get,sys-utils-cm + python3 -m pip install cm4mlops - name: Test Resnet50 and end to end submission generation run: | - cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.version=custom --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }} + cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom diff --git a/.github/workflows/test-retinanet.yml b/.github/workflows/test-retinanet.yml index 3afcf086d2..20b05cc25f 100755 --- a/.github/workflows/test-retinanet.yml +++ b/.github/workflows/test-retinanet.yml @@ -30,9 +30,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python3 -m pip install cmind - cm pull repo mlcommons@ck - cm run script --quiet --tags=get,sys-utils-cm + python3 -m pip install cm4mlops - name: Test Retinanet and end to end submission generation run: | - cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=retinanet --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=10 --adr.compiler.tags=gcc --adr.inference-src.version=custom --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }} + cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=retinanet --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=10 --adr.compiler.tags=gcc --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} diff --git a/.github/workflows/test-submission-checker.yml b/.github/workflows/test-submission-checker.yml index 14ac907230..2d1b129034 100644 --- a/.github/workflows/test-submission-checker.yml +++ b/.github/workflows/test-submission-checker.yml @@ -33,4 +33,4 @@ jobs: git clone https://github.com/mlcommons/inference_results_v4.0 --depth 1 - name: Test MLPerf inference submission checker run: | - cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.0 --quiet + cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.0 --version=r4.0 --quiet diff --git a/.github/workflows/test-tvm.yml b/.github/workflows/test-tvm.yml index fc57c88120..b132fc0025 100755 --- a/.github/workflows/test-tvm.yml +++ b/.github/workflows/test-tvm.yml @@ -30,9 +30,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python3 -m pip install cmind - cm pull repo mlcommons@ck - cm run script --quiet --tags=get,sys-utils-cm + python3 -m pip install cm4mlops - name: Test Resnet50 TVM backend run: | - cm run script --tags=run,mlperf,inference,generate-run-cmds --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=5 --adr.compiler.tags=gcc --adr.inference-src.version=custom --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }} --target_qps=1 + cm run script --tags=run,mlperf,inference,generate-run-cmds --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=5 --adr.compiler.tags=gcc --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }} diff --git a/loadgen/pyproject.toml b/loadgen/pyproject.toml index f62b2c5f7a..9be24eea6d 100755 --- a/loadgen/pyproject.toml +++ b/loadgen/pyproject.toml @@ -4,4 +4,4 @@ build-backend = "setuptools.build_meta:__legacy__" [tool.cibuildwheel] environment = "CFLAGS='-std=c++14'" -build = "cp3{7,8,9,10,11}-*" +build = "cp3{7,8,9,10,11,12}-*" diff --git a/loadgen/setup.py b/loadgen/setup.py index d7b8224de4..8dfc5b9f0e 100644 --- a/loadgen/setup.py +++ b/loadgen/setup.py @@ -40,6 +40,9 @@ "query_sample_library.h", "system_under_test.h", "test_settings.h", + "issue_query_controller.h", + "early_stopping.h", + "query_dispatch_library.h", ] lib_headers = [ @@ -49,6 +52,8 @@ "utils.h", "version.h", "results.h", + "bindings/c_api.h", + "version_generator.py" ] lib_sources = [ @@ -63,6 +68,7 @@ ] lib_bindings = [ + "bindings/c_api.cc", "bindings/python_api.cc", ] @@ -76,13 +82,13 @@ mlperf_loadgen_module = Pybind11Extension( "mlperf_loadgen", - define_macros=[("MAJOR_VERSION", "4"), ("MINOR_VERSION", "0")], + define_macros=[("MAJOR_VERSION", "4"), ("MINOR_VERSION", "1")], include_dirs=[".", get_include()], sources=mlperf_loadgen_sources, depends=mlperf_loadgen_headers) -setup(name="mlperf_loadgen", - version="4.0", +setup(name="mlcommons_loadgen", + version="4.1", description="MLPerf Inference LoadGen python bindings", url="https://mlcommons.org/", cmdclass={"build_ext": build_ext}, diff --git a/loadgen/version_generator.py b/loadgen/version_generator.py index 4de930a633..2c5d11331a 100644 --- a/loadgen/version_generator.py +++ b/loadgen/version_generator.py @@ -69,7 +69,6 @@ def generate_loadgen_version_definitions_sha1(ofile, loadgen_root): sha1s = "" loadgen_files = ( ["/bindings/" + s for s in os.listdir(loadgen_root + "/bindings")] + - ["/demos/" + s for s in os.listdir(loadgen_root + "/demos")] + ["/" + s for s in os.listdir(loadgen_root)]) for fn in sorted(loadgen_files): full_fn = loadgen_root + fn From 53f6475910954facecdce05399d0976e2bbcc931 Mon Sep 17 00:00:00 2001 From: Anton Lokhmotov Date: Thu, 20 Jun 2024 23:04:48 +0100 Subject: [PATCH 3/4] Set v4.1 as default version for submission checker. (#1737) --- tools/submission/submission_checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 9263a5f687..b15a8596fd 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -783,7 +783,7 @@ def get_args(): parser.add_argument("--input", required=True, help="submission directory") parser.add_argument( "--version", - default="v4.0", + default="v4.1", choices=list(MODEL_CONFIG.keys()), help="mlperf version", ) From 2a46c7afb88b89a6c5def96953902793996ee37b Mon Sep 17 00:00:00 2001 From: Michal Szutenberg <37601244+szutenberg@users.noreply.github.com> Date: Mon, 24 Jun 2024 18:54:53 +0200 Subject: [PATCH 4/4] Small fixes in MoE eval script (#1743) * Removed suffixes "_accuracy" * Removed redundant code * Similarly to llama we only check rouge1,rouge2 and rougeL scores --- language/mixtral-8x7b/README.md | 6 +++--- language/mixtral-8x7b/evaluate-accuracy.py | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/language/mixtral-8x7b/README.md b/language/mixtral-8x7b/README.md index 20cfe67c90..69f8035770 100644 --- a/language/mixtral-8x7b/README.md +++ b/language/mixtral-8x7b/README.md @@ -247,15 +247,15 @@ python -u evaluate-accuracy.py --checkpoint-path mistralai/Mixtral-8x7B-instruct Reference scores: Open Orca: ```json -{'rouge1': 45.4911, 'rouge2': 23.2829, 'rougeL': 30.3615, 'rougeLsum': 42.4333} +{'rouge1': 45.4911, 'rouge2': 23.2829, 'rougeL': 30.3615} ``` GSM8K: ```json -{'gsm8k_accuracy': 73.78} +{'gsm8k': 73.78} ``` MBXP: ```json -{'mbxp_accuracy': 60.16} +{'mbxp': 60.16} ``` For official submissions, 99% of each reference score is enforced. Additionally, 90%-110% of the generated tokens_per_samples: ```json diff --git a/language/mixtral-8x7b/evaluate-accuracy.py b/language/mixtral-8x7b/evaluate-accuracy.py index e20834c412..d25b8178ba 100644 --- a/language/mixtral-8x7b/evaluate-accuracy.py +++ b/language/mixtral-8x7b/evaluate-accuracy.py @@ -193,11 +193,11 @@ def main(): continue correct += (ref == tgt) - gsm8k_accuracy = 100.0 * correct / gsm8k_total + result['gsm8k'] = 100.0 * correct / gsm8k_total # MBXP metric from evaluate_mbxp import evaluate_mbxp - mbxp_accuracy = evaluate_mbxp(results_MBXP, args.n_workers) + result['mbxp'] = evaluate_mbxp(results_MBXP, args.n_workers) result = { **result, @@ -205,8 +205,6 @@ def main(): 'gen_num': gen_num, 'gen_tok_len': gen_tok_len, 'tokens_per_sample': round(gen_tok_len / gen_num, 1), - 'gsm8k_accuracy': gsm8k_accuracy, - 'mbxp_accuracy': mbxp_accuracy } print("\nResults\n")