From a536cd2be1f039a43e59b10384d4d2fc17583c62 Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Thu, 20 Jun 2024 01:42:39 -0500
Subject: [PATCH 1/4] Update for v4.1: Add new seeds + update checker + update
 compliance test table (#1736)

---
 compliance/nvidia/README.md            |   5 +-
 mlperf.conf                            |  12 +-
 text_to_image/tools/sample_ids.py      |   2 +-
 text_to_image/tools/sample_ids.txt     |  20 +--
 tools/submission/submission_checker.py | 234 +++++++++++++++++++++++--
 5 files changed, 239 insertions(+), 34 deletions(-)

diff --git a/compliance/nvidia/README.md b/compliance/nvidia/README.md
index e3a751e983..6306761807 100755
--- a/compliance/nvidia/README.md
+++ b/compliance/nvidia/README.md
@@ -37,5 +37,6 @@ The `run_verification.py` found in each test directory will copy the test files
 | 3d-unet | [TEST01](./TEST01/), [TEST05](./TEST05/) |
 | rnnt | [TEST01](./TEST01/), [TEST05](./TEST05/) |
 | gpt-j | - |
-| stable-diffusion-xl | - |
-| Llama2-70b | [TEST06]() |
+| stable-diffusion-xl | [TEST01](./TEST01/), [TEST04](./TEST04/), [TEST05](./TEST05/) |
+| Llama2-70b | [TEST06](./TEST06/) |
+| mixtral-8x7b | [TEST06](./TEST06/) |
diff --git a/mlperf.conf b/mlperf.conf
index 5a3c78b22f..6487bfc238 100644
--- a/mlperf.conf
+++ b/mlperf.conf
@@ -19,13 +19,13 @@ stable-diffusion-xl.*.performance_sample_count_override = 5000
 3d-unet.*.performance_sample_count_override = 0
 
 # Set seeds. The seeds will be distributed two weeks before the submission.
-*.*.qsl_rng_seed = 13281865557512327830
-*.*.sample_index_rng_seed = 198141574272810017
-*.*.schedule_rng_seed = 7575108116881280410
+*.*.qsl_rng_seed = 3066443479025735752
+*.*.sample_index_rng_seed = 10688027786191513374
+*.*.schedule_rng_seed = 14962580496156340209
 # Set seeds for TEST_05. The seeds will be distributed two weeks before the submission.
-*.*.test05_qsl_rng_seed = 2376919268182438552
-*.*.test05_sample_index_rng_seed = 11176391829184272374
-*.*.test05_schedule_rng_seed = 3911940905271271337
+*.*.test05_qsl_rng_seed = 16799458546791641818
+*.*.test05_sample_index_rng_seed = 5453809927556429288
+*.*.test05_schedule_rng_seed = 5435552105434836064
 
 
 *.SingleStream.target_latency_percentile = 90
diff --git a/text_to_image/tools/sample_ids.py b/text_to_image/tools/sample_ids.py
index e1d6effb4b..8c440ec5b3 100644
--- a/text_to_image/tools/sample_ids.py
+++ b/text_to_image/tools/sample_ids.py
@@ -16,7 +16,7 @@ def get_args():
         "--n", type=int, default=10, help="Dataset download location"
     )
     parser.add_argument(
-        "--seed", "-s", type=int, default=926019364, help="Dataset download location"
+        "--seed", "-s", type=int, default=633994880, help="Dataset download location"
     )
     args = parser.parse_args()
     return args
diff --git a/text_to_image/tools/sample_ids.txt b/text_to_image/tools/sample_ids.txt
index 65c9f5641a..8bf3d2be8d 100644
--- a/text_to_image/tools/sample_ids.txt
+++ b/text_to_image/tools/sample_ids.txt
@@ -1,10 +1,10 @@
-4459
-4015
-2705
-1682
-4048
-4683
-3757
-1578
-3319
-95
\ No newline at end of file
+4655
+2569
+1303
+109
+4509
+3009
+2179
+1826
+2094
+3340
\ No newline at end of file
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 451cd66b6a..9263a5f687 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -186,6 +186,169 @@
             "stable-diffusion-xl": {"SingleStream": 1024, "Server": 270336, "Offline": 1}
         },
     },
+    "v4.1": {
+        "models": [
+            "resnet",
+            "retinanet",
+            "bert-99",
+            "bert-99.9",
+            "dlrm-v2-99",
+            "dlrm-v2-99.9",
+            "3d-unet-99",
+            "3d-unet-99.9",
+            "gptj-99",
+            "gptj-99.9",
+            "llama2-70b-99",
+            "llama2-70b-99.9",
+            "stable-diffusion-xl",
+            "mixtral-8x7b"
+        ],
+        "required-scenarios-datacenter": {
+            "resnet": ["Server", "Offline"],
+            "retinanet": ["Server", "Offline"],
+            "bert-99": ["Server", "Offline"],
+            "bert-99.9": ["Server", "Offline"],
+            "dlrm-v2-99": ["Server", "Offline"],
+            "dlrm-v2-99.9": ["Server", "Offline"],
+            "3d-unet-99": ["Offline"],
+            "3d-unet-99.9": ["Offline"],
+            "gptj-99": ["Server", "Offline"],
+            "gptj-99.9": ["Server", "Offline"],
+            "llama2-70b-99": ["Server", "Offline"],
+            "llama2-70b-99.9": ["Server", "Offline"],
+            "stable-diffusion-xl": ["Server", "Offline"],
+            "mixtral-8x7b": ["Server", "Offline"]
+        },
+        "optional-scenarios-datacenter": {},
+        "required-scenarios-edge": {
+            "resnet": ["SingleStream", "MultiStream", "Offline"],
+            "retinanet": ["SingleStream", "MultiStream", "Offline"],
+            "bert-99": ["SingleStream", "Offline"],
+            "3d-unet-99": ["SingleStream", "Offline"],
+            "3d-unet-99.9": ["SingleStream", "Offline"],
+            "gptj-99": ["SingleStream", "Offline"],
+            "gptj-99.9": ["SingleStream", "Offline"],
+            "stable-diffusion-xl": ["SingleStream", "Offline"],
+        },
+        "optional-scenarios-edge": {},
+        "required-scenarios-datacenter-edge": {
+            "resnet": ["SingleStream", "Offline", "MultiStream", "Server"],
+            "retinanet": ["SingleStream", "Offline", "MultiStream", "Server"],
+            "bert-99": ["SingleStream", "Offline", "Server"],
+            "bert-99.9": ["Offline", "Server"],
+            "dlrm-v2-99": ["Offline", "Server"],
+            "dlrm-v2-99.9": ["Offline", "Server"],
+            "3d-unet-99": ["SingleStream", "Offline"],
+            "3d-unet-99.9": ["SingleStream", "Offline"],
+            "gptj-99": ["SingleStream", "Offline", "Server"],
+            "gptj-99.9": ["SingleStream", "Offline", "Server"],
+            "llama2-70b-99": ["Server", "Offline"],
+            "llama2-70b-99.9": ["Server", "Offline"],
+            "stable-diffusion-xl": ["SingleStream", "Offline", "Server"],
+            "mixtral-8x7b": ["SingleStream""Server", "Offline"]
+        },
+        "optional-scenarios-datacenter-edge": {},
+        "accuracy-target": {
+            "resnet": ("acc", 76.46 * 0.99),
+            "retinanet": ("mAP", 37.55 * 0.99),
+            "bert-99": ("F1", 90.874 * 0.99),
+            "bert-99.9": ("F1", 90.874 * 0.999),
+            "dlrm-v2-99": ("AUC", 80.31 * 0.99),
+            "dlrm-v2-99.9": ("AUC", 80.31 * 0.999),
+            "3d-unet-99": ("DICE", 0.86170 * 0.99),
+            "3d-unet-99.9": ("DICE", 0.86170 * 0.999),
+            "gptj-99" : ("ROUGE1", 42.9865 * 0.99, "ROUGE2", 20.1235 * 0.99, "ROUGEL", 29.9881 * 0.99, "GEN_LEN", 4016878*0.9),
+            "gptj-99.9" : ("ROUGE1", 42.9865 * 0.999, "ROUGE2", 20.1235 * 0.999, "ROUGEL", 29.9881 * 0.999, "GEN_LEN", 4016878*0.9),
+            "llama2-70b-99" : ("ROUGE1", 44.4312 * 0.99, "ROUGE2", 22.0352 * 0.99, "ROUGEL", 28.6162 * 0.99, "TOKENS_PER_SAMPLE", 294.45*0.9),
+            "llama2-70b-99.9" : ("ROUGE1", 44.4312 * 0.999, "ROUGE2", 22.0352 * 0.999, "ROUGEL", 28.6162 * 0.999, "TOKENS_PER_SAMPLE", 294.45*0.9),
+            "stable-diffusion-xl": ("CLIP_SCORE", 31.68631873, "FID_SCORE", 23.01085758),
+            # TODO: Mixtral metrics
+            # "mixtral-8x7b" : ("ROUGE1", X * 0.99, "ROUGE2", X * 0.99, "ROUGEL", X * 0.99, "TOKENS_PER_SAMPLE", X * 0.9, "gsm8k_accuracy": 73.78*0.99, "mbxp_accuracy": 60.12 * 0.99),
+        },
+        "accuracy-upper-limit": {
+            "stable-diffusion-xl": ("CLIP_SCORE", 31.81331801, "FID_SCORE", 23.95007626),
+            "llama2-70b-99" : ("TOKENS_PER_SAMPLE", 294.45*1.1),
+            "llama2-70b-99.9" : ("TOKENS_PER_SAMPLE", 294.45*1.1)
+            # "mixtral-8x7b" :("TOKENS_PER_SAMPLE", X * 0.9)
+        },
+        "performance-sample-count": {
+            "resnet": 1024,
+            "retinanet": 64,
+            "bert-99": 10833,
+            "bert-99.9": 10833,
+            "dlrm-v2-99": 204800,
+            "dlrm-v2-99.9": 204800,
+            "3d-unet-99": 43,
+            "3d-unet-99.9": 43,
+            "gptj-99": 13368,
+            "gptj-99.9": 13368,
+            "llama2-70b-99": 24576,
+            "llama2-70b-99.9": 24576,
+            "stable-diffusion-xl": 5000,
+            "mixtral-8x7b": 15000,
+        },
+        # TODO: Update this list.
+        "model_mapping": {
+            # map model names to the official mlperf model class
+            "ssd-resnet34": "retinanet",
+            "mobilenet": "resnet",
+            "resnet50": "resnet"
+        },
+        "seeds": {
+            # TODO: Update random seeds
+            "qsl_rng_seed": 3066443479025735752,
+            "sample_index_rng_seed": 10688027786191513374,
+            "schedule_rng_seed": 14962580496156340209,
+        },
+        "test05_seeds": {
+            # TODO: Update random seeds
+            "qsl_rng_seed": 16799458546791641818,
+            "sample_index_rng_seed": 5453809927556429288,
+            "schedule_rng_seed": 5435552105434836064,
+        },
+        "ignore_errors": [],
+        "latency-constraint": {
+            "resnet": {"Server": 15000000},
+            "retinanet": {"Server": 100000000},
+            "bert-99": {"Server": 130000000},
+            "bert-99.9": {"Server": 130000000},
+            "dlrm-v2-99": {"Server": 60000000},
+            "dlrm-v2-99.9": {"Server": 60000000},
+            "gptj-99": {"Server": 20000000000},
+            "gptj-99.9": {"Server": 20000000000},
+            "llama2-70b-99": {"Server": 20000000000},
+            "llama2-70b-99.9": {"Server": 20000000000},
+            "stable-diffusion-xl" : {"Server": 20000000000}
+            # TODO: Mixtral metrics
+            # "mixtral-8x7b" : {"Server": 20000000000}
+        },
+        "min-queries": {
+            "resnet": {
+                "SingleStream": 1024,
+                "MultiStream": 270336,
+                "Server": 270336,
+                "Offline": 1,
+            },
+            "retinanet": {
+                "SingleStream": 1024,
+                "MultiStream": 270336,
+                "Server": 270336,
+                "Offline": 1,
+            },
+            "bert-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "bert-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "dlrm-v2-99": {"Server": 270336, "Offline": 1},
+            "dlrm-v2-99.9": {"Server": 270336, "Offline": 1},
+            "3d-unet-99": {"SingleStream": 1024, "Offline": 1},
+            "3d-unet-99.9": {"SingleStream": 1024, "Offline": 1},
+            "gptj-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "gptj-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "llama2-70b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "llama2-70b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "stable-diffusion-xl": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "mixtral-8x7b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+        },
+    },
 }
 
 VALID_DIVISIONS = ["open", "closed", "network"]
@@ -221,6 +384,20 @@
                 "3319",
                 "95"
             ]
+        },
+        "v4.1": {
+            "images": [
+                "4655",
+                "2569",
+                "1303",
+                "109",
+                "4509",
+                "3009",
+                "2179",
+                "1826",
+                "2094",
+                "3340"
+            ]
         }
     }
 }
@@ -255,7 +432,8 @@
     "gptj-99.9": 13368,
     "llama2-70b-99": 24576,
     "llama2-70b-99.9": 24576,
-    "stable-diffusion-xl": 5000
+    "stable-diffusion-xl": 5000,
+    "mixtral-8x7b": 15000
 }
 
 SCENARIO_MAPPING = {
@@ -302,8 +480,8 @@
     },
     "v4.1": {
         "llama2-70b-99": {
-        "Offline": "result_tokens_per_second",
-        "Server": "result_completed_tokens_per_second",
+            "Offline": "result_tokens_per_second",
+            "Server": "result_completed_tokens_per_second",
         },
         "llama2-70b-99.9": {
             "Offline": "result_tokens_per_second",
@@ -316,16 +494,33 @@
         "gptj-99.9": {
             "Offline": "result_inferred_tokens_per_second",
             "Server": "result_inferred_completed_tokens_per_second",
+        },
+        "mixtral-8x7b": {
+            "Offline": "result_tokens_per_second",
+            "Server": "result_completed_tokens_per_second",
         }
     }
 }
 
-LLAMA2_LATENCY_LIMITS = {
-    # We might add interactive in the next round. Latency in ns
-    "conversational": {
-        "ttft": 2000 * 1000000,
-        "tpot": 200 * 1000000
-    }
+LLM_LATENCY_LIMITS = {
+    "llama2-70b-99":{
+        "conversational": {
+            "ttft": 2000 * 1000000,
+            "tpot": 200 * 1000000
+        }
+    },
+    "llama2-70b-99.9":{
+        "conversational": {
+            "ttft": 2000 * 1000000,
+            "tpot": 200 * 1000000
+        }
+    },
+    # "mixtral-8x7b":{
+    #     "conversational": {
+    #         "ttft": 2000 * 1000000,
+    #         "tpot": 200 * 1000000
+    #     }
+    # }
 }
 
 ACC_PATTERN = {
@@ -799,13 +994,13 @@ def check_accuracy_dir(config, model, path, verbose):
     return is_valid, result_acc
 
 
-def extra_check_llama2(mlperf_log, scenario):
+def extra_check_llm(mlperf_log, scenario, model):
     if (mlperf_log["requested_use_token_latencies"]):
         if scenario == "Offline":
             # For offline no further checks are necessary
             return None, True
         else:
-            for constraint, limits in LLAMA2_LATENCY_LIMITS.items():
+            for constraint, limits in LLM_LATENCY_LIMITS[model].items():
                 if mlperf_log["result_first_token_99.00_percentile_latency_ns"] < limits["ttft"] and mlperf_log["result_time_per_output_token_99.00_percentile_ns"] < limits["tpot"]:
                     return constraint, True
     else:
@@ -867,8 +1062,8 @@ def check_performance_dir(
         res = float(mlperf_log[RESULT_FIELD_BENCHMARK_OVERWRITE[version][model][scenario]])
 
         
-    if model in ["llama2-70b-99", "llama2-70b-99.9"]:
-        llama_constraint, is_valid = extra_check_llama2(mlperf_log, scenario_fixed)
+    if model in ["llama2-70b-99", "llama2-70b-99.9", "mixtral-8x7b"]:
+        llama_constraint, is_valid = extra_check_llm(mlperf_log, scenario_fixed, model)
 
     latency_99_percentile = mlperf_log["result_99.00_percentile_latency_ns"]
     latency_mean = mlperf_log["result_mean_latency_ns"]
@@ -2344,8 +2539,7 @@ def check_compliance_dir(
         "gptj-99.9",
         "llama2-70b-99",
         "llama2-70b-99.9",
-        "stable-diffusion-xl"
-
+        "mixtral-8x7b"
     ]:
         test_list.remove("TEST04")
 
@@ -2355,13 +2549,23 @@ def check_compliance_dir(
         "llama2-70b-99",
         "llama2-70b-99.9",
         "stable-diffusion-xl"
+        "mixtral-8x7b"
     ]:
         test_list.remove("TEST05")
+
+    if model in [
+        "gptj-99",
+        "gptj-99.9",
+        "llama2-70b-99",
+        "llama2-70b-99.9",
+        "mixtral-8x7b"
+    ]:
         test_list.remove("TEST01") 
 
     if model in [
         "llama2-70b-99",
         "llama2-70b-99.9",
+        "mixtral-8x7b"
     ]:
         test_list.append("TEST06") 
 

From 84e77719f117149940e326e92acd4f7d885090e8 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjunsuresh1987@gmail.com>
Date: Thu, 20 Jun 2024 20:21:45 +0100
Subject: [PATCH 2/4] Fix build --sdist for loadgen (#1732)

* Fix build --sdist for loadgen

* Update build_wheels.yml

* Update build_wheels.yml

* Update build_wheels.yml

* Update setup.py

* Simplify the github actions based on latest CM

* Simplify the github actions based on latest CM

* Fix the version of submission checker in the github action
---
 .github/workflows/build_wheels.yml            |  8 +++++++-
 .github/workflows/test-bert.yml               |  6 ++----
 .github/workflows/test-loadgen.yml            |  5 ++---
 .github/workflows/test-resnet50.yml           |  6 ++----
 .github/workflows/test-retinanet.yml          |  6 ++----
 .github/workflows/test-submission-checker.yml |  2 +-
 .github/workflows/test-tvm.yml                |  6 ++----
 loadgen/pyproject.toml                        |  2 +-
 loadgen/setup.py                              | 12 +++++++++---
 loadgen/version_generator.py                  |  1 -
 10 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
index 3c14198e81..30a88f11b2 100644
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@@ -3,12 +3,18 @@ name: Build loadgen wheels and release them into PYPI
 on:
   release:
     types: [published]
+  push:
+    branches:
+      - master
+    paths:
+      - loadgen/setup.py
 
 jobs:
   build_wheels:
     name: Build wheels on ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
         os: [ubuntu-latest, windows-latest, macOS-latest]
 
@@ -18,7 +24,7 @@ jobs:
       - uses: actions/setup-python@v3
 
       - name: Install requirements
-        run: python -m pip install cibuildwheel==2.16.2 twine==4.0.2
+        run: python -m pip install cibuildwheel twine
 
       - name: Build wheels
         run: python -m cibuildwheel loadgen/ --output-dir wheels
diff --git a/.github/workflows/test-bert.yml b/.github/workflows/test-bert.yml
index fb5ba3e515..6f6d77a397 100755
--- a/.github/workflows/test-bert.yml
+++ b/.github/workflows/test-bert.yml
@@ -30,9 +30,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python3 -m pip install cmind
-        cm pull repo mlcommons@ck
-        cm run script --quiet --tags=get,sys-utils-cm
+        python3 -m pip install cm4mlops
     - name: Test BERT and end to end submission generation
       run: |
-        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=bert-99 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --adr.compiler.tags=gcc  --adr.inference-src.version=custom --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }} --target_qps=1
+        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=bert-99 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --adr.compiler.tags=gcc  --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }}
diff --git a/.github/workflows/test-loadgen.yml b/.github/workflows/test-loadgen.yml
index 125825e390..b010f82584 100755
--- a/.github/workflows/test-loadgen.yml
+++ b/.github/workflows/test-loadgen.yml
@@ -28,8 +28,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python3 -m pip install cmind
-        cm pull repo mlcommons@ck
+        python3 -m pip install cm4mlops
     - name: Test Loadgen
       run: |
-        cm run script --tags=get,mlperf,inference,loadgen --quiet --version=custom --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }}
+        cm run script --tags=get,mlperf,inference,loadgen --quiet --version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }}
diff --git a/.github/workflows/test-resnet50.yml b/.github/workflows/test-resnet50.yml
index ab14d4ca50..b5d09a66ad 100755
--- a/.github/workflows/test-resnet50.yml
+++ b/.github/workflows/test-resnet50.yml
@@ -30,9 +30,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python3 -m pip install cmind
-        cm pull repo mlcommons@ck
-        cm run script --quiet --tags=get,sys-utils-cm
+        python3 -m pip install cm4mlops
     - name: Test Resnet50 and end to end submission generation
       run: |
-        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.version=custom --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }}
+        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=500 --adr.compiler.tags=gcc --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom
diff --git a/.github/workflows/test-retinanet.yml b/.github/workflows/test-retinanet.yml
index 3afcf086d2..20b05cc25f 100755
--- a/.github/workflows/test-retinanet.yml
+++ b/.github/workflows/test-retinanet.yml
@@ -30,9 +30,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python3 -m pip install cmind
-        cm pull repo mlcommons@ck
-        cm run script --quiet --tags=get,sys-utils-cm
+        python3 -m pip install cm4mlops
     - name: Test Retinanet and end to end submission generation
       run: |
-        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=retinanet --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=10 --adr.compiler.tags=gcc  --adr.inference-src.version=custom --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }}
+        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --quiet --submitter="MLCommons" --hw_name=default --model=retinanet --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=10 --adr.compiler.tags=gcc  --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }}
diff --git a/.github/workflows/test-submission-checker.yml b/.github/workflows/test-submission-checker.yml
index 14ac907230..2d1b129034 100644
--- a/.github/workflows/test-submission-checker.yml
+++ b/.github/workflows/test-submission-checker.yml
@@ -33,4 +33,4 @@ jobs:
         git clone https://github.com/mlcommons/inference_results_v4.0 --depth 1
     - name: Test MLPerf inference submission checker
       run: |
-        cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.0 --quiet 
+        cm run script --tags=run,mlperf,inference,submission,checker --adr.inference-src.tags=_branch.${{ github.event.pull_request.head.ref }},_repo.${{ github.event.pull_request.head.repo.html_url }} --adr.inference-src.version=custom --input=`pwd`/inference_results_v4.0 --version=r4.0 --quiet 
diff --git a/.github/workflows/test-tvm.yml b/.github/workflows/test-tvm.yml
index fc57c88120..b132fc0025 100755
--- a/.github/workflows/test-tvm.yml
+++ b/.github/workflows/test-tvm.yml
@@ -30,9 +30,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python3 -m pip install cmind
-        cm pull repo mlcommons@ck
-        cm run script --quiet --tags=get,sys-utils-cm
+        python3 -m pip install cm4mlops
     - name: Test Resnet50 TVM backend
       run: |
-        cm run script --tags=run,mlperf,inference,generate-run-cmds --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=5 --adr.compiler.tags=gcc  --adr.inference-src.version=custom --adr.inference-src.env.CM_GIT_CHECKOUT=${{ github.event.pull_request.head.ref }} --adr.inference-src.env.CM_GIT_URL=${{ github.event.pull_request.head.repo.html_url }} --target_qps=1
+        cm run script --tags=run,mlperf,inference,generate-run-cmds --quiet --submitter="MLCommons" --hw_name=default --model=resnet50 --implementation=reference --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=5 --adr.compiler.tags=gcc  --adr.inference-src.version=custom --adr.inference-src.tags=_repo.${{ github.event.pull_request.head.repo.html_url }},_branch.${{ github.event.pull_request.head.ref }}
diff --git a/loadgen/pyproject.toml b/loadgen/pyproject.toml
index f62b2c5f7a..9be24eea6d 100755
--- a/loadgen/pyproject.toml
+++ b/loadgen/pyproject.toml
@@ -4,4 +4,4 @@ build-backend = "setuptools.build_meta:__legacy__"
 
 [tool.cibuildwheel]
 environment = "CFLAGS='-std=c++14'"
-build = "cp3{7,8,9,10,11}-*"
+build = "cp3{7,8,9,10,11,12}-*"
diff --git a/loadgen/setup.py b/loadgen/setup.py
index d7b8224de4..8dfc5b9f0e 100644
--- a/loadgen/setup.py
+++ b/loadgen/setup.py
@@ -40,6 +40,9 @@
     "query_sample_library.h",
     "system_under_test.h",
     "test_settings.h",
+    "issue_query_controller.h",
+    "early_stopping.h",
+    "query_dispatch_library.h",
 ]
 
 lib_headers = [
@@ -49,6 +52,8 @@
     "utils.h",
     "version.h",
     "results.h",
+    "bindings/c_api.h",
+    "version_generator.py"
 ]
 
 lib_sources = [
@@ -63,6 +68,7 @@
 ]
 
 lib_bindings = [
+    "bindings/c_api.cc",
     "bindings/python_api.cc",
 ]
 
@@ -76,13 +82,13 @@
 
 mlperf_loadgen_module = Pybind11Extension(
         "mlperf_loadgen",
-        define_macros=[("MAJOR_VERSION", "4"), ("MINOR_VERSION", "0")],
+        define_macros=[("MAJOR_VERSION", "4"), ("MINOR_VERSION", "1")],
         include_dirs=[".", get_include()],
         sources=mlperf_loadgen_sources,
         depends=mlperf_loadgen_headers)
 
-setup(name="mlperf_loadgen",
-      version="4.0",
+setup(name="mlcommons_loadgen",
+      version="4.1",
       description="MLPerf Inference LoadGen python bindings",
       url="https://mlcommons.org/",
       cmdclass={"build_ext": build_ext},
diff --git a/loadgen/version_generator.py b/loadgen/version_generator.py
index 4de930a633..2c5d11331a 100644
--- a/loadgen/version_generator.py
+++ b/loadgen/version_generator.py
@@ -69,7 +69,6 @@ def generate_loadgen_version_definitions_sha1(ofile, loadgen_root):
     sha1s = ""
     loadgen_files = (
         ["/bindings/" + s for s in os.listdir(loadgen_root + "/bindings")] +
-        ["/demos/" + s for s in os.listdir(loadgen_root + "/demos")] +
         ["/" + s for s in os.listdir(loadgen_root)])
     for fn in sorted(loadgen_files):
         full_fn = loadgen_root + fn

From 53f6475910954facecdce05399d0976e2bbcc931 Mon Sep 17 00:00:00 2001
From: Anton Lokhmotov <psyhtest@users.noreply.github.com>
Date: Thu, 20 Jun 2024 23:04:48 +0100
Subject: [PATCH 3/4] Set v4.1 as default version for submission checker.
 (#1737)

---
 tools/submission/submission_checker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 9263a5f687..b15a8596fd 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -783,7 +783,7 @@ def get_args():
     parser.add_argument("--input", required=True, help="submission directory")
     parser.add_argument(
         "--version",
-        default="v4.0",
+        default="v4.1",
         choices=list(MODEL_CONFIG.keys()),
         help="mlperf version",
     )

From 2a46c7afb88b89a6c5def96953902793996ee37b Mon Sep 17 00:00:00 2001
From: Michal Szutenberg <37601244+szutenberg@users.noreply.github.com>
Date: Mon, 24 Jun 2024 18:54:53 +0200
Subject: [PATCH 4/4] Small fixes in MoE eval script (#1743)

* Removed suffixes "_accuracy"
* Removed redundant code
* Similarly to llama we only check rouge1,rouge2 and rougeL scores
---
 language/mixtral-8x7b/README.md            | 6 +++---
 language/mixtral-8x7b/evaluate-accuracy.py | 6 ++----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/language/mixtral-8x7b/README.md b/language/mixtral-8x7b/README.md
index 20cfe67c90..69f8035770 100644
--- a/language/mixtral-8x7b/README.md
+++ b/language/mixtral-8x7b/README.md
@@ -247,15 +247,15 @@ python -u evaluate-accuracy.py --checkpoint-path mistralai/Mixtral-8x7B-instruct
 Reference scores:
 Open Orca:
 ```json
-{'rouge1': 45.4911, 'rouge2': 23.2829, 'rougeL': 30.3615, 'rougeLsum': 42.4333}
+{'rouge1': 45.4911, 'rouge2': 23.2829, 'rougeL': 30.3615}
 ```
 GSM8K:
 ```json
-{'gsm8k_accuracy': 73.78}
+{'gsm8k': 73.78}
 ```
 MBXP:
 ```json
-{'mbxp_accuracy': 60.16}
+{'mbxp': 60.16}
 ```
 For official submissions, 99% of each reference score is enforced. Additionally, 90%-110% of the generated tokens_per_samples:
 ```json
diff --git a/language/mixtral-8x7b/evaluate-accuracy.py b/language/mixtral-8x7b/evaluate-accuracy.py
index e20834c412..d25b8178ba 100644
--- a/language/mixtral-8x7b/evaluate-accuracy.py
+++ b/language/mixtral-8x7b/evaluate-accuracy.py
@@ -193,11 +193,11 @@ def main():
             continue
         correct += (ref == tgt)
 
-    gsm8k_accuracy = 100.0 * correct / gsm8k_total
+    result['gsm8k'] = 100.0 * correct / gsm8k_total
 
     # MBXP metric
     from evaluate_mbxp import evaluate_mbxp
-    mbxp_accuracy = evaluate_mbxp(results_MBXP, args.n_workers)
+    result['mbxp'] = evaluate_mbxp(results_MBXP, args.n_workers)
 
     result = {
         **result,
@@ -205,8 +205,6 @@ def main():
         'gen_num': gen_num,
         'gen_tok_len': gen_tok_len,
         'tokens_per_sample': round(gen_tok_len / gen_num, 1),
-        'gsm8k_accuracy': gsm8k_accuracy,
-        'mbxp_accuracy': mbxp_accuracy
     }
 
     print("\nResults\n")