diff --git a/.gitignore b/.gitignore index 9545a7977..eba8bb341 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ loadgen/build/ libmlperf_loadgen.a __pycache__/ generated/ +*.swp diff --git a/docs/benchmarks/graph/get-rgat-data.md b/docs/benchmarks/graph/get-rgat-data.md new file mode 100644 index 000000000..189c25b87 --- /dev/null +++ b/docs/benchmarks/graph/get-rgat-data.md @@ -0,0 +1,39 @@ +--- +hide: + - toc +--- + +# Graph Neural Network using R-GAT + +## Dataset + +The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands. + +=== "Full Dataset" + R-GAT validation run uses the IGBH dataset consisting of 547,306,935 nodes and 5,812,005,639 edges. + + ### Get Full Dataset + ``` + cm run script --tags=get,dataset,igbh,_full -j + ``` + +=== "Debug Dataset" + R-GAT debug run uses the IGBH debug dataset(tiny). + + ### Get Full Dataset + ``` + cm run script --tags=get,dataset,igbh,_debug -j + ``` + +## Model +The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands. + +Get the Official MLPerf R-GAT Model + +=== "PyTorch" + + ### PyTorch + ``` + cm run script --tags=get,ml-model,rgat -j + ``` + diff --git a/docs/benchmarks/graph/rgat.md b/docs/benchmarks/graph/rgat.md new file mode 100644 index 000000000..ffff467a4 --- /dev/null +++ b/docs/benchmarks/graph/rgat.md @@ -0,0 +1,13 @@ +--- +hide: + - toc +--- + + +# Graph Neural Network using R-GAT + + +=== "MLCommons-Python" + ## MLPerf Reference Implementation in Python + +{{ mlperf_inference_implementation_readme (4, "rgat", "reference", devices = ["CPU", "CUDA"]) }} \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 11f2a52c2..b46d4c274 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,7 +1,7 @@ # MLPerf Inference Benchmarks ## Overview -The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf inference v4.0 round are listed below, categorized by tasks. Under each model you can find its details like the dataset used, reference accuracy, server latency constraints etc. +The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf inference v5.0 round are listed below, categorized by tasks. Under each model you can find its details like the dataset used, reference accuracy, server latency constraints etc. --- @@ -80,7 +80,7 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe - **Server Scenario Latency Constraint**: 130ms - **Equal Issue mode**: False - **High accuracy variant**: yes -- **Submission Category**: Datacenter, Edge +- **Submission Category**: Edge #### [LLAMA2-70B](benchmarks/language/llama2-70b.md) - **Dataset**: OpenORCA (GPT-4 split, max_seq_len=1024) @@ -157,11 +157,22 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe - **High accuracy variant**: Yes - **Submission Category**: Datacenter +## Graph Neural Networks +### [R-GAT](benchmarks/graph/rgat.md) +- **Dataset**: Illinois Graph Benchmark Heterogeneous validation dataset + - **Dataset Size**: 788,379 + - **QSL Size**: 788,379 +- **Number of Parameters**: +- **Reference Model Accuracy**: ACC = ? +- **Server Scenario Latency Constraint**: N/A +- **Equal Issue mode**: True +- **High accuracy variant**: No +- **Submission Category**: Datacenter --- ## Submission Categories -- **Datacenter Category**: All the current inference benchmarks are applicable to the datacenter category. -- **Edge Category**: All benchmarks except DLRMv2, LLAMA2-70B, and Mixtral-8x7B are applicable to the edge category. +- **Datacenter Category**: All benchmarks except bert are applicable to the datacenter category for inference v5.0. +- **Edge Category**: All benchmarks except DLRMv2, LLAMA2-70B, Mixtral-8x7B and R-GAT are applicable to the edge category for v5.0. ## High Accuracy Variants - **Benchmarks**: `bert`, `llama2-70b`, `gpt-j`, `dlrm_v2`, and `3d-unet` have a normal accuracy variant as well as a high accuracy variant. diff --git a/docs/submission/index.md b/docs/submission/index.md index c99802420..1050f5fb0 100644 --- a/docs/submission/index.md +++ b/docs/submission/index.md @@ -13,13 +13,15 @@ hide: Click [here](https://youtu.be/eI1Hoecc3ho) to view the recording of the workshop: Streamlining your MLPerf Inference results using CM. -=== "CM based benchmark" +Click [here](https://docs.google.com/presentation/d/1cmbpZUpVr78EIrhzyMBnnWnjJrD-mZ2vmSb-yETkTA8/edit?usp=sharing) to view the prposal slide for Common Automation for MLPerf Inference Submission Generation through CM. + +=== "CM based results" If you have followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, all the valid results will get aggregated to the `cm cache` folder. The following command could be used to browse the structure of inference results folder generated by CM. ### Get results folder structure ```bash cm find cache --tags=get,mlperf,inference,results,dir | xargs tree ``` -=== "Non CM based benchmark" +=== "Non CM based results" If you have not followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, please make sure that the result directory is structured in the following way. ``` └── System description ID(SUT Name) @@ -35,18 +37,20 @@ Click [here](https://youtu.be/eI1Hoecc3ho) to view the recording of the workshop | ├── mlperf_log_detail.txt | ├── mlperf_log_accuracy.json | └── accuracy.txt - └── Compliance_Test_ID - ├── Performance - | └── run_x/#1 run for all scenarios - | ├── mlperf_log_summary.txt - | └── mlperf_log_detail.txt - ├── Accuracy - | ├── baseline_accuracy.txt - | ├── compliance_accuracy.txt - | ├── mlperf_log_accuracy.json - | └── accuracy.txt - ├── verify_performance.txt - └── verify_accuracy.txt #for TEST01 only + |── Compliance_Test_ID + | ├── Performance + | | └── run_x/#1 run for all scenarios + | | ├── mlperf_log_summary.txt + | | └── mlperf_log_detail.txt + | ├── Accuracy + | | ├── baseline_accuracy.txt + | | ├── compliance_accuracy.txt + | | ├── mlperf_log_accuracy.json + | | └── accuracy.txt + | ├── verify_performance.txt + | └── verify_accuracy.txt #for TEST01 only + |── user.conf + └── measurements.json ```
@@ -67,67 +71,69 @@ Once all the results across all the models are ready you can use the following c ## Generate actual submission tree -=== "Closed Edge" - ### Closed Edge Submission - ```bash - cm run script --tags=generate,inference,submission \ - --clean \ - --preprocess_submission=yes \ - --run-checker \ - --submitter=MLCommons \ - --tar=yes \ - --env.CM_TAR_OUTFILE=submission.tar.gz \ - --division=closed \ - --category=edge \ - --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ - --quiet - ``` - -=== "Closed Datacenter" - ### Closed Datacenter Submission - ```bash - cm run script --tags=generate,inference,submission \ - --clean \ - --preprocess_submission=yes \ - --run-checker \ - --submitter=MLCommons \ - --tar=yes \ - --env.CM_TAR_OUTFILE=submission.tar.gz \ - --division=closed \ - --category=datacenter \ - --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ - --quiet - ``` -=== "Open Edge" - ### Open Edge Submission - ```bash - cm run script --tags=generate,inference,submission \ - --clean \ - --preprocess_submission=yes \ - --run-checker \ - --submitter=MLCommons \ - --tar=yes \ - --env.CM_TAR_OUTFILE=submission.tar.gz \ - --division=open \ - --category=edge \ - --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ - --quiet - ``` -=== "Open Datacenter" - ### Closed Datacenter Submission - ```bash - cm run script --tags=generate,inference,submission \ - --clean \ - --preprocess_submission=yes \ - --run-checker \ - --submitter=MLCommons \ - --tar=yes \ - --env.CM_TAR_OUTFILE=submission.tar.gz \ - --division=open \ - --category=datacenter \ - --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ - --quiet - ``` +=== "Docker run" + ### Docker run + === "Closed" + ### Closed Submission + ```bash + cm docker script --tags=generate,inference,submission \ + --clean \ + --preprocess_submission=yes \ + --run-checker \ + --submitter=MLCommons \ + --tar=yes \ + --env.CM_TAR_OUTFILE=submission.tar.gz \ + --division=closed \ + --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ + --quiet + ``` + + === "Open" + ### Open Submission + ```bash + cm docker script --tags=generate,inference,submission \ + --clean \ + --preprocess_submission=yes \ + --run-checker \ + --submitter=MLCommons \ + --tar=yes \ + --env.CM_TAR_OUTFILE=submission.tar.gz \ + --division=open \ + --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ + --quiet + ``` + +=== "Native run" + ### Native run + === "Closed" + ### Closed Submission + ```bash + cm run script --tags=generate,inference,submission \ + --clean \ + --preprocess_submission=yes \ + --run-checker \ + --submitter=MLCommons \ + --tar=yes \ + --env.CM_TAR_OUTFILE=submission.tar.gz \ + --division=closed \ + --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ + --quiet + ``` + + === "Open" + ### Open Submission + ```bash + cm run script --tags=generate,inference,submission \ + --clean \ + --preprocess_submission=yes \ + --run-checker \ + --submitter=MLCommons \ + --tar=yes \ + --env.CM_TAR_OUTFILE=submission.tar.gz \ + --division=open \ + --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \ + --quiet + ``` * Use `--hw_name="My system name"` to give a meaningful system name. Examples can be seen [here](https://github.com/mlcommons/inference_results_v3.0/tree/main/open/cTuning/systems) @@ -137,6 +143,10 @@ Once all the results across all the models are ready you can use the following c * Use `--results_dir` option to specify the results folder for Non CM based benchmarks +* Use `--category` option to specify the category for which submission is generated(datacenter/edge). By default, the category is taken from `system_meta.json` file located in the SUT root directory. + +* Use `--submission_base_dir` to specify the directory to which outputs from preprocess submission script and final submission is to be dumped. No need to provide `--submission_dir` along with this. For `docker run`, use `--submission_base_dir` instead of `--submission_dir`. + The above command should generate "submission.tar.gz" if there are no submission checker issues and you can upload it to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission). ## Aggregate Results in GitHub diff --git a/docs/system_requirements.yml b/docs/system_requirements.yml new file mode 100644 index 000000000..5dfec202a --- /dev/null +++ b/docs/system_requirements.yml @@ -0,0 +1,50 @@ +# All memory requirements in GB +resnet: + reference: + fp32: + system_memory: 8 + accelerator_memory: 4 + disk_storage: 25 + nvidia: + int8: + system_memory: 8 + accelerator_memory: 4 + disk_storage: 100 + intel: + int8: + system_memory: 8 + accelerator_memory: 0 + disk_storage: 50 + qualcomm: + int8: + system_memory: 8 + accelerator_memory: 8 + disk_storage: 50 +retinanet: + reference: + fp32: + system_memory: 8 + accelerator_memory: 8 + disk_storage: 200 + nvidia: + int8: + system_memory: 8 + accelerator_memory: 8 + disk_storage: 200 + intel: + int8: + system_memory: 8 + accelerator_memory: 0 + disk_storage: 200 + qualcomm: + int8: + system_memory: 8 + accelerator_memory: 8 + disk_storage: 200 +rgat: + reference: + fp32: + system_memory: 768 + accelerator_memory: 8 + disk_storage: 2300 + diff --git a/graph/R-GAT/README.md b/graph/R-GAT/README.md index 29d359686..fbfca4709 100644 --- a/graph/R-GAT/README.md +++ b/graph/R-GAT/README.md @@ -232,9 +232,12 @@ docker build . -f dockerfile.gpu -t rgat-gpu ``` Run docker container: ```bash -docker run --rm -it -v $(pwd):/root --gpus all rgat-gpu +docker run --rm -it -v $(pwd):/workspace/root --gpus all rgat-gpu ``` -Run benchmark inside the docker container: +Go inside the root folder and run benchmark inside the docker container: ```bash +cd root python3 main.py --dataset igbh-dgl --dataset-path igbh/ --profile rgat-dgl-full --device gpu [--model-path ] [--in-memory] [--dtype ] [--scenario ] ``` + +**NOTE:** For official submissions, this benchmark is required to run in equal issue mode. Please make sure that the flag `rgat.*.sample_concatenate_permutation` is set to one in the [mlperf.conf](../../loadgen/mlperf.conf) file when loadgen is built. diff --git a/graph/R-GAT/dockerfile.gpu b/graph/R-GAT/dockerfile.gpu index fae65081f..f600028fe 100644 --- a/graph/R-GAT/dockerfile.gpu +++ b/graph/R-GAT/dockerfile.gpu @@ -26,6 +26,8 @@ RUN apt install -y --no-install-recommends rsync # Upgrade pip RUN python3 -m pip install --upgrade pip +RUN pip install torch-geometric torch-scatter torch-sparse -f https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html +RUN pip install dgl -f https://data.dgl.ai/wheels/torch-2.1/cu121/repo.html COPY requirements.txt requirements.txt RUN pip install -r requirements.txt @@ -35,10 +37,6 @@ RUN cd /tmp && \ pip install pybind11 && \ CFLAGS="-std=c++14" python3 setup.py install -RUN export TORCH_VERSION=$(python -c "import torch; print(torch.__version__)") -RUN pip install torch-geometric torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-${TORCH_VERSION}.html -RUN pip install dgl -f https://data.dgl.ai/wheels/torch-2.1/cu121/repo.html - # Clean up RUN rm -rf mlperf \ rm requirements.txt \ No newline at end of file diff --git a/graph/R-GAT/igbh/tiny/models/dataloader.py b/graph/R-GAT/igbh/tiny/models/dataloader.py deleted file mode 100644 index cc64d1466..000000000 --- a/graph/R-GAT/igbh/tiny/models/dataloader.py +++ /dev/null @@ -1,82 +0,0 @@ -import torch -from torch_geometric.data import InMemoryDataset, Data -from dgl.data import DGLDataset - -from utils import IGL260MDataset - -# TODO: Make a PyG dataloader for large datasets - - -class IGL260M_PyG(InMemoryDataset): - def __init__(self, args): - super().__init__(root, transform, pre_transform, pre_filter) - - def process(self): - dataset = IGL260MDataset(root=self.dir, size=args.dataset_size, - in_memory=args.in_memory, classes=args.type_classes, synthetic=args.synthetic) - node_features = torch.from_numpy(dataset.paper_feat) - node_edges = torch.from_numpy(dataset.paper_edge).T - node_labels = torch.from_numpy(dataset.paper_label).to(torch.long) - data = Data(x=node_features, edge_index=node_edges, y=node_labels) - - n_nodes = node_features.shape[0] - - n_train = int(n_nodes * 0.6) - n_val = int(n_nodes * 0.2) - - train_mask = torch.zeros(n_nodes, dtype=torch.bool) - val_mask = torch.zeros(n_nodes, dtype=torch.bool) - test_mask = torch.zeros(n_nodes, dtype=torch.bool) - - train_mask[:n_train] = True - val_mask[n_train:n_train + n_val] = True - test_mask[n_train + n_val:] = True - - data.train_mask = train_mask - data.val_mask = val_mask - data.test_mask = test_mask - - -class IGL260M_DGL(DGLDataset): - def __init__(self, args): - self.dir = args.path - super().__init__(name='IGB260M') - - def process(self): - dataset = IGL260MDataset(root=self.dir, size=args.dataset_size, - in_memory=args.in_memory, classes=args.type_classes, synthetic=args.synthetic) - node_features = torch.from_numpy(dataset.paper_feat) - node_edges = torch.from_numpy(dataset.paper_edge) - node_labels = torch.from_numpy(dataset.paper_label).to(torch.long) - - self.graph = dgl.graph( - (node_edges[:, 0], node_edges[:, 1]), num_nodes=node_features.shape[0]) - - self.graph.ndata['feat'] = node_features - self.graph.ndata['label'] = node_labels - - self.graph = dgl.remove_self_loop(self.graph) - self.graph = dgl.add_self_loop(self.graph) - - n_nodes = node_features.shape[0] - - n_train = int(n_nodes * 0.6) - n_val = int(n_nodes * 0.2) - - train_mask = torch.zeros(n_nodes, dtype=torch.bool) - val_mask = torch.zeros(n_nodes, dtype=torch.bool) - test_mask = torch.zeros(n_nodes, dtype=torch.bool) - - train_mask[:n_train] = True - val_mask[n_train:n_train + n_val] = True - test_mask[n_train + n_val:] = True - - self.graph.ndata['train_mask'] = train_mask - self.graph.ndata['val_mask'] = val_mask - self.graph.ndata['test_mask'] = test_mask - - def __getitem__(self, i): - return self.graph - - def __len__(self): - return 1 diff --git a/graph/R-GAT/igbh/tiny/models/gnn.py b/graph/R-GAT/igbh/tiny/models/gnn.py deleted file mode 100644 index 20d5ecd72..000000000 --- a/graph/R-GAT/igbh/tiny/models/gnn.py +++ /dev/null @@ -1,296 +0,0 @@ -from utils import IGL260MDataset -import warnings -from tqdm import tqdm -import numpy as np -import time -import torch.nn.functional as F -import torch.optim as optim -import torch.nn as nn -import dgl -from dgl.data import DGLDataset -import dgl.nn.pytorch as dglnn -from dgl.nn.pytorch import GATConv, GraphConv, SAGEConv -import os.path as osp -from sys import getsizeof - - -import torch -torch.manual_seed(0) -dgl.seed(0) -warnings.filterwarnings("ignore") - - -class GCN(nn.Module): - def __init__(self, - in_feats, - n_hidden, - n_classes, - n_layers, - activation, - dropout): - super(GCN, self).__init__() - self.layers = nn.ModuleList() - self.n_layers = n_layers - self.n_hidden = n_hidden - self.n_classes = n_classes - # input layer - self.layers.append( - GraphConv( - in_feats, - n_hidden, - activation=activation)) - # hidden layers - for i in range(n_layers - 1): - self.layers.append( - GraphConv( - n_hidden, - n_hidden, - activation=activation)) - # output layer - self.layers.append(GraphConv(n_hidden, n_classes)) - self.dropout = nn.Dropout(p=dropout) - self.activation = activation - - def forward(self, blocks, x): - h = x - for l, (layer, block) in enumerate(zip(self.layers, blocks)): - if l != len(self.layers) - 1: - # h = self.activation(h) - h = self.dropout(h) - h = layer(block, h) - return h - - def inference(self, g, x, batch_size, device): - """ - Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). - g : the entire graph. - x : the input of entire node set. - The inference code is written in a fashion that it could handle any number of nodes and - layers. - """ - # During inference with sampling, multi-layer blocks are very inefficient because - # lots of computations in the first few layers are repeated. - # Therefore, we compute the representation of all nodes layer by layer. The nodes - # on each layer are of course splitted in batches. - # TODO: can we standardize this? - for l, layer in enumerate(self.layers): - y = torch.zeros(g.number_of_nodes(), self.n_hidden if l != - len(self.layers) - 1 else self.n_classes) - - sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) - dataloader = dgl.dataloading.NodeDataLoader( - g, - torch.arange(g.number_of_nodes()), - sampler, - batch_size=batch_size, - shuffle=True, - drop_last=False, - num_workers=4) - - for input_nodes, output_nodes, blocks in dataloader: - block = blocks[0] - - block = block.int().to(device) - h = x[input_nodes].to(device) - h = layer(block, h) - if l != len(self.layers) - 1: - h = self.activation(h) - h = self.dropout(h) - - y[output_nodes] = h.cpu() - - x = y - return y - - -class GAT(nn.Module): - def __init__( - self, in_feats, n_hidden, n_classes, n_layers, num_heads, activation - ): - super().__init__() - self.n_layers = n_layers - self.n_hidden = n_hidden - self.n_classes = n_classes - self.layers = nn.ModuleList() - self.layers.append( - dglnn.GATConv( - (in_feats, in_feats), - n_hidden, - num_heads=num_heads, - activation=activation, - ) - ) - for i in range(1, n_layers - 1): - self.layers.append( - dglnn.GATConv( - (n_hidden * num_heads, n_hidden * num_heads), - n_hidden, - num_heads=num_heads, - activation=activation, - ) - ) - self.layers.append( - dglnn.GATConv( - (n_hidden * num_heads, n_hidden * num_heads), - n_classes, - num_heads=num_heads, - activation=None, - ) - ) - - def forward(self, blocks, x): - h = x - for l, (layer, block) in enumerate(zip(self.layers, blocks)): - # We need to first copy the representation of nodes on the RHS from the - # appropriate nodes on the LHS. - # Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst - # would be (num_nodes_RHS, D) - h_dst = h[: block.num_dst_nodes()] - # Then we compute the updated representation on the RHS. - # The shape of h now becomes (num_nodes_RHS, D) - if l < self.n_layers - 1: - h = layer(block, (h, h_dst)).flatten(1) - else: - h = layer(block, (h, h_dst)) - h = h.mean(1) - return h.log_softmax(dim=-1) - - def inference(self, g, x, batch_size, device): - """ - Inference with the GAT model on full neighbors (i.e. without neighbor sampling). - g : the entire graph. - x : the input of entire node set. - The inference code is written in a fashion that it could handle any number of nodes and - layers. - """ - # During inference with sampling, multi-layer blocks are very inefficient because - # lots of computations in the first few layers are repeated. - # Therefore, we compute the representation of all nodes layer by layer. The nodes - # on each layer are of course splitted in batches. - # TODO: can we standardize this? - # TODO: make thiw into a variable - num_heads = 2 - for l, layer in enumerate(self.layers): - if l < self.n_layers - 1: - y = torch.zeros( - g.num_nodes(), - self.n_hidden * num_heads - if l != len(self.layers) - 1 - else self.n_classes, - ) - else: - y = torch.zeros( - g.num_nodes(), - self.n_hidden - if l != len(self.layers) - 1 - else self.n_classes, - ) - - sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) - dataloader = dgl.dataloading.DataLoader( - g, - torch.arange(g.num_nodes()), - sampler, - batch_size=batch_size, - shuffle=True, - drop_last=False, - num_workers=4, - ) - - for input_nodes, output_nodes, blocks in dataloader: - block = blocks[0].int().to(device) - - h = x[input_nodes].to(device) - h_dst = h[: block.num_dst_nodes()] - if l < self.n_layers - 1: - h = layer(block, (h, h_dst)).flatten(1) - else: - h = layer(block, (h, h_dst)) - h = h.mean(1) - h = h.log_softmax(dim=-1) - - y[output_nodes] = h.cpu() - - x = y - return y - - -class SAGE(nn.Module): - def __init__(self, - in_feats, - n_hidden, - n_classes, - n_layers, - activation, - dropout, - aggregator_type): - super().__init__() - self.n_layers = n_layers - self.n_hidden = n_hidden - self.n_classes = n_classes - self.layers = nn.ModuleList() - self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, aggregator_type)) - for i in range(1, n_layers - 1): - self.layers.append( - dglnn.SAGEConv( - n_hidden, - n_hidden, - aggregator_type)) - self.layers.append( - dglnn.SAGEConv( - n_hidden, - n_classes, - aggregator_type)) - self.dropout = nn.Dropout(dropout) - self.activation = activation - - def forward(self, blocks, x): - h = x - for l, (layer, block) in enumerate(zip(self.layers, blocks)): - h = layer(block, h) - if l != len(self.layers) - 1: - h = self.activation(h) - h = self.dropout(h) - return h - - def inference(self, g, x, batch_size, device): - """ - Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). - g : the entire graph. - x : the input of entire node set. - The inference code is written in a fashion that it could handle any number of nodes and - layers. - """ - # During inference with sampling, multi-layer blocks are very inefficient because - # lots of computations in the first few layers are repeated. - # Therefore, we compute the representation of all nodes layer by layer. The nodes - # on each layer are of course splitted in batches. - # TODO: can we standardize this? - for l, layer in enumerate(self.layers): - y = torch.zeros(g.number_of_nodes(), self.n_hidden if l != - len(self.layers) - 1 else self.n_classes) - - sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) - dataloader = dgl.dataloading.NodeDataLoader( - g, - torch.arange(g.number_of_nodes()), - sampler, - batch_size=batch_size, - shuffle=True, - drop_last=False, - num_workers=4) - - for input_nodes, output_nodes, blocks in dataloader: - block = blocks[0] - - block = block.int().to(device) - h = x[input_nodes].to(device) - h = layer(block, h) - if l != len(self.layers) - 1: - h = self.activation(h) - h = self.dropout(h) - - y[output_nodes] = h.cpu() - - x = y - return y diff --git a/graph/R-GAT/igbh/tiny/models/main.py b/graph/R-GAT/igbh/tiny/models/main.py deleted file mode 100644 index 4ab22eb75..000000000 --- a/graph/R-GAT/igbh/tiny/models/main.py +++ /dev/null @@ -1,79 +0,0 @@ -import argparse - - -def main(): - parser = argparse.ArgumentParser() - - # Input/output paths - parser.add_argument('--path', type=str, default='/gnndataset/') - parser.add_argument('--modelpath', type=str, default='gcn_19.pt') - - # Dataset selection - parser.add_argument( - '--dataset_size', - type=str, - default='experimental', - choices=[ - 'experimental', - 'small', - 'medium', - 'large', - 'full']) - parser.add_argument( - '--type_classes', - type=int, - default=19, - choices=[ - 19, - 292, - 2983]) - - # Hyperparameters - parser.add_argument('--hidden_channels', type=int, default=16) - parser.add_argument('--fan_out', type=str, default='5,10') - parser.add_argument('--num_layers', type=int, default=2) - parser.add_argument('--learning_rate', type=int, default=0.01) - parser.add_argument('--decay', type=int, default=0.001) - parser.add_argument('--num_workers', type=int, default=4) - parser.add_argument('--batch_size', type=int, default=2048 * 16) - parser.add_argument('--dropout', type=float, default=0.2) - parser.add_argument('--epochs', type=int, default=20) - parser.add_argument( - '--model_type', - type=str, - default='gcn', - choices=[ - 'gat', - 'sage', - 'gcn']) - parser.add_argument('--in_memory', type=int, default=0) - parser.add_argument('--synthetic', type=int, default=0) - parser.add_argument('--device', type=str, default='1') - args = parser.parse_args() - - print("Dataset_size: " + args.dataset_size) - print("Model : " + args.model) - print("Num_classes : " + str(args.num_classes)) - print() - - device = f'cuda:' + args.device if torch.cuda.is_available() else 'cpu' - - dataset = IGL260M_DGL(args) - g = dataset[0] - - best_test_acc, train_acc, test_acc = track_acc(g, args) - - print( - f"Train accuracy: {np.mean(train_acc):.2f} \u00B1 {np.std(train_acc):.2f} \t Best: {np.max(train_acc) * 100:.4f}%") - print( - f"Test accuracy: {np.mean(test_acc):.2f} \u00B1 {np.std(test_acc):.2f} \t Best: {np.max(test_acc) * 100:.4f}%") - print() - print(" -------- For debugging --------- ") - print("Parameters: ", args) - print(g) - print("Train accuracy: ", train_acc) - print("Test accuracy: ", test_acc) - - -if __name__ == '__main__': - main() diff --git a/graph/R-GAT/igbh/tiny/models/utils.py b/graph/R-GAT/igbh/tiny/models/utils.py deleted file mode 100644 index 5e9e1a25d..000000000 --- a/graph/R-GAT/igbh/tiny/models/utils.py +++ /dev/null @@ -1,224 +0,0 @@ -import numpy as np -import torch - - -class IGL260MDataset(object): - def __init__(self, root: str, size: str, in_memory: int, - classes: int, synthetic: int): - self.dir = root - self.size = size - self.synthetic = synthetic - self.in_memory = in_memory - self.num_classes = classes - self.__meta__ = torch.load(osp.join(self.dir, self.size, 'meta.pt')) - - self.num_features = self.__meta__['paper']['emb_dim'] - self.num_nodes = self.__meta__['paper']['num_node'] - self.num_edges = self.__meta__['cites']['num_edge'] - - @property - def paper_feat(self) -> np.ndarray: - if self.synthetic: - return np.random((self.num_nodes, self.num_edges)) - - path = osp.join( - self.dir, - self.size, - 'processed', - 'paper', - 'node_feat.npy') - if self.in_memory: - return np.load(path) - else: - return np.load(path, mmap_mode='r') - - @property - def paper_label(self) -> np.ndarray: - if self.num_classes == 19: - path = osp.join( - self.dir, - self.size, - 'processed', - 'paper', - 'node_label_19.npy') - else: - path = osp.join( - self.dir, - self.size, - 'processed', - 'paper', - 'node_label_2K.npy') - if self.in_memory: - return np.load(path) - else: - return np.load(path, mmap_mode='r') - - @property - def paper_edge(self) -> np.ndarray: - path = osp.join( - self.dir, - self.size, - 'processed', - 'paper__cites__paper', - 'edge_index.npy') - if self.in_memory: - return np.load(path) - else: - return np.load(path, mmap_mode='r') - - -def compute_acc(pred, labels): - """ - Compute the accuracy of prediction given the labels. - """ - labels = labels.long() - return (torch.argmax(pred, dim=1) == labels).float().sum() / len(pred) - - -def evaluate(model, g, inputs, labels, val_nid, batch_size, device): - """ - Evaluate the model on the validation set specified by ``val_nid``. - g : The entire graph. - inputs : The features of all the nodes. - labels : The labels of all the nodes. - val_nid : the node Ids for validation. - batch_size : Number of nodes to compute at the same time. - device : The GPU device to evaluate on. - """ - model.eval() - with torch.no_grad(): - pred = model.inference(g, inputs, batch_size, device) - model.train() - return compute_acc(pred[val_nid], labels[val_nid]) - - -def load_subtensor(g, seeds, input_nodes, device): - """ - Copys features and labels of a set of nodes onto GPU. - """ - batch_inputs = g.ndata['features'][input_nodes].to(device) - batch_labels = g.ndata['labels'][seeds].to(device) - return batch_inputs, batch_labels - - -def track_acc(g, args): - train_accuracy = [] - test_accuracy = [] - g.ndata['features'] = g.ndata['feat'] - g.ndata['labels'] = g.ndata['label'] - in_feats = g.ndata['features'].shape[1] - n_classes = args.num_classes - - # Create csr/coo/csc formats before launching training processes with multi-gpu. - # This avoids creating certain formats in each sub-process, which saves - # momory and CPU. - g.create_formats_() - - num_epochs = args.epochs - num_hidden = args.hidden_channels - num_layers = args.num_layers - fan_out = args.fan_out - batch_size = args.batch_size - lr = args.learning_rate - dropout = args.dropout - num_workers = args.num_workers - - train_nid = torch.nonzero(g.ndata['train_mask'], as_tuple=True)[0] - - # Create PyTorch DataLoader for constructing blocks - sampler = dgl.dataloading.MultiLayerNeighborSampler( - [int(fanout) for fanout in fan_out.split(',')]) - - dataloader = dgl.dataloading.NodeDataLoader( - g, - train_nid, - sampler, - batch_size=batch_size, - shuffle=True, - drop_last=False, - num_workers=num_workers) - - # Define model and optimizer - if args.model_type == 'gcn': - model = GCN(in_feats, num_hidden, n_classes, 1, F.relu, dropout) - if args.model_type == 'sage': - model = SAGE( - in_feats, - num_hidden, - n_classes, - num_layers, - F.relu, - dropout, - 'gcn') - if args.model_type == 'gat': - model = GAT(in_feats, num_hidden, n_classes, num_layers, 2, F.relu) - - model = model.to(device) - loss_fcn = nn.CrossEntropyLoss() - loss_fcn = loss_fcn.to(device) - optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=args.decay) - - # Training loop - avg = 0 - best_test_acc = 0 - log_every = 1 - training_start = time.time() - for epoch in (range(num_epochs)): - # Loop over the dataloader to sample the computation dependency graph as a list of - # blocks. - epoch_loss = 0 - gpu_mem_alloc = 0 - epoch_start = time.time() - for step, (input_nodes, seeds, blocks) in (enumerate(dataloader)): - # Load the input features as well as output labels - # batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device) - blocks = [block.int().to(device) for block in blocks] - batch_inputs = blocks[0].srcdata['features'] - batch_labels = blocks[-1].dstdata['labels'] - - # Compute loss and prediction - batch_pred = model(blocks, batch_inputs) - loss = loss_fcn(batch_pred, batch_labels) - optimizer.zero_grad() - loss.backward() - optimizer.step() - - epoch_loss += loss.detach() - - gpu_mem_alloc += ( - torch.cuda.max_memory_allocated() / 1000000 - if torch.cuda.is_available() - else 0 - ) - - train_g = g - train_nid = torch.nonzero( - train_g.ndata['train_mask'], as_tuple=True)[0] - train_acc = evaluate( - model, train_g, train_g.ndata['features'], train_g.ndata['labels'], train_nid, batch_size, device) - - test_g = g - test_nid = torch.nonzero( - test_g.ndata['test_mask'], as_tuple=True)[0] - test_acc = evaluate( - model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, batch_size, device) - - if test_acc.item() > best_test_acc: - best_test_acc = test_acc.item() - tqdm.write( - "Epoch {:05d} | Loss {:.4f} | Train Acc {:.4f} | Test Acc {:.4f} | Time {:.2f}s | GPU {:.1f} MB".format( - epoch, - epoch_loss, - train_acc.item(), - test_acc.item(), - time.time() - epoch_start, - gpu_mem_alloc - ) - ) - test_accuracy.append(test_acc.item()) - train_accuracy.append(train_acc.item()) - torch.save(model.state_dict(), args.modelpath) - print() - print("Total time taken: ", time.time() - training_start) - - return best_test_acc, train_accuracy, test_accuracy diff --git a/language/llama3-405b/README.md b/language/llama3-405b/README.md index 8c04c202e..8df2a81f1 100644 --- a/language/llama3-405b/README.md +++ b/language/llama3-405b/README.md @@ -9,34 +9,64 @@ Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3-405b) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker. - + ## Prepare environment -Copy the mlperf.conf file to this folder. -``` -cp ../../mlperf.conf . +### Local Environment Run + +The following steps were tested in Ubuntu 22.04 with python 3.10 + +- **Prerrequisite for GPU runs:** Install Nvidia Driver and cuda 12.1. + +The following links contain the commands for installing the [NVIDIA Driver](https://developer.nvidia.com/datacenter-driver-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_local) and [Cuda](https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_local) + +- **Prerrequisite:** Install conda. + +```bash +mkdir -p ~/miniconda3 +wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh +bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 +rm ~/miniconda3/miniconda.sh +~/miniconda3/bin/conda init ``` -For a CPU-only run: +- Set the following helper variables +```bash +export ROOT=$PWD/inference +export LLAMA_FOLDER=$PWD/inference/language/llama3-405b +export LOADGEN_FOLDER=$PWD/inference/loadgen +export DATASET_FOLDER=$PWD/inference/language/llama3-405b/dataset +``` +- Clone the inference repository: +```bash +git clone --recurse-submodules https://github.com/mlcommons/inference.git \ + --depth 1 ``` -conda create -n llama3-405b python=3.9 + +- Create a conda environment: +```bash +conda create -y -n llama3-405b python=3.10 conda activate llama3-405b +conda install -y -c conda-forge libstdcxx-ng=12 +``` +- Install requirements and loadgen: +```bash +cd $LLAMA_FOLDER # Install packages pip install -r requirements.txt +``` -export CUR_DIR=${PWD} -cd /loadgen - - -python -m pip install . +```bash +cd $LOADGEN_FOLDER +pip install -e . ``` -For a GPU-based run: +### Docker Run A dockerfile is provided, along with scripts to help launch it. First, add any docker volume mounts you want in -`launch.sh`. There is a section at the top of the file that looks like: +`launch_docker.sh`. There is a section at the top of the file that looks like: ``` # Add any volume mounts here with the following syntax # /path/to/src:/path/to/dir/in/container @@ -54,10 +84,13 @@ MOUNTS=( /raid/data:/raid/data ) ``` -Once you have added all your mounts, launch the container with `bash launch.sh`. +Once you have added all your mounts, build and launch the container with `bash launch.sh`. -Inside the container, set up the environment with `bash build.sh`. This will install all the dependencies from the -CPU-only setup, as well as any GPU versions for applicable libraries like PyTorch. +Now install all the dependencies: +``` +pip install -r requirements.txt +pip install -e ../../loadgen +``` ## Get Model @@ -73,7 +106,7 @@ TODO: Host model and grant access to submitters export CHECKPOINT_PATH=Meta-Llama-3.1-405B-Instruct git lfs install git clone https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct ${CHECKPOINT_PATH} - +cd ${CHECKPOINT_PATH} && git checkout be673f326cab4cd22ccfef76109faf68e41aa5f1 ``` ## Get Dataset @@ -109,9 +142,10 @@ rclone copy mlc-inference:mlcommons-inference-wg-public/llama3_405b/mlperf_llama ``` python -u main.py --scenario Offline \ --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ --dtype float16 \ --user-conf user.conf \ - --total-sample-count 8312 \ + --total-sample-count 8313 \ --dataset-path ${DATASET_PATH} \ --output-log-dir output \ --tensor-parallel-size ${GPU_COUNT} \ @@ -123,9 +157,10 @@ python -u main.py --scenario Offline \ ``` python -u main.py --scenario Server \ --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ --dtype float16 \ --user-conf user.conf \ - --total-sample-count 8312 \ + --total-sample-count 8313 \ --dataset-path ${DATASET_PATH} \ --output-log-dir output \ --tensor-parallel-size ${GPU_COUNT} \ @@ -145,10 +180,11 @@ mkdir -p "run_outputs" # The script will dump all the outputs to 'run_outputs'. python -u main.py --scenario Offline \ --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ --accuracy \ --dtype float16 \ --user-conf user.conf \ - --total-sample-count 8312 \ + --total-sample-count 8313 \ --dataset-path ${DATASET_PATH} \ --output-log-dir output \ --tensor-parallel-size ${GPU_COUNT} \ @@ -172,10 +208,11 @@ OUTPUT_LOG_DIR=server-accuracy-logs python -u main.py --scenario Server \ --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ --accuracy \ --dtype float16 \ --user-conf user.conf \ - --total-sample-count 8312 \ + --total-sample-count 8313 \ --dataset-path ${DATASET_PATH} \ --output-log-dir output \ --tensor-parallel-size ${GPU_COUNT} \ @@ -193,5 +230,12 @@ The ServerSUT was not tested for GPU runs. ## Accuracy Target -Running the GPU implementation in FP16 precision resulted in the following FP16 accuracy targets (normalized to a 0-100 -scale from a 0.0-1.0 scale): +Running the GPU implementation in FP16 precision resulted in the following FP16 accuracy targets: +``` +{ + 'rougeL': 21.6666, + 'exact_match': 90.1335, + 'tokens_per_sample': 684.68, +} +``` +The accuracy target is 99% for rougeL and exact_match, and 90% for tokens_per_sample diff --git a/language/llama3-405b/SUT_VLLM.py b/language/llama3-405b/SUT_VLLM.py index e64999d09..f5a802021 100644 --- a/language/llama3-405b/SUT_VLLM.py +++ b/language/llama3-405b/SUT_VLLM.py @@ -31,7 +31,7 @@ def __init__( model_path=None, dtype="bfloat16", batch_size=None, - total_sample_count=8312, + total_sample_count=8313, dataset_path=None, use_cached_outputs=False, # Set this to True *only for test accuracy runs* in case your prior @@ -140,16 +140,16 @@ def process_queries(self): n_tokens)] lg.QuerySamplesComplete(response) - tok = time.time() + tok = time.time() - with self.sample_counter_lock: - self.sample_counter += len(qitem) - log.info(f"Samples run: {self.sample_counter}") - if tik1: - log.info(f"\tBatchMaker time: {tik2 - tik1}") - log.info(f"\tInference time: {tik3 - tik2}") - log.info(f"\tPostprocess time: {tok - tik3}") - log.info(f"\t==== Total time: {tok - tik1}") + with self.sample_counter_lock: + self.sample_counter += len(qitem) + log.info(f"Samples run: {self.sample_counter}") + if tik1: + log.info(f"\tBatchMaker time: {tik2 - tik1}") + log.info(f"\tInference time: {tik3 - tik2}") + log.info(f"\tPostprocess time: {tok - tik3}") + log.info(f"\t==== Total time: {tok - tik1}") def load_model(self): log.info("Loading model...") @@ -194,7 +194,7 @@ def __init__( self, model_path=None, dtype="bfloat16", - total_sample_count=8312, + total_sample_count=8313, dataset_path=None, batch_size=None, workers=1, diff --git a/language/llama3-405b/dataset.py b/language/llama3-405b/dataset.py index 04fe9c4b2..084f13208 100644 --- a/language/llama3-405b/dataset.py +++ b/language/llama3-405b/dataset.py @@ -24,7 +24,7 @@ class Dataset: def __init__( self, model_name=None, - total_sample_count=8312, + total_sample_count=8313, perf_count_override=None, dataset_path=None, dtype="bfloat16" diff --git a/language/llama3-405b/evaluate-accuracy.py b/language/llama3-405b/evaluate-accuracy.py index ccc87f71f..f5677820e 100644 --- a/language/llama3-405b/evaluate-accuracy.py +++ b/language/llama3-405b/evaluate-accuracy.py @@ -141,7 +141,7 @@ def main(): tokenizer = AutoTokenizer.from_pretrained( checkpoint_path, - model_max_length=2048, + model_max_length=22000, padding_side="left", use_fast=False, ) diff --git a/language/llama3-405b/launch.sh b/language/llama3-405b/launch_docker.sh similarity index 100% rename from language/llama3-405b/launch.sh rename to language/llama3-405b/launch_docker.sh diff --git a/language/llama3-405b/main.py b/language/llama3-405b/main.py index 26d5726b3..f7802687e 100644 --- a/language/llama3-405b/main.py +++ b/language/llama3-405b/main.py @@ -77,7 +77,7 @@ def get_args(): parser.add_argument( "--total-sample-count", type=int, - default=8312, + default=8313, help="Number of samples to use in benchmark.", ) parser.add_argument( diff --git a/language/llama3-405b/run_accuracy.sh b/language/llama3-405b/run_accuracy.sh index 075245913..9a54d8f13 100644 --- a/language/llama3-405b/run_accuracy.sh +++ b/language/llama3-405b/run_accuracy.sh @@ -5,10 +5,11 @@ mkdir -p "run_outputs" python3 -u main.py --scenario Offline \ --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ --accuracy \ --mlperf-conf mlperf.conf \ --user-conf user.conf \ - --total-sample-count 8312 \ + --total-sample-count 8313 \ --dataset-path ${DATASET_PATH} \ --output-log-dir offline_accuracy_loadgen_logs \ --dtype float32 | tee offline_accuracy_log.log @@ -17,5 +18,3 @@ python3 evaluate-accuracy.py --checkpoint-path ${CHECKPOINT_PATH} \ --mlperf-accuracy-file offline_accuracy_loadgen_logs/mlperf_log_accuracy.json \ --dataset-file ${DATASET_PATH} \ --dtype int32 - -python3 consolidate_results.py --dataset-path ${DATASET_PATH} --model-dir ${CHECKPOINT_PATH} diff --git a/language/llama3-405b/run_offline.sh b/language/llama3-405b/run_offline.sh index 89fa9e45f..6b3a56e01 100644 --- a/language/llama3-405b/run_offline.sh +++ b/language/llama3-405b/run_offline.sh @@ -1,10 +1,13 @@ CHECKPOINT_PATH="${CHECKPOINT_PATH:Meta-Llama-3.1-405B-Instruct}" -DATASET_PATH="${DATASET_PATH:-open-orca-val-set.pkl}" +DATASET_PATH="${DATASET_PATH:mlperf_llama3.1_405b_dataset_8318.pkl}" python -u main.py --scenario Offline \ - --model-path ${CHECKPOINT_PATH} \ - --mlperf-conf mlperf.conf \ - --user-conf user.conf \ - --total-sample-count 8312 \ - --dataset-path ${DATASET_PATH} \ - --device cpu 2>&1 | tee server_log.log + --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ + --dtype float16 \ + --user-conf user.conf \ + --total-sample-count 8313 \ + --dataset-path ${DATASET_PATH} \ + --output-log-dir output \ + --tensor-parallel-size ${GPU_COUNT} \ + --vllm 2>&1 | tee offline.log diff --git a/language/llama3-405b/run_server.sh b/language/llama3-405b/run_server.sh index fe2a31c43..010a359de 100644 --- a/language/llama3-405b/run_server.sh +++ b/language/llama3-405b/run_server.sh @@ -1,12 +1,15 @@ CHECKPOINT_PATH="${CHECKPOINT_PATH:Meta-Llama-3.1-405B-Instruct}" -DATASET_PATH="${DATASET_PATH:-open-orca-val-set.pkl}" +DATASET_PATH="${DATASET_PATH:mlperf_llama3.1_405b_dataset_8318.pkl}" python -u main.py --scenario Server \ - --model-path ${CHECKPOINT_PATH} \ - --mlperf-conf mlperf.conf \ - --user-conf user.conf \ - --total-sample-count 8312 \ - --dataset-path ${DATASET_PATH} \ - --device cpu 2>&1 | tee server_log.log + --model-path ${CHECKPOINT_PATH} \ + --batch-size 16 \ + --dtype float16 \ + --user-conf user.conf \ + --total-sample-count 8313 \ + --dataset-path ${DATASET_PATH} \ + --output-log-dir output \ + --tensor-parallel-size ${GPU_COUNT} \ + --vllm 2>&1 | tee server.log diff --git a/language/llama3-405b/with_the_same_user b/language/llama3-405b/with_the_same_user new file mode 100755 index 000000000..cfa57902f --- /dev/null +++ b/language/llama3-405b/with_the_same_user @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# wkong: manually set the user info in env first + +set -ex + +if [ -z "$@" ]; then + COMMAND=(bash) +else + COMMAND=("$@") +fi + +apt-get update && apt-get install -y sudo + +getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}" +getent passwd "${CI_BUILD_UID}" || adduser --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" --disabled-password --quiet "${CI_BUILD_USER}" + +usermod -a -G dip "${CI_BUILD_USER}" +usermod -a -G sudo "${CI_BUILD_USER}" +usermod -a -G root "${CI_BUILD_USER}" + +echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers + +sudo -H -u "#${CI_BUILD_UID}" --preserve-env \ + PATH="${PATH}" \ + LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" \ + PYTHONPATH="${PYTHONPATH}" \ + ${COMMAND[@]} diff --git a/loadgen/VERSION.txt b/loadgen/VERSION.txt index 6b244dcd6..50e2274e6 100644 --- a/loadgen/VERSION.txt +++ b/loadgen/VERSION.txt @@ -1 +1 @@ -5.0.1 +5.0.3 diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf index 31ad5ef62..95cc08351 100644 --- a/loadgen/mlperf.conf +++ b/loadgen/mlperf.conf @@ -14,7 +14,7 @@ dlrm-v2.*.performance_sample_count_override = 204800 rnnt.*.performance_sample_count_override = 2513 gptj.*.performance_sample_count_override = 13368 llama2-70b.*.performance_sample_count_override = 24576 -llama3-405b.*.performance_sample_count_override = 8312 +llama3-405b.*.performance_sample_count_override = 8313 stable-diffusion-xl.*.performance_sample_count_override = 5000 rgat.*.performance_sample_count_override = 788379 # set to 0 to let entire sample set to be performance sample @@ -42,6 +42,9 @@ retinanet.MultiStream.target_latency = 528 # 3D-UNet uses equal issue mode because it has non-uniform inputs 3d-unet.*.sample_concatenate_permutation = 1 +# R-GAT uses equal issue mode because it may have non-uniform inputs +rgat.*.sample_concatenate_permutation = 1 + # LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario gptj.*.sample_concatenate_permutation = 1 llama2-70b.*.sample_concatenate_permutation = 1 @@ -84,8 +87,8 @@ llama3-405b.Server.tpot_latency = 175 *.Offline.min_duration = 600000 # In Offline scenario, we always have one query. But LoadGen maps this to -# min_sample_count internally in Offline scenario. If the dataset size is larger -# than 24576 we limit the min_query_count to 24576 and otherwise we use +# min_sample_count internally in Offline scenario. If the dataset size is larger +# than 24576 we limit the min_query_count to 24576 and otherwise we use # the dataset size as the limit resnet50.Offline.min_query_count = 24576 @@ -97,7 +100,7 @@ rnnt.Offline.min_query_count = 2513 3d-unet.Offline.min_query_count = 43 stable-diffusion-xl.Offline.min_query_count = 5000 llama2-70b.Offline.min_query_count = 24576 -llama3-405b.Offline.min_query_count = 8312 +llama3-405b.Offline.min_query_count = 8313 mixtral-8x7b.Offline.min_query_count = 15000 rgat.Offline.min_query_count = 788379 diff --git a/main.py b/main.py index c8c64b8c3..6a34587dd 100755 --- a/main.py +++ b/main.py @@ -239,7 +239,8 @@ def mlperf_inference_implementation_readme( common_info = get_common_info( spaces + 16, - implementation + implementation, + model.lower() ) if ( @@ -488,7 +489,7 @@ def get_venv_command(spaces): # contains run command information which is common to both docker and # native runs - def get_common_info(spaces, implementation): + def get_common_info(spaces, implementation, model): info = "" pre_space = "" for i in range(1, spaces): @@ -496,7 +497,11 @@ def get_common_info(spaces, implementation): pre_space += " " # pre_space = " " info += f"\n{pre_space}!!! tip\n\n" + info += f"{pre_space} - Number of threads could be adjusted using `--threads=#`, where `#` is the desired number of threads. This option works only if the implementation in use supports threading.\n\n" info += f"{pre_space} - Batch size could be adjusted using `--batch_size=#`, where `#` is the desired batch size. This option works only if the implementation in use is supporting the given batch size.\n\n" + if model == "rgat": + info += f"{pre_space} - Add `--env.CM_DATASET_IGBH_PATH=` if you have already downloaded the dataset. The path will be automatically mounted when using docker run.\n\n" + info += f"{pre_space} - Add `--env.CM_ML_MODEL_RGAT_CHECKPOINT_PATH=` if you have already downloaded the model. The path will be automatically mounted when using docker run.\n\n" if implementation.lower() == "reference": info += f"{pre_space} - Add `--adr.mlperf-implementation.tags=_branch.master,_repo.` if you are modifying the official MLPerf Inference implementation in a custom fork.\n\n" info += f"{pre_space} - Add `--adr.inference-src.tags=_repo.` if you are modifying the model config accuracy script in the submission checker within a custom fork.\n\n" diff --git a/mkdocs.yml b/mkdocs.yml index 95dfb6e86..96bcfb758 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -42,6 +42,8 @@ nav: - MIXTRAL-8x7B: benchmarks/language/mixtral-8x7b.md - Recommendation: - DLRM-v2: benchmarks/recommendation/dlrm-v2.md + - Graph Neural Networks: + - R-GAT: benchmarks/graph/rgat.md - Install CM: - install/index.md - Submission: diff --git a/text_to_image/backend_pytorch.py b/text_to_image/backend_pytorch.py index 027e15565..8e52e0a61 100644 --- a/text_to_image/backend_pytorch.py +++ b/text_to_image/backend_pytorch.py @@ -387,7 +387,7 @@ def predict(self, inputs): pooled_prompt_embeds, negative_pooled_prompt_embeds, ) = self.prepare_inputs(inputs, i) - with lock: + with self.lock: generated = self.pipe( prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py index 5e5d22c45..34ae82fb1 100644 --- a/tools/submission/generate_final_report.py +++ b/tools/submission/generate_final_report.py @@ -160,39 +160,72 @@ def main(): ], ] - filter_scenarios = { - "datacenter": { - "resnet": ["Server", "Offline"], - "retinanet": ["Server", "Offline"], - "rnnt": ["Server", "Offline"], - "bert-99": ["Server", "Offline"], - "bert-99.9": ["Server", "Offline"], - "dlrm-v2-99": ["Server", "Offline"], - "dlrm-v2-99.9": ["Server", "Offline"], - "3d-unet-99": ["Offline"], - "3d-unet-99.9": ["Offline"], - "gptj-99": ["Server", "Offline"], - "gptj-99.9": ["Server", "Offline"], - "stable-diffusion-xl": ["Server", "Offline"], - "llama2-70b-99": ["Server", "Offline"], - "llama2-70b-99.9": ["Server", "Offline"], - "mixtral-8x7b": ["Server", "Offline"], - }, - "edge": { - "resnet": ["SingleStream", "MultiStream", "Offline"], - "retinanet": ["SingleStream", "MultiStream", "Offline"], - "rnnt": ["SingleStream", "Offline"], - "bert-99": ["SingleStream", "Offline"], - "bert-99.9": [], - "dlrm-v2-99": [], - "dlrm-v2-99.9": [], - "3d-unet-99": ["SingleStream", "Offline"], - "3d-unet-99.9": ["SingleStream", "Offline"], - "gptj-99": ["SingleStream", "Offline"], - "gptj-99.9": ["SingleStream", "Offline"], - "stable-diffusion-xl": ["SingleStream", "Offline"], - }, - } + if args.version == "4.1": + filter_scenarios = { + "datacenter": { + "resnet": ["Server", "Offline"], + "retinanet": ["Server", "Offline"], + "rnnt": ["Server", "Offline"], + "bert-99": ["Server", "Offline"], + "bert-99.9": ["Server", "Offline"], + "dlrm-v2-99": ["Server", "Offline"], + "dlrm-v2-99.9": ["Server", "Offline"], + "3d-unet-99": ["Offline"], + "3d-unet-99.9": ["Offline"], + "gptj-99": ["Server", "Offline"], + "gptj-99.9": ["Server", "Offline"], + "stable-diffusion-xl": ["Server", "Offline"], + "llama2-70b-99": ["Server", "Offline"], + "llama2-70b-99.9": ["Server", "Offline"], + "mixtral-8x7b": ["Server", "Offline"], + }, + "edge": { + "resnet": ["SingleStream", "MultiStream", "Offline"], + "retinanet": ["SingleStream", "MultiStream", "Offline"], + "rnnt": ["SingleStream", "Offline"], + "bert-99": ["SingleStream", "Offline"], + "bert-99.9": [], + "dlrm-v2-99": [], + "dlrm-v2-99.9": [], + "3d-unet-99": ["SingleStream", "Offline"], + "3d-unet-99.9": ["SingleStream", "Offline"], + "gptj-99": ["SingleStream", "Offline"], + "gptj-99.9": ["SingleStream", "Offline"], + "stable-diffusion-xl": ["SingleStream", "Offline"], + }, + } + else: + filter_scenarios = { + "datacenter": { + "resnet": ["Server", "Offline"], + "retinanet": ["Server", "Offline"], + "rnnt": ["Server", "Offline"], + "dlrm-v2-99": ["Server", "Offline"], + "dlrm-v2-99.9": ["Server", "Offline"], + "3d-unet-99": ["Offline"], + "3d-unet-99.9": ["Offline"], + "gptj-99": ["Server", "Offline"], + "gptj-99.9": ["Server", "Offline"], + "stable-diffusion-xl": ["Server", "Offline"], + "llama2-70b-99": ["Server", "Offline"], + "llama2-70b-99.9": ["Server", "Offline"], + "mixtral-8x7b": ["Server", "Offline"], + "rgat": ["Offline"], + "llama3-405b": ["Offline", "Server"] + }, + "edge": { + "resnet": ["SingleStream", "MultiStream", "Offline"], + "retinanet": ["SingleStream", "MultiStream", "Offline"], + "rnnt": ["SingleStream", "Offline"], + "bert-99": ["SingleStream", "Offline"], + "bert-99.9": ["SingleStream", "Offline"], + "3d-unet-99": ["SingleStream", "Offline"], + "3d-unet-99.9": ["SingleStream", "Offline"], + "gptj-99": ["SingleStream", "Offline"], + "gptj-99.9": ["SingleStream", "Offline"], + "stable-diffusion-xl": ["SingleStream", "Offline"], + }, + } def MakeWorksheet(df, index, filter_dict, sheet_name, outjsondata=[]): for key, value in filter_dict.items(): diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py index 51f8c7aab..dcdad1180 100755 --- a/tools/submission/submission_checker.py +++ b/tools/submission/submission_checker.py @@ -1,4 +1,4 @@ -"""A checker for MLPerf Inference submissions from v4.0 onwards (for checking older submissions please use the submission checker from the respective release) +"""A checker for MLPerf Inference submissions from v4.1 onwards (for checking older submissions please use the submission checker from the respective release) """ from __future__ import division @@ -196,13 +196,11 @@ "resnet50": "resnet", }, "seeds": { - # TODO: Update random seeds "qsl_rng_seed": 3066443479025735752, "sample_index_rng_seed": 10688027786191513374, "schedule_rng_seed": 14962580496156340209, }, "test05_seeds": { - # TODO: Update random seeds "qsl_rng_seed": 16799458546791641818, "sample_index_rng_seed": 5453809927556429288, "schedule_rng_seed": 5435552105434836064, @@ -220,8 +218,7 @@ "llama2-70b-99": {"Server": 20000000000}, "llama2-70b-99.9": {"Server": 20000000000}, "stable-diffusion-xl": {"Server": 20000000000}, - # TODO: Mixtral metrics - # "mixtral-8x7b" : {"Server": 20000000000} + "mixtral-8x7b": {"Server": 20000000000} }, "min-queries": { "resnet": { @@ -260,7 +257,6 @@ "retinanet", "bert-99", "bert-99.9", - # TODO: remove dlrm? "dlrm-v2-99", "dlrm-v2-99.9", "3d-unet-99", @@ -271,10 +267,9 @@ "llama2-70b-99.9", "stable-diffusion-xl", "mixtral-8x7b", - "llama3-405b-99", - "llama3-405b-99.9", + "llama3-405b", "rgat", - # TODO: add automotive + # TODO: add automotive? ], "required-scenarios-datacenter": { "resnet": ["Server", "Offline"], @@ -289,8 +284,7 @@ "llama2-70b-99.9": ["Server", "Offline"], "stable-diffusion-xl": ["Server", "Offline"], "mixtral-8x7b": ["Server", "Offline"], - "llama3-405b-99": ["Server", "Offline"], - "llama3-405b-99.9": ["Server", "Offline"], + "llama3-405b": ["Server", "Offline"], "rgat": ["Offline"], }, "optional-scenarios-datacenter": {}, @@ -298,6 +292,7 @@ "resnet": ["SingleStream", "MultiStream", "Offline"], "retinanet": ["SingleStream", "MultiStream", "Offline"], "bert-99": ["SingleStream", "Offline"], + "bert-99.9": ["SingleStream", "Offline"], "3d-unet-99": ["SingleStream", "Offline"], "3d-unet-99.9": ["SingleStream", "Offline"], "gptj-99": ["SingleStream", "Offline"], @@ -308,8 +303,8 @@ "required-scenarios-datacenter-edge": { "resnet": ["SingleStream", "Offline", "MultiStream", "Server"], "retinanet": ["SingleStream", "Offline", "MultiStream", "Server"], - "bert-99": ["SingleStream", "Offline", "Server"], - "bert-99.9": ["Offline", "Server"], + "bert-99": ["SingleStream", "Offline"], + "bert-99.9": ["SingleStream", "Offline"], "dlrm-v2-99": ["Offline", "Server"], "dlrm-v2-99.9": ["Offline", "Server"], "3d-unet-99": ["SingleStream", "Offline"], @@ -320,8 +315,7 @@ "llama2-70b-99.9": ["Server", "Offline"], "stable-diffusion-xl": ["SingleStream", "Offline", "Server"], "mixtral-8x7b": ["Server", "Offline"], - "llama3-405b-99": ["Server", "Offline"], - "llama3-405b-99.9": ["Server", "Offline"], + "llama3-405b": ["Server", "Offline"], "rgat": ["Offline"], }, "optional-scenarios-datacenter-edge": {}, @@ -395,22 +389,13 @@ "mbxp_accuracy", 60.12 * 0.99, ), - # TODO: Get llama3 metrics - "llama3-405b-99": ( + "llama3-405b": ( "ROUGEL", - 1 * 0.99, + 21.6666 * 0.99, "exact_match", - 1 * 0.99, + 90.1335 * 0.99, "TOKENS_PER_SAMPLE", - 1000 * 0.9, - ), - "llama3-405b-99.9": ( - "ROUGEL", - 1 * 0.99, - "exact_match", - 1 * 0.99, - "TOKENS_PER_SAMPLE", - 20000 * 0.9, + 684.68 * 0.9, ), "rgat": ("acc", 0.7286 * 0.99), }, @@ -424,8 +409,7 @@ "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), "mixtral-8x7b": ("TOKENS_PER_SAMPLE", 145.9 * 1.1), - "llama3-405b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1), - "llama3-405b-99.9": ("TOKENS_PER_SAMPLE", 20000 * 1.1), + "llama3-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1), }, "accuracy-delta-perc": { "stable-diffusion-xl": {"CLIP_SCORE": 1, "FID_SCORE": 2} @@ -445,15 +429,15 @@ "llama2-70b-99.9": 24576, "stable-diffusion-xl": 5000, "mixtral-8x7b": 15000, - "llama3-405b-99": 8312, - "llama3-405b-99.9": 8312, + "llama3-405b": 8313, "rgat": 788379 }, - # TODO: Update this list. + # model_mapping.json is expected in the root directory of the + # submission folder for open submissions and so the below dictionary is + # not really needed "model_mapping": { # map model names to the official mlperf model class - "ssd-resnet34": "retinanet", "mobilenet": "resnet", "resnet50": "resnet", }, @@ -463,23 +447,19 @@ "sample_index_rng_seed": 10688027786191513374, "schedule_rng_seed": 14962580496156340209, }, - "test05_seeds": { - # TODO: Update random seeds - "qsl_rng_seed": 16799458546791641818, - "sample_index_rng_seed": 5453809927556429288, - "schedule_rng_seed": 5435552105434836064, - }, "ignore_errors": [], "latency-constraint": { "resnet": {"Server": 15000000}, "retinanet": {"Server": 100000000}, - "bert-99": {"Server": 130000000}, - "bert-99.9": {"Server": 130000000}, "dlrm-v2-99": {"Server": 60000000}, "dlrm-v2-99.9": {"Server": 60000000}, "gptj-99": {"Server": 20000000000}, "gptj-99.9": {"Server": 20000000000}, "stable-diffusion-xl": {"Server": 20000000000}, + "llama2-70b-99": {"Server": 20000000000}, + "llama2-70b-99.9": {"Server": 20000000000}, + "mixtral-8x7b": {"Server": 20000000000}, + "llama3-405b": {"Server": 60000000000} }, "min-queries": { "resnet": { @@ -494,8 +474,8 @@ "Server": 270336, "Offline": 1, }, - "bert-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, - "bert-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "bert-99": {"SingleStream": 1024, "Offline": 1}, + "bert-99.9": {"SingleStream": 1024, "Offline": 1}, "dlrm-v2-99": {"Server": 270336, "Offline": 1}, "dlrm-v2-99.9": {"Server": 270336, "Offline": 1}, "3d-unet-99": {"SingleStream": 1024, "Offline": 1}, @@ -510,9 +490,8 @@ "Offline": 1, }, "mixtral-8x7b": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, - "llama3-405b-99": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, - "llama3-405b-99.9": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, - "rgat": {"SingleStream": 1024, "Server": 270336, "Offline": 1} + "llama3-405b": {"SingleStream": 1024, "Server": 270336, "Offline": 1}, + "rgat": {"SingleStream": 1024, "Offline": 1} }, }, } @@ -600,8 +579,7 @@ "llama2-70b-99.9": 24576, "stable-diffusion-xl": 5000, "mixtral-8x7b": 15000, - "llama3-405b-99": 8312, - "llama2-405b-99.9": 8312, + "llama3-405b": 8313, "rgat": 788379, } @@ -620,17 +598,15 @@ } RESULT_FIELD_NEW = { - "v4.0": { + "v4.1": { "Offline": "result_samples_per_second", "SingleStream": "early_stopping_latency_ss", - "MultiStreamLegacy": "effective_samples_per_query", "MultiStream": "early_stopping_latency_ms", - "Server": "result_scheduled_samples_per_sec", + "Server": "result_completed_samples_per_sec", }, - "v4.1": { + "v5.0": { "Offline": "result_samples_per_second", "SingleStream": "early_stopping_latency_ss", - "MultiStreamLegacy": "effective_samples_per_query", "MultiStream": "early_stopping_latency_ms", "Server": "result_completed_samples_per_sec", }, @@ -680,11 +656,7 @@ "Offline": "result_tokens_per_second", "Server": "result_completed_tokens_per_second", }, - "llama3-405b-99": { - "Offline": "result_tokens_per_second", - "Server": "result_completed_tokens_per_second", - }, - "llama3-405b-99.9": { + "llama3-405b": { "Offline": "result_tokens_per_second", "Server": "result_completed_tokens_per_second", }, @@ -699,16 +671,13 @@ "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000} }, "mixtral-8x7b": {"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}}, - "llama3-405b-99": { - "conversational": {"ttft": 6000 * 1000000, "tpot": 175 * 1000000} - }, - "llama3-405b-99.9": { + "llama3-405b": { "conversational": {"ttft": 6000 * 1000000, "tpot": 175 * 1000000} }, } ACC_PATTERN = { - "acc": r"^accuracy=([\d\.]+).*", + "acc": r"^(?:\{\"accuracy|accuracy)[\": ]*=?\s*([\d\.]+).*", "AUC": r"^AUC=([\d\.]+).*", "mAP": r"^mAP=([\d\.]+).*", "bleu": r"^BLEU\:\s*([\d\.]+).*", @@ -857,7 +826,8 @@ def __init__( self.version = version self.models = self.base["models"] self.seeds = self.base["seeds"] - self.test05_seeds = self.base["test05_seeds"] + if self.base.get("test05_seeds"): + self.test05_seeds = self.base["test05_seeds"] self.accuracy_target = self.base["accuracy-target"] self.accuracy_delta_perc = self.base["accuracy-delta-perc"] self.accuracy_upper_limit = self.base.get("accuracy-upper-limit", {}) @@ -986,8 +956,8 @@ def requires_equal_issue(self, model, division): "llama2-70b-99", "llama2-70b-99.9", "mixtral-8x7b", - "llama3-405b-99", - "llama3-405b-99.9", + "llama3-405b", + "rgat", ] and self.version not in ["v4.0", "v4.1"] ) @@ -999,7 +969,7 @@ def get_args(): parser.add_argument("--input", required=True, help="submission directory") parser.add_argument( "--version", - default="v4.1", + default="v5.0", choices=list(MODEL_CONFIG.keys()), help="mlperf version", ) @@ -1355,7 +1325,7 @@ def check_performance_dir( ) if model in ["llama2-70b-99", "llama2-70b-99.9", - "mixtral-8x7b", "llama3-405b-99", "llama3-405b-99.9"]: + "mixtral-8x7b", "llama3-405b"]: llama_constraint, is_valid = extra_check_llm( mlperf_log, scenario_fixed, model) @@ -1895,13 +1865,7 @@ def log_result( "Offline": "Tokens/s", "Server": "Tokens/s", }, - "llama3-405b-99": { - "SingleStream": "Latency (ms)", - "MultiStream": "Latency (ms)", - "Offline": "Tokens/s", - "Server": "Tokens/s", - }, - "llama3-405b-99.9": { + "llama3-405b": { "SingleStream": "Latency (ms)", "MultiStream": "Latency (ms)", "Offline": "Tokens/s", @@ -2986,8 +2950,7 @@ def check_compliance_dir( "llama2-70b-99", "llama2-70b-99.9", "mixtral-8x7b", - "llama3-405b-99", - "llama3-405b-99.9", + "llama3-405b", "rgat", ]: test_list.remove("TEST04") @@ -3008,8 +2971,7 @@ def check_compliance_dir( "llama2-70b-99", "llama2-70b-99.9", "mixtral-8x7b", - "llama3-405b-99", - "llama3-405b-99.9", + "llama3-405b", ]: test_list.remove("TEST01") @@ -3018,7 +2980,7 @@ def check_compliance_dir( test_list.remove("TEST04") if model in ["llama2-70b-99", "llama2-70b-99.9", - "mixtral-8x7b", "llama3-405b-99", "llama3-405b-99.9"]: + "mixtral-8x7b", "llama3-405b"]: test_list.append("TEST06") if test_list and not os.path.exists(compliance_dir):