From 3a33e9248ed13b575b2a3fc2f652d253b9367db4 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Mon, 11 Mar 2024 12:31:55 -0700 Subject: [PATCH 01/30] fix(build): call AC_USE_SYSTEM_EXTENSIONS, AC_LANG([C]) prior to this commit, some (but not all) files #define _GNU_SOURCE individually. Mixing gnu/posix extensions at a per-TU level is a recipe for disaster, ABI incompabitilities between files in the same project is never your first guess. Opt into extensions once and for all at the top for consistency, and also add AC_LANG([C]) to ensure that the compiler isn't guessing whether it's a C++ or C source file. --- configure.ac | 4 +++- src/nccl_ofi_net.c | 1 - src/nccl_ofi_ofiutils.c | 1 - src/platform-aws.c | 1 - 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 15f7e5e6f..eb82badf3 100644 --- a/configure.ac +++ b/configure.ac @@ -6,8 +6,9 @@ # # Initialization -AC_INIT([aws-ofi-nccl], [GitHub-dev], [al-ofi-nccl-team@amazon.com], , [http://github.com/aws/aws-ofi-nccl]) +AC_INIT([aws-ofi-nccl],[GitHub-dev],[al-ofi-nccl-team@amazon.com],[],[http://github.com/aws/aws-ofi-nccl]) AC_PREREQ([2.69]) +AC_LANG([C]) AC_CONFIG_SRCDIR([src/nccl_ofi_net.c]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_HEADERS([config.h]) @@ -29,6 +30,7 @@ NCCL_NET_OFI_DISTCHCK_CONFIGURE_FLAGS= # Checks for programs AC_PROG_CC m4_version_prereq([2.70], [], [AC_PROG_CC_STDC]) +AC_USE_SYSTEM_EXTENSIONS AC_C_INLINE dnl ac_check_enable_debug will make sure AC_PROG_CC doesn't add a diff --git a/src/nccl_ofi_net.c b/src/nccl_ofi_net.c index 87af1c9fc..5effa8329 100644 --- a/src/nccl_ofi_net.c +++ b/src/nccl_ofi_net.c @@ -5,7 +5,6 @@ #include "config.h" -#define _GNU_SOURCE #include #include #include diff --git a/src/nccl_ofi_ofiutils.c b/src/nccl_ofi_ofiutils.c index 87e7abac0..0a44685a9 100644 --- a/src/nccl_ofi_ofiutils.c +++ b/src/nccl_ofi_ofiutils.c @@ -5,7 +5,6 @@ #include "config.h" -#define _GNU_SOURCE #include #include #include diff --git a/src/platform-aws.c b/src/platform-aws.c index b79ec3c50..a885cb099 100644 --- a/src/platform-aws.c +++ b/src/platform-aws.c @@ -5,7 +5,6 @@ #include "config.h" -#define _GNU_SOURCE #include #include #include From 9aeb73a8720dd025d154b3068aa55b91712292cb Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 12 May 2024 17:20:19 -0700 Subject: [PATCH 02/30] build: delete config.h, pass defines directly This commits parent set _GNU_SOURCE globally via automake to opt into gnu extensions for the entire project, rather than directly setting it inline in several (but not all) files. The rationale behind this was that conditionally enabling gnu extensions can lead to very hard to debug issues where different compilation units are using different (potentially abi incompatible) implementations. Similarly, not all files in our tree include `config.h' (or in some cases, TUs were including the file but not placing it at the very top, which defeats the purpose of eg: feature test macros.) Our config is ultimately very small (on the order of a dozen or so "actual" options, well within reasonable command line argument max length constraints) and can be passed directly in the compiler command line arguments instead. Do this instead so we don't have to worry about this footgun. --- configure.ac | 1 - include/nccl_ofi_tracepoint.h | 1 - src/nccl_ofi_api.c | 1 - src/nccl_ofi_cuda.c | 1 - src/nccl_ofi_deque.c | 1 - src/nccl_ofi_freelist.c | 1 - src/nccl_ofi_idpool.c | 1 - src/nccl_ofi_interface_neuron.c | 1 - src/nccl_ofi_interface_nvidia.c | 1 - src/nccl_ofi_msgbuff.c | 1 - src/nccl_ofi_net.c | 1 - src/nccl_ofi_ofiutils.c | 1 - src/nccl_ofi_rdma.c | 1 - src/nccl_ofi_scheduler.c | 1 - src/nccl_ofi_sendrecv.c | 1 - src/nccl_ofi_topo.c | 1 - src/platform-aws.c | 1 - src/tracepoint.c | 1 - src/tuner/nccl_ofi_model.c | 1 - src/tuner/nccl_ofi_tuner.c | 1 - tests/functional/cuda_check.c | 1 - tests/functional/nccl_connection.c | 1 - tests/functional/nccl_message_transfer.c | 1 - tests/functional/ring.c | 1 - tests/functional/test-common.h | 1 - tests/unit/deque.c | 1 - tests/unit/freelist.c | 1 - tests/unit/idpool.c | 1 - tests/unit/msgbuff.c | 1 - tests/unit/scheduler.c | 1 - 30 files changed, 30 deletions(-) diff --git a/configure.ac b/configure.ac index eb82badf3..db5126ff2 100644 --- a/configure.ac +++ b/configure.ac @@ -11,7 +11,6 @@ AC_PREREQ([2.69]) AC_LANG([C]) AC_CONFIG_SRCDIR([src/nccl_ofi_net.c]) AC_CONFIG_AUX_DIR([build-aux]) -AC_CONFIG_HEADERS([config.h]) AC_CONFIG_MACRO_DIRS([m4]) AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects tar-pax]) diff --git a/include/nccl_ofi_tracepoint.h b/include/nccl_ofi_tracepoint.h index 1e1ac0b16..3b596dc62 100644 --- a/include/nccl_ofi_tracepoint.h +++ b/include/nccl_ofi_tracepoint.h @@ -6,7 +6,6 @@ #ifndef NCCL_OFI_TRACEPOINT_H_ #define NCCL_OFI_TRACEPOINT_H_ -#include "config.h" #include "tracing_impl/nvtx.h" #include "tracing_impl/lttng.h" diff --git a/src/nccl_ofi_api.c b/src/nccl_ofi_api.c index 41c1f8c82..0faebe81e 100644 --- a/src/nccl_ofi_api.c +++ b/src/nccl_ofi_api.c @@ -3,7 +3,6 @@ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. */ -#include "config.h" #include "nccl_ofi.h" #include "nccl_ofi_api.h" diff --git a/src/nccl_ofi_cuda.c b/src/nccl_ofi_cuda.c index 2ce1c1261..8e0523b34 100644 --- a/src/nccl_ofi_cuda.c +++ b/src/nccl_ofi_cuda.c @@ -3,7 +3,6 @@ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. */ -#include "config.h" #include diff --git a/src/nccl_ofi_deque.c b/src/nccl_ofi_deque.c index 94bc1365d..494f65ac3 100644 --- a/src/nccl_ofi_deque.c +++ b/src/nccl_ofi_deque.c @@ -2,7 +2,6 @@ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include diff --git a/src/nccl_ofi_freelist.c b/src/nccl_ofi_freelist.c index 95ba68b59..dfb762fec 100644 --- a/src/nccl_ofi_freelist.c +++ b/src/nccl_ofi_freelist.c @@ -2,7 +2,6 @@ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include diff --git a/src/nccl_ofi_idpool.c b/src/nccl_ofi_idpool.c index b3c0bcda4..36bb52edb 100644 --- a/src/nccl_ofi_idpool.c +++ b/src/nccl_ofi_idpool.c @@ -2,7 +2,6 @@ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include "nccl_ofi.h" #include "nccl_ofi_idpool.h" diff --git a/src/nccl_ofi_interface_neuron.c b/src/nccl_ofi_interface_neuron.c index 697ae8d8d..d6a7743f2 100644 --- a/src/nccl_ofi_interface_neuron.c +++ b/src/nccl_ofi_interface_neuron.c @@ -2,7 +2,6 @@ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include "nccl_ofi.h" #include "nccl_ofi_api.h" diff --git a/src/nccl_ofi_interface_nvidia.c b/src/nccl_ofi_interface_nvidia.c index 4d1d12300..c10bec99b 100644 --- a/src/nccl_ofi_interface_nvidia.c +++ b/src/nccl_ofi_interface_nvidia.c @@ -2,7 +2,6 @@ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include "nccl_ofi.h" #include "nccl_ofi_api.h" diff --git a/src/nccl_ofi_msgbuff.c b/src/nccl_ofi_msgbuff.c index 13183ff29..d3e8653f0 100644 --- a/src/nccl_ofi_msgbuff.c +++ b/src/nccl_ofi_msgbuff.c @@ -2,7 +2,6 @@ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include #include diff --git a/src/nccl_ofi_net.c b/src/nccl_ofi_net.c index 5effa8329..0197d7232 100644 --- a/src/nccl_ofi_net.c +++ b/src/nccl_ofi_net.c @@ -3,7 +3,6 @@ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. */ -#include "config.h" #include #include diff --git a/src/nccl_ofi_ofiutils.c b/src/nccl_ofi_ofiutils.c index 0a44685a9..fd1a13fd6 100644 --- a/src/nccl_ofi_ofiutils.c +++ b/src/nccl_ofi_ofiutils.c @@ -3,7 +3,6 @@ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. */ -#include "config.h" #include #include diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 5e87cf0ff..5b5ab3924 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -1,7 +1,6 @@ /* * Copyright (c) 2023=2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include #include diff --git a/src/nccl_ofi_scheduler.c b/src/nccl_ofi_scheduler.c index ac1aa8db5..75d5fb636 100644 --- a/src/nccl_ofi_scheduler.c +++ b/src/nccl_ofi_scheduler.c @@ -2,7 +2,6 @@ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include #include diff --git a/src/nccl_ofi_sendrecv.c b/src/nccl_ofi_sendrecv.c index 3db61c7ce..3dac38238 100644 --- a/src/nccl_ofi_sendrecv.c +++ b/src/nccl_ofi_sendrecv.c @@ -2,7 +2,6 @@ * Copyright (c) 2023-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include #include diff --git a/src/nccl_ofi_topo.c b/src/nccl_ofi_topo.c index 84bd4a144..4b46fed3d 100644 --- a/src/nccl_ofi_topo.c +++ b/src/nccl_ofi_topo.c @@ -2,7 +2,6 @@ * Copyright (c) 2023-2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include #include diff --git a/src/platform-aws.c b/src/platform-aws.c index a885cb099..d831db454 100644 --- a/src/platform-aws.c +++ b/src/platform-aws.c @@ -3,7 +3,6 @@ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. */ -#include "config.h" #include #include diff --git a/src/tracepoint.c b/src/tracepoint.c index ce7741f1a..dcba6808e 100644 --- a/src/tracepoint.c +++ b/src/tracepoint.c @@ -2,7 +2,6 @@ * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include #if HAVE_LIBLTTNG_UST == 1 #define TRACEPOINT_CREATE_PROBES diff --git a/src/tuner/nccl_ofi_model.c b/src/tuner/nccl_ofi_model.c index 58ed91b4e..fa2c324c3 100644 --- a/src/tuner/nccl_ofi_model.c +++ b/src/tuner/nccl_ofi_model.c @@ -1,4 +1,3 @@ -#include "config.h" #include #include diff --git a/src/tuner/nccl_ofi_tuner.c b/src/tuner/nccl_ofi_tuner.c index 497599fe2..e1c6baac8 100644 --- a/src/tuner/nccl_ofi_tuner.c +++ b/src/tuner/nccl_ofi_tuner.c @@ -1,4 +1,3 @@ -#include "config.h" #include #include diff --git a/tests/functional/cuda_check.c b/tests/functional/cuda_check.c index c14c3b06b..66ae6c8af 100644 --- a/tests/functional/cuda_check.c +++ b/tests/functional/cuda_check.c @@ -8,7 +8,6 @@ * will fail if a direct cuda call is made from the plugin. */ -#include "config.h" #include "nccl_ofi_api.h" diff --git a/tests/functional/nccl_connection.c b/tests/functional/nccl_connection.c index 34a202fac..f9d9e9138 100644 --- a/tests/functional/nccl_connection.c +++ b/tests/functional/nccl_connection.c @@ -6,7 +6,6 @@ * This test validates functionality of NCCL connection establishment APIs */ -#include "config.h" #include "test-common.h" diff --git a/tests/functional/nccl_message_transfer.c b/tests/functional/nccl_message_transfer.c index 094234335..631325ad8 100644 --- a/tests/functional/nccl_message_transfer.c +++ b/tests/functional/nccl_message_transfer.c @@ -7,7 +7,6 @@ * data transfer APIs */ -#include "config.h" #include "test-common.h" diff --git a/tests/functional/ring.c b/tests/functional/ring.c index d54796de1..54bf9a8f3 100644 --- a/tests/functional/ring.c +++ b/tests/functional/ring.c @@ -2,7 +2,6 @@ * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include "test-common.h" diff --git a/tests/functional/test-common.h b/tests/functional/test-common.h index a46bfeb3b..c2cdaed2c 100644 --- a/tests/functional/test-common.h +++ b/tests/functional/test-common.h @@ -13,7 +13,6 @@ #include "nccl_ofi_log.h" #include "nccl_ofi_param.h" #include "mpi.h" -#include "config.h" #include #include #include diff --git a/tests/unit/deque.c b/tests/unit/deque.c index d061176c6..ec5e367c3 100644 --- a/tests/unit/deque.c +++ b/tests/unit/deque.c @@ -2,7 +2,6 @@ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include diff --git a/tests/unit/freelist.c b/tests/unit/freelist.c index 34bab0685..920043a64 100644 --- a/tests/unit/freelist.c +++ b/tests/unit/freelist.c @@ -2,7 +2,6 @@ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include diff --git a/tests/unit/idpool.c b/tests/unit/idpool.c index c66980cf6..e8289a46e 100644 --- a/tests/unit/idpool.c +++ b/tests/unit/idpool.c @@ -2,7 +2,6 @@ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include diff --git a/tests/unit/msgbuff.c b/tests/unit/msgbuff.c index d495d5fa2..6661d0c58 100644 --- a/tests/unit/msgbuff.c +++ b/tests/unit/msgbuff.c @@ -4,7 +4,6 @@ #include -#include "config.h" #include "nccl_ofi_msgbuff.h" diff --git a/tests/unit/scheduler.c b/tests/unit/scheduler.c index 5b9bfa0ec..998604892 100644 --- a/tests/unit/scheduler.c +++ b/tests/unit/scheduler.c @@ -2,7 +2,6 @@ * Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved. */ -#include "config.h" #include From 9144be812fe2390b743662de0ca068754a190801 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 18 Feb 2024 18:37:12 -0800 Subject: [PATCH 03/30] ci(pre-commit): introduce pre-commit pre-commit is a tool for composing portable/generic git hooks. Introduce this to the repository with a small set of base hooks. Update docs with instructions on how to use it. --- .pre-commit-config.yaml | 15 +++++++++++++++ CONTRIBUTING.md | 11 ++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..7fd8bebec --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,15 @@ +--- +fail_fast: false +exclude: | + (?x)^( + include/nccl-headers + ).*$ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: check-merge-conflict + - id: detect-private-key + - id: mixed-line-ending + - id: check-added-large-files + - id: check-shebang-scripts-are-executable diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f68529965..c3b5514fd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -30,11 +30,12 @@ Contributions via pull requests are much appreciated. Before sending us a pull r To send us a pull request, please: 1. Fork the repository. -2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. -3. Ensure local tests pass. -4. Commit to your fork using clear commit messages. -5. Send us a pull request, answering any default questions in the pull request interface. -6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. +2. Install [pre-commit](https://pre-commit.com/) through your preferred mechanism (eg: `pip install`), then run `pre-commit install` **inside this repo**. This will ensure that a collection of [git hooks](./.pre-commit-config.yaml) run on each of your commits, which help minimize your diffs, catch errors automatically, and avoid nitpicking. +3. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. +4. Ensure local tests pass. +5. Commit to your fork using clear commit messages. +6. Send us a pull request, answering any default questions in the pull request interface. +7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). From 850113e0d82ff5c48da5f8858a41d9662fb3372e Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 12 May 2024 21:28:28 -0700 Subject: [PATCH 04/30] ci(pre-commit): add yamlfmt Add a pre-commit hook to format yaml files (including pre-commit's config file.) --- .pre-commit-config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7fd8bebec..25b984c35 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,3 +13,7 @@ repos: - id: mixed-line-ending - id: check-added-large-files - id: check-shebang-scripts-are-executable + - repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt + rev: 0.2.3 + hooks: + - id: yamlfmt From 3d8a9d1ffed34da06581ec2bd971f03d41dcb6ac Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 12 May 2024 21:29:51 -0700 Subject: [PATCH 05/30] nit: apply yamlfmt fixes to tree Apply yamlfmt to files under .ci and .github. This is split into a separate commit so that it can be hidden from git blame. --- .ci/aws/aws_ofi_nccl_pr_ci.yaml | 17 +-- .github/workflows/distcheck.yaml | 211 ++++++++++++++++--------------- 2 files changed, 115 insertions(+), 113 deletions(-) diff --git a/.ci/aws/aws_ofi_nccl_pr_ci.yaml b/.ci/aws/aws_ofi_nccl_pr_ci.yaml index 49339e927..9da8860e9 100644 --- a/.ci/aws/aws_ofi_nccl_pr_ci.yaml +++ b/.ci/aws/aws_ofi_nccl_pr_ci.yaml @@ -1,11 +1,12 @@ +--- general: - timeout: 180 - enable_live_log: true + timeout: 180 + enable_live_log: true cluster: - cluster_type: manual_cluster + cluster_type: manual_cluster testing: - test_target: aws-ofi-nccl - test_type: pr - aws_ofi_nccl_build_type: debug - test_list: - - test_nccl_test + test_target: aws-ofi-nccl + test_type: pr + aws_ofi_nccl_build_type: debug + test_list: + - test_nccl_test diff --git a/.github/workflows/distcheck.yaml b/.github/workflows/distcheck.yaml index ac309f231..8d3800e1c 100644 --- a/.github/workflows/distcheck.yaml +++ b/.github/workflows/distcheck.yaml @@ -1,121 +1,122 @@ +--- name: PR CI on: [push, pull_request] env: - APT_PACKAGES: >- - build-essential - clang - gcc - git - libhwloc-dev - make + APT_PACKAGES: >- + build-essential + clang + gcc + git + libhwloc-dev + make jobs: - distcheck: - runs-on: ubuntu-22.04 - strategy: - matrix: - cc: - - gcc - - clang - sdk: - - cuda - - neuron - fail-fast: false - steps: - - uses: actions/checkout@v4 + distcheck: + runs-on: ubuntu-22.04 + strategy: + matrix: + cc: + - gcc + - clang + sdk: + - cuda + - neuron + fail-fast: false + steps: + - uses: actions/checkout@v4 - - name: Install Dependencies - run: | - sudo apt-get update -y - sudo apt-get install -y ${{ env.APT_PACKAGES }} - - name: Install CUDA SDK - if: matrix.sdk == 'cuda' - run: | - sudo apt-get install -y nvidia-cuda-toolkit - - name: Install Neuron SDK - if: matrix.sdk == 'neuron' - run: | - # Configure Linux for Neuron repository updates - sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null << EOF - deb https://apt.repos.neuron.amazonaws.com jammy main - EOF - wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add - - sudo apt update -y + - name: Install Dependencies + run: | + sudo apt-get update -y + sudo apt-get install -y ${{ env.APT_PACKAGES }} + - name: Install CUDA SDK + if: matrix.sdk == 'cuda' + run: | + sudo apt-get install -y nvidia-cuda-toolkit + - name: Install Neuron SDK + if: matrix.sdk == 'neuron' + run: | + # Configure Linux for Neuron repository updates + sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null << EOF + deb https://apt.repos.neuron.amazonaws.com jammy main + EOF + wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add - + sudo apt update -y - # Install Neuron Runtime - sudo apt-get install aws-neuronx-runtime-lib -y + # Install Neuron Runtime + sudo apt-get install aws-neuronx-runtime-lib -y - - name: Install Libfabric - run: | - # We're just doing distchecks, so it is fine if we - # just grab the latest master and built a lean build. - git clone --depth 1 https://github.com/ofiwg/libfabric.git - pushd libfabric - ./autogen.sh - ./configure --prefix=$PWD/install \ - --disable-sockets \ - --disable-udp \ - --disable-mrail \ - --disable-opx \ - CC=${{ matrix.cc }} - make -j $(nproc) - make install - popd + - name: Install Libfabric + run: | + # We're just doing distchecks, so it is fine if we + # just grab the latest master and built a lean build. + git clone --depth 1 https://github.com/ofiwg/libfabric.git + pushd libfabric + ./autogen.sh + ./configure --prefix=$PWD/install \ + --disable-sockets \ + --disable-udp \ + --disable-mrail \ + --disable-opx \ + CC=${{ matrix.cc }} + make -j $(nproc) + make install + popd - - name: Build Plugin - run: | - set -x + - name: Build Plugin + run: | + set -x - # actions/checkout@v4 would drop the plugin source in $PWD, - # so go ahead and build it. - ./autogen.sh - if [ ${{ matrix.sdk }} == "cuda" ] - then - ./configure --with-libfabric=$PWD/libfabric/install \ - --with-cuda=/usr/local/cuda/ \ - --enable-platform-aws \ - CC=${{ matrix.cc }} - else - ./configure --with-libfabric=$PWD/libfabric/install \ - --enable-neuron \ - --enable-platform-aws \ - CC=${{ matrix.cc }} - fi - make -j $(nproc) + # actions/checkout@v4 would drop the plugin source in $PWD, + # so go ahead and build it. + ./autogen.sh + if [ ${{ matrix.sdk }} == "cuda" ] + then + ./configure --with-libfabric=$PWD/libfabric/install \ + --with-cuda=/usr/local/cuda/ \ + --enable-platform-aws \ + CC=${{ matrix.cc }} + else + ./configure --with-libfabric=$PWD/libfabric/install \ + --enable-neuron \ + --enable-platform-aws \ + CC=${{ matrix.cc }} + fi + make -j $(nproc) - - name: Run Dist Check - run: make distcheck + - name: Run Dist Check + run: make distcheck - - name: Upload build logs - if: failure() - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.cc }}-config.log - path: config.log + - name: Upload build logs + if: failure() + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.cc }}-config.log + path: config.log - - uses: actions/setup-python@v5 - if: matrix.cc == 'clang' - with: - python-version: '3.9' + - uses: actions/setup-python@v5 + if: matrix.cc == 'clang' + with: + python-version: '3.9' - - name: Run CodeChecker - if: matrix.cc == 'clang' - uses: whisperity/codechecker-analysis-action@v1 - id: codechecker - with: + - name: Run CodeChecker + if: matrix.cc == 'clang' + uses: whisperity/codechecker-analysis-action@v1 + id: codechecker + with: # clean and rebuild so that compile_commands.json can be detected - build-command: "make clean && make" - ctu: true + build-command: make clean && make + ctu: true - - name: Save CodeChecker HTML output. - if: matrix.cc == 'clang' - uses: actions/upload-artifact@v4 - with: - name: "CodeChecker Bug Reports for ${{ matrix.sdk }}" - path: ${{ steps.codechecker.outputs.result-html-dir }}/*.html + - name: Save CodeChecker HTML output. + if: matrix.cc == 'clang' + uses: actions/upload-artifact@v4 + with: + name: CodeChecker Bug Reports for ${{ matrix.sdk }} + path: ${{ steps.codechecker.outputs.result-html-dir }}/*.html - - name: CodeChecker Pass Or Fail? - if: matrix.cc == 'clang' && ${{ steps.codechecker.outputs.warnings-in-diff == 'true' }} - shell: bash - run: | - echo "::error title=Static Analyzers Failed::Analysed commit(s) caused static analysis warnings" - exit 0 + - name: CodeChecker Pass Or Fail? + if: matrix.cc == 'clang' && ${{ steps.codechecker.outputs.warnings-in-diff == 'true' }} + shell: bash + run: | + echo "::error title=Static Analyzers Failed::Analysed commit(s) caused static analysis warnings" + exit 0 From f9f5348579c33979c5e56f94be870e90b5a48c35 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 18 Feb 2024 19:25:20 -0800 Subject: [PATCH 06/30] ci(pre-commit): add actionlint add a pre-commit hook to validate that all github actions in this repository are properly formatted and don't have common gh action typos/bugs. --- .pre-commit-config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 25b984c35..48bf8d700 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,3 +17,7 @@ repos: rev: 0.2.3 hooks: - id: yamlfmt + - repo: https://github.com/rhysd/actionlint + rev: v1.7.0 + hooks: + - id: actionlint From 6ded8d12f3f2802977cfc3d4ce3ae1c598fbabcd Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 12 May 2024 21:49:28 -0700 Subject: [PATCH 07/30] fix: take actionlint fixes + protect against quoting issues in shell expansion. + Fix logic error with unnecessary ${{}} usage. > This step was not conditionally executed prior to this commit. > > "[the condition] is always evaluated to true because extra characters > > are around ${{ }}" See https://github.com/rhysd/actionlint/blob/main/docs/checks.md for details. --- .github/workflows/distcheck.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/distcheck.yaml b/.github/workflows/distcheck.yaml index 8d3800e1c..7d8452e14 100644 --- a/.github/workflows/distcheck.yaml +++ b/.github/workflows/distcheck.yaml @@ -52,13 +52,13 @@ jobs: git clone --depth 1 https://github.com/ofiwg/libfabric.git pushd libfabric ./autogen.sh - ./configure --prefix=$PWD/install \ + ./configure --prefix="$PWD/install" \ --disable-sockets \ --disable-udp \ --disable-mrail \ --disable-opx \ - CC=${{ matrix.cc }} - make -j $(nproc) + CC="${{ matrix.cc }}" + make -j "$(nproc)" make install popd @@ -69,19 +69,19 @@ jobs: # actions/checkout@v4 would drop the plugin source in $PWD, # so go ahead and build it. ./autogen.sh - if [ ${{ matrix.sdk }} == "cuda" ] + if [ "${{ matrix.sdk }}" == "cuda" ] then - ./configure --with-libfabric=$PWD/libfabric/install \ + ./configure --with-libfabric="$PWD/libfabric/install" \ --with-cuda=/usr/local/cuda/ \ --enable-platform-aws \ - CC=${{ matrix.cc }} + CC="${{ matrix.cc }}" else - ./configure --with-libfabric=$PWD/libfabric/install \ + ./configure --with-libfabric="$PWD/libfabric/install" \ --enable-neuron \ --enable-platform-aws \ - CC=${{ matrix.cc }} + CC="${{ matrix.cc }}" fi - make -j $(nproc) + make -j "$(nproc)" - name: Run Dist Check run: make distcheck @@ -115,7 +115,7 @@ jobs: path: ${{ steps.codechecker.outputs.result-html-dir }}/*.html - name: CodeChecker Pass Or Fail? - if: matrix.cc == 'clang' && ${{ steps.codechecker.outputs.warnings-in-diff == 'true' }} + if: matrix.cc == 'clang' && steps.codechecker.outputs.warnings-in-diff == 'true' shell: bash run: | echo "::error title=Static Analyzers Failed::Analysed commit(s) caused static analysis warnings" From 0833b3894611fb5482f0b0958c7cb4d1a2483f55 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 18 Feb 2024 19:09:01 -0800 Subject: [PATCH 08/30] ci(pre-commit): introduce conventional commit hook This introduces a hook to encourage conventional commits [1] in this repository. Usage instructions, and expected types/scopes are added to the project README. Note that this does not pass the --strict flag or the --force-scope flag, so in practice this is toothless for now (we can reevaluate later) [1]: https://www.conventionalcommits.org --- .pre-commit-config.yaml | 6 ++++++ CONTRIBUTING.md | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 48bf8d700..c088a482a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,3 +21,9 @@ repos: rev: v1.7.0 hooks: - id: actionlint + - repo: https://github.com/compilerla/conventional-pre-commit + rev: v3.2.0 + hooks: + - id: conventional-pre-commit + stages: [commit-msg] + args: [misc, build, ci, docs, feat, fix, perf, refactor, style, test, nit] diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c3b5514fd..7de82c9c3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -33,7 +33,11 @@ To send us a pull request, please: 2. Install [pre-commit](https://pre-commit.com/) through your preferred mechanism (eg: `pip install`), then run `pre-commit install` **inside this repo**. This will ensure that a collection of [git hooks](./.pre-commit-config.yaml) run on each of your commits, which help minimize your diffs, catch errors automatically, and avoid nitpicking. 3. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 4. Ensure local tests pass. -5. Commit to your fork using clear commit messages. +5. Commit to your fork using a [conventional commit message](https://www.conventionalcommits.org/en/v1.0.0/). + This repository encourages the following conventional commit message tags:\ + `misc, build, ci, docs, feat, fix, perf, refactor, style, test, nit`\ + However, there are no constraints on acceptable scopes or tags. See `git log` + for real world examples. 6. Send us a pull request, answering any default questions in the pull request interface. 7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. From e640f436b06851803a3d3971fc4b03bc1d8ba20a Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 18 Feb 2024 18:53:08 -0800 Subject: [PATCH 09/30] ci(pre-commit): add eof/whitespace checks enable pre-commit checks which trim trailing whitespace in all files, and enforces that all files either are empty, or end with one newline. --- .pre-commit-config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c088a482a..9e871890f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,6 +8,8 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.6.0 hooks: + - id: trailing-whitespace + - id: end-of-file-fixer - id: check-merge-conflict - id: detect-private-key - id: mixed-line-ending From a50ebf1b5e1e12d7788a1a22a53f0d7e8c77b946 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 12 May 2024 20:51:29 -0700 Subject: [PATCH 10/30] nit: apply eof/trailing whitespace fixes Fix EOLs and EOFs via 'pre-commit run --all`, given new check introduced in previous commit. This is split such that it can be specified in a git-blame-ignore-revs file and filtered in git blame. --- CODE_OF_CONDUCT.md | 4 ++-- CONTRIBUTING.md | 14 +++++++------- NOTICE | 2 +- include/nccl_ofi.h | 4 ++-- include/nccl_ofi_sendrecv.h | 2 +- include/nccl_ofi_topo.h | 36 ++++++++++++++++++------------------ src/nccl_ofi_api.c | 2 +- src/nccl_ofi_cuda.c | 3 +-- src/nccl_ofi_deque.c | 2 +- src/nccl_ofi_rdma.c | 16 ++++++++-------- src/nccl_ofi_topo.c | 4 ++-- src/platform-aws.c | 4 ++-- src/tuner/nccl_ofi_tuner.c | 4 ++-- 13 files changed, 48 insertions(+), 49 deletions(-) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 3b6446687..5b627cfa6 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,4 +1,4 @@ ## Code of Conduct -This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). -For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact +This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). +For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact opensource-codeofconduct@amazon.com with any additional questions or comments. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7de82c9c3..7584e29d2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,9 +1,9 @@ # Contributing Guidelines -Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional +Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional documentation, we greatly value feedback and contributions from our community. -Please read through this document before submitting any issues or pull requests to ensure we have all the necessary +Please read through this document before submitting any issues or pull requests to ensure we have all the necessary information to effectively respond to your bug report or contribution. @@ -11,7 +11,7 @@ information to effectively respond to your bug report or contribution. We welcome you to use the GitHub issue tracker to report bugs or suggest features. -When filing an issue, please check [existing open](https://github.com/aws/aws-ofi-nccl/issues), or [recently closed](https://github.com/aws/aws-ofi-nccl/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already +When filing an issue, please check [existing open](https://github.com/aws/aws-ofi-nccl/issues), or [recently closed](https://github.com/aws/aws-ofi-nccl/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: * A reproducible test case or series of steps @@ -41,17 +41,17 @@ To send us a pull request, please: 6. Send us a pull request, answering any default questions in the pull request interface. 7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. -GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and +GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). ## Finding contributions to work on -Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/aws-ofi-nccl/labels/help%20wanted) issues is a great place to start. +Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/aws-ofi-nccl/labels/help%20wanted) issues is a great place to start. ## Code of Conduct -This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). -For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact +This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). +For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact opensource-codeofconduct@amazon.com with any additional questions or comments. diff --git a/NOTICE b/NOTICE index 098829c94..789c00a23 100644 --- a/NOTICE +++ b/NOTICE @@ -1,2 +1,2 @@ AWS Ofi Nccl -Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/include/nccl_ofi.h b/include/nccl_ofi.h index b6d88af92..58abfb902 100644 --- a/include/nccl_ofi.h +++ b/include/nccl_ofi.h @@ -274,8 +274,8 @@ struct nccl_net_ofi_device { * nccl_ofi_device. Create if it does not exist. Store * in pthread key. Increase reference counter. Must be * protected by lock stored in device. - * - * During the plugin initialization, this function will be + * + * During the plugin initialization, this function will be * called once per process using one of the instantiated device structs * to create and configure the endpoint of the initializing thread. */ diff --git a/include/nccl_ofi_sendrecv.h b/include/nccl_ofi_sendrecv.h index d2366f15b..6b852ecbe 100644 --- a/include/nccl_ofi_sendrecv.h +++ b/include/nccl_ofi_sendrecv.h @@ -193,7 +193,7 @@ typedef struct nccl_net_ofi_sendrecv_device { /* Memory registration key pool */ nccl_ofi_idpool_t key_pool; } nccl_net_ofi_sendrecv_device_t; - + typedef struct nccl_net_ofi_sendrecv_req { nccl_net_ofi_req_t base; diff --git a/include/nccl_ofi_topo.h b/include/nccl_ofi_topo.h index 28525f4bd..494486f50 100644 --- a/include/nccl_ofi_topo.h +++ b/include/nccl_ofi_topo.h @@ -141,7 +141,7 @@ void nccl_ofi_topo_free(nccl_ofi_topo_t *topo); * groups with the exception that the first groups get assigned an * additional member. * - * + * * The example below shows a schematic representation of the topology * while executing the grouping algorithm. The topology shows one * bridge, two nodes that correspond to a Nvidia GPUs and four nodes @@ -149,33 +149,33 @@ void nccl_ofi_topo_free(nccl_ofi_topo_t *topo); * Note that the schematic view below is a simplification of an actual * hardware topology. It is also only a selective view of a * hypothetical hardware topology. - * + * * Legend: * Topology nodes: * X:YZ * | - * + * * Topology node type X: * B: Bridge * G: GPU * N: Node corresponding to a NIC - * + * * num_groups Y: * Number indicating the group count - * + * * Mark Z: * u: Unmarked node * m: Marked node - * + * * Pointer "|": * The symbol "|" indicates the libfabric NIC info list pointer stored in * the user data of the topology node. The list pointer may be NULL (no * list) or stores a list of info objects (NIC0, NIC1, NIC2, NIC3, NIC4). - * - * + * + * * Initial state after NCCL OFI topology has been created via function * nccl_ofi_topo_create(). - * + * * / * B:0u * / / | | \ \ @@ -184,10 +184,10 @@ void nccl_ofi_topo_free(nccl_ofi_topo_t *topo); * G:0u G:0u N:0u N:0u N:0u N:0u * | | | | | | * NIC0 NIC1 NIC2 NIC3 - * + * * State after step 1. Topology nodes with libfabric NIC info structs and * the nodes' ancestors are marked. - * + * * / * B:0m * / / | | \ \ @@ -196,10 +196,10 @@ void nccl_ofi_topo_free(nccl_ofi_topo_t *topo); * G:0u G:0u N:0m N:0m N:0m N:0m * | | | | | | * NIC0 NIC1 NIC2 NIC3 - * + * * State after step 2. 'num_groups' of bridge node (closest marked node of * GPU topology nodes) is increased to two. - * + * * / * B:2m * / / | | \ \ @@ -208,10 +208,10 @@ void nccl_ofi_topo_free(nccl_ofi_topo_t *topo); * G:0u G:0u N:0m N:0m N:0m N:0m * | | | | | | * NIC0 NIC1 NIC2 NIC3 - * + * * State after step 3. The libfabric NIC info structs are lifted up to * bridge node (node with 'num_groups' larger then zero). - * + * * NIC0-NIC1-NIC2-NIC3 * / * B:2m @@ -220,11 +220,11 @@ void nccl_ofi_topo_free(nccl_ofi_topo_t *topo); * / / | | \ \ * G:0u G:0u N:0m N:0m N:0m N:0m * | | | | | | - * + * * State after step 4. List of libfabric NIC info structs is split into * two groups. Each group is attached to the topology node of the head of * the list. - * + * * / * B:0m * / / | | \ \ @@ -235,7 +235,7 @@ void nccl_ofi_topo_free(nccl_ofi_topo_t *topo); * NIC0 NIC2 * | | * NIC1 NIC3 - * + * * @param topo * The NCCL OFI topology. * diff --git a/src/nccl_ofi_api.c b/src/nccl_ofi_api.c index 0faebe81e..207e434d2 100644 --- a/src/nccl_ofi_api.c +++ b/src/nccl_ofi_api.c @@ -212,7 +212,7 @@ ncclResult_t nccl_net_ofi_listen_v4(int dev, void* handle, void** listenComm) /* * @brief Non-blocking connect which returns sComm as NULL - * with an expectation that it will be called again until + * with an expectation that it will be called again until * sComm != NULL * * The callee obtains one endpoint handle via the device's get_ep() diff --git a/src/nccl_ofi_cuda.c b/src/nccl_ofi_cuda.c index 8e0523b34..ddfb60316 100644 --- a/src/nccl_ofi_cuda.c +++ b/src/nccl_ofi_cuda.c @@ -74,7 +74,7 @@ int nccl_net_ofi_get_cuda_device(void *data, int *dev_id) CUresult cuda_ret_mem = nccl_net_ofi_cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) data); - CUresult cuda_ret_dev = nccl_net_ofi_cuPointerGetAttribute(&device_ordinal, + CUresult cuda_ret_dev = nccl_net_ofi_cuPointerGetAttribute(&device_ordinal, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr) data); @@ -95,4 +95,3 @@ int nccl_net_ofi_get_cuda_device(void *data, int *dev_id) *dev_id = cuda_device; return ret; } - diff --git a/src/nccl_ofi_deque.c b/src/nccl_ofi_deque.c index 494f65ac3..9cef2a1b5 100644 --- a/src/nccl_ofi_deque.c +++ b/src/nccl_ofi_deque.c @@ -47,4 +47,4 @@ int nccl_ofi_deque_finalize(nccl_ofi_deque_t *deque) free(deque); return 0; -} \ No newline at end of file +} diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 5b5ab3924..2ac00f1e0 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -436,7 +436,7 @@ static int write_topo_file(nccl_ofi_topo_t *topo) * @return Populated I/O vector, on success * @return 0 on success * non-zero on error - */ + */ static int set_mr_req_attr(nccl_ofi_idpool_t *key_pool, int dev_id, void *data, size_t size, int type, struct fi_mr_attr *mr_attr, struct iovec *iov) @@ -452,7 +452,7 @@ static int set_mr_req_attr(nccl_ofi_idpool_t *key_pool, int dev_id, mr_attr->iov_count = 1; mr_attr->access = FI_SEND | FI_RECV; - /* Add FI_WRITE (source of fi_write) and FI_REMOTE_WRITE (target of fi_write) + /* Add FI_WRITE (source of fi_write) and FI_REMOTE_WRITE (target of fi_write) for RDMA send/recv buffers */ mr_attr->access |= (FI_WRITE | FI_REMOTE_WRITE); @@ -819,7 +819,7 @@ static inline int inc_recv_seg_completion(nccl_net_ofi_rdma_req_t *req, assert(req->type == NCCL_OFI_RDMA_RECV_SEGMS); int ret = 0; bool segms_received; - + ret = pthread_mutex_lock(&req->req_lock); if (OFI_UNLIKELY(ret)) { NCCL_OFI_WARN("Unable to acquire req_lock mutex"); @@ -834,7 +834,7 @@ static inline int inc_recv_seg_completion(nccl_net_ofi_rdma_req_t *req, /* The arrival of the last segment is treated as a single * request completion of the parent request */ segms_received = req->ncompls == total_nsegms; - + /* Mark receive segments request and receive request as completed */ if (segms_received) { rdma_req_recv_segms_data_t *recv_segms_data = get_recv_segms_data(req); @@ -853,7 +853,7 @@ static inline int inc_recv_seg_completion(nccl_net_ofi_rdma_req_t *req, NCCL_OFI_WARN("Failed to unlock req_lock mutex"); return -ret; } - + /* Add completion to parent request */ ret = inc_req_completion(recv_req, req->size, recv_data->total_num_compls); } else { @@ -1277,7 +1277,7 @@ static inline int handle_bounce_recv(nccl_net_ofi_rdma_ep_t *ep, int rail_id, st /** * @brief Get request associated with RDMA write immediate data - * + * * @param ep, to look up r_comm from ID encoded in data * @param data, the immediate data */ @@ -1766,7 +1766,7 @@ static inline int free_base_req(uint64_t *num_inflight_reqs, bool dec_inflight_reqs) { int ret = 0; - + if (OFI_UNLIKELY(req == NULL)) { ret = -EINVAL; NCCL_OFI_WARN("Provided null request for cleanup"); @@ -5981,7 +5981,7 @@ int nccl_net_ofi_rdma_init(const char *provider_filter, ret = -EINVAL; goto error; } - + /* Allocate device */ nccl_net_ofi_rdma_device_t *device = calloc(1, sizeof(nccl_net_ofi_rdma_device_t)); if (!device) { diff --git a/src/nccl_ofi_topo.c b/src/nccl_ofi_topo.c index 4b46fed3d..8777e34dc 100644 --- a/src/nccl_ofi_topo.c +++ b/src/nccl_ofi_topo.c @@ -1010,7 +1010,7 @@ static hwloc_obj_t get_numa_mem_child(hwloc_obj_t node) return child; } -/* +/* * @brief Return PCI device property of PCI device * * This function reads first `MAX_DEV_PROPERTY_LENGTH` characters from @@ -1031,7 +1031,7 @@ static hwloc_obj_t get_numa_mem_child(hwloc_obj_t node) * File name of the device property * @return Pointer to an element of a char array to write device property to. * The array has to be allocated by the caller of this function. - * There must be space for at least `MAX_DEV_PROPERTY_LENGTH` + * There must be space for at least `MAX_DEV_PROPERTY_LENGTH` * characters in addition to the delimiting `\0`. * @return 0, on sucess * non-zero, on error diff --git a/src/platform-aws.c b/src/platform-aws.c index d831db454..07a4b5c8a 100644 --- a/src/platform-aws.c +++ b/src/platform-aws.c @@ -235,7 +235,7 @@ static int validate_rdma_write(struct fid_ep *ep) ret = -EINVAL; goto exit; } - NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Get endpoint option FI_OPT_EFA_EMULATED_WRITE. optval: %d", + NCCL_OFI_TRACE(NCCL_INIT | NCCL_NET, "Get endpoint option FI_OPT_EFA_EMULATED_WRITE. optval: %d", optval); #else NCCL_OFI_WARN("FI_OPT_EFA_EMULATED_WRITE not declared when the communication protocol is RDMA write."); @@ -487,7 +487,7 @@ int platform_init(const char **provider_filter) #endif /* - * Update topology if platform topology is available and + * Update topology if platform topology is available and * environment variable NCCL_TOPO_FILE is not set. */ if (getenv("NCCL_TOPO_FILE")) { diff --git a/src/tuner/nccl_ofi_tuner.c b/src/tuner/nccl_ofi_tuner.c index e1c6baac8..7b0293899 100644 --- a/src/tuner/nccl_ofi_tuner.c +++ b/src/tuner/nccl_ofi_tuner.c @@ -24,7 +24,7 @@ ncclResult_t nccl_ofi_tuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t /* * The tuner API is missing a mechanism to pass around context after * initialization. For now, init a plugin-lobal context once. - */ + */ pthread_mutex_lock(&nccl_ofi_tuner_ctx_lock); nccl_ofi_tuner_ctx = calloc(1, sizeof(struct nccl_ofi_tuner_context)); if (!nccl_ofi_tuner_ctx) { @@ -63,7 +63,7 @@ ncclResult_t nccl_ofi_tuner_get_coll_info(void *context, ncclFunc_t collType, si /* * Ideally, this should just be a lookup and not be in-flight math * We do not want divs in the hot path, but working with the API we've - * got now. + * got now. */ for (algo = 0; algo < NCCL_NUM_ALGORITHMS; algo++) { /* No CollNet on AWS today */ From afcee3e25a0762ac1f4820591845298a0c23a317 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 18 Feb 2024 19:07:22 -0800 Subject: [PATCH 11/30] ci(pre-commit): introduce shellcheck This check runs shellcheck over all .sh files. --- .pre-commit-config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9e871890f..5a42b505d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,3 +29,7 @@ repos: - id: conventional-pre-commit stages: [commit-msg] args: [misc, build, ci, docs, feat, fix, perf, refactor, style, test, nit] + - repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.10.0.1 + hooks: + - id: shellcheck From 9ee8973176285988c5b4813832640e106e0fe806 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 12 May 2024 21:52:05 -0700 Subject: [PATCH 12/30] fix: take shellcheck fixes on autogen.sh Put the shebang at the top, use /usr/bin/env sh, split -x onto its own line. --- autogen.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autogen.sh b/autogen.sh index 56bed122c..9a365022b 100755 --- a/autogen.sh +++ b/autogen.sh @@ -1,9 +1,9 @@ +#!/usr/bin/env sh # # Copyright (c) 2018-2019, Amazon.com, Inc. or its affiliates. All rights reserved. # # See LICENSE.txt for license information # -#! /bin/sh -x - +set -x autoreconf -ivf From 3f6d6b0d6dc9adefd44000431f1bfa29f8728e7f Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Wed, 21 Feb 2024 12:27:32 -0800 Subject: [PATCH 13/30] ci(pre-commit): add mdformat mdformat is a tool that ensures consistent markdown style and ensures that content is legal. It also applies some fixes that can help minimize markdown diffs, eg: to avoid explicit numbering (which are ignored by renderers) so that list items can be added without manually incrementing all preceding elements. Add this to pre-commit. Fixes are applied in a follow up commit for the sake of .git-blame-ignore-revs. --- .pre-commit-config.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5a42b505d..fbb158e69 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,6 +23,18 @@ repos: rev: v1.7.0 hooks: - id: actionlint + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.17 + hooks: + - id: mdformat + additional_dependencies: + - mdformat-gfm + - mdformat-black + - mdformat-web + - mdformat-toc + - mdformat-tables + - mdformat-footnote + - mdformat-simple-breaks - repo: https://github.com/compilerla/conventional-pre-commit rev: v3.2.0 hooks: From f9cbf3b8914db0a8744456441cb57443388b7dfe Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 12 May 2024 21:52:51 -0700 Subject: [PATCH 14/30] fix: take mdformat fixes remove some unnecessary blank lines, prefer lists which can be inserted into without creating diffs for preceding elements. --- .github/PULL_REQUEST_TEMPLATE.md | 1 - CODE_OF_CONDUCT.md | 1 + CONTRIBUTING.md | 42 ++++++++++++++++---------------- INSTALL.md | 3 +-- README.md | 4 +-- doc/efa-env-var.md | 2 +- doc/tracing.md | 21 ++++++++-------- 7 files changed, 37 insertions(+), 37 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index ab40d21d7..df92f3a44 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,5 +2,4 @@ *Description of changes:* - By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 5b627cfa6..ec98f2b76 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,4 +1,5 @@ ## Code of Conduct + This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact opensource-codeofconduct@amazon.com with any additional questions or comments. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7584e29d2..b625562c6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -6,7 +6,6 @@ documentation, we greatly value feedback and contributions from our community. Please read through this document before submitting any issues or pull requests to ensure we have all the necessary information to effectively respond to your bug report or contribution. - ## Reporting Bugs/Feature Requests We welcome you to use the GitHub issue tracker to report bugs or suggest features. @@ -14,48 +13,49 @@ We welcome you to use the GitHub issue tracker to report bugs or suggest feature When filing an issue, please check [existing open](https://github.com/aws/aws-ofi-nccl/issues), or [recently closed](https://github.com/aws/aws-ofi-nccl/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: -* A reproducible test case or series of steps -* The version of our code being used -* Any modifications you've made relevant to the bug -* Anything unusual about your environment or deployment - +- A reproducible test case or series of steps +- The version of our code being used +- Any modifications you've made relevant to the bug +- Anything unusual about your environment or deployment ## Contributing via Pull Requests + Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 1. You are working against the latest source on the *master* branch. -2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. -3. You open an issue to discuss any significant work - we would hate for your time to be wasted. +1. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. +1. You open an issue to discuss any significant work - we would hate for your time to be wasted. To send us a pull request, please: 1. Fork the repository. -2. Install [pre-commit](https://pre-commit.com/) through your preferred mechanism (eg: `pip install`), then run `pre-commit install` **inside this repo**. This will ensure that a collection of [git hooks](./.pre-commit-config.yaml) run on each of your commits, which help minimize your diffs, catch errors automatically, and avoid nitpicking. -3. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. -4. Ensure local tests pass. -5. Commit to your fork using a [conventional commit message](https://www.conventionalcommits.org/en/v1.0.0/). - This repository encourages the following conventional commit message tags:\ - `misc, build, ci, docs, feat, fix, perf, refactor, style, test, nit`\ - However, there are no constraints on acceptable scopes or tags. See `git log` - for real world examples. -6. Send us a pull request, answering any default questions in the pull request interface. -7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. +1. Install [pre-commit](https://pre-commit.com/) through your preferred mechanism (eg: `pip install`), then run `pre-commit install` **inside this repo**. This will ensure that a collection of [git hooks](./.pre-commit-config.yaml) run on each of your commits, which help minimize your diffs, catch errors automatically, and avoid nitpicking. +1. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. +1. Ensure local tests pass. +1. Commit to your fork using a [conventional commit message](https://www.conventionalcommits.org/en/v1.0.0/). + This repository uses the following conventional commit message tags:\ + `misc, build, ci, docs, feat, fix, perf, refactor, style, test`\ + and our pre-commit checks enforce that a scope must be provided. For example:\ + `tag(scope): short change description`\ + There are no constraints on acceptable scopes. See `git log` for real world examples. +1. Send us a pull request, answering any default questions in the pull request interface. +1. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). - ## Finding contributions to work on -Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/aws-ofi-nccl/labels/help%20wanted) issues is a great place to start. +Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/aws-ofi-nccl/labels/help%20wanted) issues is a great place to start. ## Code of Conduct + This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact opensource-codeofconduct@amazon.com with any additional questions or comments. - ## Security issue notifications + If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. We run [CodeChecker](https://codechecker.readthedocs.io/en/latest/) (LLVM/Clang Static Analysis) on every pull request. diff --git a/INSTALL.md b/INSTALL.md index 1b3287fa3..7948c1818 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -60,8 +60,7 @@ the following config option: In case plugin is configured with `--enable-asan` and the executable binary is not compiled and linked with ASAN support, it is required to -preload the ASAN library, i.e., run the application with `export -LD_PRELOAD=`. +preload the ASAN library, i.e., run the application with `export LD_PRELOAD=`. In case plugin is configured with `--enable-asan` and the plugin is run within a CUDA application, environment variable `ASAN_OPTIONS` diff --git a/README.md b/README.md index 62f0641bc..b3207538b 100644 --- a/README.md +++ b/README.md @@ -36,8 +36,8 @@ reports if that is the case. The plugin is regularly tested on the following operating systems: -* Amazon Linux 2 -* Ubuntu 20.04 LTS and 22.04 LTS +- Amazon Linux 2 +- Ubuntu 20.04 LTS and 22.04 LTS Other operating systems are likely to work; there is very little distribution-specific code in the plugin. diff --git a/doc/efa-env-var.md b/doc/efa-env-var.md index cf75034e2..5e6e7f0a1 100644 --- a/doc/efa-env-var.md +++ b/doc/efa-env-var.md @@ -97,7 +97,7 @@ support both EFA and NVIDIA GPUDirect Remote Direct Memory Access (RDMA). Actions - + p5.48xlarge • cuda>=12.0
diff --git a/doc/tracing.md b/doc/tracing.md index bca3762bd..b1ca549d4 100644 --- a/doc/tracing.md +++ b/doc/tracing.md @@ -7,21 +7,22 @@ The plugin request address (req) is used to associate requests between NCCL call The NCCL request address (request) is used to associate calls from NCCL to the plugin. To trace aws-ofi-nccl using LTTNG Tracing: + 1. Install the LTTNG libraries and dependencies as documented at https://lttng.org/docs/v2.13/#doc-building-from-source -2. Configure aws-ofi-nccl as normal, but with the addition of the --with-lttng= flag to ./configure. Use as +1. Configure aws-ofi-nccl as normal, but with the addition of the --with-lttng= flag to ./configure. Use as the location you chose in the previous step for LTTNG. For example, --with-lttng=/usr/local -3. Start the LTTNG session daemon. +1. Start the LTTNG session daemon. lttng-sessiond --daemonize -4. Enable up to 1 Gigabyte of tracing in a memory ring as a single tracing channel. +1. Enable up to 1 Gigabyte of tracing in a memory ring as a single tracing channel. lttng enable-channel --userspace --overwrite --subbuf-size=1M --num-subbuf=1024 efa -5. Enable all tracepoints on the channel you configured. +1. Enable all tracepoints on the channel you configured. lttng enable-event --userspace -a --channel=efa -6. Start tracing +1. Start tracing lttng start -7. Run your test. Traces should be recorded. -8. Destroy the tracing that was enabled. +1. Run your test. Traces should be recorded. +1. Destroy the tracing that was enabled. lttng destroy -9. To read and print the traces LTTNG recorded, install the babelfish2 utility. -10. Print the traces. - babelfish2 ~/lttng-traces +1. To read and print the traces LTTNG recorded, install the babelfish2 utility. +1. Print the traces. + babelfish2 ~/lttng-traces From f34a7aec9957dab11bad06799c997b108f38430c Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 12 May 2024 15:50:07 -0700 Subject: [PATCH 15/30] ci(pre-commit): add include-what-you-use include-what-you-use is a tool which almost entirely avoid traditional header-hell: "for every symbol (type, function variable, or macro) that you use in foo.c, either foo.c or foo.h should include a .h file that exports the declaration of that symbol". In other words, any file transiently including a symbol is made to directly include it after running the tool. It also checks whether the include can be replaced with forward declarations, in which case, the dependency can be removed entirely. This is more commonly used in c++ codebases for the sake of reducing compile times, but it also has other benefits such as making refactoring easier and makes it more clear at a glance what files depend on what other files. This relies on compile_commands.json/libtooling and only runs when manually called (for now) as we don't have a portable way to create that, but this hook works if you manually add compiler flags and library paths to the hook args list. --- .iwyu.imp | 5 +++++ .pre-commit-config.yaml | 11 +++++++++++ 2 files changed, 16 insertions(+) create mode 100644 .iwyu.imp diff --git a/.iwyu.imp b/.iwyu.imp new file mode 100644 index 000000000..46c19ca70 --- /dev/null +++ b/.iwyu.imp @@ -0,0 +1,5 @@ +[ + { "symbol": ["ENOMEM", "private", "", "public"] }, + { "symbol": ["EINVAL", "private", "", "public"] }, + { "symbol": ["memset", "private", "", "public"] } +] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fbb158e69..b9e2e4746 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -45,3 +45,14 @@ repos: rev: v0.10.0.1 hooks: - id: shellcheck + - repo: https://github.com/pocc/pre-commit-hooks + rev: v1.3.5 + hooks: + # xxx: difficult to use right now, need to manually supply include + # args to make dependencies resolve. Need compile_commands.json in our + # build system longterm. + - id: include-what-you-use + stages: [manual] + files: (src/.*[.]c|include/.*[.]h) + exclude: (include/.*memcheck.*[.]h|include/nccl-headers.*|src/nccl_ofi_interface_neuron.c) + args: [-Xiwyu, --mapping_file=.iwyu.imp] From 0e2d51dbd070df17cc5c3d1966a7c1f9e81b3a99 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 12 May 2024 23:38:04 -0700 Subject: [PATCH 16/30] build: take iwyu fixes The previous commits iwyu hook was made to work locally, and when run, added the following missing includes. This will not, unfortunately, be enforced going forward by pre-commit until the build can export compile_commands.json consistently. --- include/nccl_ofi_api.h | 2 ++ include/nccl_ofi_deque.h | 4 ++++ include/nccl_ofi_freelist.h | 4 +++- include/nccl_ofi_memcheck_nop.h | 3 +++ include/nccl_ofi_ofiutils.h | 2 ++ src/nccl_ofi_deque.c | 5 ++++- src/nccl_ofi_idpool.c | 7 +++++++ src/nccl_ofi_msgbuff.c | 4 ++++ 8 files changed, 29 insertions(+), 2 deletions(-) diff --git a/include/nccl_ofi_api.h b/include/nccl_ofi_api.h index a7bd41372..2ed8a8a27 100644 --- a/include/nccl_ofi_api.h +++ b/include/nccl_ofi_api.h @@ -6,6 +6,8 @@ #ifndef NET_OFI_API_H_ #define NET_OFI_API_H_ +#include + #include "nccl-headers/net.h" #include "nccl-headers/error.h" diff --git a/include/nccl_ofi_deque.h b/include/nccl_ofi_deque.h index 48f8ca6cc..b42004986 100644 --- a/include/nccl_ofi_deque.h +++ b/include/nccl_ofi_deque.h @@ -10,9 +10,13 @@ extern "C" { #endif #include +#include +#include #include #include +#include "nccl_ofi_log.h" + /* * Internal: deque element structure * diff --git a/include/nccl_ofi_freelist.h b/include/nccl_ofi_freelist.h index 52f0c45b8..7527aa969 100644 --- a/include/nccl_ofi_freelist.h +++ b/include/nccl_ofi_freelist.h @@ -10,8 +10,10 @@ extern "C" { #endif #include -#include #include +#include +#include +#include #include "nccl_ofi_log.h" #include "nccl_ofi_memcheck.h" diff --git a/include/nccl_ofi_memcheck_nop.h b/include/nccl_ofi_memcheck_nop.h index 9c15a8ff1..dcb621c30 100644 --- a/include/nccl_ofi_memcheck_nop.h +++ b/include/nccl_ofi_memcheck_nop.h @@ -9,6 +9,9 @@ extern "C" { #endif +#include +#include + /** * @file This module provides empty implementations of the interface * for providing hints about the expected state of a memory region. diff --git a/include/nccl_ofi_ofiutils.h b/include/nccl_ofi_ofiutils.h index 37a09b854..177c8be35 100644 --- a/include/nccl_ofi_ofiutils.h +++ b/include/nccl_ofi_ofiutils.h @@ -9,6 +9,8 @@ extern "C" { #endif +#include + int nccl_ofi_ofiutils_get_providers(const char *prov_include, int required_version, struct fi_info *hints, diff --git a/src/nccl_ofi_deque.c b/src/nccl_ofi_deque.c index 9cef2a1b5..7a7baa3c2 100644 --- a/src/nccl_ofi_deque.c +++ b/src/nccl_ofi_deque.c @@ -4,9 +4,12 @@ #include +#include +#include +#include -#include "nccl_ofi.h" #include "nccl_ofi_deque.h" +#include "nccl_ofi_log.h" int nccl_ofi_deque_init(nccl_ofi_deque_t **deque_p) { diff --git a/src/nccl_ofi_idpool.c b/src/nccl_ofi_idpool.c index 36bb52edb..b74362979 100644 --- a/src/nccl_ofi_idpool.c +++ b/src/nccl_ofi_idpool.c @@ -2,9 +2,16 @@ * Copyright (c) 2024 Amazon.com, Inc. or its affiliates. All rights reserved. */ +#include +#include +#include +#include +#include +#include #include "nccl_ofi.h" #include "nccl_ofi_idpool.h" +#include "nccl_ofi_log.h" #include "nccl_ofi_math.h" /* diff --git a/src/nccl_ofi_msgbuff.c b/src/nccl_ofi_msgbuff.c index d3e8653f0..93965c04d 100644 --- a/src/nccl_ofi_msgbuff.c +++ b/src/nccl_ofi_msgbuff.c @@ -3,6 +3,10 @@ */ +#include +#include +#include +#include #include #include From c39432ea07ff0a7a6c238d0ea425a40a87ddf61f Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sat, 24 Feb 2024 18:44:33 -0800 Subject: [PATCH 17/30] ci(clang-tidy): add basic clang-tidy config file. Exclude all unit tests from reporting, and select a handful of checks that are relevant to this project. --- .clang-tidy | 53 +++++++++++++++++++++++++++++++++++++++++++++++ tests/.clang-tidy | 2 ++ 2 files changed, 55 insertions(+) create mode 100644 .clang-tidy create mode 100644 tests/.clang-tidy diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000..f5fc6f63e --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,53 @@ +--- +Checks: + - -* + - cert-dcl16-c + - optin-mpi-mpi-checker-c + - optin.core.EnumCastOutOfRange + - optin-performance-padding + - clang-analyzer-core* + - clang-analyzer-nullability* + - clang-analyzer-optin.core.EnumCastOutOfRange + - clang-analyzer-optin.performance.Padding + - clang-analyzer-security* + - clang-analyzer-unix* + - -clang-analyzer-security.insecureAPI* + - bugprone-* + - -bugprone-easily-swappable-parameters + - -bugprone-assignment-in-if-condition + - -bugprone-multi-level-implicit-pointer-conversion + - misc-const-correctness + - misc-static-assert + - misc-misplaced-const + - misc-redundant-expression + - misc-header-include-cycle + - misc-definitions-in-headers + #- misc-include-cleaner + - readability-avoid-const-params-in-decls + - readability-avoid-return-with-void-value + - readability-avoid-unconditional-preprocessor-if + - readability-braces-around-statements + - readability-const-return-type + - readability-const-return-type + - readability-duplicate-include + - readability-implicit-bool-conversion + - readability-inconsistent-declaration-parameter-name + - readability-misleading-indentation + - readability-misplaced-array-index + - readability-named-parameter + - readability-non-const-parameter + - readability-redundant-control-flow + - readability-redundant-declaration + - readability-redundant-inline-specifier + - readability-simplify-boolean-expr + - readability-simplify-subscript-expr + - readability-string-compare + - readability-suspicious-call-argument + - readability-uppercase-literal-suffix +WarningsAsErrors: true +FormatStyle: file +CheckOptions: + - key: readability-uppercase-literal-suffix.NewSuffixes + value: L;LL;LU;LLU + - key: misc-include-cleaner.IgnoreHeaders + value: include/nccl-headers/.* diff --git a/tests/.clang-tidy b/tests/.clang-tidy new file mode 100644 index 000000000..b6d2b0844 --- /dev/null +++ b/tests/.clang-tidy @@ -0,0 +1,2 @@ +--- +Checks: -* From 82f8d590f70c23f69692255c5133ca5beedddd30 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 12 May 2024 23:11:14 -0700 Subject: [PATCH 18/30] ci(pre-commit): add clang tidy hook Add a pre-commit hook to run clang-tidy for files under src/ on every commit. --- .pre-commit-config.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b9e2e4746..478610f7f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -56,3 +56,10 @@ repos: files: (src/.*[.]c|include/.*[.]h) exclude: (include/.*memcheck.*[.]h|include/nccl-headers.*|src/nccl_ofi_interface_neuron.c) args: [-Xiwyu, --mapping_file=.iwyu.imp] + # This works partially without compile_commands.json, better than + # iwyu, but is disabled for the same reason. + - id: clang-tidy + files: (src/.*[.]c|include/.*[.]h) + stages: [manual] + exclude: (include/.*memcheck.*[.]h|include/nccl-headers.*|src/nccl_ofi_interface_neuron.c) + args: [--config-file=.clang-tidy] From a5b9a7e2cb4e0f07c3887e9d4e616146c3f7d3c7 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 12 May 2024 23:12:38 -0700 Subject: [PATCH 19/30] fix: cast id to `signed long long' for ffsll __builtin_ffsll takes a signed long long, not a uint64_t. Explicitly cast to `signed long long' before passing it. Caught with clang tidy. --- src/nccl_ofi_idpool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nccl_ofi_idpool.c b/src/nccl_ofi_idpool.c index b74362979..c1bb92dfc 100644 --- a/src/nccl_ofi_idpool.c +++ b/src/nccl_ofi_idpool.c @@ -108,7 +108,7 @@ int nccl_ofi_idpool_allocate_id(nccl_ofi_idpool_t *idpool) int entry_index = 0; int id = -1; for (size_t i = 0; i < num_long_elements; i++) { - entry_index = __builtin_ffsll(idpool->ids[i]); + entry_index = __builtin_ffsll((signed long long)idpool->ids[i]); if (0 != entry_index) { /* Found one available ID */ From f39051a727cb77f299dca3a2876ff340dbc90839 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 12 May 2024 23:58:31 -0700 Subject: [PATCH 20/30] fix: implicit conversions: fix rr thres This is initialized with a signed integer, but assigned to a uint64_t. Do the initialization math in a uint64_t so that there is no conversion. Caught with clang tidy. --- include/nccl_ofi_param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nccl_ofi_param.h b/include/nccl_ofi_param.h index 023c52c45..0a0897aa5 100644 --- a/include/nccl_ofi_param.h +++ b/include/nccl_ofi_param.h @@ -186,7 +186,7 @@ OFI_NCCL_PARAM_INT(disable_gdr_required_check, "DISABLE_GDR_REQUIRED_CHECK", 0); /* * Maximum size of a message in bytes before message is multiplexed */ -OFI_NCCL_PARAM_INT(round_robin_threshold, "ROUND_ROBIN_THRESHOLD", (256 * 1024)); +OFI_NCCL_PARAM_INT(round_robin_threshold, "ROUND_ROBIN_THRESHOLD", (256UL * 1024UL)); /* * Minimum bounce buffers posted per endpoint. The plugin will attempt to post From e713ba53190997fc324d7b8c49968c4ed8f4d261 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Mon, 19 Feb 2024 00:13:31 -0800 Subject: [PATCH 21/30] ci(pre-commit): add editorconfig This adds a generic editorconfig [1] file for the repo to ensure that CRLF/LF and tabwidth settings are automatically applied when possible, and adds a pre-commit hook to ensure that it is coherent with the tree. [1]: https://editorconfig.org --- .ecrc | 19 +++++++++++++++++++ .editorconfig | 15 +++++++++++++++ .pre-commit-config.yaml | 6 ++++++ 3 files changed, 40 insertions(+) create mode 100644 .ecrc create mode 100644 .editorconfig diff --git a/.ecrc b/.ecrc new file mode 100644 index 000000000..dfaba73dc --- /dev/null +++ b/.ecrc @@ -0,0 +1,19 @@ +{ + "Version": "2.7.2", + "Verbose": false, + "Debug": false, + "IgnoreDefaults": false, + "SpacesAftertabs": true, + "NoColor": false, + "Exclude": [], + "AllowedContentTypes": [], + "PassedFiles": [], + "Disable": { + "EndOfLine": false, + "Indentation": false, + "InsertFinalNewline": false, + "TrimTrailingWhitespace": false, + "IndentSize": false, + "MaxLineLength": false + } +} diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..fb360d0e2 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,15 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true + +[*.md] +indent_style = space +indent_size = 1 + +[*.[ch]] +indent_style = tab +indent_size = 8 +trim_trailing_whitespace = true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 478610f7f..d3b9668c4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -63,3 +63,9 @@ repos: stages: [manual] exclude: (include/.*memcheck.*[.]h|include/nccl-headers.*|src/nccl_ofi_interface_neuron.c) args: [--config-file=.clang-tidy] + - repo: https://github.com/editorconfig-checker/editorconfig-checker.python + rev: 2.7.3 + hooks: + - id: editorconfig-checker + stages: [manual] + alias: ec From 9bc5d2f3f74bd331230e8be60204675baf932e7b Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Wed, 21 Feb 2024 12:41:15 -0800 Subject: [PATCH 22/30] ci(github): run pre-commit on PRs rename the distcheck workflow to pr-ci. Add a second job to the file for pre-commit, that runs all non-manual pre-commit hooks. Add workflow_dispatch to the file so it can be triggered manually if desired. --- .github/workflows/{distcheck.yaml => pr-ci.yaml} | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) rename .github/workflows/{distcheck.yaml => pr-ci.yaml} (92%) diff --git a/.github/workflows/distcheck.yaml b/.github/workflows/pr-ci.yaml similarity index 92% rename from .github/workflows/distcheck.yaml rename to .github/workflows/pr-ci.yaml index 7d8452e14..8bb4aaeb3 100644 --- a/.github/workflows/distcheck.yaml +++ b/.github/workflows/pr-ci.yaml @@ -1,6 +1,6 @@ --- name: PR CI -on: [push, pull_request] +on: [push, pull_request, workflow_dispatch] env: APT_PACKAGES: >- build-essential @@ -10,8 +10,18 @@ env: libhwloc-dev make jobs: + pre-commit: + runs-on: ubuntu-latest + name: pre-commit formatting checks + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.9' + - uses: pre-commit/action@v3.0.1 distcheck: runs-on: ubuntu-22.04 + needs: pre-commit strategy: matrix: cc: From 36de2e45f31f044627c82dfd9899d9c639bd2f82 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Mon, 13 May 2024 00:15:39 -0700 Subject: [PATCH 23/30] misc: add .git-blame-ignore-revs --- .git-blame-ignore-revs | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000..643e2219d --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,11 @@ +# build: take iwyu fixes +0e2d51dbd070df17cc5c3d1966a7c1f9e81b3a99 + +# fix: take mdformat fixes +f9cbf3b8914db0a8744456441cb57443388b7dfe + +# nit: apply eof/trailing whitespace fixes +a50ebf1b5e1e12d7788a1a22a53f0d7e8c77b946 + +# nit: apply yamlfmt fixes to tree +3d8a9d1ffed34da06581ec2bd971f03d41dcb6ac From 3e83cd4f61b1cd127710d02293922c63552d1cc9 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Thu, 25 Apr 2024 16:08:11 -0700 Subject: [PATCH 24/30] test: nit: null-initialize test member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fixes: `warning: ‘schedule’ may be used uninitialized [-Wmaybe-uninitialized]' Signed-off-by: Nicholas Sielicki --- tests/unit/scheduler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/scheduler.c b/tests/unit/scheduler.c index 998604892..5a5ae0230 100644 --- a/tests/unit/scheduler.c +++ b/tests/unit/scheduler.c @@ -72,7 +72,7 @@ int verify_schedule(nccl_net_ofi_schedule_t *schedule, nccl_net_ofi_schedule_t * int test_multiplexing_schedule() { - nccl_net_ofi_schedule_t *schedule; + nccl_net_ofi_schedule_t *schedule = NULL; nccl_net_ofi_schedule_t *ref_schedule = malloc(sizeof(nccl_net_ofi_schedule_t) + 3 * sizeof(nccl_net_ofi_xfer_info_t)); if (!ref_schedule) { From 29a2bf25dcb1d369b11dc0839ad869132b03a803 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Mon, 13 May 2024 01:20:46 -0700 Subject: [PATCH 25/30] ci(github): distcheck: use github cache use the gh actions cache for apt packages, rather than redownloading them every time. This should accelerate the build somewhat. --- .github/workflows/pr-ci.yaml | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/.github/workflows/pr-ci.yaml b/.github/workflows/pr-ci.yaml index 8bb4aaeb3..154c2966a 100644 --- a/.github/workflows/pr-ci.yaml +++ b/.github/workflows/pr-ci.yaml @@ -35,14 +35,19 @@ jobs: - uses: actions/checkout@v4 - name: Install Dependencies - run: | - sudo apt-get update -y - sudo apt-get install -y ${{ env.APT_PACKAGES }} + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: ${{ env.APT_PACKAGES }} + version: base-packages + - name: Install CUDA SDK if: matrix.sdk == 'cuda' - run: | - sudo apt-get install -y nvidia-cuda-toolkit - - name: Install Neuron SDK + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: nvidia-cuda-toolkit + version: cuda-packages + + - name: Configure Neuron SDK Repository if: matrix.sdk == 'neuron' run: | # Configure Linux for Neuron repository updates @@ -52,8 +57,12 @@ jobs: wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add - sudo apt update -y - # Install Neuron Runtime - sudo apt-get install aws-neuronx-runtime-lib -y + - name: Install Neuron SDK + if: matrix.sdk == 'neuron' + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: aws-neuronx-runtime-lib + version: neuron-packages - name: Install Libfabric run: | From faed91bb98bf0c50ce9e6989de2ce6430333444d Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Mon, 13 May 2024 01:21:00 -0700 Subject: [PATCH 26/30] ci(github): rename config.log artifacts Uploading the artifact with only matrix.cc in the name can cause collisions when both fail. Upload it as `${{ matrix.cc }}-${{ matrix.sdk}}-config.log' instead so that it is unique. --- .github/workflows/pr-ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-ci.yaml b/.github/workflows/pr-ci.yaml index 154c2966a..a400e4d83 100644 --- a/.github/workflows/pr-ci.yaml +++ b/.github/workflows/pr-ci.yaml @@ -109,7 +109,7 @@ jobs: if: failure() uses: actions/upload-artifact@v4 with: - name: ${{ matrix.cc }}-config.log + name: ${{ matrix.cc }}-${{ matrix.sdk }}-config.log path: config.log - uses: actions/setup-python@v5 From 7c4b02abe4e53a2e1640d1f2e6d95d0b59184f0e Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Mon, 13 May 2024 01:50:36 -0700 Subject: [PATCH 27/30] ci(github): expand matrix for latest cc --- .github/workflows/pr-ci.yaml | 111 ++++++++++++++++++++++++++++------- 1 file changed, 90 insertions(+), 21 deletions(-) diff --git a/.github/workflows/pr-ci.yaml b/.github/workflows/pr-ci.yaml index a400e4d83..1136da527 100644 --- a/.github/workflows/pr-ci.yaml +++ b/.github/workflows/pr-ci.yaml @@ -2,10 +2,10 @@ name: PR CI on: [push, pull_request, workflow_dispatch] env: + GCC_LATEST: 13 + LLVM_LATEST: 19 APT_PACKAGES: >- build-essential - clang - gcc git libhwloc-dev make @@ -24,17 +24,80 @@ jobs: needs: pre-commit strategy: matrix: + cc-version: + - latest + - legacy cc: - gcc - clang sdk: - cuda - neuron + + exclude: + # Disable clang-latest runs. + - cc: clang + cc-version: latest + # fails with + # > checking if __builtin_expect is available... no + # > configure: error: __builtin_expect not available + # clang-19 definitely supports this, so it's something in our m4. fail-fast: false + name: u2204/${{matrix.cc}}(${{matrix.cc-version}})/distcheck/${{ matrix.sdk }} steps: - uses: actions/checkout@v4 - - name: Install Dependencies + - name: Configure Neuron SDK Repository + if: matrix.sdk == 'neuron' + run: | + # Configure Linux for Neuron repository updates + sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null << EOF + deb https://apt.repos.neuron.amazonaws.com jammy main + EOF + wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add - + sudo apt update -y + + - name: Add compiler repositories + if: matrix.cc-version == 'latest' + run: | + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + # Delete the last line, allowing us to use the cache below for + # actually installing the package; this just adds the + # repository. + sed -i '$ d' llvm.sh + sudo ./llvm.sh ${{ env.LLVM_LATEST }} + sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test + + - name: Install GCC + uses: awalsh128/cache-apt-pkgs-action@latest + if: matrix.cc-version == 'legacy' && matrix.cc == 'gcc' + with: + packages: gcc + version: compiler-${{ matrix.cc }}-${{ matrix.cc-version }} + + - name: Install Latest GCC + uses: awalsh128/cache-apt-pkgs-action@latest + if: matrix.cc-version == 'latest' && matrix.cc == 'gcc' + with: + packages: gcc-${{ env.GCC_LATEST }} + version: compiler-${{ matrix.cc }}-${{ matrix.cc-version }} + + - name: Install Clang + uses: awalsh128/cache-apt-pkgs-action@latest + if: matrix.cc-version == 'legacy' && matrix.cc == 'clang' + with: + packages: clang + version: compiler-${{ matrix.cc }}-${{ matrix.cc-version }} + + - name: Install Latest Clang + uses: awalsh128/cache-apt-pkgs-action@latest + if: matrix.cc-version == 'latest' && matrix.cc == 'clang' + with: + packages: clang-${{ env.LLVM_LATEST }} + version: compiler-${{ matrix.cc }}-${{ matrix.cc-version }} + + - name: Install Base Dependencies uses: awalsh128/cache-apt-pkgs-action@latest with: packages: ${{ env.APT_PACKAGES }} @@ -47,16 +110,6 @@ jobs: packages: nvidia-cuda-toolkit version: cuda-packages - - name: Configure Neuron SDK Repository - if: matrix.sdk == 'neuron' - run: | - # Configure Linux for Neuron repository updates - sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null << EOF - deb https://apt.repos.neuron.amazonaws.com jammy main - EOF - wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add - - sudo apt update -y - - name: Install Neuron SDK if: matrix.sdk == 'neuron' uses: awalsh128/cache-apt-pkgs-action@latest @@ -71,12 +124,21 @@ jobs: git clone --depth 1 https://github.com/ofiwg/libfabric.git pushd libfabric ./autogen.sh + + export CC="${{ matrix.cc }}" + if [ "${{ matrix.cc-version }}" == "latest" ]; then + if [ "${{ matrix.cc }}" == "clang" ]; then + export CC="$CC-${{ env.LLVM_LATEST }}" + else + export CC="$CC-${{ env.GCC_LATEST }}" + fi + fi + ./configure --prefix="$PWD/install" \ --disable-sockets \ --disable-udp \ --disable-mrail \ - --disable-opx \ - CC="${{ matrix.cc }}" + --disable-opx make -j "$(nproc)" make install popd @@ -85,6 +147,15 @@ jobs: run: | set -x + export CC="${{ matrix.cc }}" + if [ "${{ matrix.cc-version }}" == "latest" ]; then + if [ "${{ matrix.cc }}" == "clang" ]; then + export CC="$CC-${{ env.LLVM_LATEST }}" + else + export CC="$CC-${{ env.GCC_LATEST }}" + fi + fi + # actions/checkout@v4 would drop the plugin source in $PWD, # so go ahead and build it. ./autogen.sh @@ -92,13 +163,11 @@ jobs: then ./configure --with-libfabric="$PWD/libfabric/install" \ --with-cuda=/usr/local/cuda/ \ - --enable-platform-aws \ - CC="${{ matrix.cc }}" + --enable-platform-aws else ./configure --with-libfabric="$PWD/libfabric/install" \ --enable-neuron \ - --enable-platform-aws \ - CC="${{ matrix.cc }}" + --enable-platform-aws fi make -j "$(nproc)" @@ -122,7 +191,7 @@ jobs: uses: whisperity/codechecker-analysis-action@v1 id: codechecker with: - # clean and rebuild so that compile_commands.json can be detected + # clean and rebuild so that no files are skipped. build-command: make clean && make ctu: true @@ -130,7 +199,7 @@ jobs: if: matrix.cc == 'clang' uses: actions/upload-artifact@v4 with: - name: CodeChecker Bug Reports for ${{ matrix.sdk }} + name: CodeChecker Bug Reports for ${{ matrix.sdk }}-${{ matrix.cc-version }} path: ${{ steps.codechecker.outputs.result-html-dir }}/*.html - name: CodeChecker Pass Or Fail? From c6cdc1ac27c0ec36a6eab7a129ae180c3b7b0c53 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Mon, 13 May 2024 02:42:09 -0700 Subject: [PATCH 28/30] ci(github): add a configuration file for codechecker + ignore unit tests + disable unused parameter warnings. --- .github/codechecker.yaml | 6 ++++++ .github/codechecker_skips.txt | 3 +++ .github/workflows/pr-ci.yaml | 1 + 3 files changed, 10 insertions(+) create mode 100644 .github/codechecker.yaml create mode 100644 .github/codechecker_skips.txt diff --git a/.github/codechecker.yaml b/.github/codechecker.yaml new file mode 100644 index 000000000..b09fcbfd8 --- /dev/null +++ b/.github/codechecker.yaml @@ -0,0 +1,6 @@ +--- +analyzer: + - --disable=clang-diagnostic-unused-parameter + - --ignore=.github/codechecker_skips.txt + - --analyzer-config=clangsa:unroll-loops=true + - --report-hash=context-free-v2 diff --git a/.github/codechecker_skips.txt b/.github/codechecker_skips.txt new file mode 100644 index 000000000..8edfda67e --- /dev/null +++ b/.github/codechecker_skips.txt @@ -0,0 +1,3 @@ +-/tests/functional/* +-/tests/unit/* +-/include/nccl-headers/*/* diff --git a/.github/workflows/pr-ci.yaml b/.github/workflows/pr-ci.yaml index 1136da527..319638ad3 100644 --- a/.github/workflows/pr-ci.yaml +++ b/.github/workflows/pr-ci.yaml @@ -194,6 +194,7 @@ jobs: # clean and rebuild so that no files are skipped. build-command: make clean && make ctu: true + config: .github/codechecker.yaml - name: Save CodeChecker HTML output. if: matrix.cc == 'clang' From e51fc103c2cb5518d35557e032c7fe328d4b830c Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Mon, 13 May 2024 03:28:20 -0700 Subject: [PATCH 29/30] ci(github): split codechecker to separate job --- .github/workflows/pr-ci.yaml | 106 ++++++++++++++++++++++++++++++++--- 1 file changed, 99 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pr-ci.yaml b/.github/workflows/pr-ci.yaml index 319638ad3..b9c9ec37d 100644 --- a/.github/workflows/pr-ci.yaml +++ b/.github/workflows/pr-ci.yaml @@ -181,30 +181,122 @@ jobs: name: ${{ matrix.cc }}-${{ matrix.sdk }}-config.log path: config.log + codechecker: + runs-on: ubuntu-22.04 + name: CodeChecker - ${{ matrix.sdk }} + needs: [pre-commit, distcheck] + strategy: + matrix: + sdk: + - cuda + - neuron + steps: + - uses: actions/checkout@v4 - uses: actions/setup-python@v5 - if: matrix.cc == 'clang' with: python-version: '3.9' + - name: Configure Neuron SDK Repository + if: matrix.sdk == 'neuron' + run: | + # Configure Linux for Neuron repository updates + sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null << EOF + deb https://apt.repos.neuron.amazonaws.com jammy main + EOF + wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add - + sudo apt update -y + + - name: Install Base Dependencies + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: ${{ env.APT_PACKAGES }} + version: base-packages + + - name: Install CUDA SDK + if: matrix.sdk == 'cuda' + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: nvidia-cuda-toolkit + version: cuda-packages + + - name: Install Neuron SDK + if: matrix.sdk == 'neuron' + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: aws-neuronx-runtime-lib + version: neuron-packages + + - name: Add compiler repositories + run: | + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + # Delete the last line, allowing us to use the cache below for + # actually installing the package; this just adds the + # repository. + sed -i '$ d' llvm.sh + sudo ./llvm.sh ${{ env.LLVM_LATEST }} + sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test + + - name: Install Latest Clang + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: clang-${{ env.LLVM_LATEST }} + version: compiler-clang-latest + + - name: Install cppcheck + uses: awalsh128/cache-apt-pkgs-action@latest + with: + packages: cppcheck + version: codechecker-cppcheck + + - name: Install Libfabric + run: | + # We're just doing distchecks, so it is fine if we + # just grab the latest master and built a lean build. + git clone --depth 1 https://github.com/ofiwg/libfabric.git + pushd libfabric + ./autogen.sh + export CC="clang-${{ env.LLVM_LATEST }}" + ./configure --prefix="$PWD/install" \ + --disable-sockets \ + --disable-udp \ + --disable-mrail \ + --disable-opx + make -j "$(nproc)" + make install + popd + + - name: Run Configure + run: | + ./autogen.sh + if [ "${{ matrix.sdk }}" == "neuron" ]; then + ./configure \ + --with-libfabric="$PWD/libfabric/install" \ + --enable-neuron \ + --enable-platform-aws + else + ./configure \ + --with-libfabric="$PWD/libfabric/install" \ + --with-cuda=/usr/local/cuda/ \ + --enable-platform-aws + fi + - name: Run CodeChecker - if: matrix.cc == 'clang' uses: whisperity/codechecker-analysis-action@v1 id: codechecker with: - # clean and rebuild so that no files are skipped. - build-command: make clean && make + build-command: make ctu: true config: .github/codechecker.yaml - name: Save CodeChecker HTML output. - if: matrix.cc == 'clang' uses: actions/upload-artifact@v4 with: - name: CodeChecker Bug Reports for ${{ matrix.sdk }}-${{ matrix.cc-version }} + name: CodeChecker Bug Reports for ${{ matrix.sdk }} path: ${{ steps.codechecker.outputs.result-html-dir }}/*.html - name: CodeChecker Pass Or Fail? - if: matrix.cc == 'clang' && steps.codechecker.outputs.warnings-in-diff == 'true' + if: steps.codechecker.outputs.warnings-in-diff == 'true' shell: bash run: | echo "::error title=Static Analyzers Failed::Analysed commit(s) caused static analysis warnings" From 1ea28ea8f0d087c664b9e8ff2c0a6e143676f277 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Sun, 18 Feb 2024 19:33:52 -0800 Subject: [PATCH 30/30] ci(pre-commit): introduce clang-format Add a basic clang-format spec to the repository and a pre-commit hook to enforce it. Actually formatting the repository by the rules within this file is split into a separate commit for the sake of doing as much as possible to keep git-blame clean (via .git-blame-ignore-revs) This style was chosen with the help of a tool called whatstyle [1]. --- .clang-format | 61 ++++++++++++++++++++++++++++++ .pre-commit-config.yaml | 6 +++ include/nccl-headers/.clang-format | 1 + 3 files changed, 68 insertions(+) create mode 100644 .clang-format create mode 100644 include/nccl-headers/.clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..b75b3b9d7 --- /dev/null +++ b/.clang-format @@ -0,0 +1,61 @@ +--- +BasedOnStyle: Google +InsertNewlineAtEOF: true +LineEnding: LF +ColumnLimit: 100 +TabWidth: 8 +IndentWidth: 8 +BracedInitializerIndentWidth: 8 +ContinuationIndentWidth: 8 +SpacesBeforeTrailingComments: 2 +ReflowComments: true +IndentCaseLabels: false + +BreakBeforeBraces: Linux +UseTab: ForContinuationAndIndentation +MaxEmptyLinesToKeep: 2 +AlignOperands: Align +AllowAllArgumentsOnNextLine: false +BinPackArguments: false +BinPackParameters: false +AlignConsecutiveDeclarations: false +AlignConsecutiveAssignments: false + +AlignTrailingComments: + Kind: Always + OverEmptyLines: 0 +AlignConsecutiveShortCaseStatements: + Enabled: true + AcrossEmptyLines: true + AcrossComments: true + AlignCaseColons: false +AlignConsecutiveMacros: + Enabled: true + AcrossEmptyLines: true + AcrossComments: true + +AlignConsecutiveBitFields: + Enabled: true + AcrossEmptyLines: true + AcrossComments: true + +PointerAlignment: Right +ReferenceAlignment: Right +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +RemoveParentheses: MultipleParentheses +InsertBraces: true + +SortIncludes: CaseSensitive +IncludeBlocks: Regroup +IncludeCategories: + # External dependencies at the top. + - Regex: ^([<].*[.]h[>])$ + Priority: -50 + SortPriority: -50 + + # local headers below them. + - Regex: ^(["].*[.]h["])$ + Priority: -10 + SortPriority: -10 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d3b9668c4..26e3d2bf2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -63,6 +63,12 @@ repos: stages: [manual] exclude: (include/.*memcheck.*[.]h|include/nccl-headers.*|src/nccl_ofi_interface_neuron.c) args: [--config-file=.clang-tidy] + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v18.1.5 + hooks: + - id: clang-format + stages: [manual] + types_or: [c++, c, cuda] - repo: https://github.com/editorconfig-checker/editorconfig-checker.python rev: 2.7.3 hooks: diff --git a/include/nccl-headers/.clang-format b/include/nccl-headers/.clang-format new file mode 100644 index 000000000..e3845288a --- /dev/null +++ b/include/nccl-headers/.clang-format @@ -0,0 +1 @@ +DisableFormat: true