From 2d5fc12d9ce80d95abfb21eaa03036650dd6099c Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 9 Nov 2023 16:30:13 -0500
Subject: [PATCH 01/24] v24.02 Updates [skip ci]

---
 .github/workflows/build.yaml                  | 14 +++++------
 .github/workflows/pr.yaml                     | 24 +++++++++----------
 .github/workflows/test.yaml                   |  8 +++----
 VERSION                                       |  2 +-
 ci/build_docs.sh                              |  2 +-
 ci/test_wheel.sh                              |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             | 20 ++++++++--------
 .../all_cuda-120_arch-x86_64.yaml             | 20 ++++++++--------
 .../clang_tidy_cuda-118_arch-x86_64.yaml      |  8 +++----
 .../cpp_all_cuda-118_arch-x86_64.yaml         |  8 +++----
 .../cpp_all_cuda-120_arch-x86_64.yaml         |  8 +++----
 cpp/CMakeLists.txt                            |  2 +-
 cpp/Doxyfile.in                               |  2 +-
 dependencies.yaml                             | 20 ++++++++--------
 docs/source/conf.py                           |  4 ++--
 fetch_rapids.cmake                            |  2 +-
 python/CMakeLists.txt                         |  2 +-
 python/pyproject.toml                         | 12 +++++-----
 18 files changed, 80 insertions(+), 80 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 776c7ae761..84c73a1f3b 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -60,7 +60,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -68,7 +68,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cuml:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,12 +79,12 @@ jobs:
       # the CMake variables in get_cumlprims_mg.cmake since CMake will just use
       # the clone as is.
       extra-repo: rapidsai/cumlprims_mg
-      extra-repo-sha: branch-23.12
+      extra-repo-sha: branch-24.02
       extra-repo-deploy-key: CUMLPRIMS_SSH_PRIVATE_DEPLOY_KEY
   wheel-publish-cuml:
     needs: wheel-build-cuml
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7828a4b8cb..0e786e2c8f 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -24,16 +24,16 @@ jobs:
       - wheel-build-cuml
       - wheel-tests-cuml
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02
     with:
       enable_check_generated_files: false
   clang-tidy:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: "cpu8"
@@ -43,39 +43,39 @@ jobs:
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-python-tests-singlegpu:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: pull-request
       test_script: "ci/test_python_singlegpu.sh"
   conda-python-tests-dask:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: pull-request
       test_script: "ci/test_python_dask.sh"
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -85,7 +85,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -95,7 +95,7 @@ jobs:
   wheel-build-cuml:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/build_wheel.sh
@@ -105,7 +105,7 @@ jobs:
   wheel-tests-cuml:
     needs: wheel-build-cuml
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 3485d2fc01..9ef163e2fd 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests-singlegpu:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       test_script: "ci/test_python_singlegpu.sh"
   conda-python-tests-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       test_script: "ci/test_python_dask.sh"
   wheel-tests-cuml:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index a193fff41e..3c6c5e2b70 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-23.12.00
+24.02.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 81917e9278..22c2ba5cfe 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -24,7 +24,7 @@ rapids-mamba-retry install \
   --channel "${PYTHON_CHANNEL}" \
   cuml libcuml
 
-export RAPIDS_VERSION_NUMBER="23.12"
+export RAPIDS_VERSION_NUMBER="24.02"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index f6c61eabac..7f9b8d91ee 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -13,7 +13,7 @@ if [[ "$(arch)" == "aarch64" ]]; then
 fi
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.12
+python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-24.02
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/cuml*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8d4b9ab3ce..c7d3627614 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -12,13 +12,13 @@ dependencies:
 - cuda-python>=11.7.1,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
-- cudf==23.12.*
+- cudf==24.2.*
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
 - dask-core>=2023.9.2
-- dask-cuda==23.12.*
-- dask-cudf==23.12.*
+- dask-cuda==24.2.*
+- dask-cudf==24.2.*
 - dask-ml
 - dask>=2023.9.2
 - distributed>=2023.9.2
@@ -36,16 +36,16 @@ dependencies:
 - libcublas=11.11.3.6
 - libcufft-dev=10.9.0.58
 - libcufft=10.9.0.58
-- libcumlprims==23.12.*
+- libcumlprims==24.2.*
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libcusolver-dev=11.4.1.48
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libraft-headers==23.12.*
-- libraft==23.12.*
-- librmm==23.12.*
+- libraft-headers==24.2.*
+- libraft==24.2.*
+- librmm==24.2.*
 - nbsphinx
 - ninja
 - nltk
@@ -54,7 +54,7 @@ dependencies:
 - nvcc_linux-64=11.8
 - pip
 - pydata-sphinx-theme!=0.14.2
-- pylibraft==23.12.*
+- pylibraft==24.2.*
 - pynndescent==0.5.8
 - pytest
 - pytest-benchmark
@@ -62,9 +62,9 @@ dependencies:
 - pytest-cov
 - pytest-xdist
 - python>=3.9,<3.11
-- raft-dask==23.12.*
+- raft-dask==24.2.*
 - recommonmark
-- rmm==23.12.*
+- rmm==24.2.*
 - scikit-build>=0.13.1
 - scikit-learn==1.2
 - scipy>=1.8.0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index af119ecb72..a97c5759ea 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -14,13 +14,13 @@ dependencies:
 - cuda-profiler-api
 - cuda-python>=12.0,<13.0a0
 - cuda-version=12.0
-- cudf==23.12.*
+- cudf==24.2.*
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
 - dask-core>=2023.9.2
-- dask-cuda==23.12.*
-- dask-cudf==23.12.*
+- dask-cuda==24.2.*
+- dask-cudf==24.2.*
 - dask-ml
 - dask>=2023.9.2
 - distributed>=2023.9.2
@@ -36,13 +36,13 @@ dependencies:
 - joblib>=0.11
 - libcublas-dev
 - libcufft-dev
-- libcumlprims==23.12.*
+- libcumlprims==24.2.*
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==23.12.*
-- libraft==23.12.*
-- librmm==23.12.*
+- libraft-headers==24.2.*
+- libraft==24.2.*
+- librmm==24.2.*
 - nbsphinx
 - ninja
 - nltk
@@ -50,7 +50,7 @@ dependencies:
 - numpydoc
 - pip
 - pydata-sphinx-theme!=0.14.2
-- pylibraft==23.12.*
+- pylibraft==24.2.*
 - pynndescent==0.5.8
 - pytest
 - pytest-benchmark
@@ -58,9 +58,9 @@ dependencies:
 - pytest-cov
 - pytest-xdist
 - python>=3.9,<3.11
-- raft-dask==23.12.*
+- raft-dask==24.2.*
 - recommonmark
-- rmm==23.12.*
+- rmm==24.2.*
 - scikit-build>=0.13.1
 - scikit-learn==1.2
 - scipy>=1.8.0
diff --git a/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml b/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml
index 515abd8929..6231671f95 100644
--- a/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml
@@ -21,16 +21,16 @@ dependencies:
 - libcublas=11.11.3.6
 - libcufft-dev=10.9.0.58
 - libcufft=10.9.0.58
-- libcumlprims==23.12.*
+- libcumlprims==24.2.*
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libcusolver-dev=11.4.1.48
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libraft-headers==23.12.*
-- libraft==23.12.*
-- librmm==23.12.*
+- libraft-headers==24.2.*
+- libraft==24.2.*
+- librmm==24.2.*
 - ninja
 - nvcc_linux-64=11.8
 - sysroot_linux-64==2.17
diff --git a/conda/environments/cpp_all_cuda-118_arch-x86_64.yaml b/conda/environments/cpp_all_cuda-118_arch-x86_64.yaml
index be86e4397b..7038450c73 100644
--- a/conda/environments/cpp_all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/cpp_all_cuda-118_arch-x86_64.yaml
@@ -19,16 +19,16 @@ dependencies:
 - libcublas=11.11.3.6
 - libcufft-dev=10.9.0.58
 - libcufft=10.9.0.58
-- libcumlprims==23.12.*
+- libcumlprims==24.2.*
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libcusolver-dev=11.4.1.48
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- libraft-headers==23.12.*
-- libraft==23.12.*
-- librmm==23.12.*
+- libraft-headers==24.2.*
+- libraft==24.2.*
+- librmm==24.2.*
 - ninja
 - nvcc_linux-64=11.8
 - sysroot_linux-64==2.17
diff --git a/conda/environments/cpp_all_cuda-120_arch-x86_64.yaml b/conda/environments/cpp_all_cuda-120_arch-x86_64.yaml
index 83a97fcd4e..e8304388de 100644
--- a/conda/environments/cpp_all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/cpp_all_cuda-120_arch-x86_64.yaml
@@ -19,13 +19,13 @@ dependencies:
 - gtest>=1.13.0
 - libcublas-dev
 - libcufft-dev
-- libcumlprims==23.12.*
+- libcumlprims==24.2.*
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- libraft-headers==23.12.*
-- libraft==23.12.*
-- librmm==23.12.*
+- libraft-headers==24.2.*
+- libraft==24.2.*
+- librmm==24.2.*
 - ninja
 - sysroot_linux-64==2.17
 name: cpp_all_cuda-120_arch-x86_64
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 10c4d12ea0..29ee669a85 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -26,7 +26,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUML)
 
-project(CUML VERSION 23.12.00 LANGUAGES CXX CUDA)
+project(CUML VERSION 24.02.00 LANGUAGES CXX CUDA)
 
 # Write the version header
 rapids_cmake_write_version_file(include/cuml/version_config.hpp)
diff --git a/cpp/Doxyfile.in b/cpp/Doxyfile.in
index 8a476e15df..80bddac6e0 100644
--- a/cpp/Doxyfile.in
+++ b/cpp/Doxyfile.in
@@ -38,7 +38,7 @@ PROJECT_NAME           = "cuML C++ API"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "23.12"
+PROJECT_NUMBER         = "24.02"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/dependencies.yaml b/dependencies.yaml
index 568781a45f..73682f33db 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -111,10 +111,10 @@ dependencies:
           - cxx-compiler
           - gmock>=1.13.0
           - gtest>=1.13.0
-          - libcumlprims==23.12.*
-          - libraft==23.12.*
-          - libraft-headers==23.12.*
-          - librmm==23.12.*
+          - libcumlprims==24.2.*
+          - libraft==24.2.*
+          - libraft-headers==24.2.*
+          - librmm==24.2.*
     specific:
       - output_types: conda
         matrices:
@@ -153,8 +153,8 @@ dependencies:
           - scikit-build>=0.13.1
           - cython>=3.0.0
           - &treelite treelite==3.9.1
-          - pylibraft==23.12.*
-          - rmm==23.12.*
+          - pylibraft==24.2.*
+          - rmm==24.2.*
       - output_types: pyproject
         packages:
           - wheel
@@ -174,10 +174,10 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cudf==23.12.*
+          - cudf==24.2.*
           - dask>=2023.9.2
-          - dask-cuda==23.12.*
-          - dask-cudf==23.12.*
+          - dask-cuda==24.2.*
+          - dask-cudf==24.2.*
           - distributed>=2023.9.2
           - joblib>=0.11
           - numba>=0.57
@@ -185,7 +185,7 @@ dependencies:
             # we make it optional (i.e. an extra for pip
             # installation/run_constrained for conda)?
           - scipy>=1.8.0
-          - raft-dask==23.12.*
+          - raft-dask==24.2.*
           - *treelite
       - output_types: [conda, requirements]
         packages:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 2039945021..abaf2bdcc7 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -77,9 +77,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '23.12'
+version = '24.02'
 # The full version, including alpha/beta/rc tags.
-release = '23.12.00'
+release = '24.02.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 37fb090fd6..98ce6888ba 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUML_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.02/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUML_RAPIDS.cmake
   )
 endif()
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 746c4a5f6d..4314afb06e 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
 include(../fetch_rapids.cmake)
 
-set(CUML_VERSION 23.12.00)
+set(CUML_VERSION 24.02.00)
 
 option(CUML_CPU "Build only cuML CPU Python components." OFF)
 set(language_list "C;CXX")
diff --git a/python/pyproject.toml b/python/pyproject.toml
index ed9b4fd45c..76006d3ab5 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -18,8 +18,8 @@ requires = [
     "cuda-python>=11.7.1,<12.0a0",
     "cython>=3.0.0",
     "ninja",
-    "pylibraft==23.12.*",
-    "rmm==23.12.*",
+    "pylibraft==24.2.*",
+    "rmm==24.2.*",
     "scikit-build>=0.13.1",
     "setuptools",
     "treelite==3.9.1",
@@ -57,15 +57,15 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.12.*",
+    "cudf==24.2.*",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==23.12.*",
-    "dask-cudf==23.12.*",
+    "dask-cuda==24.2.*",
+    "dask-cudf==24.2.*",
     "dask>=2023.9.2",
     "distributed>=2023.9.2",
     "joblib>=0.11",
     "numba>=0.57",
-    "raft-dask==23.12.*",
+    "raft-dask==24.2.*",
     "scipy>=1.8.0",
     "treelite==3.9.1",
     "treelite_runtime==3.9.1",

From e1a7da3f358b4d8ffcc0ca3374a2efc68a573924 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 6 Dec 2023 12:36:06 -0800
Subject: [PATCH 02/24] Remove CUML_BUILD_WHEELS and standardize Python builds
 (#5689)

Some minor simplification in advance of the scikit-build-core migration to better align wheel and non-wheel Python builds.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cuml/pull/5689
---
 ci/build_wheel.sh     |  2 +-
 python/CMakeLists.txt | 42 ++++++++++++++++--------------------------
 2 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index e4941ad1a8..8b15323b33 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -49,7 +49,7 @@ fi
 
 cd ${package_dir}
 
-SKBUILD_CONFIGURE_OPTIONS="-DCUML_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF -DDISABLE_DEPRECATION_WARNINGS=ON -DCPM_cumlprims_mg_SOURCE=${GITHUB_WORKSPACE}/cumlprims_mg/" \
+SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DDISABLE_DEPRECATION_WARNINGS=ON -DCPM_cumlprims_mg_SOURCE=${GITHUB_WORKSPACE}/cumlprims_mg/" \
   python -m pip wheel . \
     -w dist \
     -vvv \
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 4314afb06e..e639f8a71c 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -43,7 +43,6 @@ project(
 # - User Options  --------------------------------------------------------------
 option(CUML_UNIVERSAL "Build all cuML Python components." ON)
 option(FIND_CUML_CPP "Search for existing CUML C++ installations before defaulting to local files" OFF)
-option(CUML_BUILD_WHEELS "Whether this build is generating a Python wheel." OFF)
 option(SINGLEGPU "Disable all mnmg components and comms libraries" OFF)
 set(CUML_RAFT_CLONE_ON_PIN OFF)
 
@@ -78,11 +77,7 @@ endif()
 
 include(rapids-cython)
 
-if(CUML_BUILD_WHEELS)
-  set(CUML_PYTHON_TREELITE_TARGET treelite::treelite_static)
-else()
-  set(CUML_PYTHON_TREELITE_TARGET treelite::treelite)
-endif()
+set(CUML_PYTHON_TREELITE_TARGET treelite::treelite)
 
 if(NOT ${CUML_CPU})
   if(NOT cuml_FOUND)
@@ -93,26 +88,21 @@ if(NOT ${CUML_CPU})
     set(BUILD_CUML_BENCH OFF)
     set(BUILD_CUML_PRIMS_BENCH OFF)
     set(CUML_EXPORT_TREELITE_LINKAGE ON)
-
-    set(_exclude_from_all "")
-    if(CUML_BUILD_WHEELS)
-      # Statically link dependencies if building wheels
-      set(CUDA_STATIC_RUNTIME ON)
-      set(CUML_USE_RAFT_STATIC ON)
-      set(CUML_USE_FAISS_STATIC ON)
-      set(CUML_USE_TREELITE_STATIC ON)
-      set(CUML_USE_CUMLPRIMS_MG_STATIC ON)
-      # Don't install the static libs into wheels
-      set(CUML_EXCLUDE_RAFT_FROM_ALL ON)
-      set(RAFT_EXCLUDE_FAISS_FROM_ALL ON)
-      set(CUML_EXCLUDE_TREELITE_FROM_ALL ON)
-      set(CUML_EXCLUDE_CUMLPRIMS_MG_FROM_ALL ON)
-
-      # Don't install the cuML C++ targets into wheels
-      set(_exclude_from_all EXCLUDE_FROM_ALL)
-    endif()
-
-    add_subdirectory(../cpp cuml-cpp ${_exclude_from_all})
+    set(CUML_PYTHON_TREELITE_TARGET treelite::treelite_static)
+
+    # Statically link dependencies if building wheels
+    set(CUDA_STATIC_RUNTIME ON)
+    set(CUML_USE_RAFT_STATIC ON)
+    set(CUML_USE_FAISS_STATIC ON)
+    set(CUML_USE_TREELITE_STATIC ON)
+    set(CUML_USE_CUMLPRIMS_MG_STATIC ON)
+    # Don't install the static libs into wheels
+    set(CUML_EXCLUDE_RAFT_FROM_ALL ON)
+    set(RAFT_EXCLUDE_FAISS_FROM_ALL ON)
+    set(CUML_EXCLUDE_TREELITE_FROM_ALL ON)
+    set(CUML_EXCLUDE_CUMLPRIMS_MG_FROM_ALL ON)
+
+    add_subdirectory(../cpp cuml-cpp EXCLUDE_FROM_ALL)
 
     set(cython_lib_dir cuml)
     install(TARGETS ${CUML_CPP_TARGET} DESTINATION ${cython_lib_dir})

From 6cd2fb3092f8867e32e6050b9354a0192766f925 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 7 Dec 2023 15:35:06 -0800
Subject: [PATCH 03/24] Fix all deprecated function calls in TUs where warnings
 are errors (#5692)

This PR is sufficient to enable compilation in devcontainers.

Contributes to #5510

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - William Hicks (https://github.com/wphicks)

URL: https://github.com/rapidsai/cuml/pull/5692
---
 cpp/src/arima/batched_arima.cu                |  4 +--
 cpp/src/dbscan/vertexdeg/algo.cuh             |  4 +--
 cpp/src/dbscan/vertexdeg/precomputed.cuh      |  4 +--
 .../batched-levelalgo/builder.cuh             |  4 +--
 .../kernels/builder_kernels.cuh               |  6 ++---
 .../batched-levelalgo/objectives.cuh          | 18 ++++++-------
 cpp/src/genetic/fitness.cuh                   | 12 ++++-----
 cpp/src/glm/preprocess.cuh                    |  4 +--
 cpp/src/glm/qn/glm_linear.cuh                 |  2 +-
 cpp/src/glm/qn/glm_logistic.cuh               |  4 +--
 cpp/src/glm/qn/glm_softmax.cuh                | 26 ++++++++-----------
 cpp/src/glm/qn/glm_svm.cuh                    |  4 +--
 cpp/src/glm/qn/simple_mat/dense.hpp           |  4 +--
 cpp/src/solver/cd.cuh                         |  4 +--
 cpp/src/svm/linear.cu                         | 10 +++----
 cpp/src_prims/functions/log.cuh               |  2 +-
 cpp/src_prims/functions/sigmoid.cuh           |  2 +-
 cpp/src_prims/functions/softThres.cuh         |  4 +--
 cpp/src_prims/linalg/batched/gemv.cuh         |  4 +--
 cpp/src_prims/linalg/batched/make_symm.cuh    |  4 +--
 cpp/src_prims/matrix/reverse.cuh              |  4 +--
 cpp/src_prims/timeSeries/jones_transform.cuh  |  4 +--
 cpp/src_prims/timeSeries/stationarity.cuh     |  4 +--
 cpp/test/prims/distance_base.cuh              |  4 +--
 cpp/test/prims/knn_regression.cu              |  2 +-
 cpp/test/sg/linear_svm_test.cu                |  2 +-
 cpp/test/sg/rf_test.cu                        | 12 ++++-----
 27 files changed, 76 insertions(+), 82 deletions(-)

diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
index 43bc81221b..71ec2d5b69 100644
--- a/cpp/src/arima/batched_arima.cu
+++ b/cpp/src/arima/batched_arima.cu
@@ -336,7 +336,7 @@ __global__ void sum_of_squares_kernel(const DataT* d_y,
   // Compute log-likelihood and write it to global memory
   if (threadIdx.x == 0) {
     d_loglike[blockIdx.x] =
-      -0.5 * static_cast<DataT>(n_obs) * raft::myLog(ssq / static_cast<DataT>(n_obs - start_sum));
+      -0.5 * static_cast<DataT>(n_obs) * raft::log(ssq / static_cast<DataT>(n_obs - start_sum));
   }
 }
 
@@ -1000,4 +1000,4 @@ void estimate_x0(raft::handle_t& handle,
   _start_params(handle, params, bm_yd, bm_exog_diff, order);
 }
 
-}  // namespace ML
\ No newline at end of file
+}  // namespace ML
diff --git a/cpp/src/dbscan/vertexdeg/algo.cuh b/cpp/src/dbscan/vertexdeg/algo.cuh
index df6a248c89..ac67664207 100644
--- a/cpp/src/dbscan/vertexdeg/algo.cuh
+++ b/cpp/src/dbscan/vertexdeg/algo.cuh
@@ -122,7 +122,7 @@ void launcher(const raft::handle_t& handle,
     stream,
     false,
     [] __device__(bool adj_ij, index_t idx) { return static_cast<index_t>(adj_ij); },
-    raft::Sum<index_t>(),
+    raft::add_op(),
     [d_nnz] __device__(index_t degree) {
       atomicAdd(d_nnz, degree);
       return degree;
@@ -143,7 +143,7 @@ void launcher(const raft::handle_t& handle,
       [sample_weight] __device__(bool adj_ij, index_t j) {
         return adj_ij ? sample_weight[j] : (value_t)0;
       },
-      raft::Sum<value_t>());
+      raft::add_op());
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }
diff --git a/cpp/src/dbscan/vertexdeg/precomputed.cuh b/cpp/src/dbscan/vertexdeg/precomputed.cuh
index b2cf9fac2c..9dbd7605c9 100644
--- a/cpp/src/dbscan/vertexdeg/precomputed.cuh
+++ b/cpp/src/dbscan/vertexdeg/precomputed.cuh
@@ -75,7 +75,7 @@ void launcher(const raft::handle_t& handle,
     stream,
     false,
     [] __device__(bool adj_ij, long_index_t idx) { return static_cast<index_t>(adj_ij); },
-    raft::Sum<index_t>(),
+    raft::add_op(),
     [d_nnz] __device__(index_t degree) {
       atomicAdd(d_nnz, degree);
       return degree;
@@ -96,7 +96,7 @@ void launcher(const raft::handle_t& handle,
       [sample_weight] __device__(bool adj_ij, long_index_t j) {
         return adj_ij ? sample_weight[j] : (value_t)0;
       },
-      raft::Sum<value_t>());
+      raft::add_op());
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
index fef69b12f7..7b6c02bc1d 100644
--- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
@@ -414,8 +414,8 @@ struct Builder {
       // unique samples from 'n' is given by the following equation: log(1 - k/n)/log(1 - 1/n) ref:
       // https://stats.stackexchange.com/questions/296005/the-expected-number-of-unique-elements-drawn-with-replacement
       IdxT n_parallel_samples =
-        std::ceil(raft::myLog(1 - double(dataset.n_sampled_cols) / double(dataset.N)) /
-                  (raft::myLog(1 - 1.f / double(dataset.N))));
+        std::ceil(raft::log(1 - double(dataset.n_sampled_cols) / double(dataset.N)) /
+                  (raft::log(1 - 1.f / double(dataset.N))));
       // maximum sampling work possible by all threads in a block :
       // `max_samples_per_thread * block_thread`
       // dynamically calculated sampling work to be done per block:
diff --git a/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels.cuh b/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels.cuh
index 7daf5341b7..29aaea2c00 100644
--- a/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/kernels/builder_kernels.cuh
@@ -280,7 +280,7 @@ __global__ void algo_L_sample_kernel(int* colids,
   IdxT int_uniform_val;
   // fp_uniform_val will have a random value between 0 and 1
   gen.next(fp_uniform_val);
-  double W = raft::myExp(raft::myLog(fp_uniform_val) / k);
+  double W = raft::exp(raft::log(fp_uniform_val) / k);
 
   size_t col(0);
   // initially fill the reservoir array in increasing order of cols till k
@@ -295,14 +295,14 @@ __global__ void algo_L_sample_kernel(int* colids,
   while (col < n) {
     // fp_uniform_val will have a random value between 0 and 1
     gen.next(fp_uniform_val);
-    col += static_cast<int>(raft::myLog(fp_uniform_val) / raft::myLog(1 - W)) + 1;
+    col += static_cast<int>(raft::log(fp_uniform_val) / raft::log(1 - W)) + 1;
     if (col < n) {
       // int_uniform_val will now have a random value between 0...k
       raft::random::custom_next(gen, &int_uniform_val, uniform_int_dist_params, IdxT(0), IdxT(0));
       colids[tid * k + int_uniform_val] = col;  // the bad memory coalescing here is hidden
       // fp_uniform_val will have a random value between 0 and 1
       gen.next(fp_uniform_val);
-      W *= raft::myExp(raft::myLog(fp_uniform_val) / k);
+      W *= raft::exp(raft::log(fp_uniform_val) / k);
     }
   }
 }
diff --git a/cpp/src/decisiontree/batched-levelalgo/objectives.cuh b/cpp/src/decisiontree/batched-levelalgo/objectives.cuh
index 59b44b3619..8ecebf90f4 100644
--- a/cpp/src/decisiontree/batched-levelalgo/objectives.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/objectives.cuh
@@ -143,7 +143,7 @@ class EntropyObjectiveFunction {
         auto lval_i = hist[n_bins * c + i].x;
         if (lval_i != 0) {
           auto lval = DataT(lval_i);
-          gain += raft::myLog(lval * invLeft) / raft::myLog(DataT(2)) * lval * invLen;
+          gain += raft::log(lval * invLeft) / raft::log(DataT(2)) * lval * invLen;
         }
 
         val_i += lval_i;
@@ -151,13 +151,13 @@ class EntropyObjectiveFunction {
         auto rval_i    = total_sum - lval_i;
         if (rval_i != 0) {
           auto rval = DataT(rval_i);
-          gain += raft::myLog(rval * invRight) / raft::myLog(DataT(2)) * rval * invLen;
+          gain += raft::log(rval * invRight) / raft::log(DataT(2)) * rval * invLen;
         }
 
         val_i += rval_i;
         if (val_i != 0) {
           auto val = DataT(val_i) * invLen;
-          gain -= val * raft::myLog(val) / raft::myLog(DataT(2));
+          gain -= val * raft::log(val) / raft::log(DataT(2));
         }
       }
 
@@ -313,9 +313,9 @@ class PoissonObjectiveFunction {
       return -std::numeric_limits<DataT>::max();
 
     // compute the gain to be
-    DataT parent_obj = -label_sum * raft::myLog(label_sum * invLen);
-    DataT left_obj   = -left_label_sum * raft::myLog(left_label_sum / nLeft);
-    DataT right_obj  = -right_label_sum * raft::myLog(right_label_sum / nRight);
+    DataT parent_obj = -label_sum * raft::log(label_sum * invLen);
+    DataT left_obj   = -left_label_sum * raft::log(left_label_sum / nLeft);
+    DataT right_obj  = -right_label_sum * raft::log(right_label_sum / nRight);
     DataT gain       = parent_obj - (left_obj + right_obj);
     gain             = gain * invLen;
 
@@ -392,9 +392,9 @@ class GammaObjectiveFunction {
       return -std::numeric_limits<DataT>::max();
 
     // compute the gain to be
-    DataT parent_obj = len * raft::myLog(label_sum * invLen);
-    DataT left_obj   = nLeft * raft::myLog(left_label_sum / nLeft);
-    DataT right_obj  = nRight * raft::myLog(right_label_sum / nRight);
+    DataT parent_obj = len * raft::log(label_sum * invLen);
+    DataT left_obj   = nLeft * raft::log(left_label_sum / nLeft);
+    DataT right_obj  = nRight * raft::log(right_label_sum / nRight);
     DataT gain       = parent_obj - (left_obj + right_obj);
     gain             = gain * invLen;
 
diff --git a/cpp/src/genetic/fitness.cuh b/cpp/src/genetic/fitness.cuh
index 78593ce956..ea6ac4109b 100644
--- a/cpp/src/genetic/fitness.cuh
+++ b/cpp/src/genetic/fitness.cuh
@@ -126,8 +126,8 @@ void weightedPearson(const raft::handle_t& h,
     stream,
     false,
     [W] __device__(math_t v, int i) { return v * v * W[i]; },
-    raft::Sum<math_t>(),
-    [] __device__(math_t in) { return raft::mySqrt(in); });
+    raft::add_op(),
+    [] __device__(math_t in) { return raft::sqrt(in); });
   math_t HYstd = y_std.element(0, stream);
 
   // Find x_std
@@ -140,8 +140,8 @@ void weightedPearson(const raft::handle_t& h,
     stream,
     false,
     [W] __device__(math_t v, int i) { return v * v * W[i]; },
-    raft::Sum<math_t>(),
-    [] __device__(math_t in) { return raft::mySqrt(in); });
+    raft::add_op(),
+    [] __device__(math_t in) { return raft::sqrt(in); });
 
   // Cross covariance
   raft::linalg::matrixVectorOp(
@@ -273,9 +273,7 @@ void meanAbsoluteError(const raft::handle_t& h,
     n_samples,
     false,
     false,
-    [N, WS] __device__(math_t y_p, math_t y, math_t w) {
-      return N * w * raft::myAbs(y - y_p) / WS;
-    },
+    [N, WS] __device__(math_t y_p, math_t y, math_t w) { return N * w * raft::abs(y - y_p) / WS; },
     stream);
 
   // Average along rows
diff --git a/cpp/src/glm/preprocess.cuh b/cpp/src/glm/preprocess.cuh
index 8b4aa09e45..226040f98b 100644
--- a/cpp/src/glm/preprocess.cuh
+++ b/cpp/src/glm/preprocess.cuh
@@ -77,7 +77,7 @@ void preProcessData(const raft::handle_t& handle,
         norm2_input,
         norm2_input,
         n_cols,
-        [] __device__(math_t v) { return raft::mySqrt(v); },
+        [] __device__(math_t v) { return raft::sqrt(v); },
         stream);
       raft::matrix::linewiseOp(
         input,
@@ -105,7 +105,7 @@ void preProcessData(const raft::handle_t& handle,
                               raft::linalg::L2Norm,
                               false,
                               stream,
-                              [] __device__(math_t v) { return raft::mySqrt(v); });
+                              [] __device__(math_t v) { return raft::sqrt(v); });
         raft::matrix::matrixVectorBinaryDivSkipZero(
           input, norm2_input, n_rows, n_cols, false, true, stream, true);
       }
diff --git a/cpp/src/glm/qn/glm_linear.cuh b/cpp/src/glm/qn/glm_linear.cuh
index 09de8eb6fe..9f39018187 100644
--- a/cpp/src/glm/qn/glm_linear.cuh
+++ b/cpp/src/glm/qn/glm_linear.cuh
@@ -57,7 +57,7 @@ struct AbsLoss : GLMBase<T, AbsLoss<T>> {
   typedef GLMBase<T, AbsLoss<T>> Super;
 
   const struct Lz {
-    inline __device__ T operator()(const T y, const T z) const { return raft::myAbs<T>(z - y); }
+    inline __device__ T operator()(const T y, const T z) const { return raft::abs<T>(z - y); }
   } lz;
 
   const struct Dlz {
diff --git a/cpp/src/glm/qn/glm_logistic.cuh b/cpp/src/glm/qn/glm_logistic.cuh
index 70edf11aca..1c57b6fadf 100644
--- a/cpp/src/glm/qn/glm_logistic.cuh
+++ b/cpp/src/glm/qn/glm_logistic.cuh
@@ -33,7 +33,7 @@ struct LogisticLoss : GLMBase<T, LogisticLoss<T>> {
     inline __device__ T log_sigmoid(const T x) const
     {
       // To avoid floating point overflow in the exp function
-      T temp = raft::myLog(1 + raft::myExp(x < 0 ? x : -x));
+      T temp = raft::log(1 + raft::exp(x < 0 ? x : -x));
       return x < 0 ? x - temp : -temp;
     }
 
@@ -48,7 +48,7 @@ struct LogisticLoss : GLMBase<T, LogisticLoss<T>> {
     inline __device__ T operator()(const T y, const T z) const
     {
       // To avoid fp overflow with exp(z) when abs(z) is large
-      T ez        = raft::myExp(z < 0 ? z : -z);
+      T ez        = raft::exp(z < 0 ? z : -z);
       T numerator = z < 0 ? ez : T(1.0);
       return numerator / (T(1.0) + ez) - y;
     }
diff --git a/cpp/src/glm/qn/glm_softmax.cuh b/cpp/src/glm/qn/glm_softmax.cuh
index 762e62e0a9..44483a9103 100644
--- a/cpp/src/glm/qn/glm_softmax.cuh
+++ b/cpp/src/glm/qn/glm_softmax.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,10 +24,6 @@
 namespace ML {
 namespace GLM {
 namespace detail {
-using raft::ceildiv;
-using raft::myExp;
-using raft::myLog;
-using raft::myMax;
 
 // Input: matrix Z (dims: CxN)
 // Computes softmax cross entropy loss across columns, i.e. normalization
@@ -84,7 +80,7 @@ __global__ void logSoftmaxKernel(
         delta = true;
         eta_y = myEta;
       }
-      etaMax = myMax<T>(myEta, etaMax);
+      etaMax = raft::max<T>(myEta, etaMax);
     }
   }
   T tmpMax = WarpRed(shm.warpStore[threadIdx.y]).Reduce(etaMax, cub::Max());
@@ -100,15 +96,15 @@ __global__ void logSoftmaxKernel(
   // TODO there must be a better way to do this...
   if (C <= BX) {  // this means one block covers a column and myEta is valid
     int idx = threadIdx.x + y * C;
-    if (threadIdx.x < C && idx < len) { lse = myExp<T>(myEta - etaMax); }
+    if (threadIdx.x < C && idx < len) { lse = raft::exp<T>(myEta - etaMax); }
   } else {
     for (int x = threadIdx.x; x < C; x += BX) {
       int idx = x + y * C;
-      if (x < C && idx < len) { lse += myExp<T>(in[idx] - etaMax); }
+      if (x < C && idx < len) { lse += raft::exp<T>(in[idx] - etaMax); }
     }
   }
   T tmpLse = WarpRed(shm.warpStore[threadIdx.y]).Sum(lse);
-  if (threadIdx.x == 0) { shm.sh_val[threadIdx.y] = etaMax + myLog<T>(tmpLse); }
+  if (threadIdx.x == 0) { shm.sh_val[threadIdx.y] = etaMax + raft::log<T>(tmpLse); }
   __syncthreads();
   lse = shm.sh_val[threadIdx.y];
   __syncthreads();
@@ -123,14 +119,14 @@ __global__ void logSoftmaxKernel(
   if (C <= BX) {  // this means one block covers a column and myEta is valid
     int idx = threadIdx.x + y * C;
     if (threadIdx.x < C && idx < len) {
-      dZ[idx] = (myExp<T>(myEta - lse) - (getDerivative ? (threadIdx.x == label) : T(0)));
+      dZ[idx] = (raft::exp<T>(myEta - lse) - (getDerivative ? (threadIdx.x == label) : T(0)));
     }
   } else {
     for (int x = threadIdx.x; x < C; x += BX) {
       int idx = x + y * C;
       if (x < C && idx < len) {
         T logP  = in[idx] - lse;
-        dZ[idx] = (myExp<T>(logP) - (getDerivative ? (x == label) : T(0)));
+        dZ[idx] = (raft::exp<T>(logP) - (getDerivative ? (x == label) : T(0)));
       }
     }
   }
@@ -156,19 +152,19 @@ void launchLogsoftmax(
   raft::interruptible::synchronize(stream);
   if (C <= 4) {
     dim3 bs(4, 64);
-    dim3 gs(ceildiv(N, 64));
+    dim3 gs(raft::ceildiv(N, 64));
     logSoftmaxKernel<T, 4, 64><<<gs, bs, 0, stream>>>(loss_val, dldZ, Z, labels, C, N);
   } else if (C <= 8) {
     dim3 bs(8, 32);
-    dim3 gs(ceildiv(N, 32));
+    dim3 gs(raft::ceildiv(N, 32));
     logSoftmaxKernel<T, 8, 32><<<gs, bs, 0, stream>>>(loss_val, dldZ, Z, labels, C, N);
   } else if (C <= 16) {
     dim3 bs(16, 16);
-    dim3 gs(ceildiv(N, 16));
+    dim3 gs(raft::ceildiv(N, 16));
     logSoftmaxKernel<T, 16, 16><<<gs, bs, 0, stream>>>(loss_val, dldZ, Z, labels, C, N);
   } else {
     dim3 bs(32, 8);
-    dim3 gs(ceildiv(N, 8));
+    dim3 gs(raft::ceildiv(N, 8));
     logSoftmaxKernel<T, 32, 8><<<gs, bs, 0, stream>>>(loss_val, dldZ, Z, labels, C, N);
   }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
diff --git a/cpp/src/glm/qn/glm_svm.cuh b/cpp/src/glm/qn/glm_svm.cuh
index 8f81a42a74..24bad5e301 100644
--- a/cpp/src/glm/qn/glm_svm.cuh
+++ b/cpp/src/glm/qn/glm_svm.cuh
@@ -33,7 +33,7 @@ struct SVCL1Loss : GLMBase<T, SVCL1Loss<T>> {
     inline __device__ T operator()(const T y, const T z) const
     {
       T s = 2 * y - 1;
-      return raft::myMax<T>(0, 1 - s * z);
+      return raft::max<T>(0, 1 - s * z);
     }
   } lz;
 
@@ -64,7 +64,7 @@ struct SVCL2Loss : GLMBase<T, SVCL2Loss<T>> {
     inline __device__ T operator()(const T y, const T z) const
     {
       T s = 2 * y - 1;
-      T t = raft::myMax<T>(0, 1 - s * z);
+      T t = raft::max<T>(0, 1 - s * z);
       return t * t;
     }
   } lz;
diff --git a/cpp/src/glm/qn/simple_mat/dense.hpp b/cpp/src/glm/qn/simple_mat/dense.hpp
index f9b15809d2..d09cd0f6b0 100644
--- a/cpp/src/glm/qn/simple_mat/dense.hpp
+++ b/cpp/src/glm/qn/simple_mat/dense.hpp
@@ -303,8 +303,8 @@ inline T squaredNorm(const SimpleVec<T>& u, T* tmp_dev, cudaStream_t stream)
 template <typename T>
 inline T nrmMax(const SimpleVec<T>& u, T* tmp_dev, cudaStream_t stream)
 {
-  auto f = [] __device__(const T x) { return raft::myAbs<T>(x); };
-  auto r = [] __device__(const T x, const T y) { return raft::myMax<T>(x, y); };
+  auto f = [] __device__(const T x) { return raft::abs<T>(x); };
+  auto r = [] __device__(const T x, const T y) { return raft::max<T>(x, y); };
   raft::linalg::mapThenReduce(tmp_dev, u.len, T(0), f, r, stream, u.data);
   T tmp_host;
   raft::update_host(&tmp_host, tmp_dev, 1, stream);
diff --git a/cpp/src/solver/cd.cuh b/cpp/src/solver/cd.cuh
index 29e8322130..78e54d62e2 100644
--- a/cpp/src/solver/cd.cuh
+++ b/cpp/src/solver/cd.cuh
@@ -74,9 +74,9 @@ __global__ void __launch_bounds__(1, 1) cdUpdateCoefKernel(math_t* coefLoc,
   auto r       = coef > l1_alpha ? coef - l1_alpha : (coef < -l1_alpha ? coef + l1_alpha : 0);
   auto squared = *squaredLoc;
   r            = squared > math_t(1e-5) ? r / squared : math_t(0);
-  auto diff    = raft::myAbs(convStateLoc->coef - r);
+  auto diff    = raft::abs(convStateLoc->coef - r);
   if (convStateLoc->diffMax < diff) convStateLoc->diffMax = diff;
-  auto absv = raft::myAbs(r);
+  auto absv = raft::abs(r);
   if (convStateLoc->coefMax < absv) convStateLoc->coefMax = absv;
   convStateLoc->coef = -r;
   *coefLoc           = r;
diff --git a/cpp/src/svm/linear.cu b/cpp/src/svm/linear.cu
index 2fa13fab0c..18f584c0c3 100644
--- a/cpp/src/svm/linear.cu
+++ b/cpp/src/svm/linear.cu
@@ -144,12 +144,12 @@ __global__ void predictProba(T* out, const T* z, const int nRows, const int nCla
   int j    = threadIdx.x;
   if constexpr (Binary) {
     t      = rowIn[0];
-    maxVal = raft::myMax<T>(t, 0);
+    maxVal = raft::max<T>(t, T{0});
     t      = T(j) * t;  // set z[0] = 0, z[1] = t
   } else {
     for (; j < nClasses; j += BX) {
       t      = rowIn[j];
-      maxVal = raft::myMax<T>(maxVal, t);
+      maxVal = raft::max<T>(maxVal, t);
     }
     j -= BX;
     maxVal = WarpRed(warpStore).Reduce(maxVal, cub::Max());
@@ -164,7 +164,7 @@ __global__ void predictProba(T* out, const T* z, const int nRows, const int nCla
   T et;         // Numerator of the softmax.
   T smSum = 0;  // Denominator of the softmax.
   while (j >= 0) {
-    et = raft::myExp<T>(t - maxVal);
+    et = raft::exp<T>(t - maxVal);
     smSum += et;
     if (j < BX) break;
     j -= BX;
@@ -178,13 +178,13 @@ __global__ void predictProba(T* out, const T* z, const int nRows, const int nCla
   // Traverse in the forward direction again to save the results.
   // Note, no extra memory reads when BX >= nClasses!
   if (j < 0) return;
-  T d = Log ? -maxVal - raft::myLog<T>(smSum) : 1 / smSum;
+  T d = Log ? -maxVal - raft::log<T>(smSum) : 1 / smSum;
   while (j < nClasses) {
     rowOut[j] = Log ? t + d : et * d;
     j += BX;
     if (j >= nClasses) break;
     t = rowIn[j];
-    if constexpr (!Log) et = raft::myExp<T>(t - maxVal);
+    if constexpr (!Log) et = raft::exp<T>(t - maxVal);
   }
 }
 
diff --git a/cpp/src_prims/functions/log.cuh b/cpp/src_prims/functions/log.cuh
index 4dbaf7b218..1cb3e60ad6 100644
--- a/cpp/src_prims/functions/log.cuh
+++ b/cpp/src_prims/functions/log.cuh
@@ -25,7 +25,7 @@ template <typename T, typename IdxType = int>
 void f_log(T* out, T* in, T scalar, IdxType len, cudaStream_t stream)
 {
   raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(T in) { return raft::myLog(in) * scalar; }, stream);
+    out, in, len, [scalar] __device__(T in) { return raft::log(in) * scalar; }, stream);
 }
 
 };  // end namespace Functions
diff --git a/cpp/src_prims/functions/sigmoid.cuh b/cpp/src_prims/functions/sigmoid.cuh
index d4d2b75c6c..9eaab0dfd5 100644
--- a/cpp/src_prims/functions/sigmoid.cuh
+++ b/cpp/src_prims/functions/sigmoid.cuh
@@ -27,7 +27,7 @@ void sigmoid(T* out, T* in, IdxType len, cudaStream_t stream)
 {
   T one = T(1);
   raft::linalg::unaryOp(
-    out, in, len, [one] __device__(T in) { return one / (one + raft::myExp(-in)); }, stream);
+    out, in, len, [one] __device__(T in) { return one / (one + raft::exp(-in)); }, stream);
 }
 
 };  // end namespace Functions
diff --git a/cpp/src_prims/functions/softThres.cuh b/cpp/src_prims/functions/softThres.cuh
index 8088c2f042..7370cd281c 100644
--- a/cpp/src_prims/functions/softThres.cuh
+++ b/cpp/src_prims/functions/softThres.cuh
@@ -30,9 +30,9 @@ void softThres(
     in,
     len,
     [thres] __device__(math_t in) {
-      if (in > math_t(0) && thres < raft::myAbs(in))
+      if (in > math_t(0) && thres < raft::abs(in))
         return in - thres;
-      else if (in < math_t(0) && thres < raft::myAbs(in))
+      else if (in < math_t(0) && thres < raft::abs(in))
         return in + thres;
       else
         return math_t(0);
diff --git a/cpp/src_prims/linalg/batched/gemv.cuh b/cpp/src_prims/linalg/batched/gemv.cuh
index 7412ce4cac..69c74daa1e 100644
--- a/cpp/src_prims/linalg/batched/gemv.cuh
+++ b/cpp/src_prims/linalg/batched/gemv.cuh
@@ -176,7 +176,7 @@ void gemvImplAx(DataT* y,
  * @param stream cuda stream
  * @param op epilogue operation
  */
-template <typename DataT, typename IdxT, typename EpilogueOp = raft::Nop<DataT, IdxT>>
+template <typename DataT, typename IdxT, typename EpilogueOp = raft::identity_op>
 void gemv(DataT* y,
           const DataT* A,
           const DataT* x,
@@ -187,7 +187,7 @@ void gemv(DataT* y,
           IdxT n,
           IdxT batchSize,
           cudaStream_t stream,
-          EpilogueOp op = raft::Nop<DataT, IdxT>())
+          EpilogueOp op = raft::identity_op())
 {
   size_t bytes = n * sizeof(DataT);
   if (16 / sizeof(DataT) && bytes % 16 == 0) {
diff --git a/cpp/src_prims/linalg/batched/make_symm.cuh b/cpp/src_prims/linalg/batched/make_symm.cuh
index 47c8f2ee59..9b6405ed32 100644
--- a/cpp/src_prims/linalg/batched/make_symm.cuh
+++ b/cpp/src_prims/linalg/batched/make_symm.cuh
@@ -71,13 +71,13 @@ __global__ void symmKernel(DataT* out, const DataT* in, IdxT batchSize, IdxT n,
  * @param stream cuda stream
  * @param op custom epilogue functor
  */
-template <typename DataT, typename IdxT, typename EpilogueOp = raft::Nop<DataT, IdxT>>
+template <typename DataT, typename IdxT, typename EpilogueOp = raft::identity_op>
 void make_symm(DataT* out,
                const DataT* in,
                IdxT batchSize,
                IdxT n,
                cudaStream_t stream,
-               EpilogueOp op = raft::Nop<DataT, IdxT>())
+               EpilogueOp op = raft::identity_op())
 {
   dim3 blk(TileDim, BlockRows);
   auto nblks = raft::ceildiv<int>(n, TileDim);
diff --git a/cpp/src_prims/matrix/reverse.cuh b/cpp/src_prims/matrix/reverse.cuh
index 7cae75d780..2064821a2c 100644
--- a/cpp/src_prims/matrix/reverse.cuh
+++ b/cpp/src_prims/matrix/reverse.cuh
@@ -119,7 +119,7 @@ void reverseImpl(math_t* out,
  * @param op the device-lambda to perform an optional final unary operation on
  *  each element after the reverse
  */
-template <typename math_t, typename Lambda = raft::Nop<math_t>, int TPB = 256>
+template <typename math_t, typename Lambda = raft::identity_op<math_t>, int TPB = 256>
 void reverse(math_t* out,
              const math_t* in,
              int nrows,
@@ -127,7 +127,7 @@ void reverse(math_t* out,
              bool rowMajor,
              bool alongRows,
              cudaStream_t stream,
-             Lambda op = raft::Nop<math_t>())
+             Lambda op = raft::identity_op<math_t>())
 {
   size_t bytes = (rowMajor ? ncols : nrows) * sizeof(math_t);
   if (16 / sizeof(math_t) && bytes % 16 == 0) {
diff --git a/cpp/src_prims/timeSeries/jones_transform.cuh b/cpp/src_prims/timeSeries/jones_transform.cuh
index b98bd796e6..81d4c693b3 100644
--- a/cpp/src_prims/timeSeries/jones_transform.cuh
+++ b/cpp/src_prims/timeSeries/jones_transform.cuh
@@ -39,7 +39,7 @@ namespace TimeSeries {
  */
 template <typename Type>
 struct PAC {
-  HDI Type operator()(Type in) { return raft::myTanh(in * 0.5); }
+  HDI Type operator()(Type in) { return raft::tanh(in * 0.5); }
 };
 
 /**
@@ -137,7 +137,7 @@ inline __device__ void invtransform(DataT* tmp, DataT* myNewParams, bool isAr)
   }
 
   for (int i = 0; i < VALUE; ++i) {
-    myNewParams[i] = 2 * raft::myATanh(myNewParams[i]);
+    myNewParams[i] = 2 * raft::atanh(myNewParams[i]);
   }
 }
 
diff --git a/cpp/src_prims/timeSeries/stationarity.cuh b/cpp/src_prims/timeSeries/stationarity.cuh
index 32654ee410..5c0f284386 100644
--- a/cpp/src_prims/timeSeries/stationarity.cuh
+++ b/cpp/src_prims/timeSeries/stationarity.cuh
@@ -241,7 +241,7 @@ static void _kpss_test(const DataT* d_y,
                        stream,
                        false,
                        raft::L2Op<DataT>(),
-                       raft::Sum<DataT>());
+                       raft::add_op());
 
   // From Kwiatkowski et al. referencing Schwert (1989)
   DataT lags_f = ceil(12.0 * pow(n_obs_f / 100.0, 0.25));
@@ -295,7 +295,7 @@ static void _kpss_test(const DataT* d_y,
                        stream,
                        false,
                        raft::L2Op<DataT>(),
-                       raft::Sum<DataT>());
+                       raft::add_op());
 
   /* The following kernel will decide whether each series is stationary based on
    * s^2 and eta */
diff --git a/cpp/test/prims/distance_base.cuh b/cpp/test/prims/distance_base.cuh
index 10b8ed72ae..09f2d09724 100644
--- a/cpp/test/prims/distance_base.cuh
+++ b/cpp/test/prims/distance_base.cuh
@@ -48,7 +48,7 @@ __global__ void naiveDistanceKernel(DataType* dist,
   }
   if (type == raft::distance::DistanceType::L2SqrtExpanded ||
       type == raft::distance::DistanceType::L2SqrtUnexpanded)
-    acc = raft::mySqrt(acc);
+    acc = raft::sqrt(acc);
   int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
@@ -100,7 +100,7 @@ __global__ void naiveCosineDistanceKernel(
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
 
   // Use 1.0 - (cosine similarity) to calc the distance
-  dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b));
+  dist[outidx] = (DataType)1.0 - acc_ab / (raft::sqrt(acc_a) * raft::sqrt(acc_b));
 }
 
 template <typename DataType>
diff --git a/cpp/test/prims/knn_regression.cu b/cpp/test/prims/knn_regression.cu
index a726fd93b7..c2a6b700af 100644
--- a/cpp/test/prims/knn_regression.cu
+++ b/cpp/test/prims/knn_regression.cu
@@ -73,7 +73,7 @@ void generate_data(
     stream,
     false,
     [=] __device__(float in, int n) { return in * in; },
-    raft::Sum<float>(),
+    raft::add_op(),
     [=] __device__(float in) { return sqrt(in); });
 
   thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(out_labels);
diff --git a/cpp/test/sg/linear_svm_test.cu b/cpp/test/sg/linear_svm_test.cu
index 77acd49810..bf2a678b3e 100644
--- a/cpp/test/sg/linear_svm_test.cu
+++ b/cpp/test/sg/linear_svm_test.cu
@@ -161,7 +161,7 @@ struct LinearSVMTest : public ::testing::TestWithParam<typename ParamsReader::Pa
       errorBuf.data(),
       params.nRowsTest,
       T(0),
-      [] __device__(const T yOut) { return raft::myAbs<T>(1.0 - yOut); },
+      [] __device__(const T yOut) { return raft::abs<T>(1.0 - yOut); },
       cub::Max(),
       stream,
       yOut.data());
diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu
index de8ef17010..64bbae26e3 100644
--- a/cpp/test/sg/rf_test.cu
+++ b/cpp/test/sg/rf_test.cu
@@ -983,8 +983,8 @@ class ObjectiveTest : public ::testing::TestWithParam<ObjectiveTestParameters> {
     DataT ghd(0);  // gamma half deviance
 
     std::for_each(data.begin(), data.end(), [&](auto& element) {
-      auto log_y = raft::myLog(element ? element : DataT(1.0));
-      ghd += raft::myLog(mean) - log_y + element / mean - 1;
+      auto log_y = raft::log(element ? element : DataT(1.0));
+      ghd += raft::log(mean) - log_y + element / mean - 1;
     });
 
     ghd /= data.size();
@@ -1022,8 +1022,8 @@ class ObjectiveTest : public ::testing::TestWithParam<ObjectiveTestParameters> {
     auto poisson_half_deviance{DataT(0.0)};
 
     std::for_each(data.begin(), data.end(), [&](auto d) {
-      auto log_y = raft::myLog(d ? d : DataT(1.0));  // we don't want nans
-      poisson_half_deviance += d * (log_y - raft::myLog(mean)) + mean - d;
+      auto log_y = raft::log(d ? d : DataT(1.0));  // we don't want nans
+      poisson_half_deviance += d * (log_y - raft::log(mean)) + mean - d;
     });
 
     poisson_half_deviance /= data.size();
@@ -1061,8 +1061,8 @@ class ObjectiveTest : public ::testing::TestWithParam<ObjectiveTestParameters> {
         if (d == DataT(c)) ++sum;
       });
       DataT class_proba = DataT(sum) / data.size();
-      entropy += -class_proba * raft::myLog(class_proba ? class_proba : DataT(1)) /
-                 raft::myLog(DataT(2));  // adding gain
+      entropy += -class_proba * raft::log(class_proba ? class_proba : DataT(1)) /
+                 raft::log(DataT(2));  // adding gain
     }
     return entropy;
   }

From c52455304bf597b9518b34f0a60cb4e6a677061c Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Fri, 8 Dec 2023 17:07:07 -0800
Subject: [PATCH 04/24] Add cuML devcontainers (#5568)

This PR adds some [devcontainers](https://containers.dev/) to help simplify building the cuML C++ and Python libraries.

It also adds an optional job to the `pr.yaml` to [build the cuML libs in each devcontainer](https://github.com/trxcllnt/cuml/blob/fea/devcontainers/.github/workflows/pr.yaml#L113-L118), so the build caches are populated for devs by CI.

A devcontainer can be launched by clicking the "Reopen in Container" button that VSCode shows when opening the repo (or by using the "Rebuild and Reopen in Container" command from the command palette):
![image](https://user-images.githubusercontent.com/178183/221771999-97ab29d5-e718-4e5f-b32f-2cdd51bba25c.png)

Clicking this button will cause VSCode to prompt the user to select one of these devcontainer variants:
![image](https://github.com/rapidsai/rmm/assets/178183/68d4b264-4fc2-4008-92b6-cb4bdd19b29f)

On startup, the devcontainer creates or updates the conda/pip environment using `cuml/dependencies.yaml`. The envs/package caches are cached on the host via volume mounts, which are described in more detail in [`.devcontainer/README.md`](https://github.com/trxcllnt/cuml/blob/fea/devcontainers/.devcontainer/README.md).

The container includes convenience functions to clean, configure, and build the various cuML components:

```shell
$ clean-cuml-cpp # only cleans the C++ build dir
$ clean-cuml-python # only cleans the Python build dir
$ clean-cuml # cleans both C++ and Python build dirs

$ configure-cuml-cpp # only configures cuml C++ lib

$ build-cuml-cpp # only builds cuml C++ lib
$ build-cuml-python # only builds cuml Python lib
$ build-cuml # builds both C++ and Python libs
```

* The C++ build script is a small wrapper around `cmake -S ~/cuml/cpp -B ~/cuml/cpp/build` and `cmake --build ~/cuml/cpp/build`
* The Python build script is a small wrapper around `pip install --editable ~/cuml/cpp`

Unlike `build.sh`, these convenience scripts *don't* install the libraries after building them. Instead, they automatically inject the correct arguments to build the C++ libraries from source and use their build dirs as package roots:

```shell
$ cmake -S ~/cuml/cpp -B ~/cuml/cpp/build
$ CMAKE_ARGS="-Dcuml_ROOT=~/cuml/cpp/build" \ # <-- this argument is automatic
  pip install -e ~/cuml/cpp
```

Authors:
  - Paul Taylor (https://github.com/trxcllnt)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Divye Gala (https://github.com/divyegala)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/cuml/pull/5568
---
 .devcontainer/Dockerfile                      | 30 +++++++
 .devcontainer/README.md                       | 35 ++++++++
 .../cuda11.8-conda/devcontainer.json          | 37 +++++++++
 .devcontainer/cuda11.8-pip/devcontainer.json  | 37 +++++++++
 .../cuda12.0-conda/devcontainer.json          | 37 +++++++++
 .devcontainer/cuda12.0-pip/devcontainer.json  | 37 +++++++++
 .github/workflows/pr.yaml                     | 12 ++-
 .gitignore                                    |  6 +-
 ci/release/update-version.sh                  | 11 ++-
 cpp/.clangd                                   | 65 +++++++++++++++
 cpp/CMakeLists.txt                            |  3 +-
 cpp/cmake/thirdparty/get_libcudacxx.cmake     | 35 ++++++++
 dependencies.yaml                             | 83 +++++++++++++++----
 python/CMakeLists.txt                         |  2 +-
 python/pyproject.toml                         |  4 +-
 15 files changed, 410 insertions(+), 24 deletions(-)
 create mode 100644 .devcontainer/Dockerfile
 create mode 100644 .devcontainer/README.md
 create mode 100644 .devcontainer/cuda11.8-conda/devcontainer.json
 create mode 100644 .devcontainer/cuda11.8-pip/devcontainer.json
 create mode 100644 .devcontainer/cuda12.0-conda/devcontainer.json
 create mode 100644 .devcontainer/cuda12.0-pip/devcontainer.json
 create mode 100644 cpp/.clangd
 create mode 100644 cpp/cmake/thirdparty/get_libcudacxx.cmake

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 0000000000..9d35e3f97f
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,30 @@
+# syntax=docker/dockerfile:1.5
+
+ARG BASE
+ARG PYTHON_PACKAGE_MANAGER=conda
+
+FROM ${BASE} as pip-base
+
+ENV DEFAULT_VIRTUAL_ENV=rapids
+
+FROM ${BASE} as conda-base
+
+ENV DEFAULT_CONDA_ENV=rapids
+
+FROM ${PYTHON_PACKAGE_MANAGER}-base
+
+ARG CUDA
+ENV CUDAARCHS="RAPIDS"
+ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
+
+ARG PYTHON_PACKAGE_MANAGER
+ENV PYTHON_PACKAGE_MANAGER="${PYTHON_PACKAGE_MANAGER}"
+
+ENV PYTHONSAFEPATH="1"
+ENV PYTHONUNBUFFERED="1"
+ENV PYTHONDONTWRITEBYTECODE="1"
+
+ENV SCCACHE_REGION="us-east-2"
+ENV SCCACHE_BUCKET="rapids-sccache-devs"
+ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV HISTFILE="/home/coder/.cache/._bash_history"
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
new file mode 100644
index 0000000000..b748bbb3bf
--- /dev/null
+++ b/.devcontainer/README.md
@@ -0,0 +1,35 @@
+# cuML Development Containers
+
+This directory contains [devcontainer configurations](https://containers.dev/implementors/json_reference/) for using VSCode to [develop in a container](https://code.visualstudio.com/docs/devcontainers/containers) via the `Remote Containers` [extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) or [GitHub Codespaces](https://github.com/codespaces).
+
+This container is a turnkey development environment for building and testing the cuML C++ and Python libraries.
+
+## Table of Contents
+
+* [Prerequisites](#prerequisites)
+* [Host bind mounts](#host-bind-mounts)
+* [Launch a Dev Container](#launch-a-dev-container)
+
+## Prerequisites
+
+* [VSCode](https://code.visualstudio.com/download)
+* [VSCode Remote Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
+
+## Host bind mounts
+
+By default, the following directories are bind-mounted into the devcontainer:
+
+* `${repo}:/home/coder/cuml`
+* `${repo}/../.aws:/home/coder/.aws`
+* `${repo}/../.local:/home/coder/.local`
+* `${repo}/../.cache:/home/coder/.cache`
+* `${repo}/../.conda:/home/coder/.conda`
+* `${repo}/../.config:/home/coder/.config`
+
+This ensures caches, configurations, dependencies, and your commits are persisted on the host across container runs.
+
+## Launch a Dev Container
+
+To launch a devcontainer from VSCode, open the cuML repo and select the "Reopen in Container" button in the bottom right:<br/><img src="https://user-images.githubusercontent.com/178183/221771999-97ab29d5-e718-4e5f-b32f-2cdd51bba25c.png"/>
+
+Alternatively, open the VSCode command palette (typically `cmd/ctrl + shift + P`) and run the "Rebuild and Reopen in Container" command.
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
new file mode 100644
index 0000000000..b2783add05
--- /dev/null
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuml,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda11.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
new file mode 100644
index 0000000000..f208ea86a5
--- /dev/null
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuml,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
new file mode 100644
index 0000000000..7445963cfd
--- /dev/null
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:24.02-cpp-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.0-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuml,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
new file mode 100644
index 0000000000..284ee66fa2
--- /dev/null
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda12.0-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuml,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 0e786e2c8f..e4006a7ac2 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -23,6 +23,7 @@ jobs:
       - docs-build
       - wheel-build-cuml
       - wheel-tests-cuml
+      - devcontainer
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
   checks:
@@ -100,7 +101,7 @@ jobs:
       build_type: pull-request
       script: ci/build_wheel.sh
       extra-repo: rapidsai/cumlprims_mg
-      extra-repo-sha: branch-23.02
+      extra-repo-sha: branch-24.02
       extra-repo-deploy-key: CUMLPRIMS_SSH_PRIVATE_DEPLOY_KEY
   wheel-tests-cuml:
     needs: wheel-build-cuml
@@ -109,3 +110,12 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
+  devcontainer:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
+    with:
+      extra-repo-deploy-key: CUMLPRIMS_SSH_PRIVATE_DEPLOY_KEY
+      build_command: |
+        sccache -z;
+        build-all --verbose;
+        sccache -s;
diff --git a/.gitignore b/.gitignore
index 56188bfad4..dc0b98d735 100644
--- a/.gitignore
+++ b/.gitignore
@@ -63,4 +63,8 @@ doxygen_check/
 
 ## Doxygen
 cpp/html
-cpp/Doxyfile
\ No newline at end of file
+cpp/Doxyfile
+
+# clang tooling
+compile_commands.json
+.clangd/
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index e1f9463703..b895f8a2f8 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -85,11 +85,18 @@ done
 sed_runner "s|/branch-[^/]*/|/branch-${NEXT_SHORT_TAG}/|g" README.md
 sed_runner "s|/branch-[^/]*/|/branch-${NEXT_SHORT_TAG}/|g" python/README.md
 
-# Wheel builds clone cumlprims_mg, update its branch
-sed_runner "s/extra-repo-sha: branch-.*/extra-repo-sha: branch-${NEXT_SHORT_TAG}/g" .github/workflows/*.yaml
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+  # Wheel builds clone cumlprims_mg, update its branch
+  sed_runner "s/extra-repo-sha: branch-.*/extra-repo-sha: branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh
+
+# .devcontainer files
+find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
+    sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+done
diff --git a/cpp/.clangd b/cpp/.clangd
new file mode 100644
index 0000000000..7c4fe036dd
--- /dev/null
+++ b/cpp/.clangd
@@ -0,0 +1,65 @@
+# https://clangd.llvm.org/config
+
+# Apply a config conditionally to all C files
+If:
+  PathMatch: .*\.(c|h)$
+
+---
+
+# Apply a config conditionally to all C++ files
+If:
+  PathMatch: .*\.(c|h)pp
+
+---
+
+# Apply a config conditionally to all CUDA files
+If:
+  PathMatch: .*\.cuh?
+CompileFlags:
+  Add:
+    - "-x"
+    - "cuda"
+    # No error on unknown CUDA versions
+    - "-Wno-unknown-cuda-version"
+    # Allow variadic CUDA functions
+    - "-Xclang=-fcuda-allow-variadic-functions"
+Diagnostics:
+  Suppress:
+    - "variadic_device_fn"
+    - "attributes_not_allowed"
+
+---
+
+# Tweak the clangd parse settings for all files
+CompileFlags:
+  Add:
+    # report all errors
+    - "-ferror-limit=0"
+    - "-fmacro-backtrace-limit=0"
+    - "-ftemplate-backtrace-limit=0"
+    # Skip the CUDA version check
+    - "--no-cuda-version-check"
+  Remove:
+    # remove gcc's -fcoroutines
+    - -fcoroutines
+    # remove nvc++ flags unknown to clang
+    - "-gpu=*"
+    - "-stdpar*"
+    # remove nvcc flags unknown to clang
+    - "-arch*"
+    - "-gencode*"
+    - "--generate-code*"
+    - "-ccbin*"
+    - "-t=*"
+    - "--threads*"
+    - "-Xptxas*"
+    - "-Xcudafe*"
+    - "-Xfatbin*"
+    - "-Xcompiler*"
+    - "--diag-suppress*"
+    - "--diag_suppress*"
+    - "--compiler-options*"
+    - "--expt-extended-lambda"
+    - "--expt-relaxed-constexpr"
+    - "-forward-unknown-to-host-compiler"
+    - "-Werror=cross-execution-space-call"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 29ee669a85..e83f31e3a4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -56,7 +56,7 @@ option(BUILD_CUML_MPI_COMMS "Build the MPI+NCCL Communicator (used for testing)"
 option(CUDA_ENABLE_KERNEL_INFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINE_INFO "Enable lineinfo in nvcc" OFF)
 option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON)
-option(DISABLE_DEPRECATION_WARNINGS "Disable deprecation warnings " OFF)
+option(DISABLE_DEPRECATION_WARNINGS "Disable deprecation warnings " ON)
 option(DISABLE_OPENMP "Disable OpenMP" OFF)
 option(ENABLE_CUMLPRIMS_MG "Enable algorithms that use libcumlprims_mg" ON)
 option(NVTX "Enable nvtx markers" OFF)
@@ -220,6 +220,7 @@ if(BUILD_CUML_TESTS OR BUILD_PRIMS_TESTS)
   find_package(Threads)
 endif()
 
+include(cmake/thirdparty/get_libcudacxx.cmake)
 include(cmake/thirdparty/get_raft.cmake)
 
 if(LINK_TREELITE)
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
new file mode 100644
index 0000000000..54184ced34
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -0,0 +1,35 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds thrust and sets any additional necessary environment variables.
+function(find_and_configure_libcudacxx)
+  # Make sure we install libcudacxx beside our patched version of thrust
+  include(GNUInstallDirs)
+  set(CMAKE_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/libcuml")
+  set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_INCLUDEDIR}/lib")
+
+  include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET cuml-exports INSTALL_EXPORT_SET cuml-exports)
+
+  # Store where CMake can find our custom Thrust install
+  include("${rapids-cmake-dir}/export/find_package_root.cmake")
+  rapids_export_find_package_root(
+    INSTALL libcudacxx
+    [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcuml/lib/rapids/cmake/libcudacxx]=]
+    EXPORT_SET cuml-exports
+    CONDITION libcudacxx_SOURCE_DIR
+  )
+endfunction()
+
+find_and_configure_libcudacxx()
diff --git a/dependencies.yaml b/dependencies.yaml
index fbb1f0107a..dc579d48fa 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -154,9 +154,17 @@ dependencies:
           - scikit-build>=0.13.1
           - cython>=3.0.0
           - &treelite treelite==3.9.1
-          - pylibraft==24.2.*
-          - rmm==24.2.*
-      - output_types: pyproject
+      - output_types: conda
+        packages:
+          - &pylibraft_conda pylibraft==24.2.*
+          - &rmm_conda rmm==24.2.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for cudf and rmm.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+      - output_types: [pyproject, requirements]
         packages:
           - wheel
           - setuptools
@@ -171,40 +179,81 @@ dependencies:
           - matrix: # All CUDA 11 versions
             packages:
               - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.0"}
+            packages:
+              - pylibraft-cu12==24.2.*
+              - rmm-cu12==24.2.*
+          - matrix: {cuda: "11.8"}
+            packages: &py_build_packages_cu11
+              - &pylibraft_cu11 pylibraft-cu11==24.2.*
+              - &rmm_cu11 rmm-cu11==24.2.*
+          - {matrix: {cuda: "11.5"}, packages: *py_build_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *py_build_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *py_build_packages_cu11}
+          - {matrix: null, packages: [*pylibraft_conda, *rmm_conda] }
+
   py_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cudf==24.2.*
           - dask-cuda==24.2.*
-          - dask-cudf==24.2.*
           - joblib>=0.11
           - numba>=0.57
             # TODO: Is scipy really a hard dependency, or should
             # we make it optional (i.e. an extra for pip
             # installation/run_constrained for conda)?
           - scipy>=1.8.0
-          - raft-dask==24.2.*
           - rapids-dask-dependency==24.2.*
           - *treelite
-      - output_types: [conda, requirements]
+      - output_types: conda
         packages:
-          - cupy>=12.0.0
-      - output_types: pyproject
+          - &cudf_conda cudf==24.2.*
+          - &cupy_conda cupy>=12.0.0
+          - &dask_cudf_conda dask-cudf==24.2.*
+          - &raft_dask_conda raft-dask==24.2.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for cudf and rmm.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+      - output_types: [pyproject, requirements]
         packages:
           - *treelite_runtime
-          - cupy-cuda11x>=12.0.0
     specific:
-      - output_types: requirements
+      - output_types: [requirements, pyproject]
         matrices:
-          - matrix:
-              arch: x86_64
+          - matrix: {cuda: "12.0"}
             packages:
-              - cupy-cuda115>=12.0.0
-          - matrix:
-              arch: aarch64
+              - cudf-cu12==24.2.*
+              - cupy-cuda12x>=12.0.0
+              - dask-cudf-cu12==24.2.*
+              - pylibraft-cu12==24.2.*
+              - raft-dask-cu12==24.2.*
+              - rmm-cu12==24.2.*
+          # All CUDA 11 versions
+          - matrix: {cuda: "11.8"}
+            packages: &py_run_packages_cu11
+              - cudf-cu11==24.2.*
+              - cupy-cuda11x>=12.0.0
+              - dask-cudf-cu11==24.2.*
+              - *pylibraft_cu11
+              - raft-dask-cu11==24.2.*
+              - *rmm_cu11
+          - {matrix: {cuda: "11.5"}, packages: *py_run_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *py_run_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *py_run_packages_cu11}
+          - matrix: null
             packages:
-              - cupy-cuda11x -f https://pip.cupy.dev/aarch64 # TODO: Verify that this works.
+              - *cudf_conda
+              - *cupy_conda
+              - *dask_cudf_conda
+              - *pylibraft_conda
+              - *raft_dask_conda
+              - *pylibraft_conda
+              - *rmm_conda
   cudatoolkit:
     specific:
       - output_types: conda
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index e639f8a71c..6bfa48ab11 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -69,8 +69,8 @@ if(FIND_CUML_CPP)
   include(rapids-cpm)
   include(rapids-export)
   rapids_cpm_init()
-  include(../cpp/cmake/thirdparty/get_treelite.cmake)
   find_package(cuml ${CUML_VERSION} REQUIRED)
+  include(../cpp/cmake/thirdparty/get_treelite.cmake)
 else()
   set(cuml_FOUND OFF)
 endif()
diff --git a/python/pyproject.toml b/python/pyproject.toml
index c605faf8c8..dd2a930e65 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -58,13 +58,15 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cudf==24.2.*",
-    "cupy-cuda11x>=12.0.0",
+    "cupy>=12.0.0",
     "dask-cuda==24.2.*",
     "dask-cudf==24.2.*",
     "joblib>=0.11",
     "numba>=0.57",
+    "pylibraft==24.2.*",
     "raft-dask==24.2.*",
     "rapids-dask-dependency==24.2.*",
+    "rmm==24.2.*",
     "scipy>=1.8.0",
     "treelite==3.9.1",
     "treelite_runtime==3.9.1",

From cb45b27ffd5b812c14821e0b22db3afb54bb5213 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 11 Dec 2023 22:14:58 -0600
Subject: [PATCH 05/24] Use cuda::proclaim_return_type on device lambdas.
 (#5696)

This PR updates parts of the code that require `cuda::proclaim_return_type` for compatibility with CCCL 2.2.0 (Thrust). This pulls out part of the diff of #5623. I left the part that is needed to upgrade to CUB 2.2.0, because those changes will have to go into a separate PR that updates to CCCL 2.2.0.

I also added explicit CMake dependencies on Thrust and RMM. Without these, cuml is reliant on RAFT for transitive dependencies, which makes it very difficult to test upstream changes to Thrust and RMM.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/cuml/pull/5696
---
 cpp/CMakeLists.txt                    |  3 ++
 cpp/cmake/thirdparty/get_rmm.cmake    | 23 ++++++++++++
 cpp/cmake/thirdparty/get_thrust.cmake | 23 ++++++++++++
 cpp/src/kmeans/kmeans_mg_impl.cuh     | 54 +++++++++++++++------------
 4 files changed, 79 insertions(+), 24 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_rmm.cmake
 create mode 100644 cpp/cmake/thirdparty/get_thrust.cmake

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e83f31e3a4..c2b5437abf 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -220,7 +220,10 @@ if(BUILD_CUML_TESTS OR BUILD_PRIMS_TESTS)
   find_package(Threads)
 endif()
 
+# thrust before rmm, rmm before raft so we get the right version of thrust/rmm
+include(cmake/thirdparty/get_thrust.cmake)
 include(cmake/thirdparty/get_libcudacxx.cmake)
+include(cmake/thirdparty/get_rmm.cmake)
 include(cmake/thirdparty/get_raft.cmake)
 
 if(LINK_TREELITE)
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
new file mode 100644
index 0000000000..35968f7245
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -0,0 +1,23 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_rmm)
+    include(${rapids-cmake-dir}/cpm/rmm.cmake)
+    rapids_cpm_rmm(BUILD_EXPORT_SET cuml-exports
+                   INSTALL_EXPORT_SET cuml-exports)
+endfunction()
+
+find_and_configure_rmm()
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
new file mode 100644
index 0000000000..1477a8b397
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -0,0 +1,23 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to find or clone thrust
+function(find_and_configure_thrust)
+    include(${rapids-cmake-dir}/cpm/thrust.cmake)
+    rapids_cpm_thrust(NAMESPACE cuml
+                      BUILD_EXPORT_SET cuml-exports
+                      INSTALL_EXPORT_SET cuml-exports)
+endfunction()
+
+find_and_configure_thrust()
diff --git a/cpp/src/kmeans/kmeans_mg_impl.cuh b/cpp/src/kmeans/kmeans_mg_impl.cuh
index f1a0470652..c53c346df8 100644
--- a/cpp/src/kmeans/kmeans_mg_impl.cuh
+++ b/cpp/src/kmeans/kmeans_mg_impl.cuh
@@ -26,6 +26,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/functional>
 #include <ml_cuda_utils.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -241,7 +242,8 @@ void initKMeansPlusPlus(const raft::handle_t& handle,
     minClusterDistance.view(),
     workspace,
     clusterCost.view(),
-    [] __device__(const DataT& a, const DataT& b) { return a + b; });
+    cuda::proclaim_return_type<DataT>(
+      [] __device__(const DataT& a, const DataT& b) { return a + b; }));
 
   // compute total cluster cost by accumulating the partial cost from all the
   // ranks
@@ -291,7 +293,8 @@ void initKMeansPlusPlus(const raft::handle_t& handle,
       minClusterDistance.view(),
       workspace,
       clusterCost.view(),
-      [] __device__(const DataT& a, const DataT& b) { return a + b; });
+      cuda::proclaim_return_type<DataT>(
+        [] __device__(const DataT& a, const DataT& b) { return a + b; }));
     comm.allreduce(
       clusterCost.data_handle(), clusterCost.data_handle(), 1, raft::comms::op_t::SUM, stream);
     raft::copy(&psi, clusterCost.data_handle(), 1, stream);
@@ -481,7 +484,7 @@ void checkWeights(const raft::handle_t& handle,
       weight.data_handle(),
       weight.data_handle(),
       weight.size(),
-      [=] __device__(const DataT& wt) { return wt * scale; },
+      cuda::proclaim_return_type<DataT>([=] __device__(const DataT& wt) { return wt * scale; }),
       stream);
   }
 }
@@ -621,12 +624,12 @@ void fit(const raft::handle_t& handle,
       newCentroids.extent(0),
       true,
       false,
-      [=] __device__(DataT mat, DataT vec) {
+      cuda::proclaim_return_type<DataT>([=] __device__(DataT mat, DataT vec) {
         if (vec == 0)
           return DataT(0);
         else
           return mat / vec;
-      },
+      }),
       stream);
 
     // copy the centroids[i] to newCentroids[i] when wtInCluster[i] is 0
@@ -639,16 +642,18 @@ void fit(const raft::handle_t& handle,
       itr_wt,
       wtInCluster.extent(0),
       newCentroids.data_handle(),
-      [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> map) {  // predicate
-        // copy when the # of samples in the cluster is 0
-        if (map.value == 0)
-          return true;
-        else
-          return false;
-      },
-      [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> map) {  // map
-        return map.key;
-      },
+      cuda::proclaim_return_type<bool>(
+        [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> map) {  // predicate
+          // copy when the # of samples in the cluster is 0
+          if (map.value == 0)
+            return true;
+          else
+            return false;
+        }),
+      cuda::proclaim_return_type<ptrdiff_t>(
+        [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> map) {  // map
+          return map.key;
+        }),
       stream);
 
     // compute the squared norm between the newCentroids and the original
@@ -657,10 +662,10 @@ void fit(const raft::handle_t& handle,
     raft::linalg::mapThenSumReduce(
       sqrdNorm.data_handle(),
       newCentroids.size(),
-      [=] __device__(const DataT a, const DataT b) {
+      cuda::proclaim_return_type<DataT>([=] __device__(const DataT a, const DataT b) {
         DataT diff = a - b;
         return diff * diff;
-      },
+      }),
       stream,
       centroids.data_handle(),
       newCentroids.data_handle());
@@ -680,13 +685,14 @@ void fit(const raft::handle_t& handle,
         minClusterAndDistance.view(),
         workspace,
         raft::make_device_scalar_view(clusterCostD.data()),
-        [] __device__(const raft::KeyValuePair<IndexT, DataT>& a,
-                      const raft::KeyValuePair<IndexT, DataT>& b) {
-          raft::KeyValuePair<IndexT, DataT> res;
-          res.key   = 0;
-          res.value = a.value + b.value;
-          return res;
-        });
+        cuda::proclaim_return_type<raft::KeyValuePair<IndexT, DataT>>(
+          [] __device__(const raft::KeyValuePair<IndexT, DataT>& a,
+                        const raft::KeyValuePair<IndexT, DataT>& b) {
+            raft::KeyValuePair<IndexT, DataT> res;
+            res.key   = 0;
+            res.value = a.value + b.value;
+            return res;
+          }));
 
       // Cluster cost phi_x(C) from all ranks
       comm.allreduce(&(clusterCostD.data()->value),

From 3dfdb6e92ea20c3fee6f0f786cccd4b9f429435e Mon Sep 17 00:00:00 2001
From: Daniel <dcolinmorgan@users.noreply.github.com>
Date: Wed, 13 Dec 2023 00:58:35 +0800
Subject: [PATCH 06/24] move _process_generic to base_return_types, avoid
 circular import (#5695)

Authors:
  - Daniel (https://github.com/dcolinmorgan)

Approvers:
  - William Hicks (https://github.com/wphicks)

URL: https://github.com/rapidsai/cuml/pull/5695
---
 python/cuml/internals/base_helpers.py      | 39 ----------------------
 python/cuml/internals/base_return_types.py | 39 ++++++++++++++++++++++
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/python/cuml/internals/base_helpers.py b/python/cuml/internals/base_helpers.py
index be768819dc..b544f428ea 100644
--- a/python/cuml/internals/base_helpers.py
+++ b/python/cuml/internals/base_helpers.py
@@ -31,45 +31,6 @@
 from cuml.internals.constants import CUML_WRAPPED_FLAG
 
 
-def _process_generic(gen_type):
-
-    # Check if the type is not a generic. If not, must return "generic" if
-    # subtype is CumlArray otherwise None
-    if not isinstance(gen_type, typing._GenericAlias):
-        if issubclass(gen_type, CumlArray):
-            return "generic"
-
-        # We don't handle SparseCumlArray at this time
-        if issubclass(gen_type, SparseCumlArray):
-            raise NotImplementedError(
-                "Generic return types with SparseCumlArray are not supported "
-                "at this time"
-            )
-
-        # Otherwise None (keep processing)
-        return None
-
-    # Its a generic type by this point. Support Union, Tuple, Dict and List
-    supported_gen_types = [
-        tuple,
-        dict,
-        list,
-        typing.Union,
-    ]
-
-    if gen_type.__origin__ in supported_gen_types:
-        # Check for a CumlArray type in the args
-        for arg in gen_type.__args__:
-            inner_type = _process_generic(arg)
-
-            if inner_type is not None:
-                return inner_type
-    else:
-        raise NotImplementedError("Unknow generic type: {}".format(gen_type))
-
-    return None
-
-
 def _wrap_attribute(class_name: str, attribute_name: str, attribute, **kwargs):
 
     # Skip items marked with autowrap_ignore
diff --git a/python/cuml/internals/base_return_types.py b/python/cuml/internals/base_return_types.py
index 5aa0d7f75d..00c796a64c 100644
--- a/python/cuml/internals/base_return_types.py
+++ b/python/cuml/internals/base_return_types.py
@@ -21,6 +21,45 @@
 from cuml.internals.array_sparse import SparseCumlArray
 
 
+def _process_generic(gen_type):
+
+    # Check if the type is not a generic. If not, must return "generic" if
+    # subtype is CumlArray otherwise None
+    if not isinstance(gen_type, typing._GenericAlias):
+        if issubclass(gen_type, CumlArray):
+            return "generic"
+
+        # We don't handle SparseCumlArray at this time
+        if issubclass(gen_type, SparseCumlArray):
+            raise NotImplementedError(
+                "Generic return types with SparseCumlArray are not supported "
+                "at this time"
+            )
+
+        # Otherwise None (keep processing)
+        return None
+
+    # Its a generic type by this point. Support Union, Tuple, Dict and List
+    supported_gen_types = [
+        tuple,
+        dict,
+        list,
+        typing.Union,
+    ]
+
+    if gen_type.__origin__ in supported_gen_types:
+        # Check for a CumlArray type in the args
+        for arg in gen_type.__args__:
+            inner_type = _process_generic(arg)
+
+            if inner_type is not None:
+                return inner_type
+    else:
+        raise NotImplementedError("Unknow generic type: {}".format(gen_type))
+
+    return None
+
+
 def _get_base_return_type(class_name, attr):
 
     if (

From 5a6ab96e10e0312ccb63c5c3ae46c312d090efe3 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 13 Dec 2023 16:38:43 -0800
Subject: [PATCH 07/24] Switch to scikit-build-core (#5693)

Contributes to rapidsai/build-planning#2

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/cuml/pull/5693
---
 build.sh                                      |  13 +-
 ci/build_wheel.sh                             |   2 +-
 .../all_cuda-118_arch-x86_64.yaml             |   2 +-
 .../all_cuda-120_arch-x86_64.yaml             |   2 +-
 conda/recipes/cuml-cpu/meta.yaml              |   2 +-
 conda/recipes/cuml/meta.yaml                  |   2 +-
 dependencies.yaml                             |   5 +-
 python/CMakeLists.txt                         |  10 +-
 python/pyproject.toml                         |  25 ++--
 python/setup.py                               | 114 ------------------
 10 files changed, 32 insertions(+), 145 deletions(-)
 delete mode 100644 python/setup.py

diff --git a/build.sh b/build.sh
index 429f949647..378248e649 100755
--- a/build.sh
+++ b/build.sh
@@ -280,13 +280,15 @@ fi
 
 # Build and (optionally) install the cuml Python package
 if (! hasArg --configure-only) && (completeBuild || hasArg cuml || hasArg pydocs); then
+    # Replace spaces with semicolons in SKBUILD_EXTRA_CMAKE_ARGS
+    SKBUILD_EXTRA_CMAKE_ARGS=$(echo ${SKBUILD_EXTRA_CMAKE_ARGS} | sed 's/ /;/g')
+
     # Append `-DFIND_CUML_CPP=ON` to CUML_EXTRA_CMAKE_ARGS unless a user specified the option.
-    SKBUILD_EXTRA_CMAKE_ARGS="${CUML_EXTRA_CMAKE_ARGS}"
-    if [[ "${CUML_EXTRA_CMAKE_ARGS}" != *"DFIND_CUML_CPP"* ]]; then
-        SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS} -DFIND_CUML_CPP=ON"
+    if [[ "${SKBUILD_EXTRA_CMAKE_ARGS}" != *"DFIND_CUML_CPP"* ]]; then
+        SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS};-DFIND_CUML_CPP=ON"
     fi
 
-    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} ${SKBUILD_EXTRA_CMAKE_ARGS}" \
+    SKBUILD_CMAKE_ARGS="-DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL};${SKBUILD_EXTRA_CMAKE_ARGS}" \
         SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \
         python -m pip install --no-build-isolation --no-deps ${REPODIR}/python
 
@@ -297,7 +299,6 @@ if (! hasArg --configure-only) && (completeBuild || hasArg cuml || hasArg pydocs
 fi
 
 if hasArg cuml-cpu; then
-    SKBUILD_CONFIGURE_OPTIONS="-DCUML_CPU=ON -DCMAKE_MESSAGE_LOG_LEVEL=VERBOSE" \
-        SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \
+    SKBUILD_CMAKE_ARGS="-DCUML_CPU=ON;-DCMAKE_MESSAGE_LOG_LEVEL=VERBOSE" \
         python -m pip install --no-build-isolation --no-deps -v ${REPODIR}/python
 fi
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 8b15323b33..d74acc744a 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -49,7 +49,7 @@ fi
 
 cd ${package_dir}
 
-SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DDISABLE_DEPRECATION_WARNINGS=ON -DCPM_cumlprims_mg_SOURCE=${GITHUB_WORKSPACE}/cumlprims_mg/" \
+SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DDISABLE_DEPRECATION_WARNINGS=ON;-DCPM_cumlprims_mg_SOURCE=${GITHUB_WORKSPACE}/cumlprims_mg/" \
   python -m pip wheel . \
     -w dist \
     -vvv \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 0211c1d7f7..0af9ec2184 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -63,7 +63,7 @@ dependencies:
 - rapids-dask-dependency==24.2.*
 - recommonmark
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - scikit-learn==1.2
 - scipy>=1.8.0
 - seaborn
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index e2768aa83f..cb2c047dc3 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -59,7 +59,7 @@ dependencies:
 - rapids-dask-dependency==24.2.*
 - recommonmark
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - scikit-learn==1.2
 - scipy>=1.8.0
 - seaborn
diff --git a/conda/recipes/cuml-cpu/meta.yaml b/conda/recipes/cuml-cpu/meta.yaml
index cb88ac22b7..efe9d6449d 100644
--- a/conda/recipes/cuml-cpu/meta.yaml
+++ b/conda/recipes/cuml-cpu/meta.yaml
@@ -29,7 +29,7 @@ requirements:
   host:
     - python x.x
     - setuptools
-    - scikit-build>=0.13.1
+    - scikit-build-core >=0.7.0
     - cython>=3.0.0
   run:
     - python x.x
diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml
index bcafb63bb6..e134dd1363 100644
--- a/conda/recipes/cuml/meta.yaml
+++ b/conda/recipes/cuml/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     - pylibraft ={{ minor_version }}
     - python x.x
     - raft-dask ={{ minor_version }}
-    - scikit-build >=0.13.1
+    - scikit-build-core >=0.7.0
     - setuptools
     - treelite {{ treelite_version }}
   run:
diff --git a/dependencies.yaml b/dependencies.yaml
index dc579d48fa..c53387c04c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -151,13 +151,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - scikit-build>=0.13.1
           - cython>=3.0.0
           - &treelite treelite==3.9.1
       - output_types: conda
         packages:
           - &pylibraft_conda pylibraft==24.2.*
           - &rmm_conda rmm==24.2.*
+          - scikit-build-core>=0.7.0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -166,8 +166,7 @@ dependencies:
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
       - output_types: [pyproject, requirements]
         packages:
-          - wheel
-          - setuptools
+          - scikit-build-core[pyproject]>=0.7.0
           - &treelite_runtime treelite_runtime==3.9.1
     specific:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 6bfa48ab11..d71e83a9e1 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -19,7 +19,7 @@ include(../fetch_rapids.cmake)
 set(CUML_VERSION 24.02.00)
 
 option(CUML_CPU "Build only cuML CPU Python components." OFF)
-set(language_list "C;CXX")
+set(language_list "CXX")
 
 if(NOT CUML_CPU)
   # We always need CUDA for cuML GPU because the raft dependency brings in a
@@ -32,11 +32,7 @@ endif()
 project(
   cuml-python
   VERSION ${CUML_VERSION}
-  LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C
-            # language to be enabled here. The test project that is built in scikit-build to verify
-            # various linking options for the python library is hardcoded to build with C, so until
-            # that is fixed we need to keep C.
-            ${language_list}
+  LANGUAGES ${language_list}
 )
 
 ################################################################################
@@ -75,7 +71,7 @@ else()
   set(cuml_FOUND OFF)
 endif()
 
-include(rapids-cython)
+include(rapids-cython-core)
 
 set(CUML_PYTHON_TREELITE_TARGET treelite::treelite)
 
diff --git a/python/pyproject.toml b/python/pyproject.toml
index dd2a930e65..c7b5c03160 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -20,13 +20,11 @@ requires = [
     "ninja",
     "pylibraft==24.2.*",
     "rmm==24.2.*",
-    "scikit-build>=0.13.1",
-    "setuptools",
+    "scikit-build-core[pyproject]>=0.7.0",
     "treelite==3.9.1",
     "treelite_runtime==3.9.1",
-    "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../dependencies.yaml and run `rapids-dependency-file-generator`.
-build-backend = "setuptools.build_meta"
+build-backend = "scikit_build_core.build"
 
 [tool.pytest.ini_options]
 markers = [
@@ -102,12 +100,6 @@ test = [
 Homepage = "https://github.com/rapidsai/cuml"
 Documentation = "https://docs.rapids.ai/api/cuml/stable/"
 
-[tool.setuptools]
-license-files = ["LICENSE"]
-
-[tool.setuptools.dynamic]
-version = {file = "cuml/VERSION"}
-
 [tool.black]
 line-length = 79
 target-version = ["py39"]
@@ -131,3 +123,16 @@ versioneer\.py |
     thirdparty
 )/
 '''
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.minimum-version = "3.26.4"
+ninja.make-fallback = true
+sdist.reproducible = true
+wheel.packages = ["cuml"]
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "cuml/VERSION"
+regex = "(?P<value>.*)"
diff --git a/python/setup.py b/python/setup.py
deleted file mode 100644
index 5a30d78201..0000000000
--- a/python/setup.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import glob
-import os
-import shutil
-import sys
-from pathlib import Path
-
-from setuptools import find_packages
-
-from skbuild import setup
-
-
-##############################################################################
-# - Helper functions
-def get_cli_option(name):
-    if name in sys.argv:
-        print("-- Detected " + str(name) + " build option.")
-        return True
-
-    else:
-        return False
-
-
-def clean_folder(path):
-    """
-    Function to clean all Cython and Python artifacts and cache folders. It
-    cleans the folder as well as its direct children recursively.
-
-    Parameters
-    ----------
-    path : String
-        Path to the folder to be cleaned.
-    """
-    shutil.rmtree(path + "/__pycache__", ignore_errors=True)
-
-    folders = glob.glob(path + "/*/")
-    for folder in folders:
-        shutil.rmtree(folder + "/__pycache__", ignore_errors=True)
-
-        clean_folder(folder)
-
-        cython_exts = glob.glob(folder + "/*.cpp")
-        cython_exts.extend(glob.glob(folder + "/*.cpython*"))
-        for file in cython_exts:
-            os.remove(file)
-
-
-##############################################################################
-# - Print of build options used by setup.py  --------------------------------
-
-clean_artifacts = get_cli_option("clean")
-
-
-##############################################################################
-# - Clean target -------------------------------------------------------------
-
-if clean_artifacts:
-    print("-- Cleaning all Python and Cython build artifacts...")
-
-    # Reset these paths since they may be deleted below
-    treelite_path = False
-
-    try:
-        setup_file_path = str(Path(__file__).parent.absolute())
-        shutil.rmtree(setup_file_path + "/.pytest_cache", ignore_errors=True)
-        shutil.rmtree(
-            setup_file_path + "/_external_repositories", ignore_errors=True
-        )
-        shutil.rmtree(setup_file_path + "/cuml.egg-info", ignore_errors=True)
-        shutil.rmtree(setup_file_path + "/__pycache__", ignore_errors=True)
-
-        clean_folder(setup_file_path + "/cuml")
-        shutil.rmtree(setup_file_path + "/build", ignore_errors=True)
-        shutil.rmtree(setup_file_path + "/_skbuild", ignore_errors=True)
-        shutil.rmtree(setup_file_path + "/dist", ignore_errors=True)
-
-    except IOError:
-        pass
-
-    # need to terminate script so cythonizing doesn't get triggered after
-    # cleanup unintendedly
-    sys.argv.remove("clean")
-
-    if "--all" in sys.argv:
-        sys.argv.remove("--all")
-
-    if len(sys.argv) == 1:
-        sys.exit(0)
-
-
-##############################################################################
-# - Python package generation ------------------------------------------------
-
-packages = find_packages(include=["cuml*"])
-setup(
-    packages=packages,
-    package_data={key: ["VERSION", "*.pxd"] for key in packages},
-    zip_safe=False,
-)

From 546bcb5d59a9b4ad88b0b33173d54a5eac1689aa Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 19 Dec 2023 17:51:24 -0600
Subject: [PATCH 08/24] Update to CCCL 2.2.0. (#5702)

This PR updates cuml to CCCL 2.2.0. Do not merge until all of RAPIDS is ready to update.

Replaces #5623.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Divye Gala (https://github.com/divyegala)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cuml/pull/5702
---
 ci/build_docs.sh                              |  1 +
 ci/build_python.sh                            |  2 +-
 ci/test_cpp.sh                                |  1 +
 cpp/CMakeLists.txt                            |  5 ++-
 .../{get_thrust.cmake => get_cccl.cmake}      | 13 ++++---
 cpp/cmake/thirdparty/get_libcudacxx.cmake     | 35 -------------------
 cpp/src/hdbscan/detail/membership.cuh         | 19 ++++++----
 cpp/src/hdbscan/detail/stabilities.cuh        | 23 ++++++++----
 cpp/src/hdbscan/prediction_data.cu            | 16 +++++----
 fetch_rapids.cmake                            |  2 +-
 10 files changed, 49 insertions(+), 68 deletions(-)
 rename cpp/cmake/thirdparty/{get_thrust.cmake => get_cccl.cmake} (72%)
 delete mode 100644 cpp/cmake/thirdparty/get_libcudacxx.cmake

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 22c2ba5cfe..61999d5c9b 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# Copyright (c) 2023, NVIDIA CORPORATION.
 set -euo pipefail
 
 rapids-logger "Create test conda environment"
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 1332062770..7642e894e5 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -14,7 +14,7 @@ package_dir="python"
 
 version=$(rapids-generate-version)
 git_commit=$(git rev-parse HEAD)
-export RAPIDS_PACKAGE_VERSION=${version} 
+export RAPIDS_PACKAGE_VERSION=${version}
 
 echo "${version}" > VERSION
 sed -i "/^__git_commit__/ s/= .*/= \"${git_commit}\"/g" "${package_dir}/${package_name}/_version.py"
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index ebed9c9bdc..a076887545 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -19,6 +19,7 @@ conda activate test
 set -u
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
+
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c2b5437abf..029f7a7405 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -220,9 +220,8 @@ if(BUILD_CUML_TESTS OR BUILD_PRIMS_TESTS)
   find_package(Threads)
 endif()
 
-# thrust before rmm, rmm before raft so we get the right version of thrust/rmm
-include(cmake/thirdparty/get_thrust.cmake)
-include(cmake/thirdparty/get_libcudacxx.cmake)
+# CCCL before RMM, and RMM before RAFT
+include(cmake/thirdparty/get_cccl.cmake)
 include(cmake/thirdparty/get_rmm.cmake)
 include(cmake/thirdparty/get_raft.cmake)
 
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_cccl.cmake
similarity index 72%
rename from cpp/cmake/thirdparty/get_thrust.cmake
rename to cpp/cmake/thirdparty/get_cccl.cmake
index 1477a8b397..0c126e320e 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_cccl.cmake
@@ -12,12 +12,11 @@
 # the License.
 # =============================================================================
 
-# Use CPM to find or clone thrust
-function(find_and_configure_thrust)
-    include(${rapids-cmake-dir}/cpm/thrust.cmake)
-    rapids_cpm_thrust(NAMESPACE cuml
-                      BUILD_EXPORT_SET cuml-exports
-                      INSTALL_EXPORT_SET cuml-exports)
+# Use CPM to find or clone CCCL
+function(find_and_configure_cccl)
+        include(${rapids-cmake-dir}/cpm/cccl.cmake)
+        rapids_cpm_cccl(BUILD_EXPORT_SET cuml-exports
+                        INSTALL_EXPORT_SET cuml-exports)
 endfunction()
 
-find_and_configure_thrust()
+find_and_configure_cccl()
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
deleted file mode 100644
index 54184ced34..0000000000
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ /dev/null
@@ -1,35 +0,0 @@
-# =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-# This function finds thrust and sets any additional necessary environment variables.
-function(find_and_configure_libcudacxx)
-  # Make sure we install libcudacxx beside our patched version of thrust
-  include(GNUInstallDirs)
-  set(CMAKE_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/libcuml")
-  set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_INCLUDEDIR}/lib")
-
-  include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
-  rapids_cpm_libcudacxx(BUILD_EXPORT_SET cuml-exports INSTALL_EXPORT_SET cuml-exports)
-
-  # Store where CMake can find our custom Thrust install
-  include("${rapids-cmake-dir}/export/find_package_root.cmake")
-  rapids_export_find_package_root(
-    INSTALL libcudacxx
-    [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcuml/lib/rapids/cmake/libcudacxx]=]
-    EXPORT_SET cuml-exports
-    CONDITION libcudacxx_SOURCE_DIR
-  )
-endfunction()
-
-find_and_configure_libcudacxx()
diff --git a/cpp/src/hdbscan/detail/membership.cuh b/cpp/src/hdbscan/detail/membership.cuh
index 6a67ae2d51..9fe1633990 100644
--- a/cpp/src/hdbscan/detail/membership.cuh
+++ b/cpp/src/hdbscan/detail/membership.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -76,13 +76,18 @@ void get_probabilities(const raft::handle_t& handle,
   rmm::device_uvector<value_t> deaths(n_clusters, stream);
   thrust::fill(exec_policy, deaths.begin(), deaths.end(), 0.0f);
 
+  cudaError_t (*reduce_func)(void*,
+                             size_t&,
+                             const value_t*,
+                             value_t*,
+                             int,
+                             const value_idx*,
+                             const value_idx*,
+                             cudaStream_t,
+                             bool) =
+    cub::DeviceSegmentedReduce::Max<const value_t*, value_t*, const value_idx*, const value_idx*>;
   Utils::cub_segmented_reduce(
-    lambdas,
-    deaths.data(),
-    n_clusters,
-    sorted_parents_offsets.data(),
-    stream,
-    cub::DeviceSegmentedReduce::Max<const value_t*, value_t*, const value_idx*, const value_idx*>);
+    lambdas, deaths.data(), n_clusters, sorted_parents_offsets.data(), stream, reduce_func);
 
   // Calculate probability per point
   thrust::fill(exec_policy, probabilities, probabilities + n_leaves, 0.0f);
diff --git a/cpp/src/hdbscan/detail/stabilities.cuh b/cpp/src/hdbscan/detail/stabilities.cuh
index 734814725d..27d7d00c4e 100644
--- a/cpp/src/hdbscan/detail/stabilities.cuh
+++ b/cpp/src/hdbscan/detail/stabilities.cuh
@@ -100,13 +100,22 @@ void compute_stabilities(const raft::handle_t& handle,
                    thrust::make_counting_iterator(n_edges),
                    births_init_op);
 
-  Utils::cub_segmented_reduce(
-    lambdas,
-    births_parent_min.data() + 1,
-    n_clusters - 1,
-    sorted_parents_offsets.data() + 1,
-    stream,
-    cub::DeviceSegmentedReduce::Min<const value_t*, value_t*, const value_idx*, const value_idx*>);
+  cudaError_t (*reduce_func)(void*,
+                             size_t&,
+                             const value_t*,
+                             value_t*,
+                             int,
+                             const value_idx*,
+                             const value_idx*,
+                             cudaStream_t,
+                             bool) =
+    cub::DeviceSegmentedReduce::Min<const value_t*, value_t*, const value_idx*, const value_idx*>;
+  Utils::cub_segmented_reduce(lambdas,
+                              births_parent_min.data() + 1,
+                              n_clusters - 1,
+                              sorted_parents_offsets.data() + 1,
+                              stream,
+                              reduce_func);
   // finally, we find minimum between initialized births where parent=child
   // and births of parents for their children
   auto births_zip =
diff --git a/cpp/src/hdbscan/prediction_data.cu b/cpp/src/hdbscan/prediction_data.cu
index 45b360482b..99b2d22e0b 100644
--- a/cpp/src/hdbscan/prediction_data.cu
+++ b/cpp/src/hdbscan/prediction_data.cu
@@ -130,13 +130,15 @@ void generate_prediction_data(const raft::handle_t& handle,
   prediction_data.set_n_clusters(handle, n_clusters);
 
   // this is to find maximum lambdas of all children under a parent
-  detail::Utils::cub_segmented_reduce(
-    lambdas,
-    prediction_data.get_deaths(),
-    n_clusters,
-    sorted_parents_offsets.data(),
-    stream,
-    cub::DeviceSegmentedReduce::Max<const float*, float*, const int*, const int*>);
+  cudaError_t (*reduce_func)(
+    void*, size_t&, const float*, float*, int, const int*, const int*, cudaStream_t, bool) =
+    cub::DeviceSegmentedReduce::Max<const float*, float*, const int*, const int*>;
+  detail::Utils::cub_segmented_reduce(lambdas,
+                                      prediction_data.get_deaths(),
+                                      n_clusters,
+                                      sorted_parents_offsets.data(),
+                                      stream,
+                                      reduce_func);
 
   rmm::device_uvector<int> is_leaf_cluster(n_clusters, stream);
   thrust::fill(exec_policy, is_leaf_cluster.begin(), is_leaf_cluster.end(), 1);
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 98ce6888ba..284e2200fb 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at

From b8daf81a8c6f4d7c0fb4efe4b961afaa0dd0f3f1 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 20 Dec 2023 07:31:01 -0800
Subject: [PATCH 09/24] Fix cupy dependency in pyproject.toml (#5705)

The cupy dependency in pyproject.toml is currently wrong due to changes made in the devcontainers PR. Unlike in conda environments where simultaneous installation of both pip and conda cupy results in coexistence followed by clobbering at import time, since this approach results in two pip installations (one wheel and one from source) one simply overwrites the other, so we haven't seen serious issues yet. The only manifestation at present is slower CI due to cupy being compiled during the wheel builds.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/cuml/pull/5705
---
 dependencies.yaml     | 5 ++---
 python/pyproject.toml | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index c53387c04c..3d3aa029a6 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -236,7 +236,7 @@ dependencies:
           - matrix: {cuda: "11.8"}
             packages: &py_run_packages_cu11
               - cudf-cu11==24.2.*
-              - cupy-cuda11x>=12.0.0
+              - &cupy_pyproject_cu11 cupy-cuda11x>=12.0.0
               - dask-cudf-cu11==24.2.*
               - *pylibraft_cu11
               - raft-dask-cu11==24.2.*
@@ -247,11 +247,10 @@ dependencies:
           - matrix: null
             packages:
               - *cudf_conda
-              - *cupy_conda
+              - *cupy_pyproject_cu11
               - *dask_cudf_conda
               - *pylibraft_conda
               - *raft_dask_conda
-              - *pylibraft_conda
               - *rmm_conda
   cudatoolkit:
     specific:
diff --git a/python/pyproject.toml b/python/pyproject.toml
index c7b5c03160..c347b91b9e 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -56,7 +56,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cudf==24.2.*",
-    "cupy>=12.0.0",
+    "cupy-cuda11x>=12.0.0",
     "dask-cuda==24.2.*",
     "dask-cudf==24.2.*",
     "joblib>=0.11",

From c8b990127a650837c731c327aeff84e3ce56a118 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 9 Jan 2024 14:42:33 -0500
Subject: [PATCH 10/24] Only cufft offers a static_nocallback version of the
 library (#5703)

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuml/pull/5703
---
 cpp/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 029f7a7405..0f5cf2a1b1 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -141,8 +141,10 @@ endif()
 # - compiler options ---------------------------------------------------------
 
 set(_ctk_static_suffix "")
+set(_ctk_fft_static_suffix "")
 if(CUDA_STATIC_RUNTIME)
-  set(_ctk_static_suffix "_static_nocallback")
+  set(_ctk_static_suffix "_static")
+  set(_ctk_fft_static_suffix "_static_nocallback")
 endif()
 
 if (NOT DISABLE_OPENMP)
@@ -612,7 +614,7 @@ if(BUILD_CUML_CPP_LIBRARY)
   list(APPEND _cuml_cpp_private_libs
     raft::raft
     $<TARGET_NAME_IF_EXISTS:GPUTreeShap::GPUTreeShap>
-    $<$<BOOL:${LINK_CUFFT}>:CUDA::cufft${_ctk_static_suffix}>
+    $<$<BOOL:${LINK_CUFFT}>:CUDA::cufft${_ctk_fft_static_suffix}>
     ${TREELITE_LIBS}
     ${OpenMP_CXX_LIB_NAMES}
     $<$<OR:$<BOOL:${BUILD_CUML_STD_COMMS}>,$<BOOL:${BUILD_CUML_MPI_COMMS}>>:NCCL::NCCL>

From d9fdf50cadc8e6d40f0a440c23fccc844e43ee38 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 11 Jan 2024 12:12:46 -0600
Subject: [PATCH 11/24] refactor CUDA versions in dependencies.yaml (#5712)

Contributes to https://github.com/rapidsai/build-planning/issues/7.

Proposes splitting the `cuda-version` dependency in `dependencies.yaml` out to its own thing, separate from the bits of the CUDA Toolkit this project needs.

### Benefits of this change

* prevents accidental inclusion of multiple `cuda-version` version in environments
* reduces update effort (via enabling more use of globs like `"12.*"`)
* improves the chance that errors like "`conda` recipe is missing a dependency" are caught in CI

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cuml/pull/5712
---
 .pre-commit-config.yaml |  2 +-
 dependencies.yaml       | 47 +++++++++++++++++++++++++++++------------
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5156a91ef6..b443c19fcc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -66,7 +66,7 @@ repos:
             pass_filenames: false
             language: python
     - repo: https://github.com/rapidsai/dependency-file-generator
-      rev: v1.5.1
+      rev: v1.8.0
       hooks:
           - id: rapids-dependency-file-generator
             args: ["--clean"]
diff --git a/dependencies.yaml b/dependencies.yaml
index 3d3aa029a6..d76d3f4cd3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -7,7 +7,8 @@ files:
       arch: [x86_64]
     includes:
       - common_build
-      - cudatoolkit
+      - cuda
+      - cuda_version
       - docs
       - py_build
       - py_run
@@ -20,7 +21,8 @@ files:
       arch: [x86_64]
     includes:
       - common_build
-      - cudatoolkit
+      - cuda
+      - cuda_version
   checks:
     output: none
     includes:
@@ -34,28 +36,29 @@ files:
     includes:
       - clang_tidy
       - common_build
-      - cudatoolkit
+      - cuda
+      - cuda_version
   docs:
     output: none
     includes:
-      - cudatoolkit
+      - cuda_version
       - docs
       - py_version
   test_cpp:
     output: none
     includes:
-      - cudatoolkit
+      - cuda_version
       - test_cpp
   test_python:
     output: none
     includes:
-      - cudatoolkit
+      - cuda_version
       - py_version
       - test_python
   test_notebooks:
     output: none
     includes:
-      - cudatoolkit
+      - cuda_version
       - py_run
       - py_version
       - test_notebooks
@@ -146,7 +149,6 @@ dependencies:
               cuda: "12.0"
             packages:
               - cuda-nvcc
-              - cuda-version=12.0
   py_build:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -252,14 +254,37 @@ dependencies:
               - *pylibraft_conda
               - *raft_dask_conda
               - *rmm_conda
-  cudatoolkit:
+  cuda_version:
     specific:
       - output_types: conda
         matrices:
+          - matrix:
+              cuda: "11.2"
+            packages:
+              - cuda-version=11.2
+          - matrix:
+              cuda: "11.4"
+            packages:
+              - cuda-version=11.4
+          - matrix:
+              cuda: "11.5"
+            packages:
+              - cuda-version=11.5
+          - matrix:
+              cuda: "11.8"
+            packages:
+              - cuda-version=11.8
           - matrix:
               cuda: "12.0"
             packages:
               - cuda-version=12.0
+  cuda:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              cuda: "12.*"
+            packages:
               - cuda-cudart-dev
               - cuda-profiler-api
               - libcublas-dev
@@ -270,7 +295,6 @@ dependencies:
           - matrix:
               cuda: "11.8"
             packages:
-              - cuda-version=11.8
               - cudatoolkit
               - libcublas-dev=11.11.3.6
               - libcublas=11.11.3.6
@@ -285,7 +309,6 @@ dependencies:
           - matrix:
               cuda: "11.5"
             packages:
-              - cuda-version=11.5
               - cudatoolkit
               - libcublas-dev>=11.7.3.1,<=11.7.4.6
               - libcublas>=11.7.3.1,<=11.7.4.6
@@ -300,7 +323,6 @@ dependencies:
           - matrix:
               cuda: "11.4"
             packages:
-              - cuda-version=11.4
               - cudatoolkit
               - &libcublas_dev114 libcublas-dev>=11.5.2.43,<=11.6.5.2
               - &libcublas114 libcublas>=11.5.2.43,<=11.6.5.2
@@ -315,7 +337,6 @@ dependencies:
           - matrix:
               cuda: "11.2"
             packages:
-              - cuda-version=11.2
               - cudatoolkit
               # The NVIDIA channel doesn't publish pkgs older than 11.4 for these libs,
               # so 11.2 uses 11.4 packages (the oldest available).

From 4858fc20acbe5be9e0b0613711ebcd3574f1a507 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 11 Jan 2024 14:40:01 -0800
Subject: [PATCH 12/24] Remove extraneous SKBUILD_BUILD_OPTIONS (#5714)

This should have been removed in #5693

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/cuml/pull/5714
---
 build.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/build.sh b/build.sh
index 378248e649..1132dcaddf 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # cuml build script
 
@@ -289,7 +289,6 @@ if (! hasArg --configure-only) && (completeBuild || hasArg cuml || hasArg pydocs
     fi
 
     SKBUILD_CMAKE_ARGS="-DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL};${SKBUILD_EXTRA_CMAKE_ARGS}" \
-        SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \
         python -m pip install --no-build-isolation --no-deps ${REPODIR}/python
 
     if hasArg pydocs; then

From bb09e545c6f1dc531e2de360820dc6684951fa34 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 12 Jan 2024 12:22:41 -0500
Subject: [PATCH 13/24] Remove usages of rapids-env-update (#5716)

Reference: https://github.com/rapidsai/ops/issues/2766

Replace rapids-env-update with rapids-configure-conda-channels,
rapids-configure-sccache, and rapids-date-string.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cuml/pull/5716
---
 ci/build_cpp.sh    | 6 +++++-
 ci/build_python.sh | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 0a6a649fd0..eac581ccef 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -3,7 +3,11 @@
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 7642e894e5..fcf0f7fe26 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -3,7 +3,11 @@
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 

From a93040c19f214d7f2a8e47e0cdd74cdbe3b83eeb Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Sat, 20 Jan 2024 11:46:30 -0500
Subject: [PATCH 14/24] Fix shared-workflows repo name (#5723)

Fix the repo name for `shared-workflows`

See: https://github.com/rapidsai/cudf/pull/14784

Authors:
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cuml/pull/5723
---
 .github/workflows/pr.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e4006a7ac2..c91c649105 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -112,7 +112,7 @@ jobs:
       script: ci/test_wheel.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
     with:
       extra-repo-deploy-key: CUMLPRIMS_SSH_PRIVATE_DEPLOY_KEY
       build_command: |

From e1b6ce77d4355dcea247cd15abcaa64f751a8a67 Mon Sep 17 00:00:00 2001
From: William Hicks <wphicks@users.noreply.github.com>
Date: Wed, 24 Jan 2024 18:41:44 -0500
Subject: [PATCH 15/24] Synchronize stream in SVC memory test (#5729)

Authors:
  - William Hicks (https://github.com/wphicks)

Approvers:
  - Micka (https://github.com/lowener)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/cuml/pull/5729
---
 cpp/test/sg/svc_test.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/test/sg/svc_test.cu b/cpp/test/sg/svc_test.cu
index fb53eb6374..709e48de18 100644
--- a/cpp/test/sg/svc_test.cu
+++ b/cpp/test/sg/svc_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1449,6 +1449,7 @@ TYPED_TEST(SmoSolverTest, MemoryLeak)
       }
     }
   }
+  raft::interruptible::synchronize(stream);
   RAFT_CUDA_TRY(cudaMemGetInfo(&free2, &total));
   float delta = (free1 - free2);
   EXPECT_EQ(delta, 0);

From dd50df5fd7e4f46c471065b6e68c654e2770c4fb Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <phcho@nvidia.com>
Date: Mon, 29 Jan 2024 15:15:18 -0800
Subject: [PATCH 16/24] Disable hnswlib feature in RAFT; pin pytest (#5733)

This resolves cmake failures such as the following:
```
CMake Error at /home/coder/cuml/cpp/build/release/_deps/raft-build/raft-targets.cmake:56 (set_target_properties):
      The link interface of target "raft::raft" contains:
        hnswlib::hnswlib
```

Also pin pytest to 7.x to avoid breaking changes from pytest 8. See https://github.com/rapidsai/cudf/pull/14920

Authors:
  - Philip Hyunsu Cho (https://github.com/hcho3)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cuml/pull/5733
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-120_arch-x86_64.yaml | 2 +-
 cpp/cmake/thirdparty/get_raft.cmake              | 3 ++-
 dependencies.yaml                                | 2 +-
 python/pyproject.toml                            | 2 +-
 5 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 0af9ec2184..929bd9b94d 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -53,11 +53,11 @@ dependencies:
 - pydata-sphinx-theme!=0.14.2
 - pylibraft==24.2.*
 - pynndescent==0.5.8
-- pytest
 - pytest-benchmark
 - pytest-cases
 - pytest-cov
 - pytest-xdist
+- pytest==7.*
 - python>=3.9,<3.11
 - raft-dask==24.2.*
 - rapids-dask-dependency==24.2.*
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index cb2c047dc3..1dad68e41a 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -49,11 +49,11 @@ dependencies:
 - pydata-sphinx-theme!=0.14.2
 - pylibraft==24.2.*
 - pynndescent==0.5.8
-- pytest
 - pytest-benchmark
 - pytest-cases
 - pytest-cov
 - pytest-xdist
+- pytest==7.*
 - python>=3.9,<3.11
 - raft-dask==24.2.*
 - rapids-dask-dependency==24.2.*
diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 8919feb45d..7bc860eed8 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -65,6 +65,7 @@ function(find_and_configure_raft)
         OPTIONS
           "BUILD_TESTS OFF"
           "BUILD_BENCH OFF"
+          "BUILD_CAGRA_HNSWLIB OFF"
           "RAFT_COMPILE_LIBRARY ${PKG_COMPILE_LIBRARY}"
     )
 
diff --git a/dependencies.yaml b/dependencies.yaml
index d76d3f4cd3..8f6262f497 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -397,7 +397,7 @@ dependencies:
           - hypothesis>=6.0,<7
           - nltk
           - numpydoc
-          - pytest
+          - pytest==7.*
           - pytest-benchmark
           - pytest-cases
           - pytest-cov
diff --git a/python/pyproject.toml b/python/pyproject.toml
index c347b91b9e..c064b65fff 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -85,11 +85,11 @@ test = [
     "nltk",
     "numpydoc",
     "pynndescent==0.5.8",
-    "pytest",
     "pytest-benchmark",
     "pytest-cases",
     "pytest-cov",
     "pytest-xdist",
+    "pytest==7.*",
     "scikit-learn==1.2",
     "seaborn",
     "statsmodels",

From 1ee99d70ba3a87c46272e51ceac2bf6907f2b7cb Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 30 Jan 2024 16:27:14 -0800
Subject: [PATCH 17/24] Migrate to Treelite 4.0 (#5701)

Test cuML with Treelite 4.0 RC1.

This PR should be merged only when Treelite 4.0 release is finalized.

Supersedes #5650

Authors:
  - Philip Hyunsu Cho (https://github.com/hcho3)

Approvers:
  - William Hicks (https://github.com/wphicks)
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cuml/pull/5701
---
 .../all_cuda-118_arch-x86_64.yaml             |   2 +-
 .../all_cuda-120_arch-x86_64.yaml             |   2 +-
 conda/recipes/cuml/conda_build_config.yaml    |   2 +-
 conda/recipes/libcuml/conda_build_config.yaml |   2 +-
 cpp/CMakeLists.txt                            |  10 -
 cpp/bench/sg/fil.cu                           |   4 +-
 cpp/bench/sg/filex.cu                         |   4 +-
 cpp/cmake/thirdparty/get_treelite.cmake       |  26 +--
 cpp/include/cuml/ensemble/randomforest.hpp    |   6 +-
 cpp/include/cuml/ensemble/treelite_defs.hpp   |   8 +-
 .../experimental/fil/treelite_importer.hpp    | 169 +++++++++-------
 cpp/include/cuml/explainer/tree_shap.hpp      |   6 +-
 cpp/include/cuml/fil/fil.h                    |   4 +-
 cpp/src/decisiontree/decisiontree.cuh         |  11 +-
 cpp/src/explainer/tree_shap.cu                | 188 ++++++++----------
 cpp/src/fil/internal.cuh                      |   8 +-
 cpp/src/fil/treelite_import.cu                | 162 ++++++++-------
 cpp/src/randomforest/randomforest.cu          | 135 ++++---------
 cpp/test/sg/fil_test.cu                       | 153 +++++++-------
 cpp/test/sg/rf_test.cu                        |   6 +-
 dependencies.yaml                             |   6 +-
 python/cuml/benchmark/algorithms.py           |  14 +-
 python/cuml/benchmark/bench_helper_funcs.py   |  28 +--
 python/cuml/ensemble/CMakeLists.txt           |   2 +-
 python/cuml/ensemble/randomforest_common.pyx  |  12 +-
 python/cuml/ensemble/randomforest_shared.pxd  |  13 +-
 python/cuml/ensemble/randomforest_shared.pyx  |  28 +--
 python/cuml/experimental/fil/fil.pyx          |   6 +-
 python/cuml/experimental/fil/infer_kind.pxd   |   2 +-
 python/cuml/explainer/tree_shap.pyx           |  88 ++++++--
 python/cuml/fil/fil.pyx                       | 129 ++++++++----
 python/cuml/tests/test_random_forest.py       |   6 +-
 python/pyproject.toml                         |   6 +-
 33 files changed, 636 insertions(+), 612 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 929bd9b94d..0986715d47 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -72,7 +72,7 @@ dependencies:
 - sphinx<6
 - statsmodels
 - sysroot_linux-64==2.17
-- treelite==3.9.1
+- treelite==4.0.0
 - umap-learn==0.5.3
 - pip:
   - dask-glm==0.3.0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 1dad68e41a..5f23df82ef 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -68,7 +68,7 @@ dependencies:
 - sphinx<6
 - statsmodels
 - sysroot_linux-64==2.17
-- treelite==3.9.1
+- treelite==4.0.0
 - umap-learn==0.5.3
 - pip:
   - dask-glm==0.3.0
diff --git a/conda/recipes/cuml/conda_build_config.yaml b/conda/recipes/cuml/conda_build_config.yaml
index d7140c479f..bdd8239202 100644
--- a/conda/recipes/cuml/conda_build_config.yaml
+++ b/conda/recipes/cuml/conda_build_config.yaml
@@ -17,4 +17,4 @@ sysroot_version:
   - "=2.17"
 
 treelite_version:
-  - "=3.9.1"
+  - "=4.0.0"
diff --git a/conda/recipes/libcuml/conda_build_config.yaml b/conda/recipes/libcuml/conda_build_config.yaml
index 873ad647e1..ea5b7a8058 100644
--- a/conda/recipes/libcuml/conda_build_config.yaml
+++ b/conda/recipes/libcuml/conda_build_config.yaml
@@ -17,7 +17,7 @@ cmake_version:
   - ">=3.26.4"
 
 treelite_version:
-  - "=3.9.1"
+  - "=4.0.0"
 
 gtest_version:
   - ">=1.13.0"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0f5cf2a1b1..6b3bb88b07 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -596,11 +596,7 @@ if(BUILD_CUML_CPP_LIBRARY)
   endif()
 
   if(CUML_USE_TREELITE_STATIC AND (TARGET treelite::treelite_static))
-    # By default, TREELITE_LIBS will contain both treelite::treelite_static and
-    # treelite::treelite_runtime_static if we are linking statically, but these
-    # two targets have duplicate symbols so we can only link to one of them.
     set(TREELITE_LIBS treelite::treelite_static)
-
     copy_interface_excludes(INCLUDED_TARGET treelite::treelite_static TARGET ${CUML_CPP_TARGET})
   elseif(CUML_EXPORT_TREELITE_LINKAGE)
     list(APPEND _cuml_cpp_public_libs ${TREELITE_LIBS})
@@ -744,9 +740,6 @@ if (TARGET treelite::treelite)
 if (TARGET treelite::treelite AND (NOT TARGET treelite))
     add_library(treelite ALIAS treelite::treelite)
 endif()
-if (TARGET treelite::treelite_runtime AND (NOT TARGET treelite_runtime))
-    add_library(treelite_runtime ALIAS treelite::treelite_runtime)
-endif()
 ]=])
 else()
     string(APPEND code_string
@@ -754,9 +747,6 @@ else()
 if (TARGET treelite::treelite_static AND (NOT TARGET treelite_static))
     add_library(treelite_static ALIAS treelite::treelite_static)
 endif()
-if (TARGET treelite::treelite_runtime_static AND (NOT TARGET treelite_runtime_static))
-    add_library(treelite_runtime_static ALIAS treelite::treelite_runtime_static)
-endif()
 ]=])
 
 endif()
diff --git a/cpp/bench/sg/fil.cu b/cpp/bench/sg/fil.cu
index 09efc1dfa3..348ea2f6b9 100644
--- a/cpp/bench/sg/fil.cu
+++ b/cpp/bench/sg/fil.cu
@@ -31,7 +31,7 @@ namespace fil {
 struct Params {
   DatasetParams data;
   RegressionParams blobs;
-  ModelHandle model;
+  TreeliteModelHandle model;
   ML::fil::storage_type_t storage;
   ML::fil::algo_t algo;
   RF_params rf;
@@ -120,7 +120,7 @@ class FIL : public RegressionFixture<float> {
 
  private:
   ML::fil::forest_t<float> forest;
-  ModelHandle model;
+  TreeliteModelHandle model;
   Params p_rest;
 };
 
diff --git a/cpp/bench/sg/filex.cu b/cpp/bench/sg/filex.cu
index 048d89c3d9..aa47195cf9 100644
--- a/cpp/bench/sg/filex.cu
+++ b/cpp/bench/sg/filex.cu
@@ -37,7 +37,7 @@ namespace filex {
 struct Params {
   DatasetParams data;
   RegressionParams blobs;
-  ModelHandle model;
+  TreeliteModelHandle model;
   ML::fil::storage_type_t storage;
   bool use_experimental;
   RF_params rf;
@@ -220,7 +220,7 @@ class FILEX : public RegressionFixture<float> {
 
  private:
   ML::fil::forest_t<float> forest;
-  ModelHandle model;
+  TreeliteModelHandle model;
   Params p_rest;
 };
 
diff --git a/cpp/cmake/thirdparty/get_treelite.cmake b/cpp/cmake/thirdparty/get_treelite.cmake
index 8cfd69be02..6fda64c080 100644
--- a/cpp/cmake/thirdparty/get_treelite.cmake
+++ b/cpp/cmake/thirdparty/get_treelite.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,9 +22,9 @@ function(find_and_configure_treelite)
                           "${multiValueArgs}" ${ARGN} )
 
     if(NOT PKG_BUILD_STATIC_LIBS)
-      list(APPEND TREELITE_LIBS treelite::treelite treelite::treelite_runtime)
+      list(APPEND TREELITE_LIBS treelite::treelite)
     else()
-      list(APPEND TREELITE_LIBS treelite::treelite_static treelite::treelite_runtime_static)
+      list(APPEND TREELITE_LIBS treelite::treelite_static)
     endif()
 
     rapids_cpm_find(Treelite ${PKG_VERSION}
@@ -40,9 +40,9 @@ function(find_and_configure_treelite)
     )
 
 
-    list(APPEND TREELITE_LIBS_NO_PREFIX treelite treelite_runtime)
+    list(APPEND TREELITE_LIBS_NO_PREFIX treelite)
     if(Treelite_ADDED AND PKG_BUILD_STATIC_LIBS)
-        list(APPEND TREELITE_LIBS_NO_PREFIX treelite_static treelite_runtime_static)
+        list(APPEND TREELITE_LIBS_NO_PREFIX treelite_static)
     endif()
 
     set(Treelite_ADDED ${Treelite_ADDED} PARENT_SCOPE)
@@ -52,28 +52,16 @@ function(find_and_configure_treelite)
             target_include_directories(treelite
                 PUBLIC $<BUILD_INTERFACE:${Treelite_SOURCE_DIR}/include>
                        $<BUILD_INTERFACE:${Treelite_BINARY_DIR}/include>)
-            target_include_directories(treelite_runtime
-                PUBLIC $<BUILD_INTERFACE:${Treelite_SOURCE_DIR}/include>
-                       $<BUILD_INTERFACE:${Treelite_BINARY_DIR}/include>)
             if(NOT TARGET treelite::treelite)
                 add_library(treelite::treelite ALIAS treelite)
             endif()
-            if(NOT TARGET treelite::treelite_runtime)
-                add_library(treelite::treelite_runtime ALIAS treelite_runtime)
-            endif()
         else()
             target_include_directories(treelite_static
                 PUBLIC $<BUILD_INTERFACE:${Treelite_SOURCE_DIR}/include>
                        $<BUILD_INTERFACE:${Treelite_BINARY_DIR}/include>)
-            target_include_directories(treelite_runtime_static
-                PUBLIC $<BUILD_INTERFACE:${Treelite_SOURCE_DIR}/include>
-                       $<BUILD_INTERFACE:${Treelite_BINARY_DIR}/include>)
             if(NOT TARGET treelite::treelite_static)
                 add_library(treelite::treelite_static ALIAS treelite_static)
             endif()
-            if(NOT TARGET treelite::treelite_runtime_static)
-                add_library(treelite::treelite_runtime_static ALIAS treelite_runtime_static)
-            endif()
         endif()
 
         rapids_export(BUILD Treelite
@@ -90,7 +78,7 @@ function(find_and_configure_treelite)
     rapids_export_find_package_root(BUILD Treelite [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cuml-exports)
 endfunction()
 
-find_and_configure_treelite(VERSION     3.9.1
-                        PINNED_TAG  346d92547295417676f499ce2dd4fff946b9042a
+find_and_configure_treelite(VERSION     4.0.0
+                        PINNED_TAG  e878556d29336d2242fd926beb659b9dec41be3a
                         EXCLUDE_FROM_ALL  ${CUML_EXCLUDE_TREELITE_FROM_ALL}
                         BUILD_STATIC_LIBS ${CUML_USE_TREELITE_STATIC})
diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp
index bccc02bac2..2df7929cd5 100644
--- a/cpp/include/cuml/ensemble/randomforest.hpp
+++ b/cpp/include/cuml/ensemble/randomforest.hpp
@@ -128,14 +128,12 @@ template <class T, class L>
 std::string get_rf_json(const RandomForestMetaData<T, L>* forest);
 
 template <class T, class L>
-void build_treelite_forest(ModelHandle* model,
+void build_treelite_forest(TreeliteModelHandle* model,
                            const RandomForestMetaData<T, L>* forest,
                            int num_features);
 
-ModelHandle concatenate_trees(std::vector<ModelHandle> treelite_handles);
+TreeliteModelHandle concatenate_trees(std::vector<TreeliteModelHandle> treelite_handles);
 
-void compare_concat_forest_to_subforests(ModelHandle concat_tree_handle,
-                                         std::vector<ModelHandle> treelite_handles);
 // ----------------------------- Classification ----------------------------------- //
 
 typedef RandomForestMetaData<float, int> RandomForestClassifierF;
diff --git a/cpp/include/cuml/ensemble/treelite_defs.hpp b/cpp/include/cuml/ensemble/treelite_defs.hpp
index 76139b2080..5026d7b31b 100644
--- a/cpp/include/cuml/ensemble/treelite_defs.hpp
+++ b/cpp/include/cuml/ensemble/treelite_defs.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #pragma once
 
-// Same definition as ModelHandle in treelite, to avoid dependencies
+// Same definition as TreeliteModelHandle in treelite, to avoid dependencies
 // of cuML C++ headers on treelite headers.
 // Original definition here:
-// https://github.com/dmlc/treelite/blob/fca738770d2b09be1c0842fac9c0f5e3f6126c40/include/treelite/c_api.h#L25
-typedef void* ModelHandle;
+// https://github.com/dmlc/treelite/blob/6ca4eb5e699aa73d3721638fc1a3a43bf658a48b/include/treelite/c_api.h#L38
+typedef void* TreeliteModelHandle;
diff --git a/cpp/include/cuml/experimental/fil/treelite_importer.hpp b/cpp/include/cuml/experimental/fil/treelite_importer.hpp
index 5858cb4a62..f083f30cf7 100644
--- a/cpp/include/cuml/experimental/fil/treelite_importer.hpp
+++ b/cpp/include/cuml/experimental/fil/treelite_importer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,8 +27,11 @@
 #include <queue>
 #include <stack>
 #include <treelite/c_api.h>
+#include <treelite/enum/task_type.h>
+#include <treelite/enum/tree_node_type.h>
+#include <treelite/enum/typeinfo.h>
 #include <treelite/tree.h>
-#include <treelite/typeinfo.h>
+#include <variant>
 
 namespace ML {
 namespace experimental {
@@ -113,13 +116,13 @@ struct treelite_importer {
       return result;
     }
 
-    auto get_categories() { return tree.MatchingCategories(node_id); }
+    auto get_categories() { return tree.CategoryList(node_id); }
 
     auto get_feature() { return tree.SplitIndex(node_id); }
 
     auto is_categorical()
     {
-      return tree.SplitType(node_id) == treelite::SplitFeatureType::kCategorical;
+      return tree.NodeType(node_id) == treelite::TreeNodeType::kCategoricalTestNode;
     }
 
     auto default_distant()
@@ -127,7 +130,7 @@ struct treelite_importer {
       auto result        = false;
       auto default_child = tree.DefaultChild(node_id);
       if (is_categorical()) {
-        if (tree.CategoriesListRightChild(node_id)) {
+        if (tree.CategoryListRightChild(node_id)) {
           result = (default_child == tree.RightChild(node_id));
         } else {
           result = (default_child == tree.LeftChild(node_id));
@@ -147,8 +150,8 @@ struct treelite_importer {
 
     auto categories()
     {
-      auto result = decltype(tree.MatchingCategories(node_id)){};
-      if (is_categorical()) { result = tree.MatchingCategories(node_id); }
+      auto result = decltype(tree.CategoryList(node_id)){};
+      if (is_categorical()) { result = tree.CategoryList(node_id); }
       return result;
     }
 
@@ -192,7 +195,7 @@ struct treelite_importer {
             throw model_import_error("Unrecognized Treelite operator");
           }
         } else {
-          if (tl_tree.CategoriesListRightChild(node_id)) {
+          if (tl_tree.CategoryListRightChild(node_id)) {
             to_be_visited.add(tl_left_id, tl_right_id);
           } else {
             to_be_visited.add(tl_right_id, tl_left_id);
@@ -254,20 +257,25 @@ struct treelite_importer {
   template <typename lambda_t>
   void tree_for_each(treelite::Model const& tl_model, lambda_t&& lambda)
   {
-    tl_model.Dispatch([&lambda](auto&& concrete_tl_model) {
-      std::for_each(std::begin(concrete_tl_model.trees), std::end(concrete_tl_model.trees), lambda);
-    });
+    std::visit(
+      [&lambda](auto&& concrete_tl_model) {
+        std::for_each(
+          std::begin(concrete_tl_model.trees), std::end(concrete_tl_model.trees), lambda);
+      },
+      tl_model.variant_);
   }
 
   template <typename iter_t, typename lambda_t>
   void tree_transform(treelite::Model const& tl_model, iter_t output_iter, lambda_t&& lambda)
   {
-    tl_model.Dispatch([&output_iter, &lambda](auto&& concrete_tl_model) {
-      std::transform(std::begin(concrete_tl_model.trees),
-                     std::end(concrete_tl_model.trees),
-                     output_iter,
-                     lambda);
-    });
+    std::visit(
+      [&output_iter, &lambda](auto&& concrete_tl_model) {
+        std::transform(std::begin(concrete_tl_model.trees),
+                       std::end(concrete_tl_model.trees),
+                       output_iter,
+                       lambda);
+      },
+      tl_model.variant_);
   }
 
   template <typename T, typename lambda_t>
@@ -281,8 +289,8 @@ struct treelite_importer {
   auto num_trees(treelite::Model const& tl_model)
   {
     auto result = index_type{};
-    tl_model.Dispatch(
-      [&result](auto&& concrete_tl_model) { result = concrete_tl_model.trees.size(); });
+    std::visit([&result](auto&& concrete_tl_model) { result = concrete_tl_model.trees.size(); },
+               tl_model.variant_);
     return result;
   }
 
@@ -305,18 +313,12 @@ struct treelite_importer {
 
   auto get_num_class(treelite::Model const& tl_model)
   {
-    auto result = index_type{};
-    tl_model.Dispatch(
-      [&result](auto&& concrete_tl_model) { result = concrete_tl_model.task_param.num_class; });
-    return result;
+    return static_cast<index_type>(tl_model.num_class[0]);
   }
 
   auto get_num_feature(treelite::Model const& tl_model)
   {
-    auto result = index_type{};
-    tl_model.Dispatch(
-      [&result](auto&& concrete_tl_model) { result = concrete_tl_model.num_feature; });
-    return result;
+    return static_cast<index_type>(tl_model.num_feature);
   }
 
   auto get_max_num_categories(treelite::Model const& tl_model)
@@ -353,62 +355,56 @@ struct treelite_importer {
   auto get_average_factor(treelite::Model const& tl_model)
   {
     auto result = double{};
-    tl_model.Dispatch([&result](auto&& concrete_tl_model) {
-      if (concrete_tl_model.average_tree_output) {
-        if (concrete_tl_model.task_type == treelite::TaskType::kMultiClfGrovePerClass) {
-          result = concrete_tl_model.trees.size() / concrete_tl_model.task_param.num_class;
-        } else {
-          result = concrete_tl_model.trees.size();
-        }
+    if (tl_model.average_tree_output) {
+      if (tl_model.task_type == treelite::TaskType::kMultiClf &&
+          tl_model.leaf_vector_shape[1] == 1) {  // grove-per-class
+        result = num_trees(tl_model) / tl_model.num_class[0];
       } else {
-        result = 1.0;
+        result = num_trees(tl_model);
       }
-    });
+    } else {
+      result = 1.0;
+    }
     return result;
   }
 
   auto get_bias(treelite::Model const& tl_model)
   {
-    auto result = double{};
-    tl_model.Dispatch(
-      [&result](auto&& concrete_tl_model) { result = concrete_tl_model.param.global_bias; });
-    return result;
+    return static_cast<double>(tl_model.base_scores[0]);
   }
 
   auto get_postproc_params(treelite::Model const& tl_model)
   {
-    auto result = detail::postproc_params_t{};
-    tl_model.Dispatch([&result](auto&& concrete_tl_model) {
-      auto tl_pred_transform = std::string{concrete_tl_model.param.pred_transform};
-      if (tl_pred_transform == std::string{"identity"} ||
-          tl_pred_transform == std::string{"identity_multiclass"}) {
-        result.element = element_op::disable;
-        result.row     = row_op::disable;
-      } else if (tl_pred_transform == std::string{"signed_square"}) {
-        result.element = element_op::signed_square;
-      } else if (tl_pred_transform == std::string{"hinge"}) {
-        result.element = element_op::hinge;
-      } else if (tl_pred_transform == std::string{"sigmoid"}) {
-        result.constant = concrete_tl_model.param.sigmoid_alpha;
-        result.element  = element_op::sigmoid;
-      } else if (tl_pred_transform == std::string{"exponential"}) {
-        result.element = element_op::exponential;
-      } else if (tl_pred_transform == std::string{"exponential_standard_ratio"}) {
-        result.constant = -concrete_tl_model.param.ratio_c / std::log(2);
-        result.element  = element_op::exponential;
-      } else if (tl_pred_transform == std::string{"logarithm_one_plus_exp"}) {
-        result.element = element_op::logarithm_one_plus_exp;
-      } else if (tl_pred_transform == std::string{"max_index"}) {
-        result.row = row_op::max_index;
-      } else if (tl_pred_transform == std::string{"softmax"}) {
-        result.row = row_op::softmax;
-      } else if (tl_pred_transform == std::string{"multiclass_ova"}) {
-        result.constant = concrete_tl_model.param.sigmoid_alpha;
-        result.element  = element_op::sigmoid;
-      } else {
-        throw model_import_error{"Unrecognized Treelite pred_transform string"};
-      }
-    });
+    auto result            = detail::postproc_params_t{};
+    auto tl_pred_transform = tl_model.postprocessor;
+    if (tl_pred_transform == std::string{"identity"} ||
+        tl_pred_transform == std::string{"identity_multiclass"}) {
+      result.element = element_op::disable;
+      result.row     = row_op::disable;
+    } else if (tl_pred_transform == std::string{"signed_square"}) {
+      result.element = element_op::signed_square;
+    } else if (tl_pred_transform == std::string{"hinge"}) {
+      result.element = element_op::hinge;
+    } else if (tl_pred_transform == std::string{"sigmoid"}) {
+      result.constant = tl_model.sigmoid_alpha;
+      result.element  = element_op::sigmoid;
+    } else if (tl_pred_transform == std::string{"exponential"}) {
+      result.element = element_op::exponential;
+    } else if (tl_pred_transform == std::string{"exponential_standard_ratio"}) {
+      result.constant = -tl_model.ratio_c / std::log(2);
+      result.element  = element_op::exponential;
+    } else if (tl_pred_transform == std::string{"logarithm_one_plus_exp"}) {
+      result.element = element_op::logarithm_one_plus_exp;
+    } else if (tl_pred_transform == std::string{"max_index"}) {
+      result.row = row_op::max_index;
+    } else if (tl_pred_transform == std::string{"softmax"}) {
+      result.row = row_op::softmax;
+    } else if (tl_pred_transform == std::string{"multiclass_ova"}) {
+      result.constant = tl_model.sigmoid_alpha;
+      result.element  = element_op::sigmoid;
+    } else {
+      throw model_import_error{"Unrecognized Treelite pred_transform string"};
+    }
     return result;
   }
 
@@ -563,6 +559,33 @@ struct treelite_importer {
               int device                               = 0,
               raft_proto::cuda_stream stream           = raft_proto::cuda_stream{})
   {
+    ASSERT(tl_model.num_target == 1, "FIL does not support multi-target model");
+    // Check tree annotation (assignment)
+    if (tl_model.task_type == treelite::TaskType::kMultiClf) {
+      // Must be either vector leaf or grove-per-class
+      if (tl_model.leaf_vector_shape[1] > 1) {  // vector-leaf
+        ASSERT(tl_model.leaf_vector_shape[1] == tl_model.num_class[0],
+               "Vector leaf must be equal to num_class = %d",
+               tl_model.num_class[0]);
+        auto tree_count = num_trees(tl_model);
+        for (decltype(tree_count) tree_id = 0; tree_id < tree_count; ++tree_id) {
+          ASSERT(tl_model.class_id[tree_id] == -1, "Tree %d has invalid class assignment", tree_id);
+        }
+      } else {  // grove-per-class
+        auto tree_count = num_trees(tl_model);
+        for (decltype(tree_count) tree_id = 0; tree_id < tree_count; ++tree_id) {
+          ASSERT(tl_model.class_id[tree_id] == tree_id % tl_model.num_class[0],
+                 "Tree %d has invalid class assignment",
+                 tree_id);
+        }
+      }
+    }
+    // Check base_scores
+    for (std::int32_t class_id = 1; class_id < tl_model.num_class[0]; ++class_id) {
+      ASSERT(tl_model.base_scores[0] == tl_model.base_scores[class_id],
+             "base_scores must be identical for all classes");
+    }
+
     auto result                = decision_forest_variant{};
     auto num_feature           = get_num_feature(tl_model);
     auto max_num_categories    = get_max_num_categories(tl_model);
@@ -675,7 +698,7 @@ auto import_from_treelite_model(treelite::Model const& tl_model,
  * @param stream The CUDA stream to use for loading this model (can be
  * omitted for CPU).
  */
-auto import_from_treelite_handle(ModelHandle tl_handle,
+auto import_from_treelite_handle(TreeliteModelHandle tl_handle,
                                  tree_layout layout                       = preferred_tree_layout,
                                  index_type align_bytes                   = index_type{},
                                  std::optional<bool> use_double_precision = std::nullopt,
diff --git a/cpp/include/cuml/explainer/tree_shap.hpp b/cpp/include/cuml/explainer/tree_shap.hpp
index b7e838bdfe..6b8b0f75d9 100644
--- a/cpp/include/cuml/explainer/tree_shap.hpp
+++ b/cpp/include/cuml/explainer/tree_shap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ using TreePathHandle =
 
 using FloatPointer = std::variant<float*, double*>;
 
-TreePathHandle extract_path_info(ModelHandle model);
+TreePathHandle extract_path_info(TreeliteModelHandle model);
 
 void gpu_treeshap(TreePathHandle path_info,
                   const FloatPointer data,
@@ -67,4 +67,4 @@ void gpu_treeshap_taylor_interactions(TreePathHandle path_info,
                                       std::size_t out_preds_size);
 
 }  // namespace Explainer
-}  // namespace ML
\ No newline at end of file
+}  // namespace ML
diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h
index a5b0b6b2aa..4058590c49 100644
--- a/cpp/include/cuml/fil/fil.h
+++ b/cpp/include/cuml/fil/fil.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -137,7 +137,7 @@ struct treelite_params_t {
  */
 void from_treelite(const raft::handle_t& handle,
                    forest_variant* pforest,
-                   ModelHandle model,
+                   TreeliteModelHandle model,
                    const treelite_params_t* tl_params);
 
 /** free deletes forest and all resources held by it; after this, forest is no longer usable
diff --git a/cpp/src/decisiontree/decisiontree.cuh b/cpp/src/decisiontree/decisiontree.cuh
index eac66f1e16..25143337e7 100644
--- a/cpp/src/decisiontree/decisiontree.cuh
+++ b/cpp/src/decisiontree/decisiontree.cuh
@@ -162,6 +162,7 @@ tl::Tree<T, T> build_treelite_tree(const DT::TreeMetaDataNode<T, L>& rf_tree,
 
   tl::Tree<T, T> tl_tree;
   tl_tree.Init();
+  tl_tree.AllocNode();  // Allocate the root node
 
   // Track head and tail of bounded "queues" (implemented as vectors for
   // performance)
@@ -185,18 +186,20 @@ tl::Tree<T, T> build_treelite_tree(const DT::TreeMetaDataNode<T, L>& rf_tree,
       ++cur_front;
 
       if (!q_node.IsLeaf()) {
-        tl_tree.AddChilds(tl_node_id);
+        const int cleft  = tl_tree.AllocNode();
+        const int cright = tl_tree.AllocNode();
+        tl_tree.SetChildren(tl_node_id, cleft, cright);
 
         // Push left child to next_level queue.
-        next_level_queue[next_end] = {q_node.LeftChildId(), tl_tree.LeftChild(tl_node_id)};
+        next_level_queue[next_end] = {q_node.LeftChildId(), cleft};
         ++next_end;
 
         // Push right child to next_level queue.
-        next_level_queue[next_end] = {q_node.RightChildId(), tl_tree.RightChild(tl_node_id)};
+        next_level_queue[next_end] = {q_node.RightChildId(), cright};
         ++next_end;
 
         // Set node from current level as numerical node. Children IDs known.
-        tl_tree.SetNumericalSplit(
+        tl_tree.SetNumericalTest(
           tl_node_id, q_node.ColumnId(), q_node.QueryValue(), true, tl::Operator::kLE);
 
       } else {
diff --git a/cpp/src/explainer/tree_shap.cu b/cpp/src/explainer/tree_shap.cu
index 926125d3ab..b05a6f7c3c 100644
--- a/cpp/src/explainer/tree_shap.cu
+++ b/cpp/src/explainer/tree_shap.cu
@@ -33,6 +33,9 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/memory.h>
+#include <treelite/enum/operator.h>
+#include <treelite/enum/task_type.h>
+#include <treelite/enum/tree_node_type.h>
 #include <treelite/tree.h>
 #include <type_traits>
 #include <variant>
@@ -109,11 +112,11 @@ template <bool is_device>
 using CatBitField = BitField<CatBitFieldStorageT, is_device>;
 using CatT        = std::uint32_t;
 
-template <typename ThresholdType>
+template <typename ThresholdT>
 struct SplitCondition {
   SplitCondition() = default;
-  SplitCondition(ThresholdType feature_lower_bound,
-                 ThresholdType feature_upper_bound,
+  SplitCondition(ThresholdT feature_lower_bound,
+                 ThresholdT feature_upper_bound,
                  bool is_missing_branch,
                  tl::Operator comparison_op,
                  CatBitField<false> categories)
@@ -134,8 +137,8 @@ struct SplitCondition {
   }
 
   // Lower and upper bounds on feature values flowing down this path
-  ThresholdType feature_lower_bound;
-  ThresholdType feature_upper_bound;
+  ThresholdT feature_lower_bound;
+  ThresholdT feature_upper_bound;
   bool is_missing_branch;
   // Comparison operator used in the test. For now only < (kLT) and <= (kLE)
   // are supported.
@@ -145,16 +148,16 @@ struct SplitCondition {
   CatBitField<true> d_categories;
 
   // Does this instance flow down this path?
-  __host__ __device__ bool EvaluateSplit(ThresholdType x) const
+  __host__ __device__ bool EvaluateSplit(ThresholdT x) const
   {
 #ifdef __CUDA_ARCH__
     constexpr bool is_device = true;
 #else  // __CUDA_ARCH__
     constexpr bool is_device = false;
 #endif
-    static_assert(std::is_floating_point<ThresholdType>::value, "x must be a floating point type");
+    static_assert(std::is_floating_point<ThresholdT>::value, "x must be a floating point type");
     auto max_representable_int =
-      static_cast<ThresholdType>(uint64_t(1) << std::numeric_limits<ThresholdType>::digits);
+      static_cast<ThresholdT>(uint64_t(1) << std::numeric_limits<ThresholdT>::digits);
     if (isnan(x)) { return is_missing_branch; }
     if constexpr (is_device) {
       if (d_categories.Size() != 0) {
@@ -200,12 +203,11 @@ struct SplitCondition {
     is_missing_branch = is_missing_branch && other.is_missing_branch;
   }
 
-  static_assert(std::is_same<ThresholdType, float>::value ||
-                  std::is_same<ThresholdType, double>::value,
-                "ThresholdType must be a float or double");
+  static_assert(std::is_same<ThresholdT, float>::value || std::is_same<ThresholdT, double>::value,
+                "ThresholdT must be a float or double");
 };
 
-template <typename ThresholdType, typename LeafType>
+template <typename ThresholdT, typename LeafT>
 struct CategoricalSplitCounter {
   int n_features;
   std::vector<CatT> n_categories;
@@ -219,12 +221,12 @@ struct CategoricalSplitCounter {
   {
   }
 
-  void node_handler(const tl::Tree<ThresholdType, LeafType>& tree, int, int parent_idx, int, float)
+  void node_handler(const tl::Tree<ThresholdT, LeafT>& tree, int, int parent_idx, int, float)
   {
     const auto split_index = tree.SplitIndex(parent_idx);
-    if (tree.SplitType(parent_idx) == tl::SplitFeatureType::kCategorical) {
+    if (tree.NodeType(parent_idx) == tl::TreeNodeType::kCategoricalTestNode) {
       CatT max_cat = 0;
-      for (CatT cat : tree.MatchingCategories(parent_idx)) {
+      for (CatT cat : tree.CategoryList(parent_idx)) {
         if (cat > max_cat) { max_cat = cat; }
       }
       n_categories[split_index] = std::max(n_categories[split_index], max_cat + 1);
@@ -232,7 +234,7 @@ struct CategoricalSplitCounter {
     feature_id.push_back(split_index);
   }
 
-  void root_handler(const tl::Tree<ThresholdType, LeafType>&, int, int, float)
+  void root_handler(const tl::Tree<ThresholdT, LeafT>&, int, int, float)
   {
     feature_id.push_back(-1);
   }
@@ -240,16 +242,16 @@ struct CategoricalSplitCounter {
   void new_path_handler() {}
 };
 
-template <typename ThresholdType, typename LeafType>
+template <typename ThresholdT, typename LeafT>
 struct PathSegmentExtractor {
-  using PathElementT = gpu_treeshap::PathElement<SplitCondition<ThresholdType>>;
+  using PathElementT = gpu_treeshap::PathElement<SplitCondition<ThresholdT>>;
   std::vector<PathElementT>& path_segments;
   std::size_t& path_idx;
   std::vector<CatBitFieldStorageT>& categorical_bitfields;
   const std::vector<std::size_t>& bitfield_segments;
   std::size_t path_segment_idx;
 
-  static constexpr ThresholdType inf{std::numeric_limits<ThresholdType>::infinity()};
+  static constexpr ThresholdT inf{std::numeric_limits<ThresholdT>::infinity()};
 
   PathSegmentExtractor(std::vector<PathElementT>& path_segments,
                        std::size_t& path_idx,
@@ -263,11 +265,8 @@ struct PathSegmentExtractor {
   {
   }
 
-  void node_handler(const tl::Tree<ThresholdType, LeafType>& tree,
-                    int child_idx,
-                    int parent_idx,
-                    int group_id,
-                    float v)
+  void node_handler(
+    const tl::Tree<ThresholdT, LeafT>& tree, int child_idx, int parent_idx, int group_id, float v)
   {
     double zero_fraction = 1.0;
     bool has_count_info  = false;
@@ -283,11 +282,11 @@ struct PathSegmentExtractor {
     // Encode the range of feature values that flow down this path
     bool is_left_path      = tree.LeftChild(parent_idx) == child_idx;
     bool is_missing_branch = tree.DefaultChild(parent_idx) == child_idx;
-    auto split_type        = tree.SplitType(parent_idx);
-    ThresholdType lower_bound, upper_bound;
+    auto node_type         = tree.NodeType(parent_idx);
+    ThresholdT lower_bound, upper_bound;
     tl::Operator comparison_op;
     CatBitField<false> categories;
-    if (split_type == tl::SplitFeatureType::kCategorical) {
+    if (node_type == tl::TreeNodeType::kCategoricalTestNode) {
       /* Create bit fields to store the list of categories associated with this path.
          The bit fields will be used to quickly decide whether a feature value should
          flow down down this path or not.
@@ -297,13 +296,13 @@ struct PathSegmentExtractor {
       categories = CatBitField<false>(raft::span<CatBitFieldStorageT, false>(
                                         categorical_bitfields.data(), categorical_bitfields.size())
                                         .subspan(bitfield_segments[path_segment_idx], n_bitfields));
-      for (CatT cat : tree.MatchingCategories(parent_idx)) {
+      for (CatT cat : tree.CategoryList(parent_idx)) {
         categories.Set(static_cast<std::size_t>(cat));
       }
       // If this path is not the path that's taken when the categorical test evaluates to be true,
       // then flip all the bits in the bit fields. This step is needed because we first built
       // the bit fields according to the list given in the categorical test.
-      bool use_right = tree.CategoriesListRightChild(parent_idx);
+      bool use_right = tree.CategoryListRightChild(parent_idx);
       if ((use_right && is_left_path) || (!use_right && !is_left_path)) {
         for (std::size_t i = bitfield_segments[path_segment_idx];
              i < bitfield_segments[path_segment_idx + 1];
@@ -315,16 +314,16 @@ struct PathSegmentExtractor {
       upper_bound   = inf;
       comparison_op = tl::Operator::kNone;
     } else {
-      if (split_type != tl::SplitFeatureType::kNumerical) {
+      if (node_type != tl::TreeNodeType::kNumericalTestNode) {
         // Assume: split is either numerical or categorical
-        RAFT_FAIL("Unexpected split type: %d", static_cast<int>(split_type));
+        RAFT_FAIL("Unexpected node type: %d", static_cast<int>(node_type));
       }
       categories    = CatBitField<false>{};
       lower_bound   = is_left_path ? -inf : tree.Threshold(parent_idx);
       upper_bound   = is_left_path ? tree.Threshold(parent_idx) : inf;
       comparison_op = tree.ComparisonOp(parent_idx);
     }
-    path_segments.push_back(gpu_treeshap::PathElement<SplitCondition<ThresholdType>>{
+    path_segments.push_back(gpu_treeshap::PathElement<SplitCondition<ThresholdT>>{
       path_idx,
       tree.SplitIndex(parent_idx),
       group_id,
@@ -334,14 +333,11 @@ struct PathSegmentExtractor {
     ++path_segment_idx;
   }
 
-  void root_handler(const tl::Tree<ThresholdType, LeafType>& tree,
-                    int child_idx,
-                    int group_id,
-                    float v)
+  void root_handler(const tl::Tree<ThresholdT, LeafT>& tree, int child_idx, int group_id, float v)
   {
     // Root node has feature -1
     auto comparison_op = tree.ComparisonOp(child_idx);
-    path_segments.push_back(gpu_treeshap::PathElement<SplitCondition<ThresholdType>>{
+    path_segments.push_back(gpu_treeshap::PathElement<SplitCondition<ThresholdT>>{
       path_idx, -1, group_id, SplitCondition{-inf, inf, false, comparison_op, {}}, 1.0, v});
     ++path_segment_idx;
   }
@@ -352,23 +348,21 @@ struct PathSegmentExtractor {
 };  // namespace
 namespace ML {
 namespace Explainer {
-template <typename ThresholdType>
+template <typename ThresholdT>
 class TreePathInfo {
  public:
   int num_tree;
   float global_bias;
   std::size_t num_groups = 1;
   tl::TaskType task_type;
-  tl::TaskParam task_param;
   bool average_tree_output;
-  thrust::device_vector<gpu_treeshap::PathElement<SplitCondition<ThresholdType>>> path_segments;
+  thrust::device_vector<gpu_treeshap::PathElement<SplitCondition<ThresholdT>>> path_segments;
   thrust::device_vector<CatBitFieldStorageT> categorical_bitfields;
   // bitfield_segments[I]: cumulative total count of all bit fields for path segments
   //                       0, 1, ..., I-1
 
-  static_assert(std::is_same<ThresholdType, float>::value ||
-                  std::is_same<ThresholdType, double>::value,
-                "ThresholdType must be a float or double");
+  static_assert(std::is_same<ThresholdT, float>::value || std::is_same<ThresholdT, double>::value,
+                "ThresholdT must be a float or double");
 };
 }  // namespace Explainer
 }  // namespace ML
@@ -540,8 +534,8 @@ namespace ML {
 namespace Explainer {
 // Traverse a path from the root node to a leaf node and call the handler functions for each node.
 // The fields group_id and v (leaf value) will be passed to the handler.
-template <typename ThresholdType, typename LeafType, typename PathHandler>
-void traverse_towards_leaf_node(const tl::Tree<ThresholdType, LeafType>& tree,
+template <typename ThresholdT, typename LeafT, typename PathHandler>
+void traverse_towards_leaf_node(const tl::Tree<ThresholdT, LeafT>& tree,
                                 int leaf_node_id,
                                 int group_id,
                                 float v,
@@ -559,8 +553,8 @@ void traverse_towards_leaf_node(const tl::Tree<ThresholdType, LeafType>& tree,
 }
 
 // Visit every path segments in a single tree and call handler functions for each segment.
-template <typename ThresholdType, typename LeafType, typename PathHandler>
-void visit_path_segments_in_tree(const std::vector<tl::Tree<ThresholdType, LeafType>>& tree_list,
+template <typename ThresholdT, typename LeafT, typename PathHandler>
+void visit_path_segments_in_tree(const std::vector<tl::Tree<ThresholdT, LeafT>>& tree_list,
                                  std::size_t tree_idx,
                                  bool use_vector_leaf,
                                  int num_groups,
@@ -568,7 +562,7 @@ void visit_path_segments_in_tree(const std::vector<tl::Tree<ThresholdType, LeafT
 {
   if (num_groups < 1) { RAFT_FAIL("num_groups must be at least 1"); }
 
-  const tl::Tree<ThresholdType, LeafType>& tree = tree_list[tree_idx];
+  const tl::Tree<ThresholdT, LeafT>& tree = tree_list[tree_idx];
 
   // Compute parent ID of each node
   std::vector<int> parent_id(tree.num_nodes, -1);
@@ -606,40 +600,38 @@ void visit_path_segments_in_tree(const std::vector<tl::Tree<ThresholdType, LeafT
 }
 
 // Visit every path segments in the whole tree ensemble model
-template <typename ThresholdType, typename LeafType, typename PathHandler>
-void visit_path_segments_in_model(const tl::ModelImpl<ThresholdType, LeafType>& model,
+template <typename ThresholdT, typename LeafT, typename PathHandler>
+void visit_path_segments_in_model(const tl::Model& model,
+                                  const tl::ModelPreset<ThresholdT, LeafT>& model_preset,
                                   PathHandler& path_handler)
 {
   int num_groups = 1;
   bool use_vector_leaf;
-  if (model.task_param.num_class > 1) { num_groups = model.task_param.num_class; }
-  if (model.task_type == tl::TaskType::kBinaryClfRegr ||
-      model.task_type == tl::TaskType::kMultiClfGrovePerClass) {
+  ASSERT(model.num_target == 1, "TreeExplainer currently does not support multi-target models");
+  if (model.num_class[0] > 1) { num_groups = model.num_class[0]; }
+  if (model.leaf_vector_shape[0] == 1 && model.leaf_vector_shape[1] == 1) {
     use_vector_leaf = false;
-  } else if (model.task_type == tl::TaskType::kMultiClfProbDistLeaf) {
-    use_vector_leaf = true;
   } else {
-    RAFT_FAIL("Unsupported task_type: %d", static_cast<int>(model.task_type));
+    use_vector_leaf = true;
   }
 
-  for (std::size_t tree_idx = 0; tree_idx < model.trees.size(); ++tree_idx) {
-    visit_path_segments_in_tree(model.trees, tree_idx, use_vector_leaf, num_groups, path_handler);
+  for (std::size_t tree_idx = 0; tree_idx < model_preset.trees.size(); ++tree_idx) {
+    visit_path_segments_in_tree(
+      model_preset.trees, tree_idx, use_vector_leaf, num_groups, path_handler);
   }
 }
 
 // Traverse a path from the root node to a leaf node and return the list of the path segments
 // Note: the path segments will have missing values in path_idx, group_id and v (leaf value).
 //       The caller is responsible for filling in these fields.
-template <typename ThresholdType, typename LeafType>
-std::vector<gpu_treeshap::PathElement<SplitCondition<ThresholdType>>> traverse_towards_leaf_node(
-  const tl::Tree<ThresholdType, LeafType>& tree,
-  int leaf_node_id,
-  const std::vector<int>& parent_id)
+template <typename ThresholdT, typename LeafT>
+std::vector<gpu_treeshap::PathElement<SplitCondition<ThresholdT>>> traverse_towards_leaf_node(
+  const tl::Tree<ThresholdT, LeafT>& tree, int leaf_node_id, const std::vector<int>& parent_id)
 {
-  std::vector<gpu_treeshap::PathElement<SplitCondition<ThresholdType>>> path_segments;
+  std::vector<gpu_treeshap::PathElement<SplitCondition<ThresholdT>>> path_segments;
   int child_idx              = leaf_node_id;
   int parent_idx             = parent_id[child_idx];
-  constexpr auto inf         = std::numeric_limits<ThresholdType>::infinity();
+  constexpr auto inf         = std::numeric_limits<ThresholdT>::infinity();
   tl::Operator comparison_op = tl::Operator::kNone;
   while (parent_idx != -1) {
     double zero_fraction = 1.0;
@@ -655,15 +647,15 @@ std::vector<gpu_treeshap::PathElement<SplitCondition<ThresholdType>>> traverse_t
     if (!has_count_info) { RAFT_FAIL("Tree model doesn't have data count information"); }
     // Encode the range of feature values that flow down this path
     bool is_left_path = tree.LeftChild(parent_idx) == child_idx;
-    if (tree.SplitType(parent_idx) == tl::SplitFeatureType::kCategorical) {
+    if (tree.NodeType(parent_idx) == tl::TreeNodeType::kCategoricalTestNode) {
       RAFT_FAIL(
         "Only trees with numerical splits are supported. "
         "Trees with categorical splits are not supported yet.");
     }
-    ThresholdType lower_bound = is_left_path ? -inf : tree.Threshold(parent_idx);
-    ThresholdType upper_bound = is_left_path ? tree.Threshold(parent_idx) : inf;
-    comparison_op             = tree.ComparisonOp(parent_idx);
-    path_segments.push_back(gpu_treeshap::PathElement<SplitCondition<ThresholdType>>{
+    ThresholdT lower_bound = is_left_path ? -inf : tree.Threshold(parent_idx);
+    ThresholdT upper_bound = is_left_path ? tree.Threshold(parent_idx) : inf;
+    comparison_op          = tree.ComparisonOp(parent_idx);
+    path_segments.push_back(gpu_treeshap::PathElement<SplitCondition<ThresholdT>>{
       ~std::size_t(0),
       tree.SplitIndex(parent_idx),
       -1,
@@ -676,31 +668,25 @@ std::vector<gpu_treeshap::PathElement<SplitCondition<ThresholdType>>> traverse_t
   // Root node has feature -1
   comparison_op = tree.ComparisonOp(child_idx);
   // Build temporary path segments with unknown path_idx, group_id and leaf value
-  path_segments.push_back(gpu_treeshap::PathElement<SplitCondition<ThresholdType>>{
-    ~std::size_t(0),
-    -1,
-    -1,
-    SplitCondition{-inf, inf, comparison_op},
-    1.0,
-    std::numeric_limits<float>::quiet_NaN()});
+  path_segments.push_back(
+    gpu_treeshap::PathElement<SplitCondition<ThresholdT>>{~std::size_t(0),
+                                                          -1,
+                                                          -1,
+                                                          SplitCondition{-inf, inf, comparison_op},
+                                                          1.0,
+                                                          std::numeric_limits<float>::quiet_NaN()});
   return path_segments;
 }
 
-template <typename ThresholdType, typename LeafType>
-TreePathHandle extract_path_info_impl(const tl::ModelImpl<ThresholdType, LeafType>& model)
+template <typename ThresholdT, typename LeafT>
+TreePathHandle extract_path_info_impl(const tl::Model& model,
+                                      const tl::ModelPreset<ThresholdT, LeafT>& model_preset)
 {
-  if (!std::is_same<ThresholdType, LeafType>::value) {
-    RAFT_FAIL("ThresholdType and LeafType must be identical");
-  }
-  if (!std::is_same<ThresholdType, float>::value && !std::is_same<ThresholdType, double>::value) {
-    RAFT_FAIL("ThresholdType must be either float32 or float64");
-  }
-
-  auto path_info = std::make_shared<TreePathInfo<ThresholdType>>();
+  auto path_info = std::make_shared<TreePathInfo<ThresholdT>>();
 
   /* 1. Scan the model for categorical splits and pre-allocate bit fields. */
-  CategoricalSplitCounter<ThresholdType, LeafType> cat_counter{model.num_feature};
-  visit_path_segments_in_model(model, cat_counter);
+  CategoricalSplitCounter<ThresholdT, LeafT> cat_counter{model.num_feature};
+  visit_path_segments_in_model(model, model_preset, cat_counter);
 
   std::size_t n_path_segments = cat_counter.feature_id.size();
   std::vector<std::size_t> n_bitfields(n_path_segments, 0);
@@ -723,10 +709,10 @@ TreePathHandle extract_path_info_impl(const tl::ModelImpl<ThresholdType, LeafTyp
   // Each path segment will have path_idx field, which uniquely identifies the path to which the
   // segment belongs.
   std::size_t path_idx = 0;
-  std::vector<gpu_treeshap::PathElement<SplitCondition<ThresholdType>>> path_segments;
-  PathSegmentExtractor<ThresholdType, LeafType> path_extractor{
+  std::vector<gpu_treeshap::PathElement<SplitCondition<ThresholdT>>> path_segments;
+  PathSegmentExtractor<ThresholdT, LeafT> path_extractor{
     path_segments, path_idx, categorical_bitfields, bitfield_segments};
-  visit_path_segments_in_model(model, path_extractor);
+  visit_path_segments_in_model(model, model_preset, path_extractor);
 
   // Marshall bit fields to GPU memory
   path_info->categorical_bitfields = thrust::device_vector<CatBitFieldStorageT>(
@@ -741,26 +727,22 @@ TreePathHandle extract_path_info_impl(const tl::ModelImpl<ThresholdType, LeafTyp
   }
 
   path_info->path_segments       = path_segments;
-  path_info->global_bias         = model.param.global_bias;
+  path_info->global_bias         = model.base_scores[0];
   path_info->task_type           = model.task_type;
-  path_info->task_param          = model.task_param;
   path_info->average_tree_output = model.average_tree_output;
-  path_info->num_tree            = static_cast<int>(model.trees.size());
-  if (path_info->task_param.num_class > 1) {
-    path_info->num_groups = static_cast<std::size_t>(path_info->task_param.num_class);
-  }
+  path_info->num_tree            = static_cast<int>(model_preset.trees.size());
+  path_info->num_groups          = static_cast<std::size_t>(model.num_class[0]);
 
   return path_info;
 }
 
-TreePathHandle extract_path_info(ModelHandle model)
+TreePathHandle extract_path_info(TreeliteModelHandle model)
 {
   const tl::Model& model_ref = *static_cast<tl::Model*>(model);
 
-  return model_ref.Dispatch([&](const auto& model_inner) {
-    // model_inner is of the concrete type tl::ModelImpl<threshold_t, leaf_t>
-    return extract_path_info_impl(model_inner);
-  });
+  return std::visit(
+    [&](auto&& model_preset) { return extract_path_info_impl(model_ref, model_preset); },
+    model_ref.variant_);
 }
 
 template <typename VariantT, typename... Targs>
diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
index 0d10feac31..a7ed967af1 100644
--- a/cpp/src/fil/internal.cuh
+++ b/cpp/src/fil/internal.cuh
@@ -37,7 +37,7 @@ class handle_t;
 // needed for node_traits<...>
 namespace treelite {
 template <typename, typename>
-struct ModelImpl;
+struct ModelPreset;
 }
 
 namespace ML {
@@ -244,7 +244,8 @@ struct node_traits {
   static constexpr storage_type_t storage_type_enum =
     std::is_same_v<sparse_node16<real_type>, node_t> ? SPARSE : SPARSE8;
   template <typename threshold_t, typename leaf_t>
-  static void check(const treelite::ModelImpl<threshold_t, leaf_t>& model);
+  static void check(const treelite::Model& model,
+                    const treelite::ModelPreset<threshold_t, leaf_t>& model_preset);
 };
 
 template <typename real_t>
@@ -254,7 +255,8 @@ struct node_traits<dense_node<real_t>> {
   static const bool IS_DENSE                    = true;
   static const storage_type_t storage_type_enum = DENSE;
   template <typename threshold_t, typename leaf_t>
-  static void check(const treelite::ModelImpl<threshold_t, leaf_t>& model)
+  static void check(const treelite::Model& model,
+                    const treelite::ModelPreset<threshold_t, leaf_t>& model_preset)
   {
   }
 };
diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu
index 905e282e84..59dc21132b 100644
--- a/cpp/src/fil/treelite_import.cu
+++ b/cpp/src/fil/treelite_import.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,9 +28,10 @@
 #include <raft/core/handle.hpp>        // for handle_t
 #include <raft/util/cudart_utils.hpp>  // for RAFT_CUDA_TRY
 
-#include <treelite/base.h>   // for Operator, SplitFeatureType, kGE, kGT, kLE, kLT, kNumerical
-#include <treelite/c_api.h>  // for ModelHandle
-#include <treelite/tree.h>   // for Tree, Model, ModelImpl, ModelParam
+#include <treelite/c_api.h>                // for TreeliteModelHandle
+#include <treelite/enum/operator.h>        // for Operator
+#include <treelite/enum/tree_node_type.h>  // for TreeNodeType
+#include <treelite/tree.h>                 // for Tree, Model, ModelPreset
 
 #include <omp.h>  // for omp
 
@@ -44,6 +45,7 @@
 #include <stack>        // for std::stack
 #include <string>       // for std::string
 #include <type_traits>  // for std::is_same
+#include <variant>      // for std::variant, std::visit
 
 namespace ML {
 namespace fil {
@@ -140,10 +142,10 @@ inline int max_depth(const tl::Tree<T, L>& tree)
 }
 
 template <typename T, typename L>
-int max_depth(const tl::ModelImpl<T, L>& model)
+int max_depth(const tl::ModelPreset<T, L>& model_preset)
 {
   int depth         = 0;
-  const auto& trees = model.trees;
+  const auto& trees = model_preset.trees;
 #pragma omp parallel for reduction(max : depth)
   for (size_t i = 0; i < trees.size(); ++i) {
     const auto& tree = trees[i];
@@ -165,8 +167,8 @@ inline std::vector<cat_feature_counters> cat_counter_vec(const tl::Tree<T, L>& t
 {
   std::vector<cat_feature_counters> res(n_cols);
   walk_tree(tree, [&](int node_id) {
-    if (tree.SplitType(node_id) == tl::SplitFeatureType::kCategorical) {
-      std::vector<std::uint32_t> mmv = tree.MatchingCategories(node_id);
+    if (tree.NodeType(node_id) == tl::TreeNodeType::kCategoricalTestNode) {
+      std::vector<std::uint32_t> mmv = tree.CategoryList(node_id);
       int max_matching_cat;
       if (mmv.size() > 0) {
         // in `struct cat_feature_counters` and GPU structures, int(max_matching_cat) is safe
@@ -193,8 +195,8 @@ inline std::size_t bit_pool_size(const tl::Tree<T, L>& tree, const categorical_s
 {
   std::size_t size = 0;
   walk_tree(tree, [&](int node_id) {
-    if (tree.SplitType(node_id) == tl::SplitFeatureType::kCategorical &&
-        tree.MatchingCategories(node_id).size() > 0) {
+    if (tree.NodeType(node_id) == tl::TreeNodeType::kCategoricalTestNode &&
+        tree.CategoryList(node_id).size() > 0) {
       size += cat_sets.sizeof_mask(tree.SplitIndex(node_id));
     }
   });
@@ -202,12 +204,13 @@ inline std::size_t bit_pool_size(const tl::Tree<T, L>& tree, const categorical_s
 }
 
 template <typename T, typename L>
-cat_sets_owner allocate_cat_sets_owner(const tl::ModelImpl<T, L>& model)
+cat_sets_owner allocate_cat_sets_owner(const tl::Model& model,
+                                       const tl::ModelPreset<T, L>& model_preset)
 {
 #pragma omp declare reduction(                                                     \
     cat_counter_vec_red : std::vector<cat_feature_counters> : elementwise_combine( \
         omp_out, omp_in)) initializer(omp_priv = omp_orig)
-  const auto& trees = model.trees;
+  const auto& trees = model_preset.trees;
   cat_sets_owner cat_sets;
   std::vector<cat_feature_counters> counters(model.num_feature);
 #pragma omp parallel for reduction(cat_counter_vec_red : counters)
@@ -329,22 +332,22 @@ conversion_state<fil_node_t> tl2fil_inner_node(int fil_left_child,
   int tl_left = tree.LeftChild(tl_node_id), tl_right = tree.RightChild(tl_node_id);
   val_t<real_t> split = {.f = std::numeric_limits<real_t>::quiet_NaN()};
   int feature_id      = tree.SplitIndex(tl_node_id);
-  bool is_categorical = tree.SplitType(tl_node_id) == tl::SplitFeatureType::kCategorical &&
-                        tree.MatchingCategories(tl_node_id).size() > 0;
+  bool is_categorical = tree.NodeType(tl_node_id) == tl::TreeNodeType::kCategoricalTestNode &&
+                        tree.CategoryList(tl_node_id).size() > 0;
   bool swap_child_nodes = false;
-  if (tree.SplitType(tl_node_id) == tl::SplitFeatureType::kNumerical) {
+  if (tree.NodeType(tl_node_id) == tl::TreeNodeType::kNumericalTestNode) {
     split.f = static_cast<real_t>(tree.Threshold(tl_node_id));
     adjust_threshold(&split.f, &swap_child_nodes, tree.ComparisonOp(tl_node_id));
-  } else if (tree.SplitType(tl_node_id) == tl::SplitFeatureType::kCategorical) {
+  } else if (tree.NodeType(tl_node_id) == tl::TreeNodeType::kCategoricalTestNode) {
     // for FIL, the list of categories is always for the right child
-    swap_child_nodes = !tree.CategoriesListRightChild(tl_node_id);
-    if (tree.MatchingCategories(tl_node_id).size() > 0) {
+    swap_child_nodes = !tree.CategoryListRightChild(tl_node_id);
+    if (tree.CategoryList(tl_node_id).size() > 0) {
       int sizeof_mask = cat_sets->accessor().sizeof_mask(feature_id);
       split.idx       = *bit_pool_offset;
       *bit_pool_offset += sizeof_mask;
       // cat_sets->bits have been zero-initialized
       uint8_t* bits = &cat_sets->bits[split.idx];
-      for (std::uint32_t category : tree.MatchingCategories(tl_node_id)) {
+      for (std::uint32_t category : tree.CategoryList(tl_node_id)) {
         bits[category / BITS_PER_BYTE] |= 1 << (category % BITS_PER_BYTE);
       }
     } else {
@@ -420,11 +423,11 @@ inline void node_depth_hist(const tl::Tree<T, L>& tree, std::vector<level_entry>
 }
 
 template <typename T, typename L>
-std::stringstream depth_hist_and_max(const tl::ModelImpl<T, L>& model)
+std::stringstream depth_hist_and_max(const tl::ModelPreset<T, L>& model_preset)
 {
   using namespace std;
   vector<level_entry> hist;
-  for (const auto& tree : model.trees)
+  for (const auto& tree : model_preset.trees)
     node_depth_hist(tree, hist);
 
   int min_leaf_depth = -1, leaves_times_depth = 0, total_branches = 0, total_leaves = 0;
@@ -464,22 +467,18 @@ std::stringstream depth_hist_and_max(const tl::ModelImpl<T, L>& model)
   return forest_shape;
 }
 
-template <typename T, typename L>
-size_t tl_leaf_vector_size(const tl::ModelImpl<T, L>& model)
+size_t tl_leaf_vector_size(const tl::Model& model)
 {
-  const tl::Tree<T, L>& tree = model.trees[0];
-  int node_key;
-  for (node_key = tree_root(tree); !tree.IsLeaf(node_key); node_key = tree.RightChild(node_key))
-    ;
-  if (tree.HasLeafVector(node_key)) return tree.LeafVector(node_key).size();
-  return 0;
+  auto size = static_cast<size_t>(model.leaf_vector_shape[0] * model.leaf_vector_shape[1]);
+  return (size == 1 ? 0 : size);
 }
 
 // tl2fil_common is the part of conversion from a treelite model
 // common for dense and sparse forests
 template <typename T, typename L>
 void tl2fil_common(forest_params_t* params,
-                   const tl::ModelImpl<T, L>& model,
+                   const tl::Model& model,
+                   const tl::ModelPreset<T, L>& model_preset,
                    const treelite_params_t* tl_params)
 {
   // fill in forest-independent params
@@ -487,31 +486,38 @@ void tl2fil_common(forest_params_t* params,
   params->threshold = tl_params->threshold;
 
   // fill in forest-dependent params
-  params->depth = max_depth(model);  // also checks for cycles
+  params->depth = max_depth(model_preset);  // also checks for cycles
 
-  const tl::ModelParam& param = model.param;
+  ASSERT(model.num_target == 1, "FIL does not support multi-target models");
 
   // assuming either all leaves use the .leaf_vector() or all leaves use .leaf_value()
   size_t leaf_vec_size = tl_leaf_vector_size(model);
-  std::string pred_transform(param.pred_transform);
+  std::string pred_transform(model.postprocessor);
   if (leaf_vec_size > 0) {
-    ASSERT(leaf_vec_size == model.task_param.num_class, "treelite model inconsistent");
+    ASSERT(leaf_vec_size == model.num_class[0], "treelite model inconsistent");
     params->num_classes = leaf_vec_size;
     params->leaf_algo   = leaf_algo_t::VECTOR_LEAF;
 
     ASSERT(pred_transform == "max_index" || pred_transform == "identity_multiclass",
            "only max_index and identity_multiclass values of pred_transform "
-           "are supported for multi-class models");
-
+           "are supported for multi-class models. pred_transform = %s",
+           pred_transform.c_str());
   } else {
-    if (model.task_param.num_class > 1) {
-      params->num_classes = static_cast<int>(model.task_param.num_class);
+    if (model.num_class[0] > 1) {
+      params->num_classes = static_cast<int>(model.num_class[0]);
       ASSERT(tl_params->output_class, "output_class==true is required for multi-class models");
       ASSERT(pred_transform == "identity_multiclass" || pred_transform == "max_index" ||
                pred_transform == "softmax" || pred_transform == "multiclass_ova",
              "only identity_multiclass, max_index, multiclass_ova and softmax "
              "values of pred_transform are supported for xgboost-style "
              "multi-class classification models.");
+      // Ensure that the trees follow the grove-per-class layout.
+      for (size_t tree_id = 0; tree_id < model_preset.trees.size(); ++tree_id) {
+        ASSERT(model.target_id[tree_id] == 0, "FIL does not support multi-target models");
+        ASSERT(model.class_id[tree_id] == tree_id % static_cast<size_t>(model.num_class[0]),
+               "The tree model is not compatible with FIL; the trees must be laid out "
+               "such that tree i's output contributes towards class (i %% num_class).");
+      }
       // this function should not know how many threads per block will be used
       params->leaf_algo = leaf_algo_t::GROVE_PER_CLASS;
     } else {
@@ -525,8 +531,13 @@ void tl2fil_common(forest_params_t* params,
 
   params->num_cols = model.num_feature;
 
-  ASSERT(param.sigmoid_alpha == 1.0f, "sigmoid_alpha not supported");
-  params->global_bias = param.global_bias;
+  ASSERT(model.sigmoid_alpha == 1.0f, "sigmoid_alpha not supported");
+  // Check base_scores
+  for (std::int32_t class_id = 1; class_id < model.num_class[0]; ++class_id) {
+    ASSERT(model.base_scores[0] == model.base_scores[class_id],
+           "base_scores must be identical for all classes");
+  }
+  params->global_bias = model.base_scores[0];
   params->output      = output_t::RAW;
   /** output_t::CLASS denotes using a threshold in FIL, when
       predict_proba == false. For all multiclass models, the best class is
@@ -543,7 +554,7 @@ void tl2fil_common(forest_params_t* params,
     params->output = output_t(params->output | output_t::SIGMOID);
   }
   if (pred_transform == "softmax") params->output = output_t(params->output | output_t::SOFTMAX);
-  params->num_trees        = model.trees.size();
+  params->num_trees        = model_preset.trees.size();
   params->blocks_per_sm    = tl_params->blocks_per_sm;
   params->threads_per_tree = tl_params->threads_per_tree;
   params->n_items          = tl_params->n_items;
@@ -551,7 +562,8 @@ void tl2fil_common(forest_params_t* params,
 
 template <typename node_t>
 template <typename threshold_t, typename leaf_t>
-void node_traits<node_t>::check(const treelite::ModelImpl<threshold_t, leaf_t>& model)
+void node_traits<node_t>::check(const treelite::Model& model,
+                                const treelite::ModelPreset<threshold_t, leaf_t>& model_preset)
 {
   if constexpr (!std::is_same<node_t, sparse_node8>()) return;
   const int MAX_FEATURES   = 1 << sparse_node8::FID_NUM_BITS;
@@ -565,7 +577,7 @@ void node_traits<node_t>::check(const treelite::ModelImpl<threshold_t, leaf_t>&
          MAX_FEATURES);
 
   // check the number of tree nodes
-  const std::vector<tl::Tree<threshold_t, leaf_t>>& trees = model.trees;
+  const std::vector<tl::Tree<threshold_t, leaf_t>>& trees = model_preset.trees;
   for (std::size_t i = 0; i < trees.size(); ++i) {
     int num_nodes = trees[i].num_nodes;
     ASSERT(num_nodes <= MAX_TREE_NODES,
@@ -585,25 +597,28 @@ struct tl2fil_t {
   std::vector<real_t> vector_leaf_;
   forest_params_t params_;
   cat_sets_owner cat_sets_;
-  const tl::ModelImpl<threshold_t, leaf_t>& model_;
+  const tl::Model& model_;
+  const tl::ModelPreset<threshold_t, leaf_t>& model_preset_;
   const treelite_params_t& tl_params_;
 
-  tl2fil_t(const tl::ModelImpl<threshold_t, leaf_t>& model_, const treelite_params_t& tl_params_)
-    : model_(model_), tl_params_(tl_params_)
+  tl2fil_t(const tl::Model& model,
+           const tl::ModelPreset<threshold_t, leaf_t>& model_preset,
+           const treelite_params_t& tl_params_)
+    : model_(model), model_preset_(model_preset), tl_params_(tl_params_)
   {
   }
 
   void init()
   {
     static const bool IS_DENSE = node_traits<fil_node_t>::IS_DENSE;
-    tl2fil_common(&params_, model_, &tl_params_);
-    node_traits<fil_node_t>::check(model_);
+    tl2fil_common(&params_, model_, model_preset_, &tl_params_);
+    node_traits<fil_node_t>::check(model_, model_preset_);
 
-    std::size_t num_trees = model_.trees.size();
+    std::size_t num_trees = model_preset_.trees.size();
 
     std::size_t total_nodes = 0;
     roots_.reserve(num_trees);
-    for (auto& tree : model_.trees) {
+    for (auto& tree : model_preset_.trees) {
       roots_.push_back(total_nodes);
       total_nodes += IS_DENSE ? tree_num_nodes(params_.depth) : tree.num_nodes;
     }
@@ -614,7 +629,7 @@ struct tl2fil_t {
       vector_leaf_.resize(max_leaves * params_.num_classes);
     }
 
-    cat_sets_ = allocate_cat_sets_owner(model_);
+    cat_sets_ = allocate_cat_sets_owner(model_, model_preset_);
     nodes_.resize(total_nodes);
 
 // convert the nodes_
@@ -624,7 +639,7 @@ struct tl2fil_t {
       size_t leaf_counter = (roots_[tree_idx] + tree_idx) / 2;
       tree2fil(nodes_,
                roots_[tree_idx],
-               model_.trees[tree_idx],
+               model_preset_.trees[tree_idx],
                tree_idx,
                params_,
                &vector_leaf_,
@@ -644,7 +659,7 @@ struct tl2fil_t {
     // but destructed at the end of this function
     handle.sync_stream(handle.get_stream());
     if (tl_params_.pforest_shape_str) {
-      *tl_params_.pforest_shape_str = sprintf_shape(model_, nodes_, roots_, cat_sets_);
+      *tl_params_.pforest_shape_str = sprintf_shape(model_preset_, nodes_, roots_, cat_sets_);
     }
   }
 };
@@ -652,10 +667,11 @@ struct tl2fil_t {
 template <typename fil_node_t, typename threshold_t, typename leaf_t>
 void convert(const raft::handle_t& handle,
              forest_t<typename fil_node_t::real_type>* pforest,
-             const tl::ModelImpl<threshold_t, leaf_t>& model,
+             const tl::Model& model,
+             const tl::ModelPreset<threshold_t, leaf_t>& model_preset,
              const treelite_params_t& tl_params)
 {
-  tl2fil_t<fil_node_t, threshold_t, leaf_t> tl2fil(model, tl_params);
+  tl2fil_t<fil_node_t, threshold_t, leaf_t> tl2fil(model, model_preset, tl_params);
   tl2fil.init();
   tl2fil.init_forest(handle, pforest);
 }
@@ -670,7 +686,8 @@ constexpr bool type_supported()
 template <typename threshold_t, typename leaf_t>
 void from_treelite(const raft::handle_t& handle,
                    forest_variant* pforest_variant,
-                   const tl::ModelImpl<threshold_t, leaf_t>& model,
+                   const tl::Model& model,
+                   const tl::ModelPreset<threshold_t, leaf_t>& model_preset,
                    const treelite_params_t* tl_params)
 {
   precision_t precision = tl_params->precision;
@@ -684,13 +701,13 @@ void from_treelite(const raft::handle_t& handle,
     case PRECISION_FLOAT32: {
       *pforest_variant         = (forest_t<float>)nullptr;
       forest_t<float>* pforest = &std::get<forest_t<float>>(*pforest_variant);
-      from_treelite(handle, pforest, model, tl_params);
+      from_treelite(handle, pforest, model, model_preset, tl_params);
       break;
     }
     case PRECISION_FLOAT64: {
       *pforest_variant          = (forest_t<double>)nullptr;
       forest_t<double>* pforest = &std::get<forest_t<double>>(*pforest_variant);
-      from_treelite(handle, pforest, model, tl_params);
+      from_treelite(handle, pforest, model, model_preset, tl_params);
       break;
     }
     default:
@@ -703,7 +720,8 @@ void from_treelite(const raft::handle_t& handle,
 template <typename threshold_t, typename leaf_t, typename real_t>
 void from_treelite(const raft::handle_t& handle,
                    forest_t<real_t>* pforest,
-                   const tl::ModelImpl<threshold_t, leaf_t>& model,
+                   const tl::Model& model,
+                   const tl::ModelPreset<threshold_t, leaf_t>& model_preset,
                    const treelite_params_t* tl_params)
 {
   // Invariants on threshold and leaf types
@@ -715,11 +733,11 @@ void from_treelite(const raft::handle_t& handle,
   // build dense trees by default
   if (storage_type == storage_type_t::AUTO) {
     if (tl_params->algo == algo_t::ALGO_AUTO || tl_params->algo == algo_t::NAIVE) {
-      int depth = max_depth(model);
+      int depth = max_depth(model_preset);
       // max 2**25 dense nodes, 256 MiB dense model size. Categorical mask size is unlimited and not
       // affected by storage format.
       const int LOG2_MAX_DENSE_NODES = 25;
-      int log2_num_dense_nodes       = depth + 1 + int(ceil(std::log2(model.trees.size())));
+      int log2_num_dense_nodes       = depth + 1 + int(ceil(std::log2(model_preset.trees.size())));
       storage_type = log2_num_dense_nodes > LOG2_MAX_DENSE_NODES ? storage_type_t::SPARSE
                                                                  : storage_type_t::DENSE;
     } else {
@@ -730,15 +748,15 @@ void from_treelite(const raft::handle_t& handle,
 
   switch (storage_type) {
     case storage_type_t::DENSE:
-      convert<dense_node<real_t>>(handle, pforest, model, *tl_params);
+      convert<dense_node<real_t>>(handle, pforest, model, model_preset, *tl_params);
       break;
     case storage_type_t::SPARSE:
-      convert<sparse_node16<real_t>>(handle, pforest, model, *tl_params);
+      convert<sparse_node16<real_t>>(handle, pforest, model, model_preset, *tl_params);
       break;
     case storage_type_t::SPARSE8:
       // SPARSE8 is only supported for float32
       if constexpr (std::is_same_v<real_t, float>) {
-        convert<sparse_node8>(handle, pforest, model, *tl_params);
+        convert<sparse_node8>(handle, pforest, model, model_preset, *tl_params);
       } else {
         ASSERT(false, "SPARSE8 is only supported for float32 treelite models");
       }
@@ -749,24 +767,26 @@ void from_treelite(const raft::handle_t& handle,
 
 void from_treelite(const raft::handle_t& handle,
                    forest_variant* pforest,
-                   ModelHandle model,
+                   TreeliteModelHandle model,
                    const treelite_params_t* tl_params)
 {
   const tl::Model& model_ref = *(tl::Model*)model;
-  model_ref.Dispatch([&](const auto& model_inner) {
-    // model_inner is of the concrete type tl::ModelImpl<threshold_t, leaf_t>
-    from_treelite(handle, pforest, model_inner, tl_params);
-  });
+  std::visit(
+    [&](auto&& model_preset) {
+      // model_preset is of the concrete type tl::ModelPreset<threshold_t, leaf_t>
+      from_treelite(handle, pforest, model_ref, model_preset, tl_params);
+    },
+    model_ref.variant_);
 }
 
 // allocates caller-owned char* using malloc()
 template <typename threshold_t, typename leaf_t, typename node_t>
-char* sprintf_shape(const tl::ModelImpl<threshold_t, leaf_t>& model,
+char* sprintf_shape(const tl::ModelPreset<threshold_t, leaf_t>& model_preset,
                     const std::vector<node_t>& nodes,
                     const std::vector<int>& trees,
                     const cat_sets_owner cat_sets)
 {
-  std::stringstream forest_shape = depth_hist_and_max(model);
+  std::stringstream forest_shape = depth_hist_and_max(model_preset);
   double size_mb = (trees.size() * sizeof(trees.front()) + nodes.size() * sizeof(nodes.front()) +
                     cat_sets.bits.size()) /
                    1e6;
diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu
index a97422d841..8ab48e2e9c 100644
--- a/cpp/src/randomforest/randomforest.cu
+++ b/cpp/src/randomforest/randomforest.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,17 +20,22 @@
 #include <raft/core/handle.hpp>
 
 #include <treelite/c_api.h>
+#include <treelite/enum/task_type.h>
 #include <treelite/tree.h>
 
 #include <raft/core/error.hpp>
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdio>
 #include <cstring>
 #include <fstream>
 #include <iostream>
+#include <iterator>
+#include <memory>
 #include <string>
 #include <type_traits>
+#include <variant>
 #include <vector>
 
 #include "randomforest.cuh"
@@ -242,13 +247,12 @@ std::string get_rf_json(const RandomForestMetaData<T, L>* forest)
 }
 
 template <class T, class L>
-void build_treelite_forest(ModelHandle* model_handle,
+void build_treelite_forest(TreeliteModelHandle* model_handle,
                            const RandomForestMetaData<T, L>* forest,
                            int num_features)
 {
-  auto parent_model          = tl::Model::Create<T, T>();
-  tl::ModelImpl<T, T>* model = dynamic_cast<tl::ModelImpl<T, T>*>(parent_model.get());
-  ASSERT(model != nullptr, "Invalid downcast to tl::ModelImpl");
+  auto model                          = tl::Model::Create<T, T>();
+  tl::ModelPreset<T, T>& model_preset = std::get<tl::ModelPreset<T, T>>(model->variant_);
 
   // Determine number of outputs
   ASSERT(forest->trees.size() == forest->rf_params.n_trees, "Inconsistent number of trees.");
@@ -261,16 +265,23 @@ void build_treelite_forest(ModelHandle* model_handle,
 
   if constexpr (std::is_integral_v<L>) {
     ASSERT(num_outputs > 1, "More than one variable expected for classification problem.");
-    model->task_type = tl::TaskType::kMultiClfProbDistLeaf;
-    std::strncpy(model->param.pred_transform, "max_index", sizeof(model->param.pred_transform));
+    model->task_type     = tl::TaskType::kMultiClf;
+    model->postprocessor = "identity_multiclass";
   } else {
-    model->task_type = tl::TaskType::kBinaryClfRegr;
+    ASSERT(num_outputs == 1, "Only one variable expected for regression problem.");
+    model->task_type     = tl::TaskType::kRegressor;
+    model->postprocessor = "identity";
   }
 
-  model->task_param = tl::TaskParam{
-    tl::TaskParam::OutputType::kFloat, false, (unsigned int)num_outputs, (unsigned int)num_outputs};
+  model->num_target        = 1;
+  model->num_class         = std::vector<std::int32_t>{static_cast<std::int32_t>(num_outputs)};
+  model->leaf_vector_shape = std::vector<std::int32_t>{1, static_cast<std::int32_t>(num_outputs)};
+  model->target_id         = std::vector<std::int32_t>(forest->rf_params.n_trees, 0);
+  model->class_id =
+    std::vector<std::int32_t>(forest->rf_params.n_trees, (std::is_integral_v<L> ? -1 : 0));
   model->num_feature         = num_features;
   model->average_tree_output = true;
+  model->base_scores         = std::vector<double>(num_outputs, 0.0);
   model->SetTreeLimit(forest->rf_params.n_trees);
 
 #pragma omp parallel for
@@ -278,11 +289,11 @@ void build_treelite_forest(ModelHandle* model_handle,
     auto rf_tree = forest->trees[i];
 
     if (rf_tree->sparsetree.size() != 0) {
-      model->trees[i] = DT::build_treelite_tree<T, L>(*rf_tree, num_outputs);
+      model_preset.trees[i] = DT::build_treelite_tree<T, L>(*rf_tree, num_outputs);
     }
   }
 
-  *model_handle = static_cast<ModelHandle>(parent_model.release());
+  *model_handle = static_cast<TreeliteModelHandle>(model.release());
 }
 
 /**
@@ -327,65 +338,6 @@ void compare_trees(tl::Tree<T, L>& tree_from_concatenated_forest,
   }
 }
 
-/**
- * @brief Compares the concatenated treelite model with the information of the forest
- *   present in the different workers. If there is a difference in the two then an error
- *   statement will be thrown.
- * @param[in] concat_tree_handle: ModelHandle for the concatenated forest.
- * @param[in] treelite_handles: List containing ModelHandles for the forest present in
- *   each worker.
- */
-void compare_concat_forest_to_subforests(ModelHandle concat_tree_handle,
-                                         std::vector<ModelHandle> treelite_handles)
-{
-  size_t concat_forest;
-  size_t total_num_trees = 0;
-  for (std::size_t forest_idx = 0; forest_idx < treelite_handles.size(); forest_idx++) {
-    size_t num_trees_each_forest;
-    TREELITE_CHECK_RET(TreeliteQueryNumTree(treelite_handles[forest_idx], &num_trees_each_forest));
-    total_num_trees = total_num_trees + num_trees_each_forest;
-  }
-
-  TREELITE_CHECK_RET(TreeliteQueryNumTree(concat_tree_handle, &concat_forest));
-
-  ASSERT(concat_forest == total_num_trees,
-         "Error! the number of trees in the concatenated forest and the sum "
-         "of the trees present in the forests present in each worker are not equal");
-
-  int concat_mod_tree_num = 0;
-  tl::Model& concat_model = *(tl::Model*)(concat_tree_handle);
-  for (std::size_t forest_idx = 0; forest_idx < treelite_handles.size(); forest_idx++) {
-    tl::Model& model = *(tl::Model*)(treelite_handles[forest_idx]);
-
-    ASSERT(concat_model.GetThresholdType() == model.GetThresholdType(),
-           "Error! Concatenated forest does not have the same threshold type as "
-           "the individual forests");
-    ASSERT(concat_model.GetLeafOutputType() == model.GetLeafOutputType(),
-           "Error! Concatenated forest does not have the same leaf output type as "
-           "the individual forests");
-    ASSERT(concat_model.num_feature == model.num_feature,
-           "Error! number of features mismatch between concatenated forest and the"
-           " individual forests");
-    ASSERT(concat_model.task_param.num_class == model.task_param.num_class,
-           "Error! number of classes mismatch between concatenated forest "
-           "and the individual forests ");
-    ASSERT(concat_model.average_tree_output == model.average_tree_output,
-           "Error! average_tree_output flag value mismatch between "
-           "concatenated forest and the individual forests");
-
-    model.Dispatch([&concat_mod_tree_num, &concat_model](auto& model_inner) {
-      // model_inner is of the concrete type tl::ModelImpl<T, L>
-      using model_type         = std::remove_reference_t<decltype(model_inner)>;
-      auto& concat_model_inner = dynamic_cast<model_type&>(concat_model);
-      for (std::size_t indiv_trees = 0; indiv_trees < model_inner.trees.size(); indiv_trees++) {
-        compare_trees(concat_model_inner.trees[concat_mod_tree_num + indiv_trees],
-                      model_inner.trees[indiv_trees]);
-      }
-      concat_mod_tree_num = concat_mod_tree_num + model_inner.trees.size();
-    });
-  }
-}
-
 /**
  * @brief Concatenates the forest information present in different workers to
  *  create a single forest. This concatenated forest is stored in a new treelite model.
@@ -394,33 +346,16 @@ void compare_concat_forest_to_subforests(ModelHandle concat_tree_handle,
  * @param[in] treelite_handles: List containing ModelHandles for the forest present in
  *   each worker.
  */
-ModelHandle concatenate_trees(std::vector<ModelHandle> treelite_handles)
+TreeliteModelHandle concatenate_trees(std::vector<TreeliteModelHandle> treelite_handles)
 {
-  /* TODO(hcho3): Use treelite::ConcatenateModelObjects(),
-     once https://github.com/dmlc/treelite/issues/474 is fixed. */
   if (treelite_handles.empty()) { return nullptr; }
-  tl::Model& first_model  = *static_cast<tl::Model*>(treelite_handles[0]);
-  tl::Model* concat_model = first_model.Dispatch([&treelite_handles](auto& first_model_inner) {
-    // first_model_inner is of the concrete type tl::ModelImpl<T, L>
-    using model_type   = std::remove_reference_t<decltype(first_model_inner)>;
-    auto* concat_model = dynamic_cast<model_type*>(
-      tl::Model::Create(first_model_inner.GetThresholdType(), first_model_inner.GetLeafOutputType())
-        .release());
-    for (std::size_t forest_idx = 0; forest_idx < treelite_handles.size(); forest_idx++) {
-      tl::Model& model  = *static_cast<tl::Model*>(treelite_handles[forest_idx]);
-      auto& model_inner = dynamic_cast<model_type&>(model);
-      for (const auto& tree : model_inner.trees) {
-        concat_model->trees.push_back(tree.Clone());
-      }
-    }
-    concat_model->num_feature         = first_model_inner.num_feature;
-    concat_model->task_type           = first_model_inner.task_type;
-    concat_model->task_param          = first_model_inner.task_param;
-    concat_model->average_tree_output = first_model_inner.average_tree_output;
-    concat_model->param               = first_model_inner.param;
-    return static_cast<tl::Model*>(concat_model);
-  });
-  return concat_model;
+  std::vector<tl::Model const*> model_objs;
+  std::transform(treelite_handles.begin(),
+                 treelite_handles.end(),
+                 std::back_inserter(model_objs),
+                 [](TreeliteModelHandle handle) { return static_cast<tl::Model const*>(handle); });
+  std::unique_ptr<tl::Model> concat_model = tl::ConcatenateModelObjects(model_objs);
+  return static_cast<TreeliteModelHandle>(concat_model.release());
 }
 
 /**
@@ -776,15 +711,15 @@ template void delete_rf_metadata<double, int>(RandomForestClassifierD* forest);
 template void delete_rf_metadata<float, float>(RandomForestRegressorF* forest);
 template void delete_rf_metadata<double, double>(RandomForestRegressorD* forest);
 
-template void build_treelite_forest<float, int>(ModelHandle* model,
+template void build_treelite_forest<float, int>(TreeliteModelHandle* model,
                                                 const RandomForestMetaData<float, int>* forest,
                                                 int num_features);
-template void build_treelite_forest<double, int>(ModelHandle* model,
+template void build_treelite_forest<double, int>(TreeliteModelHandle* model,
                                                  const RandomForestMetaData<double, int>* forest,
                                                  int num_features);
-template void build_treelite_forest<float, float>(ModelHandle* model,
+template void build_treelite_forest<float, float>(TreeliteModelHandle* model,
                                                   const RandomForestMetaData<float, float>* forest,
                                                   int num_features);
 template void build_treelite_forest<double, double>(
-  ModelHandle* model, const RandomForestMetaData<double, double>* forest, int num_features);
+  TreeliteModelHandle* model, const RandomForestMetaData<double, double>* forest, int num_features);
 }  // End namespace ML
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index 9cf3accd9c..d85b097e47 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -31,7 +31,10 @@
 #include <thrust/transform.h>
 
 #include <treelite/c_api.h>
-#include <treelite/frontend.h>
+#include <treelite/enum/operator.h>
+#include <treelite/enum/task_type.h>
+#include <treelite/enum/typeinfo.h>
+#include <treelite/model_builder.h>
 #include <treelite/tree.h>
 
 #include <gtest/gtest.h>
@@ -49,7 +52,7 @@
 namespace ML {
 
 namespace tl  = treelite;
-namespace tlf = treelite::frontend;
+namespace tlm = treelite::model_builder;
 using namespace fil;
 
 struct FilTestParams {
@@ -123,7 +126,7 @@ std::ostream& operator<<(std::ostream& os, const FilTestParams& ps)
      << ", output = " << output2str(ps.output) << ", threshold = " << ps.threshold
      << ", threads_per_tree = " << ps.threads_per_tree << ", n_items = " << ps.n_items
      << ", blocks_per_sm = " << ps.blocks_per_sm << ", algo = " << ps.algo << ", seed = " << ps.seed
-     << ", tolerance = " << ps.tolerance << ", op = " << tl::OpName(ps.op)
+     << ", tolerance = " << ps.tolerance << ", op = " << tl::OperatorToString(ps.op)
      << ", global_bias = " << ps.global_bias << ", leaf_algo = " << ps.leaf_algo
      << ", num_classes = " << ps.num_classes
      << ", node_categorical_prob = " << ps.node_categorical_prob
@@ -200,7 +203,7 @@ __global__ void floats_to_bit_stream_k(uint8_t* dst, real_t* src, std::size_t si
 
 template <typename real_t>
 void adjust_threshold_to_treelite(
-  real_t* pthreshold, int* tl_left, int* tl_right, bool* default_left, tl::Operator comparison_op)
+  real_t* pthreshold, int* left, int* right, bool* default_left, tl::Operator comparison_op)
 {
   // in treelite (take left node if val [op] threshold),
   // the meaning of the condition is reversed compared to FIL;
@@ -208,7 +211,7 @@ void adjust_threshold_to_treelite(
   // https://github.com/dmlc/treelite/blob/master/include/treelite/tree.h#L243
   // TODO(levsnv): remove workaround once confirmed to work with empty category lists in Treelite
   if (isnan(*pthreshold)) {
-    std::swap(*tl_left, *tl_right);
+    std::swap(*left, *right);
     *default_left = !*default_left;
     return;
   }
@@ -224,7 +227,7 @@ void adjust_threshold_to_treelite(
       *pthreshold = std::nextafterf(*pthreshold, -std::numeric_limits<real_t>::infinity());
     case tl::Operator::kGE:
       // swap left and right
-      std::swap(*tl_left, *tl_right);
+      std::swap(*left, *right);
       *default_left = !*default_left;
       break;
     default: ASSERT(false, "only <, >, <= and >= comparisons are supported");
@@ -751,38 +754,34 @@ using PredictSparse8FilTest         = BasePredictFilTest<fil::sparse_node8>;
 template <typename real_t>
 class TreeliteFilTest : public BaseFilTest<real_t> {
  protected:
-  /** adds nodes[node] of tree starting at index root to builder
-      at index at *pkey, increments *pkey,
-      and returns the treelite key of the node */
-  int node_to_treelite(tlf::TreeBuilder* builder, int* pkey, int root, int node)
+  /** adds nodes[node] of tree starting at index root to builder */
+  void node_to_treelite(tlm::ModelBuilder* builder, int root, int node)
   {
-    int key = (*pkey)++;
-    builder->CreateNode(key);
+    builder->StartNode(node);
     const fil::dense_node<real_t>& dense_node = this->nodes[node];
-    std::vector<std::uint32_t> left_categories;
+    std::vector<std::uint32_t> right_categories;
     if (dense_node.is_leaf()) {
       switch (this->ps.leaf_algo) {
         case fil::leaf_algo_t::FLOAT_UNARY_BINARY:
         case fil::leaf_algo_t::GROVE_PER_CLASS:
           // default is fil::FLOAT_UNARY_BINARY
-          builder->SetLeafNode(key, tlf::Value::Create(dense_node.template output<real_t>()));
+          builder->LeafScalar(dense_node.template output<real_t>());
           break;
         case fil::leaf_algo_t::CATEGORICAL_LEAF: {
-          std::vector<tlf::Value> vec(this->ps.num_classes);
+          std::vector<real_t> vec(this->ps.num_classes);
           for (int i = 0; i < this->ps.num_classes; ++i) {
-            vec[i] =
-              tlf::Value::Create(i == dense_node.template output<int>() ? real_t(1) : real_t(0));
+            vec[i] = (i == dense_node.template output<int>() ? real_t(1) : real_t(0));
           }
-          builder->SetLeafVectorNode(key, vec);
+          builder->LeafVector(vec);
           break;
         }
         case fil::leaf_algo_t::VECTOR_LEAF: {
-          std::vector<tlf::Value> vec(this->ps.num_classes);
+          std::vector<real_t> vec(this->ps.num_classes);
           for (int i = 0; i < this->ps.num_classes; ++i) {
             auto idx = dense_node.template output<int>();
-            vec[i]   = tlf::Value::Create(this->vector_leaf[idx * this->ps.num_classes + i]);
+            vec[i]   = this->vector_leaf[idx * this->ps.num_classes + i];
           }
-          builder->SetLeafVectorNode(key, vec);
+          builder->LeafVector(vec);
           break;
         }
         case fil::leaf_algo_t::GROVE_PER_CLASS_FEW_CLASSES:
@@ -803,35 +802,24 @@ class TreeliteFilTest : public BaseFilTest<real_t> {
             byte = this->cat_sets_h.bits[dense_node.set() + category / BITS_PER_BYTE];
           }
           if ((byte & (1 << (category % BITS_PER_BYTE))) != 0) {
-            left_categories.push_back(category);
+            right_categories.push_back(category);
           }
         }
       }
-      int left_key  = node_to_treelite(builder, pkey, root, left);
-      int right_key = node_to_treelite(builder, pkey, root, right);
+      node_to_treelite(builder, root, left);
+      node_to_treelite(builder, root, right);
       // TODO(levsnv): remove workaround once confirmed to work with empty category lists in
       // Treelite
-      if (!left_categories.empty() && dense_node.is_categorical()) {
-        // Treelite builder APIs don't allow to set categorical_split_right_child
-        // (which child the categories pertain to). Only the Tree API allows that.
-        // in FIL, categories always pertain to the right child, and the default in treelite
-        // is left categories in SetCategoricalTestNode
-        std::swap(left_key, right_key);
-        default_left = !default_left;
-        builder->SetCategoricalTestNode(
-          key, dense_node.fid(), left_categories, default_left, left_key, right_key);
+      if (!right_categories.empty() && dense_node.is_categorical()) {
+        // in FIL, categories always pertain to the right child
+        builder->CategoricalTest(
+          dense_node.fid(), default_left, right_categories, true, left, right);
       } else {
-        adjust_threshold_to_treelite(&threshold, &left_key, &right_key, &default_left, this->ps.op);
-        builder->SetNumericalTestNode(key,
-                                      dense_node.fid(),
-                                      this->ps.op,
-                                      tlf::Value::Create(threshold),
-                                      default_left,
-                                      left_key,
-                                      right_key);
+        adjust_threshold_to_treelite(&threshold, &left, &right, &default_left, this->ps.op);
+        builder->NumericalTest(dense_node.fid(), threshold, default_left, this->ps.op, left, right);
       }
     }
-    return key;
+    builder->EndNode();
   }
 
   void init_forest_impl(fil::forest_t<real_t>* pforest, fil::storage_type_t storage_type)
@@ -840,42 +828,71 @@ class TreeliteFilTest : public BaseFilTest<real_t> {
     bool random_forest_flag = (this->ps.output & fil::output_t::AVG) != 0;
     tl::TypeInfo tl_type_info =
       std::is_same_v<real_t, float> ? tl::TypeInfo::kFloat32 : tl::TypeInfo::kFloat64;
-    int treelite_num_classes =
+    std::int32_t treelite_num_classes =
       this->ps.leaf_algo == fil::leaf_algo_t::FLOAT_UNARY_BINARY ? 1 : this->ps.num_classes;
-    std::unique_ptr<tlf::ModelBuilder> model_builder(new tlf::ModelBuilder(
-      this->ps.num_cols, treelite_num_classes, random_forest_flag, tl_type_info, tl_type_info));
+    tl::TaskType task_type;
+    if (this->ps.leaf_algo == fil::leaf_algo_t::FLOAT_UNARY_BINARY) {
+      if (this->ps.num_classes == 1 && (this->ps.output & fil::output_t::SIGMOID) != 0) {
+        task_type = tl::TaskType::kBinaryClf;
+      } else {
+        task_type = tl::TaskType::kRegressor;
+      }
+    } else {
+      task_type = tl::TaskType::kMultiClf;
+    }
+    tlm::Metadata metadata{
+      static_cast<std::int32_t>(this->ps.num_cols),
+      task_type,
+      random_forest_flag,
+      1,
+      {treelite_num_classes},
+      {1, (this->ps.leaf_algo == GROVE_PER_CLASS ? 1 : treelite_num_classes)},
+    };
+    std::vector<std::int32_t> class_id(this->ps.num_trees);
+    // class_id[i]: the class Tree i is associated with
+    if (this->ps.leaf_algo == fil::leaf_algo_t::GROVE_PER_CLASS) {
+      for (int tree_id = 0; tree_id < this->ps.num_trees; ++tree_id) {
+        class_id[tree_id] = tree_id % treelite_num_classes;
+      }
+    } else if (this->ps.leaf_algo == fil::leaf_algo_t::FLOAT_UNARY_BINARY) {
+      for (int tree_id = 0; tree_id < this->ps.num_trees; ++tree_id) {
+        class_id[tree_id] = 0;
+      }
+    } else {  // vector leaf
+      for (int tree_id = 0; tree_id < this->ps.num_trees; ++tree_id) {
+        class_id[tree_id] = -1;
+      }
+    }
+    tlm::TreeAnnotation tree_annotation{
+      this->ps.num_trees, std::vector<std::int32_t>(this->ps.num_trees, 0), class_id};
 
     // prediction transform
+    std::string postprocessor_name;
     if ((this->ps.output & fil::output_t::SIGMOID) != 0) {
-      if (this->ps.num_classes > 2)
-        model_builder->SetModelParam("pred_transform", "multiclass_ova");
-      else
-        model_builder->SetModelParam("pred_transform", "sigmoid");
+      if (this->ps.num_classes > 2) {
+        postprocessor_name = "multiclass_ova";
+      } else {
+        postprocessor_name = "sigmoid";
+      }
     } else if (this->ps.leaf_algo != fil::leaf_algo_t::FLOAT_UNARY_BINARY) {
-      model_builder->SetModelParam("pred_transform", "max_index");
-      this->ps.output = fil::output_t(this->ps.output | fil::output_t::CLASS);
+      postprocessor_name = "softmax";
+      this->ps.output    = fil::output_t(this->ps.output | fil::output_t::SOFTMAX);
     } else if (this->ps.leaf_algo == GROVE_PER_CLASS) {
-      model_builder->SetModelParam("pred_transform", "identity_multiclass");
+      postprocessor_name = "identity_multiclass";
     } else {
-      model_builder->SetModelParam("pred_transform", "identity");
+      postprocessor_name = "identity";
     }
-
-    // global bias
-    char* global_bias_str = nullptr;
-    ASSERT(asprintf(&global_bias_str, "%f", double(this->ps.global_bias)) > 0,
-           "cannot convert global_bias into a string");
-    model_builder->SetModelParam("global_bias", global_bias_str);
-    ::free(global_bias_str);
+    tlm::PostProcessorFunc postprocessor{postprocessor_name};
+    std::vector<double> base_scores(treelite_num_classes, this->ps.global_bias);
+    std::unique_ptr<tlm::ModelBuilder> model_builder = tlm::GetModelBuilder(
+      tl_type_info, tl_type_info, metadata, tree_annotation, postprocessor, base_scores);
 
     // build the trees
     for (int i_tree = 0; i_tree < this->ps.num_trees; ++i_tree) {
-      tlf::TreeBuilder* tree_builder = new tlf::TreeBuilder(tl_type_info, tl_type_info);
-      int key_counter                = 0;
-      int root                       = i_tree * this->tree_num_nodes();
-      int root_key                   = node_to_treelite(tree_builder, &key_counter, root, root);
-      tree_builder->SetRootNode(root_key);
-      // InsertTree() consumes tree_builder
-      TL_CPP_CHECK(model_builder->InsertTree(tree_builder));
+      int root = i_tree * this->tree_num_nodes();
+      model_builder->StartTree();
+      node_to_treelite(model_builder.get(), root, root);
+      model_builder->EndTree();
     }
 
     // commit the model
@@ -894,7 +911,7 @@ class TreeliteFilTest : public BaseFilTest<real_t> {
     params.pforest_shape_str = this->ps.print_forest_shape ? &forest_shape_str : nullptr;
     params.precision         = fil::PRECISION_NATIVE;
     fil::forest_variant forest_variant;
-    fil::from_treelite(this->handle, &forest_variant, (ModelHandle)model.get(), &params);
+    fil::from_treelite(this->handle, &forest_variant, (TreeliteModelHandle)model.get(), &params);
     *pforest = std::get<fil::forest_t<real_t>>(forest_variant);
     this->handle.sync_stream(stream);
     if (this->ps.print_forest_shape) {
diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu
index 64bbae26e3..44c57c9852 100644
--- a/cpp/test/sg/rf_test.cu
+++ b/cpp/test/sg/rf_test.cu
@@ -168,7 +168,7 @@ auto FilPredict(const raft::handle_t& handle,
                 RandomForestMetaData<DataT, LabelT>* forest)
 {
   auto pred = std::make_shared<thrust::device_vector<float>>(params.n_rows);
-  ModelHandle model;
+  TreeliteModelHandle model;
   std::size_t num_outputs = 1;
   if constexpr (std::is_integral_v<LabelT>) { num_outputs = params.n_labels; }
   build_treelite_forest(&model, forest, params.n_cols);
@@ -195,7 +195,7 @@ auto FilPredictProba(const raft::handle_t& handle,
 {
   std::size_t num_outputs = params.n_labels;
   auto pred = std::make_shared<thrust::device_vector<float>>(params.n_rows * num_outputs);
-  ModelHandle model;
+  TreeliteModelHandle model;
   static_assert(std::is_integral_v<LabelT>, "Must be classification");
   build_treelite_forest(&model, forest, params.n_cols);
   fil::treelite_params_t tl_params{
@@ -555,7 +555,7 @@ TEST(RfTests, IntegerOverflow)
 
   // See if fil overflows
   thrust::device_vector<float> pred(m);
-  ModelHandle model;
+  TreeliteModelHandle model;
   build_treelite_forest(&model, forest_ptr, n);
 
   std::size_t num_outputs = 1;
diff --git a/dependencies.yaml b/dependencies.yaml
index 8f6262f497..dd9fd0faa3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -154,7 +154,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cython>=3.0.0
-          - &treelite treelite==3.9.1
+          - &treelite treelite==4.0.0
       - output_types: conda
         packages:
           - &pylibraft_conda pylibraft==24.2.*
@@ -169,7 +169,6 @@ dependencies:
       - output_types: [pyproject, requirements]
         packages:
           - scikit-build-core[pyproject]>=0.7.0
-          - &treelite_runtime treelite_runtime==3.9.1
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
@@ -220,9 +219,6 @@ dependencies:
           # This index is needed for cudf and rmm.
           - --extra-index-url=https://pypi.nvidia.com
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
-      - output_types: [pyproject, requirements]
-        packages:
-          - *treelite_runtime
     specific:
       - output_types: [requirements, pyproject]
         matrices:
diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py
index 79214787c7..be9b5ab841 100644
--- a/python/cuml/benchmark/algorithms.py
+++ b/python/cuml/benchmark/algorithms.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import treelite_runtime
 import treelite
 from cuml.benchmark.bench_helper_funcs import (
     fit,
@@ -27,7 +26,6 @@
     _build_fil_classifier,
     _build_gtil_classifier,
     _build_optimized_fil_classifier,
-    _build_treelite_classifier,
     _treelite_fil_accuracy_score,
     _training_data_to_numpy,
     _build_mnmg_umap,
@@ -218,7 +216,7 @@ def _labels_to_int_hook(data):
 def _treelite_format_hook(data):
     """Helper function converting data into treelite format"""
     data = _training_data_to_numpy(data[0], data[1])
-    return treelite_runtime.DMatrix(data[0]), data[1]
+    return data[0], data[1]
 
 
 def _numpy_format_hook(data):
@@ -466,7 +464,7 @@ def all_algorithms():
             ),
             name="FIL",
             accepts_labels=False,
-            setup_cpu_func=_build_treelite_classifier,
+            setup_cpu_func=_build_gtil_classifier,
             setup_cuml_func=_build_fil_classifier,
             cpu_data_prep_hook=_treelite_format_hook,
             accuracy_function=_treelite_fil_accuracy_score,
@@ -497,7 +495,7 @@ def all_algorithms():
             cuml_args=dict(output_class=False),
             name="FILEX",
             accepts_labels=False,
-            setup_cpu_func=_build_treelite_classifier,
+            setup_cpu_func=_build_gtil_classifier,
             setup_cuml_func=_build_fil_classifier,
             cpu_data_prep_hook=_treelite_format_hook,
             accuracy_function=_treelite_fil_accuracy_score,
@@ -516,7 +514,7 @@ def all_algorithms():
             ),
             name="FILEX-Optimized",
             accepts_labels=False,
-            setup_cpu_func=_build_treelite_classifier,
+            setup_cpu_func=_build_gtil_classifier,
             setup_cuml_func=_build_optimized_fil_classifier,
             cpu_data_prep_hook=_treelite_format_hook,
             accuracy_function=_treelite_fil_accuracy_score,
@@ -535,7 +533,7 @@ def all_algorithms():
             ),
             name="FIL-Optimized",
             accepts_labels=False,
-            setup_cpu_func=_build_treelite_classifier,
+            setup_cpu_func=_build_gtil_classifier,
             setup_cuml_func=_build_optimized_fil_classifier,
             cpu_data_prep_hook=_treelite_format_hook,
             accuracy_function=_treelite_fil_accuracy_score,
diff --git a/python/cuml/benchmark/bench_helper_funcs.py b/python/cuml/benchmark/bench_helper_funcs.py
index 5292d0f9fa..979a4df921 100644
--- a/python/cuml/benchmark/bench_helper_funcs.py
+++ b/python/cuml/benchmark/bench_helper_funcs.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -392,32 +392,6 @@ def _build_gtil_classifier(m, data, args, tmpdir):
     return GtilWrapper(tl_model, infer_type=infer_type)
 
 
-def _build_treelite_classifier(m, data, args, tmpdir):
-    """Setup function for treelite classification benchmarking"""
-    from cuml.internals.import_utils import has_xgboost
-    import treelite_runtime
-
-    max_depth = args["max_depth"]
-    num_rounds = args["num_rounds"]
-    n_feature = data[0].shape[1]
-    train_size = data[0].shape[0]
-    model_name = f"xgb_{max_depth}_{num_rounds}_{n_feature}_{train_size}.model"
-    model_path = os.path.join(tmpdir, model_name)
-
-    bst = xgb.Booster()
-    bst.load_model(model_path)
-    tl_model = treelite.Model.from_xgboost(bst)
-    tl_model.export_lib(
-        toolchain="gcc",
-        libpath=os.path.join(tmpdir, "treelite.so"),
-        params={"parallel_comp": 40},
-        verbose=False,
-    )
-    return treelite_runtime.Predictor(
-        os.path.join(tmpdir, "treelite.so"), verbose=False
-    )
-
-
 def _treelite_fil_accuracy_score(y_true, y_pred):
     """Function to get correct accuracy for FIL (returns class index)"""
     # convert the input if necessary
diff --git a/python/cuml/ensemble/CMakeLists.txt b/python/cuml/ensemble/CMakeLists.txt
index 3231c322b1..e3732c1577 100644
--- a/python/cuml/ensemble/CMakeLists.txt
+++ b/python/cuml/ensemble/CMakeLists.txt
@@ -20,7 +20,7 @@ add_module_gpu_default("randomforestregressor.pyx" ${randomforestregressor_algo}
 
 set(linked_libraries
     ${cuml_sg_libraries}
-    ${TREELITE_LIBS})
+    ${CUML_PYTHON_TREELITE_TARGET})
 
 rapids_cython_create_modules(
   CXX
diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index a799de25a9..eb71f0c78d 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -208,12 +208,12 @@ class BaseRandomForestModel(Base):
             raise NotFittedError(
                     "Attempting to create treelite from un-fit forest.")
 
-        cdef ModelHandle tl_handle = NULL
+        cdef TreeliteModelHandle tl_handle = NULL
         if self.treelite_handle:
             return self.treelite_handle  # Use cached version
 
         elif self.treelite_serialized_model:  # bytes -> Treelite
-            tl_handle = <ModelHandle><uintptr_t>treelite_deserialize(
+            tl_handle = <TreeliteModelHandle><uintptr_t>treelite_deserialize(
                 self.treelite_serialized_model)
 
         else:
@@ -317,14 +317,14 @@ class BaseRandomForestModel(Base):
         return treelite_deserialize(treelite_serialized_model)
 
     def _concatenate_treelite_handle(self, treelite_handle):
-        cdef ModelHandle concat_model_handle = NULL
-        cdef vector[ModelHandle] *model_handles \
-            = new vector[ModelHandle]()
+        cdef TreeliteModelHandle concat_model_handle = NULL
+        cdef vector[TreeliteModelHandle] *model_handles \
+            = new vector[TreeliteModelHandle]()
         cdef uintptr_t mod_ptr
         for i in treelite_handle:
             mod_ptr = <uintptr_t>i
             model_handles.push_back((
-                <ModelHandle> mod_ptr))
+                <TreeliteModelHandle> mod_ptr))
 
         self._reset_forest_data()
         concat_model_handle = concatenate_trees(deref(model_handles))
diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd
index bd4e8ca0b0..e32e520c28 100644
--- a/python/cuml/ensemble/randomforest_shared.pxd
+++ b/python/cuml/ensemble/randomforest_shared.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,8 +32,7 @@ from pylibraft.common.handle cimport handle_t
 cimport cuml.common.cuda
 
 cdef extern from "treelite/c_api.h":
-    ctypedef void* ModelHandle
-    ctypedef void* ModelBuilderHandle
+    ctypedef void* TreeliteModelHandle
     cdef const char* TreeliteGetLastError()
 
 cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
@@ -78,7 +77,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
     #
     # Treelite handling
     #
-    cdef void build_treelite_forest[T, L](ModelHandle*,
+    cdef void build_treelite_forest[T, L](TreeliteModelHandle*,
                                           RandomForestMetaData[T, L]*,
                                           int
                                           ) except +
@@ -108,7 +107,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                                  int,
                                  int) except +
 
-    cdef vector[unsigned char] save_model(ModelHandle)
+    cdef vector[unsigned char] save_model(TreeliteModelHandle)
 
-    cdef ModelHandle concatenate_trees(
-        vector[ModelHandle] &treelite_handles) except +
+    cdef TreeliteModelHandle concatenate_trees(
+        vector[TreeliteModelHandle] &treelite_handles) except +
diff --git a/python/cuml/ensemble/randomforest_shared.pyx b/python/cuml/ensemble/randomforest_shared.pyx
index dcaae34da6..4e8c86341b 100644
--- a/python/cuml/ensemble/randomforest_shared.pyx
+++ b/python/cuml/ensemble/randomforest_shared.pyx
@@ -24,22 +24,24 @@ from typing import Dict, List, Union
 from cuml.internals.safe_imports import cpu_only_import
 np = cpu_only_import('numpy')
 
-cdef extern from "treelite/tree.h" namespace "treelite":
-    cdef struct PyBufferFrame:
+cdef extern from "treelite/c_api.h":
+    cdef struct TreelitePyBufferFrame:
         void* buf
         char* format
         size_t itemsize
         size_t nitem
+
+cdef extern from "treelite/tree.h" namespace "treelite":
     cdef cppclass Model:
-        vector[PyBufferFrame] GetPyBuffer() except +
+        vector[TreelitePyBufferFrame] SerializeToPyBuffer() except +
         @staticmethod
-        unique_ptr[Model] CreateFromPyBuffer(vector[PyBufferFrame]) except +
+        unique_ptr[Model] DeserializeFromPyBuffer(const vector[TreelitePyBufferFrame] &) except +
 
 cdef extern from "Python.h":
     Py_buffer* PyMemoryView_GET_BUFFER(PyObject* mview)
 
 cdef class PyBufferFrameWrapper:
-    cdef PyBufferFrame _handle
+    cdef TreelitePyBufferFrame _handle
     cdef Py_ssize_t shape[1]
     cdef Py_ssize_t strides[1]
 
@@ -70,28 +72,28 @@ cdef class PyBufferFrameWrapper:
     def __releasebuffer__(self, Py_buffer *buffer):
         pass
 
-cdef PyBufferFrameWrapper MakePyBufferFrameWrapper(PyBufferFrame handle):
+cdef PyBufferFrameWrapper MakePyBufferFrameWrapper(TreelitePyBufferFrame handle):
     cdef PyBufferFrameWrapper wrapper = PyBufferFrameWrapper()
     wrapper._handle = handle
     return wrapper
 
-cdef list _get_frames(ModelHandle model):
+cdef list _get_frames(TreeliteModelHandle model):
     return [memoryview(MakePyBufferFrameWrapper(v))
-            for v in (<Model*>model).GetPyBuffer()]
+            for v in (<Model*>model).SerializeToPyBuffer()]
 
-cdef ModelHandle _init_from_frames(vector[PyBufferFrame] frames) except *:
-    return <ModelHandle>Model.CreateFromPyBuffer(frames).release()
+cdef TreeliteModelHandle _init_from_frames(vector[TreelitePyBufferFrame] frames) except *:
+    return <TreeliteModelHandle>Model.DeserializeFromPyBuffer(frames).release()
 
 
 def get_frames(model: uintptr_t) -> List[memoryview]:
-    return _get_frames(<ModelHandle> model)
+    return _get_frames(<TreeliteModelHandle> model)
 
 
 def init_from_frames(frames: List[np.ndarray],
                      format_str: List[str], itemsize: List[int]) -> uintptr_t:
-    cdef vector[PyBufferFrame] cpp_frames
+    cdef vector[TreelitePyBufferFrame] cpp_frames
     cdef Py_buffer* buf
-    cdef PyBufferFrame cpp_frame
+    cdef TreelitePyBufferFrame cpp_frame
     format_bytes = [s.encode('utf-8') for s in format_str]
     for i, frame in enumerate(frames):
         x = memoryview(frame)
diff --git a/python/cuml/experimental/fil/fil.pyx b/python/cuml/experimental/fil/fil.pyx
index 067b6e1eba..b652d3e07d 100644
--- a/python/cuml/experimental/fil/fil.pyx
+++ b/python/cuml/experimental/fil/fil.pyx
@@ -54,7 +54,7 @@ from cuml.internals.safe_imports import (
 nvtx_annotate = gpu_only_import_from("nvtx", "annotate", alt=null_decorator)
 
 cdef extern from "treelite/c_api.h":
-    ctypedef void* ModelHandle
+    ctypedef void* TreeliteModelHandle
 
 
 cdef raft_proto_device_t get_device_type(arr):
@@ -94,7 +94,7 @@ cdef extern from "cuml/experimental/fil/forest_model.hpp" namespace "ML::experim
 
 cdef extern from "cuml/experimental/fil/treelite_importer.hpp" namespace "ML::experimental::fil":
     forest_model import_from_treelite_handle(
-        ModelHandle,
+        TreeliteModelHandle,
         fil_tree_layout,
         uint32_t,
         optional[bool],
@@ -160,7 +160,7 @@ cdef class ForestInference_impl():
             tree_layout = fil_tree_layout.depth_first
 
         self.model = import_from_treelite_handle(
-            <ModelHandle><uintptr_t>model_handle,
+            <TreeliteModelHandle><uintptr_t>model_handle,
             tree_layout,
             align_bytes,
             use_double_precision_c,
diff --git a/python/cuml/experimental/fil/infer_kind.pxd b/python/cuml/experimental/fil/infer_kind.pxd
index b2dc6df246..c4b5e91016 100644
--- a/python/cuml/experimental/fil/infer_kind.pxd
+++ b/python/cuml/experimental/fil/infer_kind.pxd
@@ -15,7 +15,7 @@
 #
 
 cdef extern from "treelite/c_api.h":
-    ctypedef void* ModelHandle
+    ctypedef void* TreeliteModelHandle
 
 cdef extern from "cuml/experimental/fil/infer_kind.hpp" namespace "ML::experimental::fil":
     # TODO(hcho3): Switch to new syntax for scoped enum when we adopt Cython 3.0
diff --git a/python/cuml/explainer/tree_shap.pyx b/python/cuml/explainer/tree_shap.pyx
index 16c611ff36..076f59b0c6 100644
--- a/python/cuml/explainer/tree_shap.pyx
+++ b/python/cuml/explainer/tree_shap.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,10 +36,14 @@ else:
     sklrfc = object
 
 cdef extern from "treelite/c_api.h":
-    ctypedef void * ModelHandle
-    cdef int TreeliteQueryNumClass(ModelHandle handle, size_t * out)
-
-cdef extern from "treelite/c_api_common.h":
+    cdef struct TreelitePyBufferFrame:
+        void* buf
+        char* format
+        size_t itemsize
+        size_t nitem
+    ctypedef void * TreeliteModelHandle
+    cdef int TreeliteGetHeaderField(
+            TreeliteModelHandle model, const char * name, TreelitePyBufferFrame* out_frame) except +
     cdef const char * TreeliteGetLastError()
 
 cdef extern from "cuml/explainer/tree_shap.hpp" namespace "ML::Explainer":
@@ -49,7 +53,7 @@ cdef extern from "cuml/explainer/tree_shap.hpp" namespace "ML::Explainer":
     cdef cppclass FloatPointer:
         pass
 
-    cdef TreePathHandle extract_path_info(ModelHandle model) except +
+    cdef TreePathHandle extract_path_info(TreeliteModelHandle model) except +
     cdef void gpu_treeshap(TreePathHandle  path_info,
                            const FloatPointer data,
                            size_t n_rows,
@@ -90,6 +94,43 @@ cdef FloatPointer type_erase_float_ptr(array):
         raise ValueError("Unsupported dtype")
     return ptr
 
+cdef class PyBufferFrameWrapper:
+    cdef TreelitePyBufferFrame _handle
+    cdef Py_ssize_t shape[1]
+    cdef Py_ssize_t strides[1]
+
+    def __cinit__(self):
+        pass
+
+    def __dealloc__(self):
+        pass
+
+    def __getbuffer__(self, Py_buffer* buffer, int flags):
+        cdef Py_ssize_t itemsize = self._handle.itemsize
+
+        self.shape[0] = self._handle.nitem
+        self.strides[0] = itemsize
+
+        buffer.buf = self._handle.buf
+        buffer.format = self._handle.format
+        buffer.internal = NULL
+        buffer.itemsize = itemsize
+        buffer.len = self._handle.nitem * itemsize
+        buffer.ndim = 1
+        buffer.obj = self
+        buffer.readonly = 0
+        buffer.shape = self.shape
+        buffer.strides = self.strides
+        buffer.suboffsets = NULL
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+cdef PyBufferFrameWrapper MakePyBufferFrameWrapper(TreelitePyBufferFrame handle):
+    cdef PyBufferFrameWrapper wrapper = PyBufferFrameWrapper()
+    wrapper._handle = handle
+    return wrapper
+
 cdef class TreeExplainer:
     """
     Model explainer that calculates Shapley values for the predictions of
@@ -164,7 +205,7 @@ cdef class TreeExplainer:
     """
     cdef public object expected_value
     cdef TreePathHandle path_info
-    cdef size_t num_class
+    cdef object num_class
     cdef object data
 
     def __init__(self, *, model, data=None):
@@ -205,13 +246,20 @@ cdef class TreeExplainer:
         else:
             raise ValueError('Unrecognized model object type')
 
-        cdef ModelHandle model_ptr = <ModelHandle > <uintptr_t > handle
-        self.num_class = 0
-        if TreeliteQueryNumClass(model_ptr, & self.num_class) != 0:
-            raise RuntimeError('Treelite error: {}'.format(
-                TreeliteGetLastError()))
+        cdef TreeliteModelHandle model_ptr = <TreeliteModelHandle > <uintptr_t > handle
+        # Get num_class
+        cdef TreelitePyBufferFrame frame
+        res = TreeliteGetHeaderField(<TreeliteModelHandle> model_ptr, "num_class", &frame)
+        if res < 0:
+            err = TreeliteGetLastError()
+            raise RuntimeError(f"Failed to fetch num_class: {err}")
+        view = memoryview(MakePyBufferFrameWrapper(frame))
+        self.num_class = np.asarray(view)
         self.path_info = extract_path_info(model_ptr)
 
+        if len(self.num_class) > 1:
+            raise NotImplementedError("TreeExplainer does not support multi-target models")
+
     def _prepare_input(self, X):
         try:
             return input_to_cuml_array(
@@ -252,7 +300,7 @@ cdef class TreeExplainer:
         # Storing a C-order 3D array in a CumlArray leads to cryptic error
         # ValueError: len(shape) != len(strides)
         # So we use 2D array here
-        pred_shape = (n_rows, self.num_class * (n_cols + 1))
+        pred_shape = (n_rows, self.num_class[0] * (n_cols + 1))
         preds = CumlArray.empty(
             shape=pred_shape, dtype=dtype, order='C')
 
@@ -278,14 +326,14 @@ cdef class TreeExplainer:
         # 2. Transpose SHAP values in dimension (group_id, row_id, feature_id)
         preds = preds.to_output(
             output_type=self._determine_output_type(X))
-        if self.num_class > 1:
+        if self.num_class[0] > 1:
             preds = preds.reshape(
-                (n_rows, self.num_class, n_cols + 1))
+                (n_rows, self.num_class[0], n_cols + 1))
             preds = preds.transpose((1, 0, 2))
             self.expected_value = preds[:, 0, -1]
             return preds[:, :, :-1]
         else:
-            assert self.num_class == 1
+            assert self.num_class[0] == 1
             self.expected_value = preds[0, -1]
             return preds[:, :-1]
 
@@ -320,7 +368,7 @@ cdef class TreeExplainer:
         # Storing a C-order 3D array in a CumlArray leads to cryptic error
         # ValueError: len(shape) != len(strides)
         # So we use 2D array here
-        pred_shape = (n_rows, self.num_class * (n_cols + 1)**2)
+        pred_shape = (n_rows, self.num_class[0] * (n_cols + 1)**2)
         preds = CumlArray.empty(
             shape=pred_shape, dtype=dtype, order='C')
 
@@ -345,14 +393,14 @@ cdef class TreeExplainer:
 
         preds = preds.to_output(
             output_type=self._determine_output_type(X))
-        if self.num_class > 1:
+        if self.num_class[0] > 1:
             preds = preds.reshape(
-                (n_rows, self.num_class, n_cols + 1, n_cols + 1))
+                (n_rows, self.num_class[0], n_cols + 1, n_cols + 1))
             preds = preds.transpose((1, 0, 2, 3))
             self.expected_value = preds[:, 0, -1, -1]
             return preds[:, :, :-1, :-1]
         else:
-            assert self.num_class == 1
+            assert self.num_class[0] == 1
             preds = preds.reshape(
                 (n_rows,  n_cols + 1, n_cols + 1))
             self.expected_value = preds[0, -1, -1]
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 078bdc6fcc..16413b34ac 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -44,27 +44,70 @@ import treelite.sklearn as tl_skl
 cimport cuml.common.cuda
 
 cdef extern from "treelite/c_api.h":
-    ctypedef void* ModelHandle
-    cdef int TreeliteLoadXGBoostModelEx(const char* filename,
-                                        const char* config_json,
-                                        ModelHandle* out) except +
-    cdef int TreeliteLoadXGBoostJSONEx(const char* filename,
+    cdef struct TreelitePyBufferFrame:
+        void* buf
+        char* format
+        size_t itemsize
+        size_t nitem
+    ctypedef void* TreeliteModelHandle
+    ctypedef void* TreeliteGTILConfigHandle
+    cdef int TreeliteLoadXGBoostModelLegacyBinary(const char* filename,
+                                                  const char* config_json,
+                                                  TreeliteModelHandle* out) except +
+    cdef int TreeliteLoadXGBoostModel(const char* filename,
+                                      const char* config_json,
+                                      TreeliteModelHandle* out) except +
+    cdef int TreeliteFreeModel(TreeliteModelHandle handle) except +
+    cdef int TreeliteQueryNumTree(TreeliteModelHandle handle, size_t* out) except +
+    cdef int TreeliteQueryNumFeature(TreeliteModelHandle handle, int* out) except +
+    cdef int TreeliteLoadLightGBMModel(const char* filename,
                                        const char* config_json,
-                                       ModelHandle* out) except +
-    cdef int TreeliteFreeModel(ModelHandle handle) except +
-    cdef int TreeliteQueryNumTree(ModelHandle handle, size_t* out) except +
-    cdef int TreeliteQueryNumFeature(ModelHandle handle, size_t* out) except +
-    cdef int TreeliteQueryNumClass(ModelHandle handle, size_t* out) except +
-    cdef int TreeliteLoadLightGBMModelEx(const char* filename,
-                                         const char* config_json,
-                                         ModelHandle* out) except +
-    cdef int TreeliteSerializeModel(const char* filename,
-                                    ModelHandle handle) except +
-    cdef int TreeliteDeserializeModel(const char* filename,
-                                      ModelHandle handle) except +
+                                       TreeliteModelHandle* out) except +
+    cdef int TreeliteSerializeModelToFile(TreeliteModelHandle handle,
+                                          const char* filename) except +
+    cdef int TreeliteGetHeaderField(
+            TreeliteModelHandle model, const char * name, TreelitePyBufferFrame* out_frame) except +
     cdef const char* TreeliteGetLastError()
 
 
+cdef class PyBufferFrameWrapper:
+    cdef TreelitePyBufferFrame _handle
+    cdef Py_ssize_t shape[1]
+    cdef Py_ssize_t strides[1]
+
+    def __cinit__(self):
+        pass
+
+    def __dealloc__(self):
+        pass
+
+    def __getbuffer__(self, Py_buffer* buffer, int flags):
+        cdef Py_ssize_t itemsize = self._handle.itemsize
+
+        self.shape[0] = self._handle.nitem
+        self.strides[0] = itemsize
+
+        buffer.buf = self._handle.buf
+        buffer.format = self._handle.format
+        buffer.internal = NULL
+        buffer.itemsize = itemsize
+        buffer.len = self._handle.nitem * itemsize
+        buffer.ndim = 1
+        buffer.obj = self
+        buffer.readonly = 0
+        buffer.shape = self.shape
+        buffer.strides = self.strides
+        buffer.suboffsets = NULL
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+cdef PyBufferFrameWrapper MakePyBufferFrameWrapper(TreelitePyBufferFrame handle):
+    cdef PyBufferFrameWrapper wrapper = PyBufferFrameWrapper()
+    wrapper._handle = handle
+    return wrapper
+
+
 cdef class TreeliteModel():
     """
     Wrapper for Treelite-loaded forest
@@ -76,22 +119,22 @@ cdef class TreeliteModel():
     Attributes
     ----------
 
-    handle : ModelHandle
+    handle : TreeliteModelHandle
         Opaque pointer to Treelite model
     """
-    cdef ModelHandle handle
+    cdef TreeliteModelHandle handle
     cdef bool owns_handle
 
     def __cinit__(self, owns_handle=True):
         """If owns_handle is True, free the handle's model in destructor.
         Set this to False if another owner will free the model."""
-        self.handle = <ModelHandle>NULL
+        self.handle = <TreeliteModelHandle>NULL
         self.owns_handle = owns_handle
 
-    cdef set_handle(self, ModelHandle new_handle):
+    cdef set_handle(self, TreeliteModelHandle new_handle):
         self.handle = new_handle
 
-    cdef ModelHandle get_handle(self):
+    cdef TreeliteModelHandle get_handle(self):
         return self.handle
 
     @property
@@ -112,14 +155,14 @@ cdef class TreeliteModel():
     @property
     def num_features(self):
         assert self.handle != NULL
-        cdef size_t out
+        cdef int out
         TreeliteQueryNumFeature(self.handle, &out)
         return out
 
     @classmethod
     def free_treelite_model(cls, model_handle):
         cdef uintptr_t model_ptr = <uintptr_t>model_handle
-        TreeliteFreeModel(<ModelHandle> model_ptr)
+        TreeliteFreeModel(<TreeliteModelHandle> model_ptr)
 
     @classmethod
     def from_filename(cls, filename, model_type="xgboost"):
@@ -136,14 +179,14 @@ cdef class TreeliteModel():
         """
         filename_bytes = filename.encode("UTF-8")
         config_bytes = "{}".encode("UTF-8")
-        cdef ModelHandle handle
+        cdef TreeliteModelHandle handle
         if model_type == "xgboost":
-            res = TreeliteLoadXGBoostModelEx(filename_bytes, config_bytes, &handle)
+            res = TreeliteLoadXGBoostModelLegacyBinary(filename_bytes, config_bytes, &handle)
             if res < 0:
                 err = TreeliteGetLastError()
                 raise RuntimeError("Failed to load %s (%s)" % (filename, err))
         elif model_type == "xgboost_json":
-            res = TreeliteLoadXGBoostJSONEx(filename_bytes, config_bytes, &handle)
+            res = TreeliteLoadXGBoostModel(filename_bytes, config_bytes, &handle)
             if res < 0:
                 err = TreeliteGetLastError()
                 raise RuntimeError("Failed to load %s (%s)" % (filename, err))
@@ -151,7 +194,7 @@ cdef class TreeliteModel():
             logger.warn("Treelite currently does not support float64 model"
                         " parameters. Accuracy may degrade slightly relative"
                         " to native LightGBM invocation.")
-            res = TreeliteLoadLightGBMModelEx(filename_bytes, config_bytes, &handle)
+            res = TreeliteLoadLightGBMModel(filename_bytes, config_bytes, &handle)
             if res < 0:
                 err = TreeliteGetLastError()
                 raise RuntimeError("Failed to load %s (%s)" % (filename, err))
@@ -172,13 +215,13 @@ cdef class TreeliteModel():
         """
         assert self.handle != NULL
         filename_bytes = filename.encode("UTF-8")
-        TreeliteSerializeModel(filename_bytes, self.handle)
+        TreeliteSerializeModelToFile(self.handle, filename_bytes)
 
     @classmethod
     def from_treelite_model_handle(cls,
                                    treelite_handle,
                                    take_handle_ownership=False):
-        cdef ModelHandle handle = <ModelHandle> <size_t> treelite_handle
+        cdef TreeliteModelHandle handle = <TreeliteModelHandle> <size_t> treelite_handle
         model = TreeliteModel(owns_handle=take_handle_ownership)
         model.set_handle(handle)
         return model
@@ -253,14 +296,14 @@ cdef extern from "cuml/fil/fil.h" namespace "ML::fil":
 
     cdef void from_treelite(handle_t& handle,
                             forest_variant*,
-                            ModelHandle,
+                            TreeliteModelHandle,
                             treelite_params_t*) except +
 
 cdef class ForestInference_impl():
 
     cdef object handle
     cdef forest_variant forest_data
-    cdef size_t num_class
+    cdef object num_class
     cdef bool output_class
     cdef char* shape_str
     cdef DeviceMemoryResource mr
@@ -390,10 +433,10 @@ cdef class ForestInference_impl():
         if preds is None:
             shape = (n_rows, )
             if predict_proba:
-                if self.num_class <= 2:
+                if self.num_class[0] <= 2:
                     shape += (2,)
                 else:
-                    shape += (self.num_class,)
+                    shape += (self.num_class[0],)
             preds = CumlArray.empty(shape=shape, dtype=fil_dtype, order='C',
                                     index=X_m.index)
         else:
@@ -467,10 +510,18 @@ cdef class ForestInference_impl():
 
         from_treelite(handle_[0],
                       &self.forest_data,
-                      <ModelHandle> model_ptr,
+                      <TreeliteModelHandle> model_ptr,
                       &treelite_params)
-        TreeliteQueryNumClass(<ModelHandle> model_ptr,
-                              &self.num_class)
+        # Get num_class
+        cdef TreelitePyBufferFrame frame
+        res = TreeliteGetHeaderField(<TreeliteModelHandle> model_ptr, "num_class", &frame)
+        if res < 0:
+            err = TreeliteGetLastError()
+            raise RuntimeError(f"Failed to fetch num_class: {err}")
+        view = memoryview(MakePyBufferFrameWrapper(frame))
+        self.num_class = np.asarray(view).copy()
+        if len(self.num_class) > 1:
+            raise NotImplementedError("FIL does not support multi-target models")
         return self
 
     def __dealloc__(self):
@@ -964,7 +1015,7 @@ class ForestInference(Base,
                                    ):
         """
         Returns a FIL instance by converting a treelite model to
-        FIL model by using the treelite ModelHandle passed.
+        FIL model by using the treelite TreeliteModelHandle passed.
 
         Parameters
         ----------
diff --git a/python/cuml/tests/test_random_forest.py b/python/cuml/tests/test_random_forest.py
index 1a77b14d05..76c2fff608 100644
--- a/python/cuml/tests/test_random_forest.py
+++ b/python/cuml/tests/test_random_forest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1327,7 +1327,7 @@ def test_rf_regressor_gtil_integration(tmpdir):
     X, y = X.astype(np.float32), y.astype(np.float32)
     clf = curfr(max_depth=3, random_state=0, n_estimators=10)
     clf.fit(X, y)
-    expected_pred = clf.predict(X)
+    expected_pred = clf.predict(X).reshape((-1, 1))
 
     checkpoint_path = os.path.join(tmpdir, "checkpoint.tl")
     clf.convert_to_treelite_model().to_treelite_checkpoint(checkpoint_path)
@@ -1342,7 +1342,7 @@ def test_rf_binary_classifier_gtil_integration(tmpdir):
     X, y = X.astype(np.float32), y.astype(np.int32)
     clf = curfc(max_depth=3, random_state=0, n_estimators=10)
     clf.fit(X, y)
-    expected_pred = clf.predict(X)
+    expected_pred = clf.predict_proba(X)
 
     checkpoint_path = os.path.join(tmpdir, "checkpoint.tl")
     clf.convert_to_treelite_model().to_treelite_checkpoint(checkpoint_path)
diff --git a/python/pyproject.toml b/python/pyproject.toml
index c064b65fff..e08bedd774 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -21,8 +21,7 @@ requires = [
     "pylibraft==24.2.*",
     "rmm==24.2.*",
     "scikit-build-core[pyproject]>=0.7.0",
-    "treelite==3.9.1",
-    "treelite_runtime==3.9.1",
+    "treelite==4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "scikit_build_core.build"
 
@@ -66,8 +65,7 @@ dependencies = [
     "rapids-dask-dependency==24.2.*",
     "rmm==24.2.*",
     "scipy>=1.8.0",
-    "treelite==3.9.1",
-    "treelite_runtime==3.9.1",
+    "treelite==4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From e2f8da499a286d8f86923d30712c8a6208f0ea43 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 2 Feb 2024 06:49:06 -0800
Subject: [PATCH 18/24] Disable HistGradientBoosting support for now (#5744)

Treelite 4.0 added support for `HistGradientBoostingClassifier` and `HistGradientBoostingRegressor`. However, it does not yet work with the latest scikit-learn (1.4.0), as it changed the internals of `HistGradientBoostingClassifier` / `HistGradientBoostingRegressor` estimators.

For now, throw an exception for HistGradientBoosting estimators.

Authors:
  - Philip Hyunsu Cho (https://github.com/hcho3)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/cuml/pull/5744
---
 python/cuml/experimental/fil/fil.pyx | 13 ++++++++++++-
 python/cuml/fil/fil.pyx              | 11 +++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/python/cuml/experimental/fil/fil.pyx b/python/cuml/experimental/fil/fil.pyx
index b652d3e07d..302cc55cc2 100644
--- a/python/cuml/experimental/fil/fil.pyx
+++ b/python/cuml/experimental/fil/fil.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@ import warnings
 from libcpp cimport bool
 from libc.stdint cimport uint32_t, uintptr_t
 
+from cuml.internals.import_utils import has_sklearn
 from cuml.common.device_selection import using_device_type
 from cuml.internals.input_utils import input_to_cuml_array
 from cuml.internals.safe_imports import (
@@ -966,6 +967,16 @@ class ForestInference(UniversalBase, CMajorInputTagMixin):
             For GPU execution, the RAFT handle containing the stream or stream
             pool to use during loading and inference.
         """
+        # TODO(hcho3): Remove this check when https://github.com/dmlc/treelite/issues/544 is fixed
+        if has_sklearn():
+            from sklearn.ensemble import (
+                HistGradientBoostingClassifier as HistGradientBoostingC,
+            )
+            from sklearn.ensemble import (
+                HistGradientBoostingRegressor as HistGradientBoostingR,
+            )
+            if isinstance(skl_model, (HistGradientBoostingR, HistGradientBoostingC)):
+                raise NotImplementedError("HistGradientBoosting estimators are not yet supported")
         tl_model = treelite.sklearn.import_model(skl_model)
         if default_chunk_size is None:
             default_chunk_size = threads_per_tree
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 16413b34ac..1230280b83 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -29,6 +29,7 @@ from libc.stdint cimport uintptr_t
 from libc.stdlib cimport free
 
 import cuml.internals
+from cuml.internals.import_utils import has_sklearn
 from cuml.internals.array import CumlArray
 from cuml.internals.base import Base
 from pylibraft.common.handle cimport handle_t
@@ -881,6 +882,16 @@ class ForestInference(Base,
         logger.warn("Treelite currently does not support float64 model"
                     " parameters. Accuracy may degrade slightly relative to"
                     " native sklearn invocation.")
+        # TODO(hcho3): Remove this check when https://github.com/dmlc/treelite/issues/544 is fixed
+        if has_sklearn():
+            from sklearn.ensemble import (
+                HistGradientBoostingClassifier as HistGradientBoostingC,
+            )
+            from sklearn.ensemble import (
+                HistGradientBoostingRegressor as HistGradientBoostingR,
+            )
+            if isinstance(skl_model, (HistGradientBoostingR, HistGradientBoostingC)):
+                raise NotImplementedError("HistGradientBoosting estimators are not yet supported")
         tl_model = tl_skl.import_model(skl_model)
         cuml_fm.load_from_treelite_model(
             model=tl_model,

From dc02a3fc75d59c172cc4a1fa7f695018add8ae1e Mon Sep 17 00:00:00 2001
From: Jinfeng Li <jinfengl@nvidia.com>
Date: Mon, 5 Feb 2024 07:33:37 -0800
Subject: [PATCH 19/24] [LogisticRegressionMG] Support standardization with no
 data modification (#5724)

The key idea is to modify coefficients in linearFwd to get the same predictions, and modify the gradients in linearBwd to get the same gradients.

Authors:
   - Jinfeng Li (https://github.com/lijinf2)

Approvers:
   - Corey J. Nolet (https://github.com/cjnolet)
---
 cpp/include/cuml/linear_model/qn_mg.hpp       |   6 +-
 cpp/src/glm/qn/mg/glm_base_mg.cuh             |  38 ++-
 cpp/src/glm/qn/mg/qn_mg.cuh                   |  19 +-
 cpp/src/glm/qn/mg/standardization.cuh         | 173 +++++++++++
 cpp/src/glm/qn_mg.cu                          |  67 ++++-
 .../dask/linear_model/logistic_regression.py  |   3 +-
 .../linear_model/logistic_regression_mg.pyx   |  14 +-
 .../dask/test_dask_logistic_regression.py     | 279 +++++++++++++++++-
 8 files changed, 576 insertions(+), 23 deletions(-)
 create mode 100644 cpp/src/glm/qn/mg/standardization.cuh

diff --git a/cpp/include/cuml/linear_model/qn_mg.hpp b/cpp/include/cuml/linear_model/qn_mg.hpp
index 21d35584be..2370814fab 100644
--- a/cpp/include/cuml/linear_model/qn_mg.hpp
+++ b/cpp/include/cuml/linear_model/qn_mg.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,6 +48,7 @@ std::vector<float> getUniquelabelsMG(const raft::handle_t& handle,
  * @param[out] coef: learned coefficients
  * @param[in] pams: model parameters
  * @param[in] X_col_major: true if X is stored column-major
+ * @param[in] standardization: whether to standardize the dataset before training
  * @param[in] n_classes: number of outputs (number of classes or `1` for regression)
  * @param[out] f: host pointer holding the final objective value
  * @param[out] num_iters: host pointer holding the actual number of iterations taken
@@ -59,6 +60,7 @@ void qnFit(raft::handle_t& handle,
            float* coef,
            const qn_params& pams,
            bool X_col_major,
+           bool standardization,
            int n_classes,
            float* f,
            int* num_iters);
@@ -73,6 +75,7 @@ void qnFit(raft::handle_t& handle,
  * @param[in] input_row_ids: vector holding row pointers of non-zero values of all partitions for
  * that rank
  * @param[in] X_nnz: the number of non-zero values of that rank
+ * @param[in] standardization: whether to standardize the dataset before training
  * @param[in] input_desc: PartDescriptor object for the input
  * @param[in] labels: labels data
  * @param[out] coef: learned coefficients
@@ -90,6 +93,7 @@ void qnFitSparse(raft::handle_t& handle,
                  std::vector<Matrix::Data<float>*>& labels,
                  float* coef,
                  const qn_params& pams,
+                 bool standardization,
                  int n_classes,
                  float* f,
                  int* num_iters);
diff --git a/cpp/src/glm/qn/mg/glm_base_mg.cuh b/cpp/src/glm/qn/mg/glm_base_mg.cuh
index 094d7197b6..eac3398b94 100644
--- a/cpp/src/glm/qn/mg/glm_base_mg.cuh
+++ b/cpp/src/glm/qn/mg/glm_base_mg.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,12 +20,17 @@
 #include <raft/linalg/multiply.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <glm/qn/mg/standardization.cuh>
+
 #include <glm/qn/glm_base.cuh>
 #include <glm/qn/glm_logistic.cuh>
 #include <glm/qn/glm_regularizer.cuh>
 #include <glm/qn/qn_solvers.cuh>
 #include <glm/qn/qn_util.cuh>
 
+#include <iostream>
+#include <vector>
+
 namespace ML {
 namespace GLM {
 namespace opg {
@@ -91,6 +96,7 @@ struct GLMWithDataMG : ML::GLM::detail::GLMWithData<T, GLMObjective> {
   int rank;
   int64_t n_samples;
   int n_ranks;
+  const Standardizer<T>* stder_p;
 
   GLMWithDataMG(raft::handle_t const& handle,
                 int rank,
@@ -99,13 +105,15 @@ struct GLMWithDataMG : ML::GLM::detail::GLMWithData<T, GLMObjective> {
                 GLMObjective* obj,
                 const SimpleMat<T>& X,
                 const SimpleVec<T>& y,
-                SimpleDenseMat<T>& Z)
+                SimpleDenseMat<T>& Z,
+                const Standardizer<T>* stder_p = NULL)
     : ML::GLM::detail::GLMWithData<T, GLMObjective>(obj, X, y, Z)
   {
     this->handle_p  = &handle;
     this->rank      = rank;
     this->n_ranks   = n_ranks;
     this->n_samples = n_samples;
+    this->stder_p   = stder_p;
   }
 
   inline T operator()(const SimpleVec<T>& wFlat,
@@ -134,6 +142,26 @@ struct GLMWithDataMG : ML::GLM::detail::GLMWithData<T, GLMObjective> {
       raft::resource::sync_stream(*(this->handle_p));
     }
 
+    // if standardization is True
+    std::vector<T> wFlatOrigin(this->C * this->dims);
+    if (stder_p != NULL) {
+      raft::copy(wFlatOrigin.data(), wFlat.data, this->C * this->dims, stream);
+
+      stder_p->adapt_model_for_linearFwd(
+        *handle_p, wFlat.data, this->C, (this->X)->n, (this->X)->n != G.n);
+
+      // scale reg part of the gradient for the upcoming adapt_gradient_for_linearBwd
+      raft::linalg::matrixVectorOp(G.data,
+                                   G.data,
+                                   stder_p->std.data,
+                                   stder_p->std.len,
+                                   G.m,
+                                   false,
+                                   true,
+                                   raft::mul_op(),
+                                   stream);
+    }
+
     // apply linearFwd, getLossAndDz, linearBwd
     ML::GLM::detail::linearFwd(
       lossFunc->handle, *(this->Z), *(this->X), W);  // linear part: forward pass
@@ -163,6 +191,12 @@ struct GLMWithDataMG : ML::GLM::detail::GLMWithData<T, GLMObjective> {
     communicator.allreduce(G.data, G.data, this->C * this->dims, raft::comms::op_t::SUM, stream);
     communicator.sync_stream(stream);
 
+    if (stder_p != NULL) {
+      stder_p->adapt_gradient_for_linearBwd(
+        *handle_p, G, *(this->Z), (this->X)->n != G.n, n_samples);
+      raft::copy(wFlat.data, wFlatOrigin.data(), this->C * this->dims, stream);
+    }
+
     T loss_host;
     raft::update_host(&loss_host, dev_scalar, 1, stream);
     raft::resource::sync_stream(*(this->handle_p));
diff --git a/cpp/src/glm/qn/mg/qn_mg.cuh b/cpp/src/glm/qn/mg/qn_mg.cuh
index 177eb17b1b..e9178dd6ac 100644
--- a/cpp/src/glm/qn/mg/qn_mg.cuh
+++ b/cpp/src/glm/qn/mg/qn_mg.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 
 #include "glm_base_mg.cuh"
+#include "standardization.cuh"
+
 #include <glm/qn/glm_logistic.cuh>
 #include <glm/qn/glm_regularizer.cuh>
 #include <glm/qn/glm_softmax.cuh>
@@ -42,7 +44,8 @@ int qn_fit_mg(const raft::handle_t& handle,
               int* num_iters,
               size_t n_samples,
               int rank,
-              int n_ranks)
+              int n_ranks,
+              const Standardizer<T>* stder_p = NULL)
 {
   cudaStream_t stream = handle.get_stream();
   LBFGSParam<T> opt_param(pams);
@@ -59,7 +62,8 @@ int qn_fit_mg(const raft::handle_t& handle,
   ML::GLM::detail::Tikhonov<T> reg(l2);
   ML::GLM::detail::RegularizedGLM<T, LossFunction, decltype(reg)> regularizer_obj(&loss, &reg);
 
-  auto obj_function = GLMWithDataMG(handle, rank, n_ranks, n_samples, &regularizer_obj, X, y, Z);
+  auto obj_function =
+    GLMWithDataMG(handle, rank, n_ranks, n_samples, &regularizer_obj, X, y, Z, stder_p);
   return ML::GLM::detail::qn_minimize(
     handle, w0, fx, num_iters, obj_function, l1, opt_param, pams.verbose);
 }
@@ -76,8 +80,9 @@ inline void qn_fit_x_mg(const raft::handle_t& handle,
                         int64_t n_samples,
                         int rank,
                         int n_ranks,
-                        T* sample_weight = nullptr,
-                        T svr_eps        = 0)
+                        const Standardizer<T>* stder_p = NULL,
+                        T* sample_weight               = nullptr,
+                        T svr_eps                      = 0)
 {
   /*
    NB:
@@ -104,13 +109,13 @@ inline void qn_fit_x_mg(const raft::handle_t& handle,
       ASSERT(C > 0, "qn_mg.cuh: logistic loss invalid C");
       ML::GLM::detail::LogisticLoss<T> loss(handle, D, pams.fit_intercept);
       ML::GLM::opg::qn_fit_mg<T, decltype(loss)>(
-        handle, pams, loss, X, y, Z, w0_data, f, num_iters, n_samples, rank, n_ranks);
+        handle, pams, loss, X, y, Z, w0_data, f, num_iters, n_samples, rank, n_ranks, stder_p);
     } break;
     case QN_LOSS_SOFTMAX: {
       ASSERT(C > 2, "qn_mg.cuh: softmax invalid C");
       ML::GLM::detail::Softmax<T> loss(handle, D, C, pams.fit_intercept);
       ML::GLM::opg::qn_fit_mg<T, decltype(loss)>(
-        handle, pams, loss, X, y, Z, w0_data, f, num_iters, n_samples, rank, n_ranks);
+        handle, pams, loss, X, y, Z, w0_data, f, num_iters, n_samples, rank, n_ranks, stder_p);
     } break;
     default: {
       ASSERT(false, "qn_mg.cuh: unknown loss function type (id = %d).", pams.loss);
diff --git a/cpp/src/glm/qn/mg/standardization.cuh b/cpp/src/glm/qn/mg/standardization.cuh
new file mode 100644
index 0000000000..8b3eaeada0
--- /dev/null
+++ b/cpp/src/glm/qn/mg/standardization.cuh
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include <glm/qn/simple_mat/dense.hpp>
+#include <glm/qn/simple_mat/sparse.hpp>
+
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/operators.hpp>
+
+#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/divide.cuh>
+#include <raft/linalg/multiply.cuh>
+#include <raft/linalg/sqrt.cuh>
+#include <raft/matrix/math.hpp>
+#include <raft/sparse/op/row_op.cuh>
+#include <raft/stats/stddev.cuh>
+#include <raft/stats/sum.cuh>
+
+#include <cuml/common/logger.hpp>
+
+namespace ML {
+namespace GLM {
+namespace opg {
+
+template <typename T>
+void mean_stddev(const raft::handle_t& handle,
+                 const SimpleDenseMat<T>& X,
+                 int n_samples,
+                 T* mean_vector,
+                 T* stddev_vector)
+{
+  const T* input_data = X.data;
+  int D               = X.n;
+  int num_rows        = X.m;
+  bool col_major      = (X.ord == COL_MAJOR);
+  auto stream         = handle.get_stream();
+  auto& comm          = handle.get_comms();
+
+  raft::stats::sum(mean_vector, input_data, D, num_rows, !col_major, stream);
+  T weight = T(1) / T(n_samples);
+  raft::linalg::multiplyScalar(mean_vector, mean_vector, weight, D, stream);
+  comm.allreduce(mean_vector, mean_vector, D, raft::comms::op_t::SUM, stream);
+  comm.sync_stream(stream);
+
+  raft::stats::vars(stddev_vector, input_data, mean_vector, D, num_rows, false, !col_major, stream);
+  weight = n_samples < 1 ? T(0) : T(1) * num_rows / T(n_samples - 1);
+  raft::linalg::multiplyScalar(stddev_vector, stddev_vector, weight, D, stream);
+  comm.allreduce(stddev_vector, stddev_vector, D, raft::comms::op_t::SUM, stream);
+  comm.sync_stream(stream);
+
+  raft::linalg::sqrt(stddev_vector, stddev_vector, D, handle.get_stream());
+}
+
+struct inverse_op {
+  template <typename T>
+  constexpr RAFT_INLINE_FUNCTION auto operator()(const T& a) const
+  {
+    return a == T(0.0) ? a : T(1.0) / a;
+  }
+};
+
+template <typename T>
+struct Standardizer {
+  SimpleVec<T> mean;
+  SimpleVec<T> std;
+  SimpleVec<T> std_inv;
+  SimpleVec<T> scaled_mean;
+
+  Standardizer(const raft::handle_t& handle,
+               const SimpleDenseMat<T>& X,
+               int n_samples,
+               rmm::device_uvector<T>& mean_std_buff)
+  {
+    int D = X.n;
+    ASSERT(mean_std_buff.size() == 4 * D, "buff size must be four times the dimension");
+
+    auto stream = handle.get_stream();
+
+    mean.reset(mean_std_buff.data(), D);
+    std.reset(mean_std_buff.data() + D, D);
+    std_inv.reset(mean_std_buff.data() + 2 * D, D);
+    scaled_mean.reset(mean_std_buff.data() + 3 * D, D);
+
+    mean_stddev(handle, X, n_samples, mean.data, std.data);
+
+    raft::linalg::unaryOp(std_inv.data, std.data, D, inverse_op(), stream);
+
+    // scale mean by the standard deviation
+    raft::linalg::binaryOp(scaled_mean.data, std_inv.data, mean.data, D, raft::mul_op(), stream);
+  }
+
+  void adapt_model_for_linearFwd(
+    const raft::handle_t& handle, T* coef, int n_targets, int D, bool has_bias) const
+  {
+    ASSERT(D == mean.len, "dimension mismatches");
+
+    // adapt coefficients and intercept to avoid actual data standardization
+    SimpleDenseMat<T> W(coef, n_targets, D + has_bias);
+    SimpleDenseMat<T> Wweights;
+    col_slice(W, Wweights, 0, D);
+
+    auto mul_lambda = [] __device__(const T a, const T b) { return a * b; };
+    raft::linalg::matrixVectorOp(Wweights.data,
+                                 Wweights.data,
+                                 std_inv.data,
+                                 Wweights.n,
+                                 Wweights.m,
+                                 false,
+                                 true,
+                                 mul_lambda,
+                                 handle.get_stream());
+
+    if (has_bias) {
+      SimpleVec<T> Wbias;
+
+      col_ref(W, Wbias, D);
+
+      Wbias.assign_gemv(handle, -1, Wweights, false, mean, 1, handle.get_stream());
+    }
+  }
+
+  void adapt_gradient_for_linearBwd(const raft::handle_t& handle,
+                                    SimpleDenseMat<T>& G,
+                                    const SimpleDenseMat<T>& dZ,
+                                    bool has_bias,
+                                    int n_samples) const
+  {
+    auto stream   = handle.get_stream();
+    int D         = mean.len;
+    int n_targets = dZ.m;
+    auto& comm    = handle.get_comms();
+
+    // scale coefficients
+    SimpleDenseMat<T> Gweights;
+    col_slice(G, Gweights, 0, D);
+
+    raft::matrix::matrixVectorBinaryMult(
+      Gweights.data, std_inv.data, Gweights.m, D, false, true, stream);
+
+    if (has_bias) {
+      SimpleVec<T> Gbias;
+      col_ref(G, Gbias, D);
+
+      SimpleDenseMat<T> Gbias_transpose_mat(Gbias.data, Gbias.m, 1);
+      SimpleDenseMat<T> scaled_mean_mat(scaled_mean.data, 1, D);
+
+      Gweights.assign_gemm(
+        handle, -1.0, Gbias_transpose_mat, false, scaled_mean_mat, false, 1.0, stream);
+    }
+  }
+};
+
+};  // namespace opg
+};  // namespace GLM
+};  // namespace ML
\ No newline at end of file
diff --git a/cpp/src/glm/qn_mg.cu b/cpp/src/glm/qn_mg.cu
index ee75316a18..1a128100c8 100644
--- a/cpp/src/glm/qn_mg.cu
+++ b/cpp/src/glm/qn_mg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include "qn/mg/qn_mg.cuh"
+#include "qn/mg/standardization.cuh"
 #include "qn/simple_mat/dense.hpp"
 #include <cuda_runtime.h>
 #include <cuml/common/logger.hpp>
@@ -25,11 +26,22 @@
 #include <raft/core/error.hpp>
 #include <raft/core/handle.hpp>
 #include <raft/label/classlabels.cuh>
+#include <raft/linalg/divide.cuh>
+#include <raft/linalg/matrix_vector.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/sqrt.cuh>
+#include <raft/matrix/math.hpp>
+#include <raft/stats/mean_center.cuh>
+#include <raft/stats/stddev.cuh>
+#include <raft/stats/sum.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <vector>
 using namespace MLCommon;
 
-#include <iostream>
+#include <cumlprims/opg/matrix/math.hpp>
+#include <cumlprims/opg/stats/mean.hpp>
+#include <cumlprims/opg/stats/mean_center.hpp>
+#include <cumlprims/opg/stats/stddev.hpp>
 
 namespace ML {
 namespace GLM {
@@ -81,6 +93,7 @@ void qnFit_impl(const raft::handle_t& handle,
                 const qn_params& pams,
                 T* X,
                 bool X_col_major,
+                bool standardization,
                 T* y,
                 size_t N,
                 size_t D,
@@ -94,6 +107,10 @@ void qnFit_impl(const raft::handle_t& handle,
 {
   auto X_simple = SimpleDenseMat<T>(X, N, D, X_col_major ? COL_MAJOR : ROW_MAJOR);
 
+  rmm::device_uvector<T> mean_std_buff(4 * D, handle.get_stream());
+  Standardizer<T>* stder = NULL;
+  if (standardization) stder = new Standardizer(handle, X_simple, n_samples, mean_std_buff);
+
   ML::GLM::opg::qn_fit_x_mg(handle,
                             pams,
                             X_simple,
@@ -104,7 +121,15 @@ void qnFit_impl(const raft::handle_t& handle,
                             num_iters,
                             n_samples,
                             rank,
-                            n_ranks);  // ignore sample_weight, svr_eps
+                            n_ranks,
+                            stder);  // ignore sample_weight, svr_eps
+
+  if (standardization) {
+    int n_targets = ML::GLM::detail::qn_is_classification(pams.loss) && C == 2 ? 1 : C;
+    stder->adapt_model_for_linearFwd(handle, w0, n_targets, D, pams.fit_intercept);
+    delete stder;
+  }
+
   return;
 }
 
@@ -116,6 +141,7 @@ void qnFit_impl(raft::handle_t& handle,
                 T* coef,
                 const qn_params& pams,
                 bool X_col_major,
+                bool standardization,
                 int n_classes,
                 T* f,
                 int* num_iters)
@@ -132,10 +158,13 @@ void qnFit_impl(raft::handle_t& handle,
     n_samples += p->size;
   }
 
+  auto stream = handle.get_stream();
+
   qnFit_impl<T>(handle,
                 pams,
                 data_X->ptr,
                 X_col_major,
+                standardization,
                 data_y->ptr,
                 input_desc.totalElementsOwnedBy(input_desc.rank),
                 input_desc.N,
@@ -166,12 +195,22 @@ void qnFit(raft::handle_t& handle,
            float* coef,
            const qn_params& pams,
            bool X_col_major,
+           bool standardization,
            int n_classes,
            float* f,
            int* num_iters)
 {
-  qnFit_impl<float>(
-    handle, input_data, input_desc, labels, coef, pams, X_col_major, n_classes, f, num_iters);
+  qnFit_impl<float>(handle,
+                    input_data,
+                    input_desc,
+                    labels,
+                    coef,
+                    pams,
+                    X_col_major,
+                    standardization,
+                    n_classes,
+                    f,
+                    num_iters);
 }
 
 template <typename T, typename I>
@@ -181,6 +220,7 @@ void qnFitSparse_impl(const raft::handle_t& handle,
                       I* X_cols,
                       I* X_row_ids,
                       I X_nnz,
+                      bool standardization,
                       T* y,
                       size_t N,
                       size_t D,
@@ -192,8 +232,13 @@ void qnFitSparse_impl(const raft::handle_t& handle,
                       int rank,
                       int n_ranks)
 {
+  RAFT_EXPECTS(standardization == false, "standardization for sparse vectors is not supported yet");
+
   auto X_simple = SimpleSparseMat<T>(X_values, X_cols, X_row_ids, X_nnz, N, D);
 
+  rmm::device_uvector<T> mean_std_buff(4 * D, handle.get_stream());
+  Standardizer<T>* stder = NULL;
+
   ML::GLM::opg::qn_fit_x_mg(handle,
                             pams,
                             X_simple,
@@ -204,7 +249,15 @@ void qnFitSparse_impl(const raft::handle_t& handle,
                             num_iters,
                             n_samples,
                             rank,
-                            n_ranks);  // ignore sample_weight, svr_eps
+                            n_ranks,
+                            stder);  // ignore sample_weight, svr_eps
+
+  if (standardization) {
+    int n_targets = ML::GLM::detail::qn_is_classification(pams.loss) && C == 2 ? 1 : C;
+    stder->adapt_model_for_linearFwd(handle, w0, n_targets, D, pams.fit_intercept);
+    delete stder;
+  }
+
   return;
 }
 
@@ -217,6 +270,7 @@ void qnFitSparse(raft::handle_t& handle,
                  std::vector<Matrix::Data<float>*>& labels,
                  float* coef,
                  const qn_params& pams,
+                 bool standardization,
                  int n_classes,
                  float* f,
                  int* num_iters)
@@ -233,6 +287,7 @@ void qnFitSparse(raft::handle_t& handle,
                                input_cols,
                                input_row_ids,
                                X_nnz,
+                               standardization,
                                data_y->ptr,
                                input_desc.totalElementsOwnedBy(input_desc.rank),
                                input_desc.N,
diff --git a/python/cuml/dask/linear_model/logistic_regression.py b/python/cuml/dask/linear_model/logistic_regression.py
index faf194962f..6f3a1e384d 100644
--- a/python/cuml/dask/linear_model/logistic_regression.py
+++ b/python/cuml/dask/linear_model/logistic_regression.py
@@ -140,8 +140,9 @@ class LogisticRegression(LinearRegression):
     <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html>`_.
     """
 
-    def __init__(self, **kwargs):
+    def __init__(self, *, standardization=False, **kwargs):
         super().__init__(**kwargs)
+        self.kwargs["standardization"] = standardization
 
     def fit(self, X, y):
         """
diff --git a/python/cuml/linear_model/logistic_regression_mg.pyx b/python/cuml/linear_model/logistic_regression_mg.pyx
index ae9a2db58b..aff46fb387 100644
--- a/python/cuml/linear_model/logistic_regression_mg.pyx
+++ b/python/cuml/linear_model/logistic_regression_mg.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -75,6 +75,7 @@ cdef extern from "cuml/linear_model/qn_mg.hpp" namespace "ML::GLM::opg" nogil:
         float *coef,
         const qn_params& pams,
         bool X_col_major,
+        bool standardization,
         int n_classes,
         float *f,
         int *num_iters) except +
@@ -94,6 +95,7 @@ cdef extern from "cuml/linear_model/qn_mg.hpp" namespace "ML::GLM::opg" nogil:
         vector[floatData_t *] labels,
         float *coef,
         const qn_params& pams,
+        bool standardization,
         int n_classes,
         float *f,
         int *num_iters) except +
@@ -101,8 +103,9 @@ cdef extern from "cuml/linear_model/qn_mg.hpp" namespace "ML::GLM::opg" nogil:
 
 class LogisticRegressionMG(MGFitMixin, LogisticRegression):
 
-    def __init__(self, **kwargs):
+    def __init__(self, *, standardization=False, **kwargs):
         super(LogisticRegressionMG, self).__init__(**kwargs)
+        self.standardization = standardization
 
     @property
     @cuml.internals.api_base_return_array_skipall
@@ -188,10 +191,12 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
         assert len(input_data) == 1, f"Currently support only one (X, y) pair in the list. Received {len(input_data)} pairs."
         self.is_col_major = False
         order = 'F' if self.is_col_major else 'C'
+
         super().fit(input_data, n_rows, n_cols, parts_rank_size, rank, order=order)
 
     @cuml.internals.api_base_return_any_skipall
     def _fit(self, X, y, coef_ptr, input_desc):
+
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
         cdef float objective32
         cdef int num_iters
@@ -213,6 +218,9 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
 
         sparse_input = isinstance(X, list)
 
+        if self.standardization:
+            assert not sparse_input, "standardization for sparse vectors is not supported yet"
+
         if self.dtype == np.float32:
             if sparse_input is False:
                 qnFit(
@@ -223,6 +231,7 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
                     <float*>mat_coef_ptr,
                     qnpams,
                     self.is_col_major,
+                    self.standardization,
                     self._num_classes,
                     <float*> &objective32,
                     <int*> &num_iters)
@@ -244,6 +253,7 @@ class LogisticRegressionMG(MGFitMixin, LogisticRegression):
                     deref(<vector[floatData_t*]*><uintptr_t>y),
                     <float*>mat_coef_ptr,
                     qnpams,
+                    self.standardization,
                     self._num_classes,
                     <float*> &objective32,
                     <int*> &num_iters)
diff --git a/python/cuml/tests/dask/test_dask_logistic_regression.py b/python/cuml/tests/dask/test_dask_logistic_regression.py
index c50a9f3978..59934face6 100644
--- a/python/cuml/tests/dask/test_dask_logistic_regression.py
+++ b/python/cuml/tests/dask/test_dask_logistic_regression.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -82,11 +82,14 @@ def cal_chunks(dataset, n_partitions):
     return X_da, y_da
 
 
-def make_classification_dataset(datatype, nrows, ncols, n_info, n_classes=2):
+def make_classification_dataset(
+    datatype, nrows, ncols, n_info, n_redundant=2, n_classes=2
+):
     X, y = make_classification(
         n_samples=nrows,
         n_features=ncols,
         n_informative=n_info,
+        n_redundant=n_redundant,
         n_classes=n_classes,
         random_state=0,
     )
@@ -318,6 +321,7 @@ def test_lbfgs(
     penalty="l2",
     l1_ratio=None,
     C=1.0,
+    standardization=False,
     n_classes=2,
     convert_to_sparse=False,
 ):
@@ -353,6 +357,7 @@ def imp():
         penalty=penalty,
         l1_ratio=l1_ratio,
         C=C,
+        standardization=standardization,
         verbose=True,
     )
     lr.fit(X_dask, y_dask)
@@ -379,7 +384,7 @@ def array_to_numpy(ary):
     sk_model = skLR(
         solver=sk_solver,
         fit_intercept=fit_intercept,
-        penalty=penalty,
+        penalty=penalty if penalty != "none" else None,
         l1_ratio=l1_ratio,
         C=C,
     )
@@ -387,7 +392,7 @@ def array_to_numpy(ary):
     sk_coef = sk_model.coef_
     sk_intercept = sk_model.intercept_
 
-    if sk_solver == "lbfgs":
+    if sk_solver == "lbfgs" and standardization is False:
         assert len(lr_coef) == len(sk_coef)
         assert array_equal(lr_coef, sk_coef, tolerance, with_sign=True)
         assert array_equal(
@@ -653,3 +658,269 @@ def test_exception_one_label(fit_intercept, client):
     lr = LogisticRegression(fit_intercept=fit_intercept)
     with pytest.raises(ValueError, match=err_msg):
         lr.fit(X, y)
+
+
+@pytest.mark.mg
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize(
+    "regularization",
+    [
+        ("none", 1.0, None),
+        ("l2", 2.0, None),
+        ("l1", 2.0, None),
+        ("elasticnet", 2.0, 0.2),
+    ],
+)
+@pytest.mark.parametrize("datatype", [np.float32])
+@pytest.mark.parametrize("delayed", [False])
+@pytest.mark.parametrize("n_classes", [2, 8])
+def test_standardization_on_normal_dataset(
+    fit_intercept, regularization, datatype, delayed, n_classes, client
+):
+
+    penalty = regularization[0]
+    C = regularization[1]
+    l1_ratio = regularization[2]
+
+    # test correctness compared with scikit-learn
+    test_lbfgs(
+        nrows=1e5,
+        ncols=20,
+        n_parts=2,
+        fit_intercept=fit_intercept,
+        datatype=datatype,
+        delayed=delayed,
+        client=client,
+        penalty=penalty,
+        n_classes=n_classes,
+        C=C,
+        l1_ratio=l1_ratio,
+        standardization=True,
+    )
+
+
+@pytest.mark.mg
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize(
+    "regularization",
+    [
+        ("none", 1.0, None),
+        ("l2", 2.0, None),
+        ("l1", 2.0, None),
+        ("elasticnet", 2.0, 0.2),
+    ],
+)
+@pytest.mark.parametrize("datatype", [np.float32])
+@pytest.mark.parametrize("delayed", [False])
+@pytest.mark.parametrize("ncol_and_nclasses", [(2, 2), (6, 4), (100, 10)])
+def test_standardization_on_scaled_dataset(
+    fit_intercept, regularization, datatype, delayed, ncol_and_nclasses, client
+):
+
+    penalty = regularization[0]
+    C = regularization[1]
+    l1_ratio = regularization[2]
+    nrows = int(1e5)
+    ncols = ncol_and_nclasses[0]
+    n_classes = ncol_and_nclasses[1]
+    n_info = ncols
+    n_redundant = 0
+    n_parts = 2
+    tolerance = 0.005
+
+    from sklearn.linear_model import LogisticRegression as CPULR
+    from sklearn.model_selection import train_test_split
+    from cuml.dask.linear_model.logistic_regression import (
+        LogisticRegression as cumlLBFGS_dask,
+    )
+    from sklearn.preprocessing import StandardScaler
+
+    X, y = make_classification_dataset(
+        datatype,
+        nrows,
+        ncols,
+        n_info,
+        n_redundant=n_redundant,
+        n_classes=n_classes,
+    )
+    X[:, 0] *= 1000  # Scale up the first features by 1000
+    X[:, 0] += 50  # Shift the first features by 50
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=0
+    )
+
+    def to_dask_data(X_train, X_test, y_train, y_test):
+        X_train_dask, y_train_dask = _prep_training_data(
+            client, X_train, y_train, n_parts
+        )
+        X_test_dask, y_test_dask = _prep_training_data(
+            client, X_test, y_test, n_parts
+        )
+        return (X_train_dask, X_test_dask, y_train_dask, y_test_dask)
+
+    # run MG with standardization=True
+    mgon = cumlLBFGS_dask(
+        standardization=True,
+        solver="qn",
+        fit_intercept=fit_intercept,
+        penalty=penalty,
+        l1_ratio=l1_ratio,
+        C=C,
+        verbose=True,
+    )
+    X_train_dask, X_test_dask, y_train_dask, _ = to_dask_data(
+        X_train, X_test, y_train, y_test
+    )
+    mgon.fit(X_train_dask, y_train_dask)
+    mgon_preds = (
+        mgon.predict(X_test_dask, delayed=delayed).compute().to_numpy()
+    )
+    mgon_accuracy = accuracy_score(y_test, mgon_preds)
+
+    assert array_equal(X_train_dask.compute().to_numpy(), X_train)
+
+    # run CPU with StandardScaler
+    # if fit_intercept is true, mean center then scale the dataset
+    # if fit_intercept is false, scale the dataset without mean center
+    scaler = StandardScaler(with_mean=fit_intercept, with_std=True)
+    scaler.fit(X_train)
+    X_train_scaled = scaler.transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+
+    sk_solver = "lbfgs" if penalty == "l2" or penalty == "none" else "saga"
+    cpu = CPULR(
+        solver=sk_solver,
+        fit_intercept=fit_intercept,
+        penalty=penalty if penalty != "none" else None,
+        l1_ratio=l1_ratio,
+        C=C,
+    )
+    cpu.fit(X_train_scaled, y_train)
+    cpu_preds = cpu.predict(X_test_scaled)
+    cpu_accuracy = accuracy_score(y_test, cpu_preds)
+
+    assert len(mgon_preds) == len(cpu_preds)
+    assert (mgon_accuracy >= cpu_accuracy) | (
+        np.abs(mgon_accuracy - cpu_accuracy) < 1e-3
+    )
+
+    # assert equal the accuracy and the model
+    mgon_coef_origin = mgon.coef_.to_numpy() * scaler.scale_
+    if fit_intercept is True:
+        mgon_intercept_origin = mgon.intercept_.to_numpy() + np.dot(
+            mgon.coef_.to_numpy(), scaler.mean_
+        )
+    else:
+        mgon_intercept_origin = mgon.intercept_.to_numpy()
+
+    if sk_solver == "lbfgs":
+        assert array_equal(mgon_coef_origin, cpu.coef_, tolerance)
+        assert array_equal(mgon_intercept_origin, cpu.intercept_, tolerance)
+
+    # running MG with standardization=False
+    mgoff = cumlLBFGS_dask(
+        standardization=False,
+        solver="qn",
+        fit_intercept=fit_intercept,
+        penalty=penalty,
+        l1_ratio=l1_ratio,
+        C=C,
+        verbose=True,
+    )
+    X_train_ds, X_test_ds, y_train_dask, _ = to_dask_data(
+        X_train_scaled, X_test_scaled, y_train, y_test
+    )
+    mgoff.fit(X_train_ds, y_train_dask)
+    mgoff_preds = (
+        mgoff.predict(X_test_ds, delayed=delayed).compute().to_numpy()
+    )
+    mgoff_accuracy = accuracy_score(y_test, mgoff_preds)
+
+    # assert equal the accuracy and the model
+    assert len(mgon_preds) == len(mgoff_preds)
+    assert (mgon_accuracy >= mgoff_accuracy) | (
+        np.abs(mgon_accuracy - mgoff_accuracy) < 1e-3
+    )
+
+    print(f"mgon_coef_origin: {mgon_coef_origin}")
+    print(f"mgoff.coef_: {mgoff.coef_.to_numpy()}")
+    print(f"mgon_intercept_origin: {mgon_intercept_origin}")
+    print(f"mgoff.intercept_: {mgoff.intercept_.to_numpy()}")
+    assert array_equal(mgon_coef_origin, mgoff.coef_.to_numpy(), tolerance)
+    assert array_equal(
+        mgon_intercept_origin, mgoff.intercept_.to_numpy(), tolerance
+    )
+
+
+@pytest.mark.mg
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize(
+    "regularization",
+    [
+        ("none", 1.0, None),
+        ("l2", 2.0, None),
+        ("l1", 2.0, None),
+        ("elasticnet", 2.0, 0.2),
+    ],
+)
+def test_standardization_example(fit_intercept, regularization, client):
+    n_rows = int(1e5)
+    n_cols = 20
+    n_info = 10
+    n_classes = 4
+
+    datatype = np.float32
+    n_parts = 2
+    max_iter = 5  # cannot set this too large. Observed GPU-specific coefficients when objective converges at 0.
+
+    penalty = regularization[0]
+    C = regularization[1]
+    l1_ratio = regularization[2]
+
+    est_params = {
+        "penalty": penalty,
+        "C": C,
+        "l1_ratio": l1_ratio,
+        "fit_intercept": fit_intercept,
+        "max_iter": max_iter,
+    }
+
+    X, y = make_classification_dataset(
+        datatype, n_rows, n_cols, n_info, n_classes=n_classes
+    )
+
+    from sklearn.preprocessing import StandardScaler
+
+    scaler = StandardScaler(with_mean=fit_intercept, with_std=True)
+    scaler.fit(X)
+    scaler.scale_ = np.sqrt(scaler.var_ * len(X) / (len(X) - 1))
+    X_scaled = scaler.transform(X)
+
+    X_df, y_df = _prep_training_data(client, X, y, n_parts)
+    from cuml.dask.linear_model import LogisticRegression as cumlLBFGS_dask
+
+    lr_on = cumlLBFGS_dask(standardization=True, verbose=True, **est_params)
+    lr_on.fit(X_df, y_df)
+
+    lron_coef_origin = lr_on.coef_.to_numpy() * scaler.scale_
+    if fit_intercept is True:
+        lron_intercept_origin = lr_on.intercept_.to_numpy() + np.dot(
+            lr_on.coef_.to_numpy(), scaler.mean_
+        )
+    else:
+        lron_intercept_origin = lr_on.intercept_.to_numpy()
+
+    X_df_scaled, y_df = _prep_training_data(client, X_scaled, y, n_parts)
+    lr_off = cumlLBFGS_dask(standardization=False, **est_params)
+    lr_off.fit(X_df_scaled, y_df)
+
+    assert array_equal(lron_coef_origin, lr_off.coef_.to_numpy())
+    assert array_equal(lron_intercept_origin, lr_off.intercept_.to_numpy())
+
+    from cuml.linear_model import LogisticRegression as SG
+
+    sg = SG(**est_params)
+    sg.fit(X_scaled, y)
+
+    assert array_equal(lron_coef_origin, sg.coef_)
+    assert array_equal(lron_intercept_origin, sg.intercept_)

From 7627240fdfcdca9b2c602d25eed75b8fe2ac9c0e Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Mon, 5 Feb 2024 08:23:45 -0800
Subject: [PATCH 20/24] [Hotfix] Update GPUTreeSHAP to fix ARM build (#5747)

Pick up rapidsai/gputreeshap#42 to fix build on ARM.
---
 cpp/cmake/thirdparty/get_gputreeshap.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_gputreeshap.cmake b/cpp/cmake/thirdparty/get_gputreeshap.cmake
index 6ebe3a067e..4f84cbc475 100644
--- a/cpp/cmake/thirdparty/get_gputreeshap.cmake
+++ b/cpp/cmake/thirdparty/get_gputreeshap.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -68,4 +68,4 @@ function(find_and_configure_gputreeshap)
 
 endfunction()
 
-find_and_configure_gputreeshap(PINNED_TAG ae946908b4cdc2bf498deefc426a3656761166f5)
+find_and_configure_gputreeshap(PINNED_TAG 9d32df85f822f186b5fbf53a9a1fa0251d0cd755)

From 65302d3a7d474fac1c630190a6c1a56e4495d725 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <phcho@nvidia.com>
Date: Tue, 6 Feb 2024 16:16:57 -0800
Subject: [PATCH 21/24] [Hotfix] Fix FIL gtest (#5755)

Fix failing tests in gtest for FIL.
```
[  FAILED  ] FilTests/TreeliteDenseFloat32FilTest.Import/0, where GetParam() = num_rows = 20000, num_cols = 50, nan_prob = 0.05, depth = 8, num_trees = 50, leaf_prob = 0.05, output = RAW, threshold = 0, threads_per_tree = 1, n_items = 0, blocks_per_sm = 0, algo = 1, seed = 42, tolerance = 0.002, op = <, global_bias = 0, leaf_algo = 0, num_classes = 1, node_categorical_prob = 0, feature_categorical_prob = 0, cat_match_prob = 0.5, max_magnitude_of_matching_cat = 1
```

Authors:
   - Philip Hyunsu Cho (https://github.com/hcho3)

Approvers:
   - William Hicks (https://github.com/wphicks)
---
 cpp/test/sg/fil_test.cu | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
index d85b097e47..eb8cccdacb 100644
--- a/cpp/test/sg/fil_test.cu
+++ b/cpp/test/sg/fil_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -787,6 +787,7 @@ class TreeliteFilTest : public BaseFilTest<real_t> {
         case fil::leaf_algo_t::GROVE_PER_CLASS_FEW_CLASSES:
         case fil::leaf_algo_t::GROVE_PER_CLASS_MANY_CLASSES: break;
       }
+      builder->EndNode();
     } else {
       int left          = root + 2 * (node - root) + 1;
       int right         = root + 2 * (node - root) + 2;
@@ -806,8 +807,6 @@ class TreeliteFilTest : public BaseFilTest<real_t> {
           }
         }
       }
-      node_to_treelite(builder, root, left);
-      node_to_treelite(builder, root, right);
       // TODO(levsnv): remove workaround once confirmed to work with empty category lists in
       // Treelite
       if (!right_categories.empty() && dense_node.is_categorical()) {
@@ -818,8 +817,10 @@ class TreeliteFilTest : public BaseFilTest<real_t> {
         adjust_threshold_to_treelite(&threshold, &left, &right, &default_left, this->ps.op);
         builder->NumericalTest(dense_node.fid(), threshold, default_left, this->ps.op, left, right);
       }
+      builder->EndNode();
+      node_to_treelite(builder, root, left);
+      node_to_treelite(builder, root, right);
     }
-    builder->EndNode();
   }
 
   void init_forest_impl(fil::forest_t<real_t>* pforest, fil::storage_type_t storage_type)
@@ -875,7 +876,7 @@ class TreeliteFilTest : public BaseFilTest<real_t> {
         postprocessor_name = "sigmoid";
       }
     } else if (this->ps.leaf_algo != fil::leaf_algo_t::FLOAT_UNARY_BINARY) {
-      postprocessor_name = "softmax";
+      postprocessor_name = "identity_multiclass";
       this->ps.output    = fil::output_t(this->ps.output | fil::output_t::SOFTMAX);
     } else if (this->ps.leaf_algo == GROVE_PER_CLASS) {
       postprocessor_name = "identity_multiclass";

From 2d8f1c22262c7d63bcf8635f7cfdd33c3c5d3b7c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 7 Feb 2024 05:52:18 -0800
Subject: [PATCH 22/24] Exclude tests from builds (#5754)

---
 python/pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index e08bedd774..9b77eb34c6 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -127,6 +127,7 @@ build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
 cmake.minimum-version = "3.26.4"
 ninja.make-fallback = true
+sdist.exclude = ["*tests*"]
 sdist.reproducible = true
 wheel.packages = ["cuml"]
 

From d3349c791f0c978aa5443ca77661709ddebdc7db Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 9 Feb 2024 08:13:55 -0600
Subject: [PATCH 23/24] Fix ctest directory to ensure tests are executed
 (#5753)

C++ tests were not being run because the `test_cpp.sh` script was not executing `ctest` from the tests directory. This PR fixes that problem and adds the flag `--no-tests=error` to fail if no tests are found in the future.

It seems this was broken in #5487.

Also closes #5757 (includes the same changes).

Authors:
   - Dante Gama Dessavre (https://github.com/dantegd)
   - Bradley Dice (https://github.com/bdice)
   - Ray Douglass (https://github.com/raydouglass)

Approvers:
   - Ray Douglass (https://github.com/raydouglass)
---
 ci/test_cpp.sh          | 10 +++++++---
 cpp/test/CMakeLists.txt |  4 ++--
 cpp/test/sg/svc_test.cu |  2 ++
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index a076887545..5cfa376667 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -37,8 +37,12 @@ trap "EXITCODE=1" ERR
 set +e
 
 # Run libcuml gtests from libcuml-tests package
-rapids-logger "Run gtests"
-ctest -j9 --output-on-failure
+export GTEST_OUTPUT=xml:${RAPIDS_TESTS_DIR}/
+
+pushd $CONDA_PREFIX/bin/gtests/libcuml/
+rapids-logger "Run libcuml gtests"
+ctest -j9 --output-on-failure --no-tests=error
+popd
 
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 2ff1ebc5b4..0033c844ae 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -182,7 +182,7 @@ if(all_algo OR solvers_algo)
 endif()
 
 if(all_algo OR svm_algo)
-  ConfigureTest(PREFIX SG NAME SVC_TEST  sg/svc_test.cu ML_INCLUDE)
+  ConfigureTest(PREFIX SG NAME SVC_TEST  sg/svc_test.cu ML_INCLUDE GPUS 1 PERCENT 100)
   # The SVC Test tries to verify it has no memory leaks by checking
   # how much free memory on the GPU exists after execution. This
   # check requires no other GPU tests to be running or it fails
diff --git a/cpp/test/sg/svc_test.cu b/cpp/test/sg/svc_test.cu
index 709e48de18..cb141ae874 100644
--- a/cpp/test/sg/svc_test.cu
+++ b/cpp/test/sg/svc_test.cu
@@ -1397,6 +1397,8 @@ TYPED_TEST(SmoSolverTest, BlobPredict)
 
 TYPED_TEST(SmoSolverTest, MemoryLeak)
 {
+  GTEST_SKIP();  // Skip the tests in CI for release 24.02
+                 // https://github.com/rapidsai/cuml/issues/5763
   auto stream = this->handle.get_stream();
   // We measure that we have the same amount of free memory available on the GPU
   // before and after we call SVM. This can help catch memory leaks, but it is

From e8fe5e36ad898db48c553950039ec8656e386990 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Mon, 12 Feb 2024 15:43:59 -0500
Subject: [PATCH 24/24] Update Changelog [skip ci]

---
 CHANGELOG.md | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 037b314f90..301b2cee51 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,39 @@
+# cuML 24.02.00 (12 Feb 2024)
+
+## 🚨 Breaking Changes
+
+- Update to CCCL 2.2.0. ([#5702](https://github.com/rapidsai/cuml/pull/5702)) [@bdice](https://github.com/bdice)
+- Switch to scikit-build-core ([#5693](https://github.com/rapidsai/cuml/pull/5693)) [@vyasr](https://github.com/vyasr)
+
+## 🐛 Bug Fixes
+
+- [Hotfix] Fix FIL gtest ([#5755](https://github.com/rapidsai/cuml/pull/5755)) [@hcho3](https://github.com/hcho3)
+- Exclude tests from builds ([#5754](https://github.com/rapidsai/cuml/pull/5754)) [@vyasr](https://github.com/vyasr)
+- Fix ctest directory to ensure tests are executed ([#5753](https://github.com/rapidsai/cuml/pull/5753)) [@bdice](https://github.com/bdice)
+- Synchronize stream in SVC memory test ([#5729](https://github.com/rapidsai/cuml/pull/5729)) [@wphicks](https://github.com/wphicks)
+- Fix shared-workflows repo name ([#5723](https://github.com/rapidsai/cuml/pull/5723)) [@raydouglass](https://github.com/raydouglass)
+- Fix cupy dependency in pyproject.toml ([#5705](https://github.com/rapidsai/cuml/pull/5705)) [@vyasr](https://github.com/vyasr)
+- Only cufft offers a static_nocallback version of the library ([#5703](https://github.com/rapidsai/cuml/pull/5703)) [@robertmaynard](https://github.com/robertmaynard)
+
+## 🛠️ Improvements
+
+- [Hotfix] Update GPUTreeSHAP to fix ARM build ([#5747](https://github.com/rapidsai/cuml/pull/5747)) [@hcho3](https://github.com/hcho3)
+- Disable HistGradientBoosting support for now ([#5744](https://github.com/rapidsai/cuml/pull/5744)) [@hcho3](https://github.com/hcho3)
+- Disable hnswlib feature in RAFT; pin pytest ([#5733](https://github.com/rapidsai/cuml/pull/5733)) [@hcho3](https://github.com/hcho3)
+- [LogisticRegressionMG] Support standardization with no data modification ([#5724](https://github.com/rapidsai/cuml/pull/5724)) [@lijinf2](https://github.com/lijinf2)
+- Remove usages of rapids-env-update ([#5716](https://github.com/rapidsai/cuml/pull/5716)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Remove extraneous SKBUILD_BUILD_OPTIONS ([#5714](https://github.com/rapidsai/cuml/pull/5714)) [@vyasr](https://github.com/vyasr)
+- refactor CUDA versions in dependencies.yaml ([#5712](https://github.com/rapidsai/cuml/pull/5712)) [@jameslamb](https://github.com/jameslamb)
+- Update to CCCL 2.2.0. ([#5702](https://github.com/rapidsai/cuml/pull/5702)) [@bdice](https://github.com/bdice)
+- Migrate to Treelite 4.0 ([#5701](https://github.com/rapidsai/cuml/pull/5701)) [@hcho3](https://github.com/hcho3)
+- Use cuda::proclaim_return_type on device lambdas. ([#5696](https://github.com/rapidsai/cuml/pull/5696)) [@bdice](https://github.com/bdice)
+- move _process_generic to base_return_types, avoid circular import ([#5695](https://github.com/rapidsai/cuml/pull/5695)) [@dcolinmorgan](https://github.com/dcolinmorgan)
+- Switch to scikit-build-core ([#5693](https://github.com/rapidsai/cuml/pull/5693)) [@vyasr](https://github.com/vyasr)
+- Fix all deprecated function calls in TUs where warnings are errors ([#5692](https://github.com/rapidsai/cuml/pull/5692)) [@vyasr](https://github.com/vyasr)
+- Remove CUML_BUILD_WHEELS and standardize Python builds ([#5689](https://github.com/rapidsai/cuml/pull/5689)) [@vyasr](https://github.com/vyasr)
+- Forward-merge branch-23.12 to branch-24.02 ([#5657](https://github.com/rapidsai/cuml/pull/5657)) [@bdice](https://github.com/bdice)
+- Add cuML devcontainers ([#5568](https://github.com/rapidsai/cuml/pull/5568)) [@trxcllnt](https://github.com/trxcllnt)
+
 # cuML 23.12.00 (6 Dec 2023)
 
 ## 🚨 Breaking Changes