From 682966f604b0577f0f662812d26dd4a2bec1d1e6 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Tue, 11 Feb 2025 18:30:50 -0800 Subject: [PATCH] [fbgemm_gpu] Break down CMake module further, pt 2 Break down `fbgemm_gpu_tbe_training_backward` module further, pt 2 --- .github/scripts/utils_cuda.bash | 2 +- fbgemm_gpu/cmake/TbeTraining.cmake | 16 ++++++ fbgemm_gpu/cmake/tbe_sources.py | 78 +++++++++++++++++++----------- fbgemm_gpu/fbgemm_gpu/__init__.py | 1 + 4 files changed, 68 insertions(+), 29 deletions(-) diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash index a3147dec4c..d006bfdd36 100644 --- a/.github/scripts/utils_cuda.bash +++ b/.github/scripts/utils_cuda.bash @@ -192,7 +192,7 @@ install_cuda () { # in the future, we will be using conda-forge for installing all CUDA versions # (except for versions 11.8 and below, which are only available through # nvidia/label/cuda-*) - if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then + if [[ "$cuda_version" =~ ^12.6.*$ ]]; then # shellcheck disable=SC2086 (exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c conda-forge --override-channels -y \ cuda=${cuda_version}) || return 1 diff --git a/fbgemm_gpu/cmake/TbeTraining.cmake b/fbgemm_gpu/cmake/TbeTraining.cmake index 8ce711cdcd..0393a64a4b 100644 --- a/fbgemm_gpu/cmake/TbeTraining.cmake +++ b/fbgemm_gpu/cmake/TbeTraining.cmake @@ -28,10 +28,12 @@ get_tbe_sources_list(gen_cpu_files_training) get_tbe_sources_list(gen_gpu_files_training) get_tbe_sources_list(gen_gpu_files_training_pt2) get_tbe_sources_list(gen_gpu_files_training_dense) +get_tbe_sources_list(gen_gpu_files_training_split_host) handle_genfiles_rocm(gen_cpu_files_training) handle_genfiles_rocm(gen_gpu_files_training) handle_genfiles_rocm(gen_gpu_files_training_pt2) handle_genfiles_rocm(gen_gpu_files_training_dense) +handle_genfiles_rocm(gen_gpu_files_training_split_host) # Index Select get_tbe_sources_list(static_cpu_files_index_select) @@ -119,6 +121,7 @@ gpu_cpp_library( ${TORCH_CUDA_OPTIONS} DEPS fbgemm + fbgemm_gpu_config DESTINATION fbgemm_gpu) @@ -215,6 +218,19 @@ gpu_cpp_library( DESTINATION fbgemm_gpu) +gpu_cpp_library( + PREFIX + fbgemm_gpu_tbe_training_backward_split_host + TYPE + SHARED + INCLUDE_DIRS + ${fbgemm_sources_include_directories} + GPU_SRCS + ${gen_gpu_files_training_split_host} + NVCC_FLAGS + ${TORCH_CUDA_OPTIONS} + DESTINATION + fbgemm_gpu) gpu_cpp_library( PREFIX diff --git a/fbgemm_gpu/cmake/tbe_sources.py b/fbgemm_gpu/cmake/tbe_sources.py index 4a00acbbe8..77b7b5ad41 100644 --- a/fbgemm_gpu/cmake/tbe_sources.py +++ b/fbgemm_gpu/cmake/tbe_sources.py @@ -364,19 +364,41 @@ ] ) -gen_gpu_files_training_dense = [ - # Dense host and kernel, and forward-quantized host src files - fstring.format(wdesc) - for wdesc in WEIGHT_OPTIONS - for fstring in [ - "gen_embedding_backward_dense_split_{}_cuda.cu", - "gen_embedding_backward_dense_split_{}_meta.cpp", - "gen_embedding_backward_dense_split_{}_kernel_cta.cu", - "gen_embedding_backward_dense_split_{}_kernel_warp.cu", +gen_gpu_files_training_dense = ( + [ + # Dense host and kernel, and forward-quantized host src files + fstring.format(wdesc) + for wdesc in WEIGHT_OPTIONS + for fstring in [ + "gen_embedding_backward_dense_split_{}_cuda.cu", + "gen_embedding_backward_dense_split_{}_meta.cpp", + "gen_embedding_backward_dense_split_{}_kernel_cta.cu", + "gen_embedding_backward_dense_split_{}_kernel_warp.cu", + ] + ] + + [ + "gen_embedding_backward_split_dense.cpp", ] -] + [ - "gen_embedding_backward_split_dense.cpp", -] +) + +gen_gpu_files_training_split_host = ( + [ + "gen_embedding_backward_split_{}.cpp".format(optimizer) + for optimizer in ALL_OPTIMIZERS + ] + + [ + "gen_embedding_backward_ssd_{}.cpp".format(optimizer) + for optimizer in SSD_OPTIMIZERS + ] + + [ + "gen_embedding_backward_{}_split_{}_meta.cpp".format(optimizer, wdesc) + for optimizer in GPU_OPTIMIZERS + for wdesc in [ + "weighted", + "unweighted", + ] + ] +) gen_gpu_files_training = ( [ @@ -461,22 +483,22 @@ else [] ) ] - + [ - "gen_embedding_backward_split_{}.cpp".format(optimizer) - for optimizer in ALL_OPTIMIZERS - ] - + [ - "gen_embedding_backward_ssd_{}.cpp".format(optimizer) - for optimizer in SSD_OPTIMIZERS - ] - + [ - "gen_embedding_backward_{}_split_{}_meta.cpp".format(optimizer, wdesc) - for optimizer in GPU_OPTIMIZERS - for wdesc in [ - "weighted", - "unweighted", - ] - ] + # + [ + # "gen_embedding_backward_split_{}.cpp".format(optimizer) + # for optimizer in ALL_OPTIMIZERS + # ] + # + [ + # "gen_embedding_backward_ssd_{}.cpp".format(optimizer) + # for optimizer in SSD_OPTIMIZERS + # ] + # + [ + # "gen_embedding_backward_{}_split_{}_meta.cpp".format(optimizer, wdesc) + # for optimizer in GPU_OPTIMIZERS + # for wdesc in [ + # "weighted", + # "unweighted", + # ] + # ] ) gen_hip_files_training = [ diff --git a/fbgemm_gpu/fbgemm_gpu/__init__.py b/fbgemm_gpu/fbgemm_gpu/__init__.py index 84a3a71943..c400772239 100644 --- a/fbgemm_gpu/fbgemm_gpu/__init__.py +++ b/fbgemm_gpu/fbgemm_gpu/__init__.py @@ -45,6 +45,7 @@ def _load_library(filename: str) -> None: "fbgemm_gpu_tbe_training_backward", "fbgemm_gpu_tbe_training_backward_pt2", "fbgemm_gpu_tbe_training_backward_dense", + "fbgemm_gpu_tbe_training_backward_split_host", "fbgemm_gpu_py", ]