From 984dd762dd45b610d6f5a31f96a2a87d366b0ced Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Tue, 11 Feb 2025 10:07:44 -0800 Subject: [PATCH] Break down CMake module further (#3673) Summary: - Break down `fbgemm_gpu_tbe_training_backward` module further Differential Revision: D69443056 Pulled By: q10 --- fbgemm_gpu/cmake/TbeTraining.cmake | 42 ++++++++++++++++++++ fbgemm_gpu/cmake/tbe_sources.py | 63 ++++++++++++++++-------------- fbgemm_gpu/fbgemm_gpu/__init__.py | 2 + 3 files changed, 77 insertions(+), 30 deletions(-) diff --git a/fbgemm_gpu/cmake/TbeTraining.cmake b/fbgemm_gpu/cmake/TbeTraining.cmake index b3d8abd704..8ce711cdcd 100644 --- a/fbgemm_gpu/cmake/TbeTraining.cmake +++ b/fbgemm_gpu/cmake/TbeTraining.cmake @@ -26,8 +26,12 @@ handle_genfiles_rocm(gen_gpu_files_forward_split) get_tbe_sources_list(static_cpu_files_training) get_tbe_sources_list(gen_cpu_files_training) get_tbe_sources_list(gen_gpu_files_training) +get_tbe_sources_list(gen_gpu_files_training_pt2) +get_tbe_sources_list(gen_gpu_files_training_dense) handle_genfiles_rocm(gen_cpu_files_training) handle_genfiles_rocm(gen_gpu_files_training) +handle_genfiles_rocm(gen_gpu_files_training_pt2) +handle_genfiles_rocm(gen_gpu_files_training_dense) # Index Select get_tbe_sources_list(static_cpu_files_index_select) @@ -148,6 +152,27 @@ gpu_cpp_library( DESTINATION fbgemm_gpu) +gpu_cpp_library( + PREFIX + fbgemm_gpu_tbe_training_backward_pt2 + TYPE + SHARED + INCLUDE_DIRS + ${fbgemm_sources_include_directories} + GPU_SRCS + ${gen_gpu_files_training_pt2} + NVCC_FLAGS + ${TORCH_CUDA_OPTIONS} + DEPS + fbgemm + fbgemm_gpu_config + fbgemm_gpu_tbe_cache + fbgemm_gpu_tbe_common + fbgemm_gpu_tbe_utils + fbgemm_gpu_sparse_async_cumsum + DESTINATION + fbgemm_gpu) + gpu_cpp_library( PREFIX fbgemm_gpu_tbe_training_backward @@ -174,6 +199,23 @@ gpu_cpp_library( DESTINATION fbgemm_gpu) +gpu_cpp_library( + PREFIX + fbgemm_gpu_tbe_training_backward_dense + TYPE + SHARED + INCLUDE_DIRS + ${fbgemm_sources_include_directories} + GPU_SRCS + ${gen_gpu_files_training_dense} + NVCC_FLAGS + ${TORCH_CUDA_OPTIONS} + DEPS + fbgemm_gpu_tbe_training_backward + DESTINATION + fbgemm_gpu) + + gpu_cpp_library( PREFIX fbgemm_gpu_tbe_index_select diff --git a/fbgemm_gpu/cmake/tbe_sources.py b/fbgemm_gpu/cmake/tbe_sources.py index a32be1effa..4a00acbbe8 100644 --- a/fbgemm_gpu/cmake/tbe_sources.py +++ b/fbgemm_gpu/cmake/tbe_sources.py @@ -345,20 +345,42 @@ ] ) -gen_gpu_files_training = ( +gen_gpu_files_training_pt2 = ( [ - "gen_embedding_backward_split_grad_embedding_ops.cu", + "gen_embedding_split_{}_pt2_autograd.cpp".format(optimizer) + for optimizer in ALL_OPTIMIZERS ] + [ - # Dense host and kernel, and forward-quantized host src files - fstring.format(wdesc) - for wdesc in WEIGHT_OPTIONS - for fstring in [ - "gen_embedding_backward_dense_split_{}_cuda.cu", - "gen_embedding_backward_dense_split_{}_meta.cpp", - "gen_embedding_backward_dense_split_{}_kernel_cta.cu", - "gen_embedding_backward_dense_split_{}_kernel_warp.cu", - ] + "gen_embedding_ssd_{}_pt2_autograd.cpp".format(optimizer) + for optimizer in SSD_OPTIMIZERS + ] + + [ + "gen_embedding_backward_split_{}_pt2_cuda_wrapper.cpp".format(optimizer) + for optimizer in ALL_OPTIMIZERS + ] + + [ + "gen_embedding_backward_ssd_{}_pt2_cuda_wrapper.cpp".format(optimizer) + for optimizer in SSD_OPTIMIZERS + ] +) + +gen_gpu_files_training_dense = [ + # Dense host and kernel, and forward-quantized host src files + fstring.format(wdesc) + for wdesc in WEIGHT_OPTIONS + for fstring in [ + "gen_embedding_backward_dense_split_{}_cuda.cu", + "gen_embedding_backward_dense_split_{}_meta.cpp", + "gen_embedding_backward_dense_split_{}_kernel_cta.cu", + "gen_embedding_backward_dense_split_{}_kernel_warp.cu", + ] +] + [ + "gen_embedding_backward_split_dense.cpp", +] + +gen_gpu_files_training = ( + [ + "gen_embedding_backward_split_grad_embedding_ops.cu", ] + [ # Backward-split positional weights and forward src files @@ -447,14 +469,6 @@ "gen_embedding_backward_ssd_{}.cpp".format(optimizer) for optimizer in SSD_OPTIMIZERS ] - + [ - "gen_embedding_split_{}_pt2_autograd.cpp".format(optimizer) - for optimizer in ALL_OPTIMIZERS - ] - + [ - "gen_embedding_ssd_{}_pt2_autograd.cpp".format(optimizer) - for optimizer in SSD_OPTIMIZERS - ] + [ "gen_embedding_backward_{}_split_{}_meta.cpp".format(optimizer, wdesc) for optimizer in GPU_OPTIMIZERS @@ -463,17 +477,6 @@ "unweighted", ] ] - + [ - "gen_embedding_backward_split_{}_pt2_cuda_wrapper.cpp".format(optimizer) - for optimizer in ALL_OPTIMIZERS - ] - + [ - "gen_embedding_backward_ssd_{}_pt2_cuda_wrapper.cpp".format(optimizer) - for optimizer in SSD_OPTIMIZERS - ] - + [ - "gen_embedding_backward_split_dense.cpp", - ] ) gen_hip_files_training = [ diff --git a/fbgemm_gpu/fbgemm_gpu/__init__.py b/fbgemm_gpu/fbgemm_gpu/__init__.py index 84df9b705e..84a3a71943 100644 --- a/fbgemm_gpu/fbgemm_gpu/__init__.py +++ b/fbgemm_gpu/fbgemm_gpu/__init__.py @@ -43,6 +43,8 @@ def _load_library(filename: str) -> None: "fbgemm_gpu_tbe_inference", "fbgemm_gpu_tbe_training_forward", "fbgemm_gpu_tbe_training_backward", + "fbgemm_gpu_tbe_training_backward_pt2", + "fbgemm_gpu_tbe_training_backward_dense", "fbgemm_gpu_py", ]