Renaming from #3263 part2 (#3362)

Follow-up to #3349 `KernelExecutor::compileFusion` -> `KernelExecutor::compile` `KernelExecutor::runFusion` -> `KernelExecutor::run`
NVIDIA · Nov 7, 2024 · 951dde6 · 951dde6
1 parent ba4f7d4
commit 951dde6
Show file tree

Hide file tree

Showing 55 changed files with 1,447 additions and 1,468 deletions.
diff --git a/benchmarks/cpp/gelu_backward.cpp b/benchmarks/cpp/gelu_backward.cpp
@@ -163,7 +163,7 @@ static void NvFuserScheduler_GeluBackward_Compile(
 
   for (auto _ : benchmark_state) {
     KernelExecutor ke;
-    ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
+    ke.compile(&fusion, inputs, heuristic_params->lparams);
   }
 }
 
@@ -188,13 +188,13 @@ static void NvFuserScheduler_GeluBackward_RunFusion(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
   KernelExecutor ke;
-  ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
+  ke.compile(&fusion, inputs, heuristic_params->lparams);
 
   C10_CUDA_CHECK(cudaDeviceSynchronize());
 
   for (auto _ : benchmark_state) {
-    outputs = ke.runFusion(
-        c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
+    outputs =
+        ke.run(c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
     C10_CUDA_CHECK(cudaDeviceSynchronize());
     clearL2Cache();
   }
@@ -219,7 +219,7 @@ static void NvFuserScheduler_GeluBackward_RunFusion_GpuOnly(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
   KernelExecutor ke;
-  ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
+  ke.compile(&fusion, inputs, heuristic_params->lparams);
 
   runBenchmarkIterations(
       benchmark_state, &ke, inputs, heuristic_params->lparams);
@@ -249,11 +249,11 @@ static void NvFuserScheduler_GeluBackward_RunFusion_CpuOnly(
 
   KernelExecutor ke;
   ke.setExecuteKernelFlag(false);
-  ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
+  ke.compile(&fusion, inputs, heuristic_params->lparams);
 
   for (auto _ : benchmark_state) {
-    outputs = ke.runFusion(
-        c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
+    outputs =
+        ke.run(c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
   }
 }
 

diff --git a/benchmarks/cpp/indexselect.cpp b/benchmarks/cpp/indexselect.cpp
@@ -133,7 +133,7 @@ static void NvFuserScheduler_IndexSelect_Compile(
 
   for (auto _ : benchmark_state) {
     KernelExecutor ke;
-    ke.compileFusion(
+    ke.compile(
         &fusion, c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
   }
 }
@@ -156,15 +156,15 @@ static void NvFuserScheduler_IndexSelect_RunFusion(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
   KernelExecutor ke;
-  ke.compileFusion(
+  ke.compile(
       &fusion, c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
 
   C10_CUDA_CHECK(cudaDeviceSynchronize());
 
   at::Tensor output = at::empty_like(inputs[0].toTensor());
 
   for (auto _ : benchmark_state) {
-    ke.runFusion(
+    ke.run(
         c10::ArrayRef<c10::IValue>(inputs),
         {output},
         heuristic_params->lparams);

diff --git a/benchmarks/cpp/lstm_cell.cpp b/benchmarks/cpp/lstm_cell.cpp
@@ -156,7 +156,7 @@ static void NvFuserScheduler_LstmCell_Compile(
 
   for (auto _ : benchmark_state) {
     KernelExecutor ke;
-    ke.compileFusion(&fusion, inputs);
+    ke.compile(&fusion, inputs);
   }
 }
 
@@ -183,13 +183,13 @@ static void NvFuserScheduler_LstmCell_RunFusion(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
   KernelExecutor ke;
-  ke.compileFusion(&fusion, inputs);
+  ke.compile(&fusion, inputs);
 
   C10_CUDA_CHECK(cudaDeviceSynchronize());
 
   for (auto _ : benchmark_state) {
-    outputs = ke.runFusion(
-        c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
+    outputs =
+        ke.run(c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
     C10_CUDA_CHECK(cudaDeviceSynchronize());
   }
 }
@@ -221,7 +221,7 @@ static void NvFuserScheduler_LstmCell_RunFusion_GpuOnly(
       &fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));
 
   KernelExecutor ke;
-  ke.compileFusion(&fusion, inputs);
+  ke.compile(&fusion, inputs);
 
   runBenchmarkIterations(
       benchmark_state, &ke, inputs, heuristic_params->lparams);
@@ -261,11 +261,11 @@ static void NvFuserScheduler_LstmCell_RunFusion_CpuOnly(
 
   KernelExecutor ke;
   ke.setExecuteKernelFlag(false);
-  ke.compileFusion(&fusion, inputs);
+  ke.compile(&fusion, inputs);
 
   for (auto _ : benchmark_state) {
-    outputs = ke.runFusion(
-        c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
+    outputs =
+        ke.run(c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
   }
 }
 

diff --git a/benchmarks/cpp/matmul.cpp b/benchmarks/cpp/matmul.cpp
@@ -176,15 +176,15 @@ static void SingleMatmulBase(
   // Compile kernel
   auto launch_constraints = LaunchParams();
   KernelExecutor ke;
-  ke.compileFusion(fusion, args, launch_constraints, cparams);
+  ke.compile(fusion, args, launch_constraints, cparams);
   NVF_CHECK(
       getBankConflictInfo(ke.kernel(), launch_constraints).empty(),
       "Shared memory bank conflict not removed.");
 
   std::vector<c10::IValue> aten_inputs({inputs.first, inputs.second});
 
   // Warm up run
-  auto outputs = ke.runFusion(aten_inputs);
+  auto outputs = ke.run(aten_inputs);
   checkMatch(expected_output, outputs.at(0).to(at::kDouble), k);
 
   runBenchmarkIterations(benchmark_state, &ke, aten_inputs);
@@ -357,13 +357,13 @@ static void SingleMatmulPartitionedK(
   // Compile kernel
   KernelExecutor ke;
   auto lparams = LaunchParams();
-  ke.compileFusion(fusion, args, lparams, cparams);
+  ke.compile(fusion, args, lparams, cparams);
   NVF_CHECK(
       getBankConflictInfo(ke.kernel(), lparams).empty(),
       "Shared memory bank conflict not removed.");
 
   // Warm up run
-  auto outputs = ke.runFusion(aten_inputs);
+  auto outputs = ke.run(aten_inputs);
 
   checkMatch(expected_output, outputs.at(0).to(at::kDouble), Ki);
 
@@ -462,15 +462,15 @@ static void NvFuserScheduler_MatmulSplitKReduction(
 
   // Compile kernel
   KernelExecutor ke;
-  ke.compileFusion(
+  ke.compile(
       fusion, args, heuristic_params->lparams, heuristic_params->cparams);
 
   NVF_CHECK(
       getBankConflictInfo(ke.kernel(), heuristic_params->lparams).empty(),
       "Shared memory bank conflict not removed.");
 
   // Warm up run
-  auto outputs = ke.runFusion(aten_inputs, heuristic_params->lparams);
+  auto outputs = ke.run(aten_inputs, heuristic_params->lparams);
 
   checkMatch(expected_output, outputs.at(0).to(at::kDouble), splitk_factor);
 

diff --git a/benchmarks/cpp/softmax.cpp b/benchmarks/cpp/softmax.cpp
@@ -106,7 +106,7 @@ static void NvFuserScheduler_Softmax_WarpReduceReference(
   scheduler->schedule(fusion, heuristic_params.get());
 
   KernelExecutor ke;
-  ke.compileFusion(fusion, aten_inputs);
+  ke.compile(fusion, aten_inputs);
 
   runBenchmarkIterations(benchmark_state, &ke, aten_inputs);
 
@@ -153,7 +153,7 @@ static void NvFuserScheduler_Softmax_WarpReduce(
   }
 
   KernelExecutor ke;
-  ke.compileFusion(fusion, aten_inputs);
+  ke.compile(fusion, aten_inputs);
 
   runBenchmarkIterations(benchmark_state, &ke, aten_inputs);
 

diff --git a/benchmarks/cpp/utils.cpp b/benchmarks/cpp/utils.cpp
@@ -230,8 +230,8 @@ int64_t runBenchmarkIterations(
   int64_t io_bytes = getSizeOfInputs(aten_inputs);
   {
     // Warm-up run
-    auto cg_outputs = fusion_executor->runFusion(
-        aten_inputs, launch_constraints, compile_params);
+    auto cg_outputs =
+        fusion_executor->run(aten_inputs, launch_constraints, compile_params);
     io_bytes += getSizeOfOutputs(cg_outputs);
   }
 
@@ -246,8 +246,8 @@ int64_t runBenchmarkIterations(
     clearL2Cache();
     FusionProfiler::start();
     FusionProfiler::createSegments(1);
-    auto cg_outputs = fusion_executor->runFusion(
-        aten_inputs, launch_constraints, compile_params);
+    auto cg_outputs =
+        fusion_executor->run(aten_inputs, launch_constraints, compile_params);
     FusionProfiler::stop();
     benchmark_state.SetIterationTime(
         FusionProfiler::profile().kernel_time_ms / 1000.0);

diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
@@ -157,9 +157,9 @@ void HostIrExecutor::handle(PostOnStream* post_ir) {
     if (!ke.isCompiled()) {
       Fusion* fusion = hu->fusion_to_execute();
       DynamicTransform::concretizeFusion(fusion, input_IValues);
-      ke.compileFusion(fusion, input_IValues);
+      ke.compile(fusion, input_IValues);
     }
-    outputs = ke.runFusion(input_IValues);
+    outputs = ke.run(input_IValues);
     if (!params_.cache_fusion_executor) {
       fe_.erase(hu);
     }

diff --git a/csrc/python_frontend/fusion_definition.cpp b/csrc/python_frontend/fusion_definition.cpp
@@ -370,18 +370,18 @@ std::vector<at::Tensor> FusionDefinition::execute(
       if (user_sched.heuristic_params == nullptr) {
         // Manual schedule
         if (!user_sched.executor->isCompiled()) {
-          user_sched.executor->compileFusion(
+          user_sched.executor->compile(
               user_sched.scheduled_fusion.get(),
               inputs,
               user_sched.fusion_id_,
               user_sched.device_id_);
         }
-        outputs = user_sched.executor->runFusion(inputs);
+        outputs = user_sched.executor->run(inputs);
       } else {
         // Automatic scheduler was used for UserSchedule.
         // Pass launch and compile params to compileFusion and runFusion.
         if (!user_sched.executor->isCompiled()) {
-          user_sched.executor->compileFusion(
+          user_sched.executor->compile(
               user_sched.scheduled_fusion.get(),
               KernelArgumentHolder::createKernelArgumentHolder(
                   inputs, getCommonDeviceCUDA(inputs)),
@@ -391,7 +391,7 @@ std::vector<at::Tensor> FusionDefinition::execute(
               user_sched.fusion_id_,
               user_sched.device_id_);
         }
-        outputs = user_sched.executor->runFusion(
+        outputs = user_sched.executor->run(
             inputs,
             user_sched.heuristic_params->lparams,
             user_sched.heuristic_params->cparams);

diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
@@ -185,7 +185,7 @@ std::string KernelExecutor::getStructuredCode() const {
   return getStructuredCode(kernelString(), kernel()->indexType());
 }
 
-void KernelExecutor::compileFusion(
+void KernelExecutor::compile(
     Fusion* fusion,
     const KernelArgumentHolder& args,
     const LaunchParams& launch_constraints,
@@ -1137,7 +1137,7 @@ at::Tensor findBufferForFusionOutput(
 }
 } // namespace
 
-std::vector<at::Tensor> KernelExecutor::runFusion(
+std::vector<at::Tensor> KernelExecutor::run(
     KernelArgumentHolder& args,
     const LaunchParams& launch_constraints,
     CompileParams compile_params,

diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
@@ -42,7 +42,7 @@ class KernelExecutor : public NonCopyable {
   //! To compile a fusion with the 32-bit index type, CompileParams
   //! must be passed in. There used to be an index type associated
   //! with KernelArgumentHolder, but it is no longer the case.
-  NVF_API void compileFusion(
+  NVF_API void compile(
       Fusion* fusion,
       const KernelArgumentHolder& args,
       const LaunchParams& launch_constraints,
@@ -56,25 +56,25 @@ class KernelExecutor : public NonCopyable {
   // TODO: merge it with the overload above.
   //! This API is merely here so we don't have to go back and update all cpp
   //! tests.
-  void compileFusion(
+  void compile(
       Fusion* fusion,
       const at::ArrayRef<c10::IValue>& inputs = {},
       const LaunchParams& launch_constraints = LaunchParams(),
       CompileParams compile_params = CompileParams()) {
     KernelArgumentHolder args =
         KernelArgumentHolder::createKernelArgumentHolder(inputs);
-    compileFusion(fusion, args, launch_constraints, compile_params);
+    compile(fusion, args, launch_constraints, compile_params);
   }
 
   //! Used by user defined schedules in python frontend
-  void compileFusion(
+  void compile(
       Fusion* fusion,
       const at::ArrayRef<c10::IValue>& inputs,
       int64_t fusion_id,
       int64_t concrete_id) {
     KernelArgumentHolder args =
         KernelArgumentHolder::createKernelArgumentHolder(inputs);
-    compileFusion(
+    compile(
         fusion,
         args,
         LaunchParams(),
@@ -92,15 +92,15 @@ class KernelExecutor : public NonCopyable {
   // TODO: args shouldn't come in a reference here because we will append the
   // outputs to be able to send it to the kernel. For now none of the users are
   // reconsuming the args, so it is okay. It isn't done now because changing it
-  // from a reference makes a call as runFusion({}) ambiguous, and that is used
+  // from a reference makes a call as run({}) ambiguous, and that is used
   // in some places in the codebase.
-  NVF_API std::vector<at::Tensor> runFusion(
+  NVF_API std::vector<at::Tensor> run(
       KernelArgumentHolder& args,
       const LaunchParams& launch_constraints = LaunchParams(),
       CompileParams compile_params = CompileParams(),
       std::vector<at::Tensor> outputs = {});
 
-  std::vector<at::Tensor> runFusion(
+  std::vector<at::Tensor> run(
       const at::ArrayRef<c10::IValue>& inputs,
       const std::vector<at::Tensor>& outputs,
       const LaunchParams& launch_constraints = LaunchParams(),
@@ -111,15 +111,15 @@ class KernelExecutor : public NonCopyable {
     if (opt_code.has_value()) {
       args.setCacheId(*opt_code);
     }
-    return runFusion(args, launch_constraints, compile_params, outputs);
+    return run(args, launch_constraints, compile_params, outputs);
   }
 
-  std::vector<at::Tensor> runFusion(
+  std::vector<at::Tensor> run(
       const at::ArrayRef<c10::IValue>& inputs,
       const LaunchParams& launch_constraints = LaunchParams(),
       CompileParams compile_params = CompileParams(),
       const std::optional<size_t>& opt_code = std::nullopt) {
-    return runFusion(inputs, {}, launch_constraints, compile_params, opt_code);
+    return run(inputs, {}, launch_constraints, compile_params, opt_code);
   }
 
   // Register a lowering hooks that are called to modify the GpuLower object

diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp
@@ -595,7 +595,7 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
   if (executor.groupId() < 0) {
     executor.setGroupId(group_id);
   }
-  auto outputs = executor.runFusion(args, launch_params, compile_params);
+  auto outputs = executor.run(args, launch_params, compile_params);
 
   return outputs;
 }
@@ -625,7 +625,7 @@ void FusionKernelRuntime::compileKernel(
   NVF_ERROR(
       heuristic_params->cparams.index_type.has_value(),
       "Kernel index type is not defined.");
-  executors_.at(group_id).compileFusion(
+  executors_.at(group_id).compile(
       fusion_to_run.get(),
       args,
       heuristic_params->lparams,

diff --git a/examples/sinh_extension/main.cpp b/examples/sinh_extension/main.cpp
@@ -35,8 +35,8 @@ at::Tensor sinh_nvfuser(const at::Tensor& input) {
       SchedulerEntry::scheduleWith(&fusion, SchedulerType::PointWise, {input});
 
   KernelExecutor ke;
-  ke.compileFusion(&fusion, {input}, heuristic_params->lparams);
-  auto outputs = ke.runFusion({input}, heuristic_params->lparams);
+  ke.compile(&fusion, {input}, heuristic_params->lparams);
+  auto outputs = ke.run({input}, heuristic_params->lparams);
 
   return outputs[0];
 }

diff --git a/examples/sinh_libtorch/main.cpp b/examples/sinh_libtorch/main.cpp
@@ -32,8 +32,8 @@ at::Tensor sinh_nvfuser(const at::Tensor& input) {
       &fusion, SchedulerType::PointWise, {input});
 
   KernelExecutor ke;
-  ke.compileFusion(&fusion, {input}, heuristic_params->lparams);
-  auto outputs = ke.runFusion({input}, heuristic_params->lparams);
+  ke.compile(&fusion, {input}, heuristic_params->lparams);
+  auto outputs = ke.run({input}, heuristic_params->lparams);
 
   return outputs[0];
 }