Skip to content

Commit

Permalink
Renaming from #3263 part2 (#3362)
Browse files Browse the repository at this point in the history
Follow-up to #3349 

`KernelExecutor::compileFusion` -> `KernelExecutor::compile`
`KernelExecutor::runFusion` -> `KernelExecutor::run`
  • Loading branch information
naoyam authored Nov 7, 2024
1 parent ba4f7d4 commit 951dde6
Show file tree
Hide file tree
Showing 55 changed files with 1,447 additions and 1,468 deletions.
16 changes: 8 additions & 8 deletions benchmarks/cpp/gelu_backward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ static void NvFuserScheduler_GeluBackward_Compile(

for (auto _ : benchmark_state) {
KernelExecutor ke;
ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
ke.compile(&fusion, inputs, heuristic_params->lparams);
}
}

Expand All @@ -188,13 +188,13 @@ static void NvFuserScheduler_GeluBackward_RunFusion(
&fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));

KernelExecutor ke;
ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
ke.compile(&fusion, inputs, heuristic_params->lparams);

C10_CUDA_CHECK(cudaDeviceSynchronize());

for (auto _ : benchmark_state) {
outputs = ke.runFusion(
c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
outputs =
ke.run(c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
C10_CUDA_CHECK(cudaDeviceSynchronize());
clearL2Cache();
}
Expand All @@ -219,7 +219,7 @@ static void NvFuserScheduler_GeluBackward_RunFusion_GpuOnly(
&fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));

KernelExecutor ke;
ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
ke.compile(&fusion, inputs, heuristic_params->lparams);

runBenchmarkIterations(
benchmark_state, &ke, inputs, heuristic_params->lparams);
Expand Down Expand Up @@ -249,11 +249,11 @@ static void NvFuserScheduler_GeluBackward_RunFusion_CpuOnly(

KernelExecutor ke;
ke.setExecuteKernelFlag(false);
ke.compileFusion(&fusion, inputs, heuristic_params->lparams);
ke.compile(&fusion, inputs, heuristic_params->lparams);

for (auto _ : benchmark_state) {
outputs = ke.runFusion(
c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
outputs =
ke.run(c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
}
}

Expand Down
6 changes: 3 additions & 3 deletions benchmarks/cpp/indexselect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ static void NvFuserScheduler_IndexSelect_Compile(

for (auto _ : benchmark_state) {
KernelExecutor ke;
ke.compileFusion(
ke.compile(
&fusion, c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
}
}
Expand All @@ -156,15 +156,15 @@ static void NvFuserScheduler_IndexSelect_RunFusion(
&fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));

KernelExecutor ke;
ke.compileFusion(
ke.compile(
&fusion, c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);

C10_CUDA_CHECK(cudaDeviceSynchronize());

at::Tensor output = at::empty_like(inputs[0].toTensor());

for (auto _ : benchmark_state) {
ke.runFusion(
ke.run(
c10::ArrayRef<c10::IValue>(inputs),
{output},
heuristic_params->lparams);
Expand Down
16 changes: 8 additions & 8 deletions benchmarks/cpp/lstm_cell.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ static void NvFuserScheduler_LstmCell_Compile(

for (auto _ : benchmark_state) {
KernelExecutor ke;
ke.compileFusion(&fusion, inputs);
ke.compile(&fusion, inputs);
}
}

Expand All @@ -183,13 +183,13 @@ static void NvFuserScheduler_LstmCell_RunFusion(
&fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));

KernelExecutor ke;
ke.compileFusion(&fusion, inputs);
ke.compile(&fusion, inputs);

C10_CUDA_CHECK(cudaDeviceSynchronize());

for (auto _ : benchmark_state) {
outputs = ke.runFusion(
c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
outputs =
ke.run(c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
C10_CUDA_CHECK(cudaDeviceSynchronize());
}
}
Expand Down Expand Up @@ -221,7 +221,7 @@ static void NvFuserScheduler_LstmCell_RunFusion_GpuOnly(
&fusion, SchedulerType::PointWise, c10::ArrayRef<c10::IValue>(inputs));

KernelExecutor ke;
ke.compileFusion(&fusion, inputs);
ke.compile(&fusion, inputs);

runBenchmarkIterations(
benchmark_state, &ke, inputs, heuristic_params->lparams);
Expand Down Expand Up @@ -261,11 +261,11 @@ static void NvFuserScheduler_LstmCell_RunFusion_CpuOnly(

KernelExecutor ke;
ke.setExecuteKernelFlag(false);
ke.compileFusion(&fusion, inputs);
ke.compile(&fusion, inputs);

for (auto _ : benchmark_state) {
outputs = ke.runFusion(
c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
outputs =
ke.run(c10::ArrayRef<c10::IValue>(inputs), heuristic_params->lparams);
}
}

Expand Down
12 changes: 6 additions & 6 deletions benchmarks/cpp/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,15 +176,15 @@ static void SingleMatmulBase(
// Compile kernel
auto launch_constraints = LaunchParams();
KernelExecutor ke;
ke.compileFusion(fusion, args, launch_constraints, cparams);
ke.compile(fusion, args, launch_constraints, cparams);
NVF_CHECK(
getBankConflictInfo(ke.kernel(), launch_constraints).empty(),
"Shared memory bank conflict not removed.");

std::vector<c10::IValue> aten_inputs({inputs.first, inputs.second});

// Warm up run
auto outputs = ke.runFusion(aten_inputs);
auto outputs = ke.run(aten_inputs);
checkMatch(expected_output, outputs.at(0).to(at::kDouble), k);

runBenchmarkIterations(benchmark_state, &ke, aten_inputs);
Expand Down Expand Up @@ -357,13 +357,13 @@ static void SingleMatmulPartitionedK(
// Compile kernel
KernelExecutor ke;
auto lparams = LaunchParams();
ke.compileFusion(fusion, args, lparams, cparams);
ke.compile(fusion, args, lparams, cparams);
NVF_CHECK(
getBankConflictInfo(ke.kernel(), lparams).empty(),
"Shared memory bank conflict not removed.");

// Warm up run
auto outputs = ke.runFusion(aten_inputs);
auto outputs = ke.run(aten_inputs);

checkMatch(expected_output, outputs.at(0).to(at::kDouble), Ki);

Expand Down Expand Up @@ -462,15 +462,15 @@ static void NvFuserScheduler_MatmulSplitKReduction(

// Compile kernel
KernelExecutor ke;
ke.compileFusion(
ke.compile(
fusion, args, heuristic_params->lparams, heuristic_params->cparams);

NVF_CHECK(
getBankConflictInfo(ke.kernel(), heuristic_params->lparams).empty(),
"Shared memory bank conflict not removed.");

// Warm up run
auto outputs = ke.runFusion(aten_inputs, heuristic_params->lparams);
auto outputs = ke.run(aten_inputs, heuristic_params->lparams);

checkMatch(expected_output, outputs.at(0).to(at::kDouble), splitk_factor);

Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/softmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ static void NvFuserScheduler_Softmax_WarpReduceReference(
scheduler->schedule(fusion, heuristic_params.get());

KernelExecutor ke;
ke.compileFusion(fusion, aten_inputs);
ke.compile(fusion, aten_inputs);

runBenchmarkIterations(benchmark_state, &ke, aten_inputs);

Expand Down Expand Up @@ -153,7 +153,7 @@ static void NvFuserScheduler_Softmax_WarpReduce(
}

KernelExecutor ke;
ke.compileFusion(fusion, aten_inputs);
ke.compile(fusion, aten_inputs);

runBenchmarkIterations(benchmark_state, &ke, aten_inputs);

Expand Down
8 changes: 4 additions & 4 deletions benchmarks/cpp/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,8 @@ int64_t runBenchmarkIterations(
int64_t io_bytes = getSizeOfInputs(aten_inputs);
{
// Warm-up run
auto cg_outputs = fusion_executor->runFusion(
aten_inputs, launch_constraints, compile_params);
auto cg_outputs =
fusion_executor->run(aten_inputs, launch_constraints, compile_params);
io_bytes += getSizeOfOutputs(cg_outputs);
}

Expand All @@ -246,8 +246,8 @@ int64_t runBenchmarkIterations(
clearL2Cache();
FusionProfiler::start();
FusionProfiler::createSegments(1);
auto cg_outputs = fusion_executor->runFusion(
aten_inputs, launch_constraints, compile_params);
auto cg_outputs =
fusion_executor->run(aten_inputs, launch_constraints, compile_params);
FusionProfiler::stop();
benchmark_state.SetIterationTime(
FusionProfiler::profile().kernel_time_ms / 1000.0);
Expand Down
4 changes: 2 additions & 2 deletions csrc/host_ir/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,9 +157,9 @@ void HostIrExecutor::handle(PostOnStream* post_ir) {
if (!ke.isCompiled()) {
Fusion* fusion = hu->fusion_to_execute();
DynamicTransform::concretizeFusion(fusion, input_IValues);
ke.compileFusion(fusion, input_IValues);
ke.compile(fusion, input_IValues);
}
outputs = ke.runFusion(input_IValues);
outputs = ke.run(input_IValues);
if (!params_.cache_fusion_executor) {
fe_.erase(hu);
}
Expand Down
8 changes: 4 additions & 4 deletions csrc/python_frontend/fusion_definition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -370,18 +370,18 @@ std::vector<at::Tensor> FusionDefinition::execute(
if (user_sched.heuristic_params == nullptr) {
// Manual schedule
if (!user_sched.executor->isCompiled()) {
user_sched.executor->compileFusion(
user_sched.executor->compile(
user_sched.scheduled_fusion.get(),
inputs,
user_sched.fusion_id_,
user_sched.device_id_);
}
outputs = user_sched.executor->runFusion(inputs);
outputs = user_sched.executor->run(inputs);
} else {
// Automatic scheduler was used for UserSchedule.
// Pass launch and compile params to compileFusion and runFusion.
if (!user_sched.executor->isCompiled()) {
user_sched.executor->compileFusion(
user_sched.executor->compile(
user_sched.scheduled_fusion.get(),
KernelArgumentHolder::createKernelArgumentHolder(
inputs, getCommonDeviceCUDA(inputs)),
Expand All @@ -391,7 +391,7 @@ std::vector<at::Tensor> FusionDefinition::execute(
user_sched.fusion_id_,
user_sched.device_id_);
}
outputs = user_sched.executor->runFusion(
outputs = user_sched.executor->run(
inputs,
user_sched.heuristic_params->lparams,
user_sched.heuristic_params->cparams);
Expand Down
4 changes: 2 additions & 2 deletions csrc/runtime/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ std::string KernelExecutor::getStructuredCode() const {
return getStructuredCode(kernelString(), kernel()->indexType());
}

void KernelExecutor::compileFusion(
void KernelExecutor::compile(
Fusion* fusion,
const KernelArgumentHolder& args,
const LaunchParams& launch_constraints,
Expand Down Expand Up @@ -1137,7 +1137,7 @@ at::Tensor findBufferForFusionOutput(
}
} // namespace

std::vector<at::Tensor> KernelExecutor::runFusion(
std::vector<at::Tensor> KernelExecutor::run(
KernelArgumentHolder& args,
const LaunchParams& launch_constraints,
CompileParams compile_params,
Expand Down
22 changes: 11 additions & 11 deletions csrc/runtime/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class KernelExecutor : public NonCopyable {
//! To compile a fusion with the 32-bit index type, CompileParams
//! must be passed in. There used to be an index type associated
//! with KernelArgumentHolder, but it is no longer the case.
NVF_API void compileFusion(
NVF_API void compile(
Fusion* fusion,
const KernelArgumentHolder& args,
const LaunchParams& launch_constraints,
Expand All @@ -56,25 +56,25 @@ class KernelExecutor : public NonCopyable {
// TODO: merge it with the overload above.
//! This API is merely here so we don't have to go back and update all cpp
//! tests.
void compileFusion(
void compile(
Fusion* fusion,
const at::ArrayRef<c10::IValue>& inputs = {},
const LaunchParams& launch_constraints = LaunchParams(),
CompileParams compile_params = CompileParams()) {
KernelArgumentHolder args =
KernelArgumentHolder::createKernelArgumentHolder(inputs);
compileFusion(fusion, args, launch_constraints, compile_params);
compile(fusion, args, launch_constraints, compile_params);
}

//! Used by user defined schedules in python frontend
void compileFusion(
void compile(
Fusion* fusion,
const at::ArrayRef<c10::IValue>& inputs,
int64_t fusion_id,
int64_t concrete_id) {
KernelArgumentHolder args =
KernelArgumentHolder::createKernelArgumentHolder(inputs);
compileFusion(
compile(
fusion,
args,
LaunchParams(),
Expand All @@ -92,15 +92,15 @@ class KernelExecutor : public NonCopyable {
// TODO: args shouldn't come in a reference here because we will append the
// outputs to be able to send it to the kernel. For now none of the users are
// reconsuming the args, so it is okay. It isn't done now because changing it
// from a reference makes a call as runFusion({}) ambiguous, and that is used
// from a reference makes a call as run({}) ambiguous, and that is used
// in some places in the codebase.
NVF_API std::vector<at::Tensor> runFusion(
NVF_API std::vector<at::Tensor> run(
KernelArgumentHolder& args,
const LaunchParams& launch_constraints = LaunchParams(),
CompileParams compile_params = CompileParams(),
std::vector<at::Tensor> outputs = {});

std::vector<at::Tensor> runFusion(
std::vector<at::Tensor> run(
const at::ArrayRef<c10::IValue>& inputs,
const std::vector<at::Tensor>& outputs,
const LaunchParams& launch_constraints = LaunchParams(),
Expand All @@ -111,15 +111,15 @@ class KernelExecutor : public NonCopyable {
if (opt_code.has_value()) {
args.setCacheId(*opt_code);
}
return runFusion(args, launch_constraints, compile_params, outputs);
return run(args, launch_constraints, compile_params, outputs);
}

std::vector<at::Tensor> runFusion(
std::vector<at::Tensor> run(
const at::ArrayRef<c10::IValue>& inputs,
const LaunchParams& launch_constraints = LaunchParams(),
CompileParams compile_params = CompileParams(),
const std::optional<size_t>& opt_code = std::nullopt) {
return runFusion(inputs, {}, launch_constraints, compile_params, opt_code);
return run(inputs, {}, launch_constraints, compile_params, opt_code);
}

// Register a lowering hooks that are called to modify the GpuLower object
Expand Down
4 changes: 2 additions & 2 deletions csrc/runtime/fusion_kernel_runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
if (executor.groupId() < 0) {
executor.setGroupId(group_id);
}
auto outputs = executor.runFusion(args, launch_params, compile_params);
auto outputs = executor.run(args, launch_params, compile_params);

return outputs;
}
Expand Down Expand Up @@ -625,7 +625,7 @@ void FusionKernelRuntime::compileKernel(
NVF_ERROR(
heuristic_params->cparams.index_type.has_value(),
"Kernel index type is not defined.");
executors_.at(group_id).compileFusion(
executors_.at(group_id).compile(
fusion_to_run.get(),
args,
heuristic_params->lparams,
Expand Down
4 changes: 2 additions & 2 deletions examples/sinh_extension/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ at::Tensor sinh_nvfuser(const at::Tensor& input) {
SchedulerEntry::scheduleWith(&fusion, SchedulerType::PointWise, {input});

KernelExecutor ke;
ke.compileFusion(&fusion, {input}, heuristic_params->lparams);
auto outputs = ke.runFusion({input}, heuristic_params->lparams);
ke.compile(&fusion, {input}, heuristic_params->lparams);
auto outputs = ke.run({input}, heuristic_params->lparams);

return outputs[0];
}
Expand Down
4 changes: 2 additions & 2 deletions examples/sinh_libtorch/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ at::Tensor sinh_nvfuser(const at::Tensor& input) {
&fusion, SchedulerType::PointWise, {input});

KernelExecutor ke;
ke.compileFusion(&fusion, {input}, heuristic_params->lparams);
auto outputs = ke.runFusion({input}, heuristic_params->lparams);
ke.compile(&fusion, {input}, heuristic_params->lparams);
auto outputs = ke.run({input}, heuristic_params->lparams);

return outputs[0];
}
Expand Down
Loading

0 comments on commit 951dde6

Please sign in to comment.