Skip to content

Commit

Permalink
Deduplicate some benchmark code. (#3903)
Browse files Browse the repository at this point in the history
  • Loading branch information
csarofeen authored Feb 16, 2025
1 parent 204d795 commit 53bd0a7
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 298 deletions.
98 changes: 0 additions & 98 deletions benchmarks/cpp/heuristic_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,74 +24,6 @@

using namespace nvfuser;

static auto getLayerBackwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& executor_cache,
std::vector<c10::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();

const size_t kM = shape.size();
const size_t kN = norm_shape.size();
const size_t kOuterNumDims = kM - kN;

std::vector<int64_t> outer_shape;
for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
outer_shape.push_back(shape[idx]);
}
for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
outer_shape.push_back(1);
}

auto grad_out = makeSymbolicTensor(shape.size());
auto input = makeSymbolicTensor(shape.size());
auto mean = makeConcreteTensor(outer_shape);
auto rstd = makeConcreteTensor(outer_shape);
auto weight = makeSymbolicTensor(norm_shape.size());
auto bias = makeSymbolicTensor(norm_shape.size());
fusion.addInput(grad_out);
fusion.addInput(input);
fusion.addInput(mean);
fusion.addInput(rstd);
fusion.addInput(weight);
fusion.addInput(bias);

auto grads = layer_norm_backward(
grad_out,
input,
norm_shape,
mean,
rstd,
weight,
bias,
{true, true, true});

fusion.addOutput(grads.grad_input);
fusion.addOutput(grads.grad_weight);
fusion.addOutput(grads.grad_bias);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_grad_out = at::randn(shape, options);
at::Tensor aten_input = at::randn(shape, options);
at::Tensor aten_weight = at::randn(norm_shape, options);
at::Tensor aten_bias = at::randn(norm_shape, options);

const float kEps = 1e-5;
auto aten_results = at::native_layer_norm(
aten_input, norm_shape, aten_weight, aten_bias, kEps);
auto aten_output = std::get<0>(aten_results);
auto aten_mean = std::get<1>(aten_results);
auto aten_rstd = std::get<2>(aten_results);

executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {
aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);

return executor_cache->getMostRecentKernelRuntime();
}

static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
benchmark::State& benchmark_state) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Expand All @@ -117,36 +49,6 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicCache(
}
}

static auto getLayerForwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& executor_cache,
std::vector<c10::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();

const float kEps = 1e-5;
Val* eps_ptr = IrBuilder::create<Val>(kEps);

auto input = makeSymbolicTensor(shape.size());
fusion.addInput(input);

auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);

fusion.addOutput(result.output);
fusion.addOutput(result.mean);
fusion.addOutput(result.invstd);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_input = at::randn(shape, options);

executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {aten_input};
auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);

return executor_cache->getMostRecentKernelRuntime();
}

static void NvFuserScheduler_LayerNormForward_HeuristicCache(
benchmark::State& benchmark_state) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Expand Down
100 changes: 0 additions & 100 deletions benchmarks/cpp/heuristic_lookup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,76 +24,6 @@

using namespace nvfuser;

static auto getLayerBackwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& executor_cache,
std::vector<c10::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();

const size_t kM = shape.size();
const size_t kN = norm_shape.size();
const size_t kOuterNumDims = kM - kN;

std::vector<int64_t> outer_shape;
for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
outer_shape.push_back(shape[idx]);
}
for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
outer_shape.push_back(1);
}

auto grad_out = makeSymbolicTensor(shape.size());
auto input = makeSymbolicTensor(shape.size());
auto mean = makeConcreteTensor(outer_shape);
auto rstd = makeConcreteTensor(outer_shape);
auto weight = makeSymbolicTensor(norm_shape.size());
auto bias = makeSymbolicTensor(norm_shape.size());
fusion.addInput(grad_out);
fusion.addInput(input);
fusion.addInput(mean);
fusion.addInput(rstd);
fusion.addInput(weight);
fusion.addInput(bias);

auto grads = layer_norm_backward(
grad_out,
input,
norm_shape,
mean,
rstd,
weight,
bias,
{true, true, true});

fusion.addOutput(grads.grad_input);
fusion.addOutput(grads.grad_weight);
fusion.addOutput(grads.grad_bias);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_grad_out = at::randn(shape, options);
at::Tensor aten_input = at::randn(shape, options);
at::Tensor aten_weight = at::randn(norm_shape, options);
at::Tensor aten_bias = at::randn(norm_shape, options);
auto at_weight = c10::optional<at::Tensor>(aten_weight);
auto at_bias = c10::optional<at::Tensor>(aten_bias);

const float kEps = 1e-5;
auto aten_results =
at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
auto aten_output = std::get<0>(aten_results);
auto aten_mean = std::get<1>(aten_results);
auto aten_rstd = std::get<2>(aten_results);

executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {
aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);

return executor_cache->getMostRecentKernelRuntime();
}

static void NvFuserScheduler_LayerNormBackward_HeuristicLookup(
benchmark::State& benchmark_state) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Expand All @@ -119,36 +49,6 @@ static void NvFuserScheduler_LayerNormBackward_HeuristicLookup(
}
}

static auto getLayerForwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& executor_cache,
std::vector<c10::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();

const float kEps = 1e-5;
Val* eps_ptr = IrBuilder::create<Val>(kEps);

auto input = makeSymbolicTensor(shape.size());
fusion.addInput(input);

auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);

fusion.addOutput(result.output);
fusion.addOutput(result.mean);
fusion.addOutput(result.invstd);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_input = at::randn(shape, options);

executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {aten_input};
auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);

return executor_cache->getMostRecentKernelRuntime();
}

static void NvFuserScheduler_LayerNormForward_HeuristicLookup(
benchmark::State& benchmark_state) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Expand Down
100 changes: 0 additions & 100 deletions benchmarks/cpp/shape_inference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,76 +24,6 @@

using namespace nvfuser;

static auto getLayerBackwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& executor_cache,
std::vector<c10::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();

const size_t kM = shape.size();
const size_t kN = norm_shape.size();
const size_t kOuterNumDims = kM - kN;

std::vector<int64_t> outer_shape;
for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
outer_shape.push_back(shape[idx]);
}
for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
outer_shape.push_back(1);
}

auto grad_out = makeSymbolicTensor(shape.size());
auto input = makeSymbolicTensor(shape.size());
auto mean = makeConcreteTensor(outer_shape);
auto rstd = makeConcreteTensor(outer_shape);
auto weight = makeSymbolicTensor(norm_shape.size());
auto bias = makeSymbolicTensor(norm_shape.size());
fusion.addInput(grad_out);
fusion.addInput(input);
fusion.addInput(mean);
fusion.addInput(rstd);
fusion.addInput(weight);
fusion.addInput(bias);

auto grads = layer_norm_backward(
grad_out,
input,
norm_shape,
mean,
rstd,
weight,
bias,
{true, true, true});

fusion.addOutput(grads.grad_input);
fusion.addOutput(grads.grad_weight);
fusion.addOutput(grads.grad_bias);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_grad_out = at::randn(shape, options);
at::Tensor aten_input = at::randn(shape, options);
at::Tensor aten_weight = at::randn(norm_shape, options);
at::Tensor aten_bias = at::randn(norm_shape, options);
auto at_weight = c10::optional<at::Tensor>(aten_weight);
auto at_bias = c10::optional<at::Tensor>(aten_bias);

const float kEps = 1e-5;
auto aten_results =
at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
auto aten_output = std::get<0>(aten_results);
auto aten_mean = std::get<1>(aten_results);
auto aten_rstd = std::get<2>(aten_results);

executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {
aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);

return executor_cache->getMostRecentKernelRuntime();
}

void LayerNormBackward_ShapeInference_Base(
benchmark::State& benchmark_state,
bool disable_launch_parameter_cache) {
Expand Down Expand Up @@ -137,36 +67,6 @@ static void NvFuserScheduler_LayerNormBackward_NoShapeInferenceCachedBaseline(
LayerNormBackward_ShapeInference_Base(benchmark_state, false);
}

static auto getLayerForwardNormRuntime(
std::unique_ptr<Fusion> fusion_ptr,
std::unique_ptr<FusionExecutorCache>& executor_cache,
std::vector<c10::IValue>& aten_inputs,
std::vector<int64_t>& shape,
std::vector<int64_t>& norm_shape) {
Fusion& fusion = *fusion_ptr.get();

const float kEps = 1e-5;
Val* eps_ptr = IrBuilder::create<Val>(kEps);

auto input = makeSymbolicTensor(shape.size());
fusion.addInput(input);

auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);

fusion.addOutput(result.output);
fusion.addOutput(result.mean);
fusion.addOutput(result.invstd);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_input = at::randn(shape, options);

executor_cache = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
aten_inputs = {aten_input};
auto cg_outputs = executor_cache->runFusionWithInputs(aten_inputs);

return executor_cache->getMostRecentKernelRuntime();
}

void LayerNormForward_ShapeInferenceBase(
benchmark::State& benchmark_state,
bool disable_launch_param_cache) {
Expand Down
Loading

0 comments on commit 53bd0a7

Please sign in to comment.