Skip to content

Commit

Permalink
[Codegen][GPU] Stop using redundant transfer hoisting pattern on memr…
Browse files Browse the repository at this point in the history
  • Loading branch information
qedawkins authored Oct 8, 2024
1 parent 6c095ed commit e8ff07e
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,12 @@ struct OptimizeVectorTransferPass final

LDBG("after dropping leading unit dims\n" << funcOp);

// Workaround, run loop invariant code motion before hoist redundant vector
// transfer to workaround a bug upstream.
// TODO(thomasraoux): Remove it once the fix is merged.
loopInvariantCodeMotion(funcOp);
linalg::hoistRedundantVectorTransfers(cast<func::FuncOp>(funcOp));
if (redundantHoisting) {
// Workaround, run loop invariant code motion before hoist redundant
// vector transfer to workaround a bug upstream.
loopInvariantCodeMotion(funcOp);
linalg::hoistRedundantVectorTransfers(cast<func::FuncOp>(funcOp));
}
IRRewriter rewriter(funcOp->getContext());
vector::transferOpflowOpt(rewriter, funcOp);

Expand Down
2 changes: 2 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,8 @@ def OptimizeVectorTransferPass :
let options = [
Option<"flatten", "flatten", "bool", "false",
"Flatten the vector type of vector transfers where possible (contiguous row-major data).">,
Option<"redundantHoisting", "redundant-hoisting", "bool", "true",
"Enables use of redundant vector transfer hoisting.">,
];
let dependentDialects = [
"memref::MemRefDialect"
Expand Down
9 changes: 7 additions & 2 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,8 +448,13 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
funcPassManager.addPass(createOptimizeVectorTransferPass());
funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
{
OptimizeVectorTransferPassOptions options;
// Disable redundant vector transfer hoisting because it does not
// properly consider distributed code on memrefs.
options.redundantHoisting = false;
funcPassManager.addPass(createOptimizeVectorTransferPass());
}
funcPassManager.addPass(createHoistStaticallyBoundAllocationsPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -941,3 +941,55 @@ hal.executable public @main {
// CHECK-DAG: %[[LHS_MM:.+]] = vector.transfer_read %[[LHS_ALLOC]]{{.*}} vector<4xf32>
// CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read %[[RHS_ALLOC]]{{.*}} vector<4x4xf32>
// CHECK: vector.contract {{.*}} %[[LHS_MM]], %[[RHS_MM]]

// -----

#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
#config = #iree_gpu.lowering_config<{
workgroup = [1, 64, 0],
reduction = [0, 0, 2],
thread = [1, 1, 0]
}>
hal.executable public @main {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
hal.executable.export public @small_matvec ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @small_matvec()
attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10x10xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10x1xf32>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<10x1xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [10, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<10x10xf32>> -> tensor<10x10xf32>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<10x1xf32>> -> tensor<10x1xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [10, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<10x1xf32>> -> tensor<10x1xf32>
%6 = linalg.matmul {lowering_config = #config}
ins(%3, %4 : tensor<10x10xf32>, tensor<10x1xf32>)
outs(%5 : tensor<10x1xf32>) -> tensor<10x1xf32>
flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [10, 1], strides = [1, 1] : tensor<10x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<10x1xf32>>
return
}
}
}
}

// Note that current barrier placement logic is observedly poor. Some cleanup
// analysis should be able to simplify the below to just two barriers.

// CHECK-LABEL: func @small_matvec
// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
// CHECK: scf.for %{{.*}} = %{{.*}} to %c1 step %c64

// Verify that the write does not get hoisted out of the single threaded
// for loop.
// CHECK: vector.transfer_write %{{.*}}, %[[B2]]{{.*}} memref<10x1xf32, #hal.descriptor_type<storage_buffer>>
// CHECK-NEXT: }
// CHECK-NEXT: return

0 comments on commit e8ff07e

Please sign in to comment.