diff --git a/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp b/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp index 28db15d92e45..f300291149b2 100644 --- a/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp @@ -87,11 +87,12 @@ struct OptimizeVectorTransferPass final LDBG("after dropping leading unit dims\n" << funcOp); - // Workaround, run loop invariant code motion before hoist redundant vector - // transfer to workaround a bug upstream. - // TODO(thomasraoux): Remove it once the fix is merged. - loopInvariantCodeMotion(funcOp); - linalg::hoistRedundantVectorTransfers(cast(funcOp)); + if (redundantHoisting) { + // Workaround, run loop invariant code motion before hoist redundant + // vector transfer to workaround a bug upstream. + loopInvariantCodeMotion(funcOp); + linalg::hoistRedundantVectorTransfers(cast(funcOp)); + } IRRewriter rewriter(funcOp->getContext()); vector::transferOpflowOpt(rewriter, funcOp); diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td index f59409fab527..c30dbb13d6db 100644 --- a/compiler/src/iree/compiler/Codegen/Common/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Common/Passes.td @@ -419,6 +419,8 @@ def OptimizeVectorTransferPass : let options = [ Option<"flatten", "flatten", "bool", "false", "Flatten the vector type of vector transfers where possible (contiguous row-major data).">, + Option<"redundantHoisting", "redundant-hoisting", "bool", "true", + "Enables use of redundant vector transfer hoisting.">, ]; let dependentDialects = [ "memref::MemRefDialect" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 2d8f56edc3b8..14264d650fd4 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -448,8 +448,13 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager, funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); - funcPassManager.addPass(createOptimizeVectorTransferPass()); - funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass()); + { + OptimizeVectorTransferPassOptions options; + // Disable redundant vector transfer hoisting because it does not + // properly consider distributed code on memrefs. + options.redundantHoisting = false; + funcPassManager.addPass(createOptimizeVectorTransferPass()); + } funcPassManager.addPass(createHoistStaticallyBoundAllocationsPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index e6b21a6f6d4b..08fb78ada09c 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -941,3 +941,55 @@ hal.executable public @main { // CHECK-DAG: %[[LHS_MM:.+]] = vector.transfer_read %[[LHS_ALLOC]]{{.*}} vector<4xf32> // CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read %[[RHS_ALLOC]]{{.*}} vector<4x4xf32> // CHECK: vector.contract {{.*}} %[[LHS_MM]], %[[RHS_MM]] + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding +]> +#config = #iree_gpu.lowering_config<{ + workgroup = [1, 64, 0], + reduction = [0, 0, 2], + thread = [1, 1, 0] +}> +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @small_matvec ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @small_matvec() + attributes {translation_info = #iree_codegen.translation_info} { + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [10, 10], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10x10xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10x1xf32> + %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [10, 1], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<10x1xf32> + %6 = linalg.matmul {lowering_config = #config} + ins(%3, %4 : tensor<10x10xf32>, tensor<10x1xf32>) + outs(%5 : tensor<10x1xf32>) -> tensor<10x1xf32> + flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [10, 1], strides = [1, 1] : tensor<10x1xf32> -> !flow.dispatch.tensor> + return + } + } + } +} + +// Note that current barrier placement logic is observedly poor. Some cleanup +// analysis should be able to simplify the below to just two barriers. + +// CHECK-LABEL: func @small_matvec +// CHECK-DAG: %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2) +// CHECK: scf.for %{{.*}} = %{{.*}} to %c1 step %c64 + +// Verify that the write does not get hoisted out of the single threaded +// for loop. +// CHECK: vector.transfer_write %{{.*}}, %[[B2]]{{.*}} memref<10x1xf32, #hal.descriptor_type> +// CHECK-NEXT: } +// CHECK-NEXT: return