[Codegen][GPU] Stop using redundant transfer hoisting pattern on memr…

…efs (iree-org#18721) This pattern is a footgun and doesn't model parallel regions properly as noted here: https://github.com/llvm/llvm-project/blob/cc5ddae5e29ef2d7dd132469caee5cac54523ce5/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h#L42
bjacob · Oct 8, 2024 · e8ff07e · e8ff07e
1 parent 6c095ed
commit e8ff07e
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 7 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp b/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
@@ -87,11 +87,12 @@ struct OptimizeVectorTransferPass final
 
     LDBG("after dropping leading unit dims\n" << funcOp);
 
-    // Workaround, run loop invariant code motion before hoist redundant vector
-    // transfer to workaround a bug upstream.
-    // TODO(thomasraoux): Remove it once the fix is merged.
-    loopInvariantCodeMotion(funcOp);
-    linalg::hoistRedundantVectorTransfers(cast<func::FuncOp>(funcOp));
+    if (redundantHoisting) {
+      // Workaround, run loop invariant code motion before hoist redundant
+      // vector transfer to workaround a bug upstream.
+      loopInvariantCodeMotion(funcOp);
+      linalg::hoistRedundantVectorTransfers(cast<func::FuncOp>(funcOp));
+    }
     IRRewriter rewriter(funcOp->getContext());
     vector::transferOpflowOpt(rewriter, funcOp);
 

diff --git a/compiler/src/iree/compiler/Codegen/Common/Passes.td b/compiler/src/iree/compiler/Codegen/Common/Passes.td
@@ -419,6 +419,8 @@ def OptimizeVectorTransferPass :
   let options = [
     Option<"flatten", "flatten", "bool", "false",
            "Flatten the vector type of vector transfers where possible (contiguous row-major data).">,
+    Option<"redundantHoisting", "redundant-hoisting", "bool", "true",
+           "Enables use of redundant vector transfer hoisting.">,
   ];
   let dependentDialects = [
       "memref::MemRefDialect"

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -448,8 +448,13 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
   funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
-  funcPassManager.addPass(createOptimizeVectorTransferPass());
-  funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
+  {
+    OptimizeVectorTransferPassOptions options;
+    // Disable redundant vector transfer hoisting because it does not
+    // properly consider distributed code on memrefs.
+    options.redundantHoisting = false;
+    funcPassManager.addPass(createOptimizeVectorTransferPass());
+  }
   funcPassManager.addPass(createHoistStaticallyBoundAllocationsPass());
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -941,3 +941,55 @@ hal.executable public @main {
 //   CHECK-DAG:     %[[LHS_MM:.+]] = vector.transfer_read %[[LHS_ALLOC]]{{.*}} vector<4xf32>
 //   CHECK-DAG:     %[[RHS_MM:.+]] = vector.transfer_read %[[RHS_ALLOC]]{{.*}} vector<4x4xf32>
 //       CHECK:     vector.contract {{.*}} %[[LHS_MM]], %[[RHS_MM]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+#config = #iree_gpu.lowering_config<{
+  workgroup = [1, 64, 0],
+  reduction = [0, 0, 2],
+  thread = [1, 1, 0]
+}>
+hal.executable public @main {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @small_matvec ordinal(0) layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @small_matvec()
+        attributes {translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>} {
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10x10xf32>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10x1xf32>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<readwrite:tensor<10x1xf32>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [10, 10], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<10x10xf32>> -> tensor<10x10xf32>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10, 1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<10x1xf32>> -> tensor<10x1xf32>
+        %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [10, 1], strides = [1, 1] : !flow.dispatch.tensor<readwrite:tensor<10x1xf32>> -> tensor<10x1xf32>
+        %6 = linalg.matmul {lowering_config = #config}
+          ins(%3, %4 : tensor<10x10xf32>, tensor<10x1xf32>)
+          outs(%5 : tensor<10x1xf32>) -> tensor<10x1xf32>
+        flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [10, 1], strides = [1, 1] : tensor<10x1xf32> -> !flow.dispatch.tensor<readwrite:tensor<10x1xf32>>
+        return
+      }
+    }
+  }
+}
+
+// Note that current barrier placement logic is observedly poor. Some cleanup
+// analysis should be able to simplify the below to just two barriers.
+
+// CHECK-LABEL: func @small_matvec
+//   CHECK-DAG:   %[[B2:.+]] = hal.interface.binding.subspan layout({{.+}}) binding(2)
+//       CHECK:   scf.for %{{.*}} = %{{.*}} to %c1 step %c64
+
+// Verify that the write does not get hoisted out of the single threaded
+// for loop.
+//       CHECK:     vector.transfer_write %{{.*}}, %[[B2]]{{.*}} memref<10x1xf32, #hal.descriptor_type<storage_buffer>>
+//  CHECK-NEXT:   }
+//  CHECK-NEXT:   return