TMem allocator (#3803)

This is a followup of #3795. Instead of naively inserting one `tcgen05.alloc` at the beginning, now we do real analysis on the TMem tensors, and generate the correct number of `tcgen05.alloc` s based on the analysis. As noted on `[Tensor Memory Allocation]`, allocating TMem can be a very hard problem, and at this stage, it does not make sense to start investing time on writing a perfect allocator. So the goal of this PR is to provide a solution that is hackable (so that in the future, when we want to try different allocation strategies, we can easily hack our codebase to achieve our goal) and extensible (so that in the future, when we get a better idea on what is a good allocation strategy, most of the code developed in this PR can still be reused, instead of abandoning everything and rewrite a new one from scratch). With this goal in mind, this PR adds a way to represent "how we want to allocate TMem" (`struct TMemAlllocationInfo`), a lowering pass that translate this representation into kernel IR, and a naive heuristics that generate a simple `TMemAlllocationInfo`. Regarding the topic of "allocating TMem", I believe the only thing missing after this PR is the insertion of `tcgen05.dealloc`s, which will be in the next PR. We might want to go back to this topic after we start looking at perf, but before that, I consider the topic of "allocating TMem" as done after the next PR. Note that the allocation size is hard coded to be "whole 32 columns" for now. This is clearly wrong, but I would categorize this task into the topic "the scheduling and indexing of TMem", which is the next thing I will do after the "allocating TMem" topic is done. I suggest start reviewing this PR from the code comment in `csrc/device_lower/analysis/tensor_memory.h`
NVIDIA · Feb 3, 2025 · dbd0d6b · dbd0d6b
1 parent e8c2846
commit dbd0d6b
Show file tree

Hide file tree

Showing 8 changed files with 354 additions and 53 deletions.
diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp
@@ -3201,9 +3201,11 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
         }
         case MemoryType::Tensor: {
           // Generate code like:
-          // TMemTensor T2(T5[0]);
+          // TMemTensor T2(T5[0], 0, 0);
           indent() << "TMemTensor " << genVariableName(tv) << "("
-                   << genInline(alloc->address()) << ");\n";
+                   << genInline(alloc->address()) << ", "
+                   << genInline(alloc->laneOffset()) << ", "
+                   << genInline(alloc->colOffset()) << ");\n";
           break;
         }
         default:

diff --git a/csrc/device_lower/analysis/tensor_memory.cpp b/csrc/device_lower/analysis/tensor_memory.cpp
@@ -13,27 +13,83 @@
 
 namespace nvfuser {
 
+// See note [Tensor Memory Allocation] for the overall design.
 TensorMemoryInfo computeTMemInfo(Fusion* fusion) {
-  bool found = false;
+  TensorMemoryInfo result;
+
+  // Step 1: partition the tensors. Each partition of tensors will become a
+  // region, so we use the term partition and region interchangeably. The user
+  // may have provided full or partial partitioning information. For the
+  // TensorViews that the user has already specified which region they belong
+  // to, we will use that information. For the rest of the tensors, we will
+  // assign each of them to a separate region.
+  using Partition = std::vector<std::vector<TensorView*>>;
+  Partition partitions;
+  if (fusion->hasManaged("tmem_regions")) {
+    partitions = fusion->getManaged<Partition>("tmem_regions");
+  } else {
+    partitions = {};
+  }
+
+  // Verify that there is no overlap between user specified partitions
+  std::unordered_set<TensorView*> tensors;
+  for (auto& partition : partitions) {
+    NVF_ERROR(!partition.empty(), "Empty partition");
+    for (auto tv : partition) {
+      NVF_ERROR(
+          tv->getMemoryType() == MemoryType::Tensor, "Invalid memory type");
+      NVF_ERROR(
+          tensors.insert(tv).second, "Tensors cannot be in multiple regions");
+    }
+  }
+
+  // For all TensorViews whose partition is not specified, assign them to a
+  // separate region.
   for (auto tv : fusion->allTvs()) {
-    if (tv->getMemoryType() == MemoryType::Tensor) {
-      NVF_ERROR(!found, "Only one tensor on TMem is supported");
-      found = true;
+    if (tv->getMemoryType() != MemoryType::Tensor) {
+      continue;
+    }
+    if (tensors.count(tv) == 0) {
+      partitions.push_back({tv});
     }
   }
 
-  if (found) {
+  // Step 2: Compute the allocation information for tensor memory. That is, for
+  // each partition, we create a Region object and fill in the necessary
+  // information.
+  using Region = TMemAlllocationInfo::Region;
+  std::vector<Region>& regions = result.allocation.regions;
+  for (const auto& partition : partitions) {
+    regions.emplace_back();
+    auto& region = regions.back();
+
     // tcgen05.alloc stores the allocated address in shared memory. So we use a
     // TensorView with MemoryType::Shared to store this address.
-    auto allocation_address = TensorViewBuilder()
-                                  .shape(std::vector<Val*>{})
-                                  .dtype(DataType::UInt32)
-                                  .build();
-    allocation_address->setMemoryType(MemoryType::Shared);
-    return {allocation_address};
+    region.address = TensorViewBuilder()
+                         .shape(std::vector<Val*>{})
+                         .dtype(DataType::UInt32)
+                         .build();
+    region.address->setMemoryType(MemoryType::Shared);
+
+    // Assign each tensor in the region a whole 128 lanes and N columns.
+    region.num_columns = region.address->fusion()->zeroVal(DataType::UInt16);
+    for (auto tv : partition) {
+      // TODO: right now we hardcode the number of columns of each tensor to
+      // be 32. This is definitely not correct.
+      Val* num_columns = IrBuilder::create<Val>(32, DataType::UInt16);
+      region.covered_tensors.emplace_back();
+      auto& covered_tensor = region.covered_tensors.back();
+      covered_tensor.tensor = tv;
+      covered_tensor.lane_offset = tv->fusion()->zeroVal(DataType::UInt16);
+      covered_tensor.column_offset = region.num_columns;
+      region.num_columns =
+          SimplifyingIrBuilder::addExpr(region.num_columns, num_columns);
+    }
+    region.num_columns =
+        IrBuilder::maybeCastExpr(DataType::UInt32, region.num_columns);
   }
 
-  return {nullptr};
+  return result;
 }
 
 } // namespace nvfuser
diff --git a/csrc/device_lower/analysis/tensor_memory.h b/csrc/device_lower/analysis/tensor_memory.h
@@ -7,8 +7,11 @@
 // clang-format on
 #pragma once
 
+#include <vector>
+
 namespace nvfuser {
 
+class Val;
 class TensorView;
 class Fusion;
 
@@ -45,20 +48,112 @@ TensorMemoryInfo computeTMemInfo(Fusion* fusion);
 // relinquishes the right to allocate, the next CTA that is blocked will be
 // unblocked and can acquire the mutex to allocate TMem.
 //
-// Currently, our TMem allocation strategy is as naive as follows:
-// We assume there is at most one TensorView on TMem in the fusion. With this
-// assumption, we don't have to worry about where to place different tensors on
-// TMem. We will traverse the fusion to look for a TMem TensorView. If we can
-// find such a TensorView, we will generate a tcgen05.alloc and
-// tcgen05.relinquish_alloc_permit at the beginning of the kernel. We do not
-// dealloc TMem for now.
+// The tcgen05.alloc instruction is like the following:
+//   tcgen05.alloc [dest], nCols
+//
+// There are three important things to note about this instruction:
+//
+// 1. The output of this instruction is in shared memory address.
+// 2. The unit of allocation is 32 whole columns of tensor memory. And nCols
+//    must be a power of two.
+// 3. The right to allocate is like a mutex and will serialize CTA scheduling.
+//    The tcgen05.alloc is blocking when there is no space to allocate.
+//
+// The point 1 above is not a big trouble for us, but we need to make sure we
+// allocate the address tensor in shared memory before allocating the tensor
+// memory. But the point 2 and 3 can be a big challenge. There are basically
+// two things to worry about when allocating tensor memory:
+//
+// 1. Fragmentation. When the tensor does not occupy all lanes or the tensor's
+// size is not a power of two columns or < 32 columns, naively allocating all
+// lanes with 32 or higher power of 2 columns could waste some space. In a
+// perfect world, it would be nice to have a 2D allocator that is capable
+// merging the allocation of multiple tensors into a single tcgen05.alloc.
+// For example, if tv0 and tv2 both has 64 rows and 32 columns, we can allocate
+// tv0 on the first 64 lanes, and tv1 on the next 64 lanes. Another example is,
+// if tv0 has 128 rows and 31 columns, and tv1 has 128 rows and 33 columns, we
+// pack the two tensors into a single tcgen05.alloc of 64 columns.
+//
+// 2. Latency. We should relinquish the right to allocate as soon as we are done
+// with allocating, so that other CTAs can grab the "right to allocate" mutex.
+// We should also deallocate the tensor memory as soon as we are done with using
+// it, so that other CTA's tcgen05.alloc can get unblocked. In a perfect world,
+// it would be nice to able to break one TensorView into multiple deallocations.
+// For example, if tv0 has 128 rows and 256 columns, and we are sequentially
+// reading these 256 columns one by one. For this case, instead of waiting for
+// the entire 256-size loop to finish, it would be nice to deallocate the first
+// 128 columns if we are done with reading them, so that other CTAs have a
+// chance to allocate their memory in the freed space.
+//
+// From the above analysis, it is important to realize that the allocation of
+// TensorView and the allocation of the tensor memory are not a one-to-one
+// correspondence. A TensorView can be allocated by multiple tcgen05.allocs, and
+// a tcgen05.alloc can be used to allocate multiple TensorViews. For now, we
+// limit ourselves that a TensorView can not span multiple tcgen05.allocs, and
+// we call a piece of TMem area that is allocated by a single tcgen05.alloc and
+// may span multiple TensorViews a "region". This design derives a
+// TMem -> region -> TensorView hierarchy.
+//
+// In practice, it is very difficult to optimize both fragmentation and latency
+// perfectly. Although tensor memory was originally designed for matmul, because
+// it is a large and fast memory, it would be nice to use it for other purposes,
+// such as persistent buffers. This could make it even more difficult to
+// allocate tensor memory optimally. Considering the complexity of the problem,
+// the development of a tensor memory allocator is likely an incremental
+// process. With this in mind, we design the allocation of tensor memory in
+// nvFuser to be hackable.
+//
+// There are three main components in the design:
+// 1. A data structure, TMemAllocationInfo, that describes how we allocate
+//    tensor memory.
+// 2. A heuristic, executed as part of computeTMemInfo, that generates the
+//    allocation information as an instance of TMemAlllocationInfo.
+// 3. A pass, executed as part of insertAllocations, that generates the actual
+//    IR nodes based on the TMemAlllocationInfo.
+//
+// The TMemAllocationInfo data structure and the insertAllocations support
+// a wider range of allocation strategies than the heuristic in computeTMemInfo.
+// This provides some flexibility for prototyping and experimentation by just
+// manually specifying TMemAllocationInfo. To manually specify the allocation
+// strategy, the user can specify a managed variable "tmem_regions" in the
+// fusion. The type of this managed variable is vector<vector<TensorView*>>
+// which specifies which TensorViews should be coalesced into the same region.
+
+// The data structure that describes how we allocate tensor memory. It is
+// assumed that:
+// 1. TMem allocation are split into regions, with each region described by a
+//    Region. Each region spans a full 128 lanes and N columns of tensor memory.
+//    The number of columns must be a power of two and minimum 32. Each region
+//    is allocated by a single tcgen05.alloc and deallocated by a matching
+//    tcgen05.dealloc.
+// 2. Each kernel can have multiple regions.
+// 3. Each region can cover multiple TensorViews, but each TensorView can not
+//    span multiple regions.
+struct TMemAlllocationInfo {
+  // Each entry describes a region of 128 rows x N columns of tensor memory
+  // allocated by a single tcgen05.alloc.
+  struct Region {
+    // tcgen05.alloc stores the allocated address in shared memory. So we use a
+    // TensorView with MemoryType::Shared to store this address.
+    TensorView* address;
+    // The number of columns to allocate. Must be >= 32 and a power of two.
+    Val* num_columns;
+    // The TMem TensorViews covered by this region. Each region can be used to
+    // store multiple TensorViews. The (lane_offset, column_offset) specifies
+    // the starting offset of each TensorView in this region.
+    struct TVInfo {
+      TensorView* tensor;
+      Val* lane_offset;
+      Val* column_offset;
+    };
+    std::vector<TVInfo> covered_tensors;
+  };
+  std::vector<Region> regions;
+};
 
 // The actual definition of TensorMemoryInfo.
 struct TensorMemoryInfo {
-  // The address returned by tcgen05.alloc.
-  // tcgen05.alloc stores the allocated address in shared memory. So we use a
-  // TensorView with MemoryType::Shared to store this address.
-  TensorView* allocation_address = nullptr;
+  TMemAlllocationInfo allocation;
 };
 
 } // namespace nvfuser
diff --git a/csrc/device_lower/pass/allocation.cpp b/csrc/device_lower/pass/allocation.cpp
@@ -479,11 +479,25 @@ class AllocationInserter : public kir::ExprMutator {
     // Fill in the base address, lane offset, and column offset for tensor
     // memory allocations
     if (memory_type == MemoryType::Tensor) {
-      auto allocation_address =
-          GpuLower::current()->tmemInfo().allocation_address;
-      auto address_ti = IrBuilder::create<kir::TensorIndex>(
-          allocation_address, allocation_address->fusion()->zeroVal());
-      alloc_expr->setAddress(address_ti);
+      const auto& regions = GpuLower::current()->tmemInfo().allocation.regions;
+      for (const auto& region : regions) {
+        auto tv_info_it = std::find_if(
+            region.covered_tensors.begin(),
+            region.covered_tensors.end(),
+            [&](const auto& tv_info) { return tv_info.tensor == info.buffer; });
+        if (tv_info_it != region.covered_tensors.end()) {
+          auto address_ti = IrBuilder::create<kir::TensorIndex>(
+              region.address, region.address->fusion()->zeroVal());
+          alloc_expr->setAddress(address_ti);
+          alloc_expr->setLaneOffset(tv_info_it->lane_offset);
+          alloc_expr->setColOffset(tv_info_it->column_offset);
+          break;
+        }
+      }
+      NVF_ERROR(
+          alloc_expr->address() != nullptr,
+          "Could not find region for tensor memory allocation of ",
+          info.buffer);
     }
 
     return alloc_expr;
@@ -828,33 +842,31 @@ class AllocationInserter : public kir::ExprMutator {
 
 // Insert IR nodes that allocate and deallocate TMem regions.
 // See note [Tensor Memory Allocation] for the overall design.
-// We insert the tcgen05.alloc and the relinquish of the right to allocate at
-// the beginning of the top-level scope of the kernel. We do not tcgen05.dealloc
-// yet. The allocation of each TMem TensorView is inserted by
-// AllocationInserter::insert, therefore not handled here.
+// We insert the tcgen05.allocs of each region and the relinquish of the right
+// to allocate at the beginning of the top-level scope of the kernel. We do not
+// tcgen05.dealloc for now. The allocation of each TMem TensorView within each
+// region is inserted by AllocationInserter::insert, therefore not handled here.
 std::vector<Expr*> insertTMemRegionAllocsAndDeallocs(
     const std::vector<Expr*>& exprs) {
   // Expressions to be inserted at the beginning of the top-level scope.
   std::list<Expr*> prologue;
   {
-    if (GpuLower::current()->tmemInfo().allocation_address != nullptr) {
-      // Allocate the address tensor
-      auto allocation_address =
-          GpuLower::current()->tmemInfo().allocation_address;
-      auto address_alloc_expr = IrBuilder::create<kir::Allocate>(
-          allocation_address, MemoryType::Shared);
+    const auto& regions = GpuLower::current()->tmemInfo().allocation.regions;
+    // For each TMem region, allocate its address in shared memory, and insert
+    // the tcgen05.alloc for tensor memory allocation.
+    for (const auto& region : regions) {
+      // kir::Allocate for the address tensor on shared memory
+      auto address_alloc_expr =
+          IrBuilder::create<kir::Allocate>(region.address, MemoryType::Shared);
       prologue.push_back(address_alloc_expr);
-
-      // the tcgen05.alloc instructions
-      auto alloc_expr = IrBuilder::create<kir::AllocTMem>(
-          allocation_address,
-          IrBuilder::create<Val>(
-              32,
-              DataType::UInt32) // TODO: hard code allocation size to 32 for now
-      );
+      // the tcgen05.alloc instruction
+      auto alloc_expr =
+          IrBuilder::create<kir::AllocTMem>(region.address, region.num_columns);
       prologue.push_back(alloc_expr);
+    }
 
-      // Relinquish the right to allocate after we are done with tcgen05.allocs
+    if (!regions.empty()) {
+      // Relinquish the right to allocate after all regions have been allocated
       auto tcgen05_relinquish_expr = IrBuilder::create<kir::Asm>(
           "tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned",
           std::vector<Val*>{},

diff --git a/csrc/kernel_ir.cpp b/csrc/kernel_ir.cpp
@@ -187,6 +187,8 @@ Allocate::Allocate(
   addAttribute(alias);
   // Always initialize smem/tmem addresses to nullptr
   addAttribute(nullptr);
+  addAttribute(nullptr);
+  addAttribute(nullptr);
 
   for (auto s : shape) {
     addAttribute(s);