From d96a3f09a0416e9d01281c7c34a7b9cd07af5b73 Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com>
Date: Mon, 3 Feb 2025 19:10:56 -0600
Subject: [PATCH] [GPU] Match Tile And Fuse skinny matmul bail-out to Vector
 Distribute (#19857)

This PR matches the failure criteria to what we see in the
SetContractConfig for vector distribute for bailing out on skinny
matmuls.

The dispatch in https://github.com/iree-org/iree/issues/19855 goes to
0.068 ms vs the default path which gets 1.64 ms as this skinny matmul
with multiple dims cannot be currently supported by vector reduction and
warp reduction but tile and fuse can support it using padding.

This needs the flag
`--iree-codegen-llvmgpu-test-tile-and-fuse-matmul=true`

Also the `GPUMatmulShapeType` was becoming too large as this PR adds
batch sizes to it and was giving the following error.
```
 error: static_assert failed due to requirement 'sizeof(mlir::iree_compiler::GPUMatmulShapeType) <= 256' "You are trying to use a default number of inlined elements for SmallVector<T> but sizeof(T) is really big! Please use an explicit number of inlined elements with SmallVector<T, N> to make sure you really want that much inline storage."
 ```
 This PR fixes this issue both by explictly mentioning vector sizes in the struct members.

---------

Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
---
 .../Codegen/Common/GPU/GPUHeuristics.cpp      | 56 ++++++++++++-------
 .../Codegen/Common/GPU/GPUHeuristics.h        | 18 +++---
 .../Dialect/GPU/TargetUtils/ConfigUtils.cpp   | 19 +++++--
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 15 ++++-
 .../test/ROCDL/config_tile_and_fuse.mlir      | 10 ++--
 5 files changed, 77 insertions(+), 41 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
index f8e30f31a961..669d1d1eb539 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
@@ -21,6 +21,9 @@ using llvm::APIntOps::GreatestCommonDivisor;
 
 namespace mlir::iree_compiler {
 
+// Threshold used to determine whether a matmul dimension is 'very skinny'.
+constexpr int64_t kVerySkinnyDimThreshold = 4;
+
 template <typename T>
 static llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
                                      const llvm::SmallVectorImpl<T> &vector) {
@@ -77,17 +80,17 @@ calculateResultSharedMemoryUsedInBytes(const GPUMMASchedule &schedule,
 static bool isScheduleAligned(const GPUMatmulShapeType &problem,
                               const GPUMMASchedule &schedule,
                               bool mustBeAligned) {
-  SmallVector<int64_t> alignedMSizes(problem.mSizes);
+  SmallVector<int64_t, 2> alignedMSizes(problem.mSizes);
   alignedMSizes.back() =
       mustBeAligned ? problem.mSizes.back()
                     : llvm::divideCeil(problem.mSizes.back(), schedule.mSize) *
                           schedule.mSize;
-  SmallVector<int64_t> alignedNSizes(problem.nSizes);
+  SmallVector<int64_t, 2> alignedNSizes(problem.nSizes);
   alignedNSizes.back() =
       mustBeAligned ? problem.nSizes.back()
                     : llvm::divideCeil(problem.nSizes.back(), schedule.nSize) *
                           schedule.nSize;
-  SmallVector<int64_t> alignedKSizes(problem.kSizes);
+  SmallVector<int64_t, 2> alignedKSizes(problem.kSizes);
   alignedKSizes.back() =
       mustBeAligned ? problem.kSizes.back()
                     : llvm::divideCeil(problem.kSizes.back(), schedule.kSize) *
@@ -106,7 +109,7 @@ static bool isScheduleAligned(const GPUMatmulShapeType &problem,
       };
   // Checks whether the elements of `a` are evenly divisible by the
   // corresponding elements of `b`.
-  auto areAligned = [](SmallVector<int64_t> a, SmallVector<int64_t> b) {
+  auto areAligned = [](SmallVector<int64_t, 2> a, SmallVector<int64_t, 2> b) {
     for (auto [aVal, bVal] : llvm::zip_equal(a, b)) {
       if (aVal % bVal != 0) {
         return false;
@@ -223,6 +226,7 @@ static FailureOr<GPUMMASchedule> fitScheduleInSharedMemory(
 
 static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
                                         const GPUMatmulShapeType &intrinsic,
+                                        int64_t preferredSubgroupSize,
                                         bool canUpcastAcc, bool mustBeAligned) {
   assert(intrinsic.mSizes.size() == 1 && intrinsic.nSizes.size() == 1 &&
          intrinsic.kSizes.size() == 1 &&
@@ -240,12 +244,17 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
     }
   }
 
-  if (mustBeAligned && (problem.mSizes.back() % intrinsic.mSizes[0] != 0 ||
-                        problem.nSizes.back() % intrinsic.nSizes[0] != 0 ||
-                        problem.kSizes.back() % intrinsic.kSizes[0] != 0)) {
-    return failure(); // Cannot use this intrinsic for misaligned cases.
+  if (mustBeAligned) {
+    if ((problem.mSizes.back() % intrinsic.mSizes[0] != 0 ||
+         problem.nSizes.back() % intrinsic.nSizes[0] != 0 ||
+         problem.kSizes.back() % intrinsic.kSizes[0] != 0)) {
+      return failure();
+    }
+    return success();
   }
 
+  // Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction
+  // pipeline, similar to matvec.
   // TODO: Figure out what the precise cutoff is, this may be machine dependent.
   // In situation when alignment isn't required, we disallow intrinsics to be
   // picked if the tile size is too small. For example, this will force a matmul
@@ -255,10 +264,15 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
   // established after we sweep the different tile sizes for a problem config.
   // Once a precise threshold is established, replace 4 with the threshold and
   // remove this todo.
-  if (!mustBeAligned &&
-      (problem.mSizes.back() < 4 || problem.nSizes.back() < 4 ||
-       problem.kSizes.back() < 4)) {
-    return failure();
+  if (llvm::all_equal({problem.mSizes.size(), problem.nSizes.size(),
+                       problem.kSizes.size(), size_t{1}}) &&
+      problem.batchSizes.empty()) {
+    int64_t mSize = problem.mSizes.back();
+    int64_t nSize = problem.nSizes.back();
+    if ((mSize <= kVerySkinnyDimThreshold && (nSize > preferredSubgroupSize)) ||
+        (nSize <= kVerySkinnyDimThreshold && (mSize > preferredSubgroupSize))) {
+      return failure();
+    }
   }
   return success();
 }
@@ -279,8 +293,8 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
   // 16x16x16 intrinsic, then:
   //  - mTotalTileCounts would be 4 * (16/16) = 4
   //  - nTotalTileCounts would be 2 * (32/16) = 4
-  SmallVector<int64_t> mTotalTileCounts = problem.mSizes;
-  SmallVector<int64_t> nTotalTileCounts = problem.nSizes;
+  SmallVector<int64_t, 2> mTotalTileCounts = problem.mSizes;
+  SmallVector<int64_t, 2> nTotalTileCounts = problem.nSizes;
   mTotalTileCounts.back() =
       llvm::divideCeil(problem.mSizes.back(), intrinsic.mSizes[0]);
   nTotalTileCounts.back() =
@@ -361,7 +375,7 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
   // For the problem described above {M:[4, 16], N:[2, 32], K[3, 128]} with a
   // 16x16x16 intrinsic, then:
   //  - kTotalTileCounts would be 3 * (128/16) = 24
-  SmallVector<int64_t> kTotalTileCounts = problem.kSizes;
+  SmallVector<int64_t, 2> kTotalTileCounts = problem.kSizes;
   kTotalTileCounts.back() =
       llvm::divideCeil(problem.kSizes.back(), intrinsic.kSizes[0]);
   // Compute the ideal number of intrinsics along K per subgroup based on the
@@ -395,8 +409,8 @@ FailureOr<GPUMMASchedule> deduceMMASchedule(
     int64_t subgroupSize, bool transposedLhs, bool transposedRhs,
     bool canUpcastAcc, bool mustBeAligned, bool doCPromotion) {
   for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
-    if (failed(canTargetIntrinsic(problem, intrinsic, canUpcastAcc,
-                                  mustBeAligned))) {
+    if (failed(canTargetIntrinsic(problem, intrinsic, subgroupSize,
+                                  canUpcastAcc, mustBeAligned))) {
       continue;
     }
 
@@ -450,13 +464,13 @@ FailureOr<GPUMMASchedule> deduceAttentionSchedule(
          qkMatmul.nSizes.size() == 1 && qkMatmul.kSizes.size() == 1 &&
          "unimplemented: multi M/N/K attention schedule");
   for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) {
-    if (failed(canTargetIntrinsic(qkMatmul, intrinsic, canUpcastAcc,
-                                  mustBeAligned))) {
+    if (failed(canTargetIntrinsic(qkMatmul, intrinsic, subgroupSize,
+                                  canUpcastAcc, mustBeAligned))) {
       continue;
     }
 
-    if (failed(canTargetIntrinsic(pvMatmul, intrinsic, canUpcastAcc,
-                                  mustBeAligned))) {
+    if (failed(canTargetIntrinsic(pvMatmul, intrinsic, subgroupSize,
+                                  canUpcastAcc, mustBeAligned))) {
       continue;
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
index a35e2b464632..6542d11ebf18 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h
@@ -10,18 +10,22 @@ namespace mlir::iree_compiler {
 
 /// Struct containing information about a matmul's shape and type.
 struct GPUMatmulShapeType {
-  SmallVector<int64_t> mSizes;
-  SmallVector<int64_t> nSizes;
-  SmallVector<int64_t> kSizes;
+  SmallVector<int64_t, 2> mSizes;
+  SmallVector<int64_t, 2> nSizes;
+  SmallVector<int64_t, 2> kSizes;
+  SmallVector<int64_t, 2> batchSizes;
   Type aType;
   Type bType;
   Type cType;
 
   GPUMatmulShapeType(int64_t m, int64_t n, int64_t k, Type a, Type b, Type c)
-      : mSizes({m}), nSizes({n}), kSizes({k}), aType(a), bType(b), cType(c) {}
-  GPUMatmulShapeType(SmallVector<int64_t> m, SmallVector<int64_t> n,
-                     SmallVector<int64_t> k, Type a, Type b, Type c)
-      : mSizes(m), nSizes(n), kSizes(k), aType(a), bType(b), cType(c) {}
+      : mSizes({m}), nSizes({n}), kSizes({k}), batchSizes({}), aType(a),
+        bType(b), cType(c) {}
+  GPUMatmulShapeType(ArrayRef<int64_t> m, ArrayRef<int64_t> n,
+                     ArrayRef<int64_t> k, ArrayRef<int64_t> batch, Type a,
+                     Type b, Type c)
+      : mSizes(m), nSizes(n), kSizes(k), batchSizes(batch), aType(a), bType(b),
+        cType(c) {}
 };
 
 /// Struct containing seed tile sizes for GPU MMA heuristics deduction logic.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 7b62d6955b10..48bfcc9a7c2a 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -202,23 +202,29 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
   // Gather all static M, N, and K dimensions to deduce the MMASchedule. Dynamic
   // dimensions will be tiled to 1 in workgroup tiling, so they are ignored when
   // computing an MMA schedule.
-  SmallVector<int64_t> mDims, nDims, kDims;
-  for (auto mDim : contractionDims.m) {
+  SmallVector<int64_t> mDims, nDims, kDims, batchDims;
+  for (int64_t mDim : contractionDims.m) {
     if (!ShapedType::isDynamic(bounds[mDim])) {
       mDims.push_back(mDim);
     }
   }
-  for (auto nDim : contractionDims.n) {
+  for (int64_t nDim : contractionDims.n) {
     if (!ShapedType::isDynamic(bounds[nDim])) {
       nDims.push_back(nDim);
     }
   }
-  for (auto kDim : contractionDims.k) {
+  for (int64_t kDim : contractionDims.k) {
     if (!ShapedType::isDynamic(bounds[kDim])) {
       kDims.push_back(kDim);
     }
   }
 
+  for (int64_t batchDim : contractionDims.batch) {
+    if (!ShapedType::isDynamic(bounds[batchDim])) {
+      batchDims.push_back(batchDim);
+    }
+  }
+
   auto getDimBounds = [&](SmallVector<int64_t> dims) -> SmallVector<int64_t> {
     return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; });
   };
@@ -233,8 +239,9 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector<int64_t> bounds,
   Type initElemType = getElementTypeOrSelf(init);
 
   GPUMatmulShapeType problem{getDimBounds(mDims), getDimBounds(nDims),
-                             getDimBounds(kDims), lhsElemType,
-                             rhsElemType,         initElemType};
+                             getDimBounds(kDims), getDimBounds(batchDims),
+                             lhsElemType,         rhsElemType,
+                             initElemType};
 
   // Infer if lhs or rhs is transposed to help generate better schedule.
   // TODO: Drop this. This is only a consideration for other pipelines.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 2ee7e8241c17..8833b4203156 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -536,13 +536,24 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
       rhsElemType = getElementTypeOrSelf(rhsOp.getDpsInputs()[0]);
   }
 
+  SmallVector<int64_t> batchDims;
+  for (int64_t batchDim : contractionDims->batch) {
+    if (!ShapedType::isDynamic(bounds[batchDim])) {
+      batchDims.push_back(batchDim);
+    }
+  }
+  auto getDimBounds = [&](SmallVector<int64_t> dims) -> SmallVector<int64_t> {
+    return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; });
+  };
+
   // TODO(Max191): Support multiple M/N/K dimension problems for MMASchedules
   // once the pipeline is able to support it. After adding multiple dimensions,
   // all instances of schedule->m/nSubgroupCounts[0] and
   // schedule->m/n/kTileSizes[0] need to use the full list of sizes instead of
   // just the first element.
-  GPUMatmulShapeType problem{bounds[mDim], bounds[nDim], bounds[kDim],
-                             lhsElemType,  rhsElemType,  initElemType};
+  GPUMatmulShapeType problem{
+      {bounds[mDim]}, {bounds[nDim]}, {bounds[kDim]}, getDimBounds(batchDims),
+      lhsElemType,    rhsElemType,    initElemType};
 
   // Helper fn to store mma information.
   auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
index 910eca3c1768..ec6038f47dee 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -282,12 +282,12 @@ module {
 // -----
 
 module {
-func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> {
+func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x2x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x2x577xf32> {
     %c0 = arith.constant 0.0 : f32
-    %empty = tensor.empty() : tensor<12x577x577xf32>
-    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
-    %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x577x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x577x577xf32>) -> tensor<12x577x577xf32>
-    return %mm :  tensor<12x577x577xf32>
+    %empty = tensor.empty() : tensor<12x2x577xf32>
+    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
+    %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x2x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
+    return %mm :  tensor<12x2x577xf32>
 }
 }