From d96a3f09a0416e9d01281c7c34a7b9cd07af5b73 Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com> Date: Mon, 3 Feb 2025 19:10:56 -0600 Subject: [PATCH] [GPU] Match Tile And Fuse skinny matmul bail-out to Vector Distribute (#19857) This PR matches the failure criteria to what we see in the SetContractConfig for vector distribute for bailing out on skinny matmuls. The dispatch in https://github.com/iree-org/iree/issues/19855 goes to 0.068 ms vs the default path which gets 1.64 ms as this skinny matmul with multiple dims cannot be currently supported by vector reduction and warp reduction but tile and fuse can support it using padding. This needs the flag `--iree-codegen-llvmgpu-test-tile-and-fuse-matmul=true` Also the `GPUMatmulShapeType` was becoming too large as this PR adds batch sizes to it and was giving the following error. ``` error: static_assert failed due to requirement 'sizeof(mlir::iree_compiler::GPUMatmulShapeType) <= 256' "You are trying to use a default number of inlined elements for SmallVector but sizeof(T) is really big! Please use an explicit number of inlined elements with SmallVector to make sure you really want that much inline storage." ``` This PR fixes this issue both by explictly mentioning vector sizes in the struct members. --------- Signed-off-by: Nirvedh Meshram --- .../Codegen/Common/GPU/GPUHeuristics.cpp | 56 ++++++++++++------- .../Codegen/Common/GPU/GPUHeuristics.h | 18 +++--- .../Dialect/GPU/TargetUtils/ConfigUtils.cpp | 19 +++++-- .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 15 ++++- .../test/ROCDL/config_tile_and_fuse.mlir | 10 ++-- 5 files changed, 77 insertions(+), 41 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp index f8e30f31a961..669d1d1eb539 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp @@ -21,6 +21,9 @@ using llvm::APIntOps::GreatestCommonDivisor; namespace mlir::iree_compiler { +// Threshold used to determine whether a matmul dimension is 'very skinny'. +constexpr int64_t kVerySkinnyDimThreshold = 4; + template static llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const llvm::SmallVectorImpl &vector) { @@ -77,17 +80,17 @@ calculateResultSharedMemoryUsedInBytes(const GPUMMASchedule &schedule, static bool isScheduleAligned(const GPUMatmulShapeType &problem, const GPUMMASchedule &schedule, bool mustBeAligned) { - SmallVector alignedMSizes(problem.mSizes); + SmallVector alignedMSizes(problem.mSizes); alignedMSizes.back() = mustBeAligned ? problem.mSizes.back() : llvm::divideCeil(problem.mSizes.back(), schedule.mSize) * schedule.mSize; - SmallVector alignedNSizes(problem.nSizes); + SmallVector alignedNSizes(problem.nSizes); alignedNSizes.back() = mustBeAligned ? problem.nSizes.back() : llvm::divideCeil(problem.nSizes.back(), schedule.nSize) * schedule.nSize; - SmallVector alignedKSizes(problem.kSizes); + SmallVector alignedKSizes(problem.kSizes); alignedKSizes.back() = mustBeAligned ? problem.kSizes.back() : llvm::divideCeil(problem.kSizes.back(), schedule.kSize) * @@ -106,7 +109,7 @@ static bool isScheduleAligned(const GPUMatmulShapeType &problem, }; // Checks whether the elements of `a` are evenly divisible by the // corresponding elements of `b`. - auto areAligned = [](SmallVector a, SmallVector b) { + auto areAligned = [](SmallVector a, SmallVector b) { for (auto [aVal, bVal] : llvm::zip_equal(a, b)) { if (aVal % bVal != 0) { return false; @@ -223,6 +226,7 @@ static FailureOr fitScheduleInSharedMemory( static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem, const GPUMatmulShapeType &intrinsic, + int64_t preferredSubgroupSize, bool canUpcastAcc, bool mustBeAligned) { assert(intrinsic.mSizes.size() == 1 && intrinsic.nSizes.size() == 1 && intrinsic.kSizes.size() == 1 && @@ -240,12 +244,17 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem, } } - if (mustBeAligned && (problem.mSizes.back() % intrinsic.mSizes[0] != 0 || - problem.nSizes.back() % intrinsic.nSizes[0] != 0 || - problem.kSizes.back() % intrinsic.kSizes[0] != 0)) { - return failure(); // Cannot use this intrinsic for misaligned cases. + if (mustBeAligned) { + if ((problem.mSizes.back() % intrinsic.mSizes[0] != 0 || + problem.nSizes.back() % intrinsic.nSizes[0] != 0 || + problem.kSizes.back() % intrinsic.kSizes[0] != 0)) { + return failure(); + } + return success(); } + // Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction + // pipeline, similar to matvec. // TODO: Figure out what the precise cutoff is, this may be machine dependent. // In situation when alignment isn't required, we disallow intrinsics to be // picked if the tile size is too small. For example, this will force a matmul @@ -255,10 +264,15 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem, // established after we sweep the different tile sizes for a problem config. // Once a precise threshold is established, replace 4 with the threshold and // remove this todo. - if (!mustBeAligned && - (problem.mSizes.back() < 4 || problem.nSizes.back() < 4 || - problem.kSizes.back() < 4)) { - return failure(); + if (llvm::all_equal({problem.mSizes.size(), problem.nSizes.size(), + problem.kSizes.size(), size_t{1}}) && + problem.batchSizes.empty()) { + int64_t mSize = problem.mSizes.back(); + int64_t nSize = problem.nSizes.back(); + if ((mSize <= kVerySkinnyDimThreshold && (nSize > preferredSubgroupSize)) || + (nSize <= kVerySkinnyDimThreshold && (mSize > preferredSubgroupSize))) { + return failure(); + } } return success(); } @@ -279,8 +293,8 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem, // 16x16x16 intrinsic, then: // - mTotalTileCounts would be 4 * (16/16) = 4 // - nTotalTileCounts would be 2 * (32/16) = 4 - SmallVector mTotalTileCounts = problem.mSizes; - SmallVector nTotalTileCounts = problem.nSizes; + SmallVector mTotalTileCounts = problem.mSizes; + SmallVector nTotalTileCounts = problem.nSizes; mTotalTileCounts.back() = llvm::divideCeil(problem.mSizes.back(), intrinsic.mSizes[0]); nTotalTileCounts.back() = @@ -361,7 +375,7 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem, // For the problem described above {M:[4, 16], N:[2, 32], K[3, 128]} with a // 16x16x16 intrinsic, then: // - kTotalTileCounts would be 3 * (128/16) = 24 - SmallVector kTotalTileCounts = problem.kSizes; + SmallVector kTotalTileCounts = problem.kSizes; kTotalTileCounts.back() = llvm::divideCeil(problem.kSizes.back(), intrinsic.kSizes[0]); // Compute the ideal number of intrinsics along K per subgroup based on the @@ -395,8 +409,8 @@ FailureOr deduceMMASchedule( int64_t subgroupSize, bool transposedLhs, bool transposedRhs, bool canUpcastAcc, bool mustBeAligned, bool doCPromotion) { for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) { - if (failed(canTargetIntrinsic(problem, intrinsic, canUpcastAcc, - mustBeAligned))) { + if (failed(canTargetIntrinsic(problem, intrinsic, subgroupSize, + canUpcastAcc, mustBeAligned))) { continue; } @@ -450,13 +464,13 @@ FailureOr deduceAttentionSchedule( qkMatmul.nSizes.size() == 1 && qkMatmul.kSizes.size() == 1 && "unimplemented: multi M/N/K attention schedule"); for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) { - if (failed(canTargetIntrinsic(qkMatmul, intrinsic, canUpcastAcc, - mustBeAligned))) { + if (failed(canTargetIntrinsic(qkMatmul, intrinsic, subgroupSize, + canUpcastAcc, mustBeAligned))) { continue; } - if (failed(canTargetIntrinsic(pvMatmul, intrinsic, canUpcastAcc, - mustBeAligned))) { + if (failed(canTargetIntrinsic(pvMatmul, intrinsic, subgroupSize, + canUpcastAcc, mustBeAligned))) { continue; } diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h index a35e2b464632..6542d11ebf18 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h @@ -10,18 +10,22 @@ namespace mlir::iree_compiler { /// Struct containing information about a matmul's shape and type. struct GPUMatmulShapeType { - SmallVector mSizes; - SmallVector nSizes; - SmallVector kSizes; + SmallVector mSizes; + SmallVector nSizes; + SmallVector kSizes; + SmallVector batchSizes; Type aType; Type bType; Type cType; GPUMatmulShapeType(int64_t m, int64_t n, int64_t k, Type a, Type b, Type c) - : mSizes({m}), nSizes({n}), kSizes({k}), aType(a), bType(b), cType(c) {} - GPUMatmulShapeType(SmallVector m, SmallVector n, - SmallVector k, Type a, Type b, Type c) - : mSizes(m), nSizes(n), kSizes(k), aType(a), bType(b), cType(c) {} + : mSizes({m}), nSizes({n}), kSizes({k}), batchSizes({}), aType(a), + bType(b), cType(c) {} + GPUMatmulShapeType(ArrayRef m, ArrayRef n, + ArrayRef k, ArrayRef batch, Type a, + Type b, Type c) + : mSizes(m), nSizes(n), kSizes(k), batchSizes(batch), aType(a), bType(b), + cType(c) {} }; /// Struct containing seed tile sizes for GPU MMA heuristics deduction logic. diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 7b62d6955b10..48bfcc9a7c2a 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -202,23 +202,29 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, // Gather all static M, N, and K dimensions to deduce the MMASchedule. Dynamic // dimensions will be tiled to 1 in workgroup tiling, so they are ignored when // computing an MMA schedule. - SmallVector mDims, nDims, kDims; - for (auto mDim : contractionDims.m) { + SmallVector mDims, nDims, kDims, batchDims; + for (int64_t mDim : contractionDims.m) { if (!ShapedType::isDynamic(bounds[mDim])) { mDims.push_back(mDim); } } - for (auto nDim : contractionDims.n) { + for (int64_t nDim : contractionDims.n) { if (!ShapedType::isDynamic(bounds[nDim])) { nDims.push_back(nDim); } } - for (auto kDim : contractionDims.k) { + for (int64_t kDim : contractionDims.k) { if (!ShapedType::isDynamic(bounds[kDim])) { kDims.push_back(kDim); } } + for (int64_t batchDim : contractionDims.batch) { + if (!ShapedType::isDynamic(bounds[batchDim])) { + batchDims.push_back(batchDim); + } + } + auto getDimBounds = [&](SmallVector dims) -> SmallVector { return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; }); }; @@ -233,8 +239,9 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, Type initElemType = getElementTypeOrSelf(init); GPUMatmulShapeType problem{getDimBounds(mDims), getDimBounds(nDims), - getDimBounds(kDims), lhsElemType, - rhsElemType, initElemType}; + getDimBounds(kDims), getDimBounds(batchDims), + lhsElemType, rhsElemType, + initElemType}; // Infer if lhs or rhs is transposed to help generate better schedule. // TODO: Drop this. This is only a consideration for other pipelines. diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 2ee7e8241c17..8833b4203156 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -536,13 +536,24 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target, rhsElemType = getElementTypeOrSelf(rhsOp.getDpsInputs()[0]); } + SmallVector batchDims; + for (int64_t batchDim : contractionDims->batch) { + if (!ShapedType::isDynamic(bounds[batchDim])) { + batchDims.push_back(batchDim); + } + } + auto getDimBounds = [&](SmallVector dims) -> SmallVector { + return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; }); + }; + // TODO(Max191): Support multiple M/N/K dimension problems for MMASchedules // once the pipeline is able to support it. After adding multiple dimensions, // all instances of schedule->m/nSubgroupCounts[0] and // schedule->m/n/kTileSizes[0] need to use the full list of sizes instead of // just the first element. - GPUMatmulShapeType problem{bounds[mDim], bounds[nDim], bounds[kDim], - lhsElemType, rhsElemType, initElemType}; + GPUMatmulShapeType problem{ + {bounds[mDim]}, {bounds[nDim]}, {bounds[kDim]}, getDimBounds(batchDims), + lhsElemType, rhsElemType, initElemType}; // Helper fn to store mma information. auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma, diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir index 910eca3c1768..ec6038f47dee 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir @@ -282,12 +282,12 @@ module { // ----- module { -func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> { +func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x2x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x2x577xf32> { %c0 = arith.constant 0.0 : f32 - %empty = tensor.empty() : tensor<12x577x577xf32> - %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> - %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x577x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> - return %mm : tensor<12x577x577xf32> + %empty = tensor.empty() : tensor<12x2x577xf32> + %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x2x577xf32>) -> tensor<12x2x577xf32> + %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x2x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x2x577xf32>) -> tensor<12x2x577xf32> + return %mm : tensor<12x2x577xf32> } }