diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp index f8e30f31a961..669d1d1eb539 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp @@ -21,6 +21,9 @@ using llvm::APIntOps::GreatestCommonDivisor; namespace mlir::iree_compiler { +// Threshold used to determine whether a matmul dimension is 'very skinny'. +constexpr int64_t kVerySkinnyDimThreshold = 4; + template static llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const llvm::SmallVectorImpl &vector) { @@ -77,17 +80,17 @@ calculateResultSharedMemoryUsedInBytes(const GPUMMASchedule &schedule, static bool isScheduleAligned(const GPUMatmulShapeType &problem, const GPUMMASchedule &schedule, bool mustBeAligned) { - SmallVector alignedMSizes(problem.mSizes); + SmallVector alignedMSizes(problem.mSizes); alignedMSizes.back() = mustBeAligned ? problem.mSizes.back() : llvm::divideCeil(problem.mSizes.back(), schedule.mSize) * schedule.mSize; - SmallVector alignedNSizes(problem.nSizes); + SmallVector alignedNSizes(problem.nSizes); alignedNSizes.back() = mustBeAligned ? problem.nSizes.back() : llvm::divideCeil(problem.nSizes.back(), schedule.nSize) * schedule.nSize; - SmallVector alignedKSizes(problem.kSizes); + SmallVector alignedKSizes(problem.kSizes); alignedKSizes.back() = mustBeAligned ? problem.kSizes.back() : llvm::divideCeil(problem.kSizes.back(), schedule.kSize) * @@ -106,7 +109,7 @@ static bool isScheduleAligned(const GPUMatmulShapeType &problem, }; // Checks whether the elements of `a` are evenly divisible by the // corresponding elements of `b`. - auto areAligned = [](SmallVector a, SmallVector b) { + auto areAligned = [](SmallVector a, SmallVector b) { for (auto [aVal, bVal] : llvm::zip_equal(a, b)) { if (aVal % bVal != 0) { return false; @@ -223,6 +226,7 @@ static FailureOr fitScheduleInSharedMemory( static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem, const GPUMatmulShapeType &intrinsic, + int64_t preferredSubgroupSize, bool canUpcastAcc, bool mustBeAligned) { assert(intrinsic.mSizes.size() == 1 && intrinsic.nSizes.size() == 1 && intrinsic.kSizes.size() == 1 && @@ -240,12 +244,17 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem, } } - if (mustBeAligned && (problem.mSizes.back() % intrinsic.mSizes[0] != 0 || - problem.nSizes.back() % intrinsic.nSizes[0] != 0 || - problem.kSizes.back() % intrinsic.kSizes[0] != 0)) { - return failure(); // Cannot use this intrinsic for misaligned cases. + if (mustBeAligned) { + if ((problem.mSizes.back() % intrinsic.mSizes[0] != 0 || + problem.nSizes.back() % intrinsic.nSizes[0] != 0 || + problem.kSizes.back() % intrinsic.kSizes[0] != 0)) { + return failure(); + } + return success(); } + // Send very skinny, {2-4}xNxK and Mx{2-4}xK, matmuls to the vector reduction + // pipeline, similar to matvec. // TODO: Figure out what the precise cutoff is, this may be machine dependent. // In situation when alignment isn't required, we disallow intrinsics to be // picked if the tile size is too small. For example, this will force a matmul @@ -255,10 +264,15 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem, // established after we sweep the different tile sizes for a problem config. // Once a precise threshold is established, replace 4 with the threshold and // remove this todo. - if (!mustBeAligned && - (problem.mSizes.back() < 4 || problem.nSizes.back() < 4 || - problem.kSizes.back() < 4)) { - return failure(); + if (llvm::all_equal({problem.mSizes.size(), problem.nSizes.size(), + problem.kSizes.size(), size_t{1}}) && + problem.batchSizes.empty()) { + int64_t mSize = problem.mSizes.back(); + int64_t nSize = problem.nSizes.back(); + if ((mSize <= kVerySkinnyDimThreshold && (nSize > preferredSubgroupSize)) || + (nSize <= kVerySkinnyDimThreshold && (mSize > preferredSubgroupSize))) { + return failure(); + } } return success(); } @@ -279,8 +293,8 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem, // 16x16x16 intrinsic, then: // - mTotalTileCounts would be 4 * (16/16) = 4 // - nTotalTileCounts would be 2 * (32/16) = 4 - SmallVector mTotalTileCounts = problem.mSizes; - SmallVector nTotalTileCounts = problem.nSizes; + SmallVector mTotalTileCounts = problem.mSizes; + SmallVector nTotalTileCounts = problem.nSizes; mTotalTileCounts.back() = llvm::divideCeil(problem.mSizes.back(), intrinsic.mSizes[0]); nTotalTileCounts.back() = @@ -361,7 +375,7 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem, // For the problem described above {M:[4, 16], N:[2, 32], K[3, 128]} with a // 16x16x16 intrinsic, then: // - kTotalTileCounts would be 3 * (128/16) = 24 - SmallVector kTotalTileCounts = problem.kSizes; + SmallVector kTotalTileCounts = problem.kSizes; kTotalTileCounts.back() = llvm::divideCeil(problem.kSizes.back(), intrinsic.kSizes[0]); // Compute the ideal number of intrinsics along K per subgroup based on the @@ -395,8 +409,8 @@ FailureOr deduceMMASchedule( int64_t subgroupSize, bool transposedLhs, bool transposedRhs, bool canUpcastAcc, bool mustBeAligned, bool doCPromotion) { for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) { - if (failed(canTargetIntrinsic(problem, intrinsic, canUpcastAcc, - mustBeAligned))) { + if (failed(canTargetIntrinsic(problem, intrinsic, subgroupSize, + canUpcastAcc, mustBeAligned))) { continue; } @@ -450,13 +464,13 @@ FailureOr deduceAttentionSchedule( qkMatmul.nSizes.size() == 1 && qkMatmul.kSizes.size() == 1 && "unimplemented: multi M/N/K attention schedule"); for (auto [index, intrinsic] : llvm::enumerate(intrinsics)) { - if (failed(canTargetIntrinsic(qkMatmul, intrinsic, canUpcastAcc, - mustBeAligned))) { + if (failed(canTargetIntrinsic(qkMatmul, intrinsic, subgroupSize, + canUpcastAcc, mustBeAligned))) { continue; } - if (failed(canTargetIntrinsic(pvMatmul, intrinsic, canUpcastAcc, - mustBeAligned))) { + if (failed(canTargetIntrinsic(pvMatmul, intrinsic, subgroupSize, + canUpcastAcc, mustBeAligned))) { continue; } diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h index a35e2b464632..6542d11ebf18 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.h @@ -10,18 +10,22 @@ namespace mlir::iree_compiler { /// Struct containing information about a matmul's shape and type. struct GPUMatmulShapeType { - SmallVector mSizes; - SmallVector nSizes; - SmallVector kSizes; + SmallVector mSizes; + SmallVector nSizes; + SmallVector kSizes; + SmallVector batchSizes; Type aType; Type bType; Type cType; GPUMatmulShapeType(int64_t m, int64_t n, int64_t k, Type a, Type b, Type c) - : mSizes({m}), nSizes({n}), kSizes({k}), aType(a), bType(b), cType(c) {} - GPUMatmulShapeType(SmallVector m, SmallVector n, - SmallVector k, Type a, Type b, Type c) - : mSizes(m), nSizes(n), kSizes(k), aType(a), bType(b), cType(c) {} + : mSizes({m}), nSizes({n}), kSizes({k}), batchSizes({}), aType(a), + bType(b), cType(c) {} + GPUMatmulShapeType(ArrayRef m, ArrayRef n, + ArrayRef k, ArrayRef batch, Type a, + Type b, Type c) + : mSizes(m), nSizes(n), kSizes(k), batchSizes(batch), aType(a), bType(b), + cType(c) {} }; /// Struct containing seed tile sizes for GPU MMA heuristics deduction logic. diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 7b62d6955b10..48bfcc9a7c2a 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -202,23 +202,29 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, // Gather all static M, N, and K dimensions to deduce the MMASchedule. Dynamic // dimensions will be tiled to 1 in workgroup tiling, so they are ignored when // computing an MMA schedule. - SmallVector mDims, nDims, kDims; - for (auto mDim : contractionDims.m) { + SmallVector mDims, nDims, kDims, batchDims; + for (int64_t mDim : contractionDims.m) { if (!ShapedType::isDynamic(bounds[mDim])) { mDims.push_back(mDim); } } - for (auto nDim : contractionDims.n) { + for (int64_t nDim : contractionDims.n) { if (!ShapedType::isDynamic(bounds[nDim])) { nDims.push_back(nDim); } } - for (auto kDim : contractionDims.k) { + for (int64_t kDim : contractionDims.k) { if (!ShapedType::isDynamic(bounds[kDim])) { kDims.push_back(kDim); } } + for (int64_t batchDim : contractionDims.batch) { + if (!ShapedType::isDynamic(bounds[batchDim])) { + batchDims.push_back(batchDim); + } + } + auto getDimBounds = [&](SmallVector dims) -> SmallVector { return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; }); }; @@ -233,8 +239,9 @@ getMatmulLoweringConfigAndWorkgroupSize(SmallVector bounds, Type initElemType = getElementTypeOrSelf(init); GPUMatmulShapeType problem{getDimBounds(mDims), getDimBounds(nDims), - getDimBounds(kDims), lhsElemType, - rhsElemType, initElemType}; + getDimBounds(kDims), getDimBounds(batchDims), + lhsElemType, rhsElemType, + initElemType}; // Infer if lhs or rhs is transposed to help generate better schedule. // TODO: Drop this. This is only a consideration for other pipelines. diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 2ee7e8241c17..8833b4203156 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -536,13 +536,24 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target, rhsElemType = getElementTypeOrSelf(rhsOp.getDpsInputs()[0]); } + SmallVector batchDims; + for (int64_t batchDim : contractionDims->batch) { + if (!ShapedType::isDynamic(bounds[batchDim])) { + batchDims.push_back(batchDim); + } + } + auto getDimBounds = [&](SmallVector dims) -> SmallVector { + return llvm::map_to_vector(dims, [&](int64_t dim) { return bounds[dim]; }); + }; + // TODO(Max191): Support multiple M/N/K dimension problems for MMASchedules // once the pipeline is able to support it. After adding multiple dimensions, // all instances of schedule->m/nSubgroupCounts[0] and // schedule->m/n/kTileSizes[0] need to use the full list of sizes instead of // just the first element. - GPUMatmulShapeType problem{bounds[mDim], bounds[nDim], bounds[kDim], - lhsElemType, rhsElemType, initElemType}; + GPUMatmulShapeType problem{ + {bounds[mDim]}, {bounds[nDim]}, {bounds[kDim]}, getDimBounds(batchDims), + lhsElemType, rhsElemType, initElemType}; // Helper fn to store mma information. auto storeMmaInfo = [](IREE::GPU::MmaInterfaceAttr mma, diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir index 910eca3c1768..ec6038f47dee 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir @@ -282,12 +282,12 @@ module { // ----- module { -func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> { +func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x2x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x2x577xf32> { %c0 = arith.constant 0.0 : f32 - %empty = tensor.empty() : tensor<12x577x577xf32> - %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> - %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x577x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x577x577xf32>) -> tensor<12x577x577xf32> - return %mm : tensor<12x577x577xf32> + %empty = tensor.empty() : tensor<12x2x577xf32> + %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x2x577xf32>) -> tensor<12x2x577xf32> + %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x2x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x2x577xf32>) -> tensor<12x2x577xf32> + return %mm : tensor<12x2x577xf32> } }