Merge OpenAI Triton commit f47cc3e (#3319)

This PR change the Triton base from 9a49104 to f47cc3e (Jan 29). Pass rate: 98.19% Please do not squash and merge this PR.
intel · Jan 31, 2025 · ccf97fd · ccf97fd
2 parents b0ddc4b + 6ce6d5b
commit ccf97fd
Show file tree

Hide file tree

Showing 28 changed files with 518 additions and 297 deletions.
diff --git a/README.md b/README.md
@@ -232,8 +232,14 @@ For detailed instructions on how to debug Triton's frontend, please refer to thi
 - `MLIR_ENABLE_TIMING` dumps the timing information for each MLIR pass.
 - `LLVM_ENABLE_TIMING` dumps the timing information for each LLVM pass.
 - `TRITON_DEFAULT_FP_FUSION` overrides the default behavior of allowing fp fusion (mul+add->fma).
-- `MLIR_ENABLE_DIAGNOSTICS` enables dumping the stack trace and the related IR operation of diagnostics (e.g., errors and warnings).
-- `MLIR_ENABLE_REMARK` enables the performance warnings that are emitted as remarks.
+- `MLIR_ENABLE_DIAGNOSTICS=<comma-separated>` controls diagnostic emission in MLIR.
+  Options are: `warnings`, `remarks`, `stacktraces`, `operations`.
+  Use comma-separated values to customize output. For example,
+  `MLIR_ENABLE_DIAGNOSTICS=remarks,operations` enables remarks and IR operations,
+  while `MLIR_ENABLE_DIAGNOSTICS=warnings,stacktraces` enables warnings with
+  stacktraces. By default, only errors are shown. Setting `warnings` includes
+  errors and warnings; `remarks` includes errors, warnings, and remarks.
+- `MLIR_ENABLE_REMARK` is deprecated. Please use `MLIR_ENABLE_DIAGNOSTICS=remarks`.
 - `TRITON_KERNEL_DUMP` enables the dumping of the IR from each compilation stage and the final ptx/amdgcn.
 - `TRITON_DUMP_DIR` specifies the directory to save the dumped IR and ptx/amdgcn when `TRITON_KERNEL_DUMP` is set to 1.
 - `TRITON_KERNEL_OVERRIDE` enables the override of the compiled kernel with a user-specified IR/ptx/amdgcn at the beginning of each compilation stage.

diff --git a/include/triton/Conversion/TritonGPUToLLVM/FMADotUtility.h b/include/triton/Conversion/TritonGPUToLLVM/FMADotUtility.h
@@ -0,0 +1,35 @@
+#ifndef TRITON_CONVERSION_FMA_DOT_UTILITY_H
+#define TRITON_CONVERSION_FMA_DOT_UTILITY_H
+
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+
+namespace mlir::triton::gpu {
+
+/// Abstract interface for scalar multiplication of Value vectors.
+///
+/// Enable generation of hardware specific code in different backends.
+class FMAVectorMultiplier {
+public:
+  /// \returns scalar product of two arrays, plus c: a·b + c
+  virtual Value multiplyVectors(ArrayRef<Value> a, ArrayRef<Value> b,
+                                Value c) = 0;
+
+  virtual ~FMAVectorMultiplier() = default;
+};
+
+/// Implements a framework for FMA dot conversion to llvm.
+///
+/// This function implements architecture independent part of FMA dot
+/// conversion and calls "multiplier" object, which is defined by caller
+/// and implements architecture dependant part of conversion.
+LogicalResult parametricConvertFMADot(DotOp op, DotOp::Adaptor adaptor,
+                                      const LLVMTypeConverter *typeConverter,
+                                      ConversionPatternRewriter &rewriter,
+                                      FMAVectorMultiplier &multiplier);
+
+} // namespace mlir::triton::gpu
+
+#endif // TRITON_CONVERSION_FMA_DOT_UTILITY_H
diff --git a/include/triton/Dialect/Triton/IR/Traits.h b/include/triton/Dialect/Triton/IR/Traits.h
@@ -3,6 +3,7 @@
 
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Support/LogicalResult.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 
@@ -27,7 +28,7 @@ LogicalResult verifyTensorLayouts(Operation *op);
 
 LogicalResult verifySameOperandsEncoding(Operation *op,
                                          bool allowTensorPointerType = false);
-
+LogicalResult verifyEquivalentType(Type typeA, Type typeB);
 LogicalResult
 verifySameOperandsAndResultEncoding(Operation *op,
                                     bool allowTensorPointerType = false);

diff --git a/include/triton/Dialect/Triton/IR/TritonInterfaces.td b/include/triton/Dialect/Triton/IR/TritonInterfaces.td
@@ -2,6 +2,7 @@
 #define TRITON_INTERFACES
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
 
 def TensorSizeTrait : NativeOpTrait<"TensorSizeTrait">;
 def VerifyTensorLayoutsTrait : NativeOpTrait<"VerifyTensorLayoutsTrait">;
@@ -13,4 +14,17 @@ def SameLoadStoreOperandsAndResultShape : NativeOpTrait<"SameLoadStoreOperandsAn
 def SameLoadStoreOperandsEncoding : NativeOpTrait<"SameLoadStoreOperandsEncoding">;
 def SameLoadStoreOperandsAndResultEncoding : NativeOpTrait<"SameLoadStoreOperandsAndResultEncoding">;
 
+// A trait equivalent to InferTypeOpAdaptor, but that checks for structural
+// equivalence of the layouts of the result rather than just layout equality.
+def InferTypeOpWithLayoutEquivalence : InferTypeOpAdaptorBase<[{
+  static bool isCompatibleReturnTypes(TypeRange lhs, TypeRange rhs) {
+    if (lhs.size() != rhs.size())
+      return false;
+    return llvm::all_of(llvm::zip(lhs, rhs), [](auto tup) {
+      auto [lhs, rhs] = tup;
+      return succeeded(OpTrait::impl::verifyEquivalentType(lhs, rhs));
+    });
+  }
+}]>;
+
 #endif // TRITON_INTERFACES
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -539,7 +539,7 @@ def TT_SplitOp : TT_Op<"split", [
 
 def TT_TransOp : TT_Op<"trans", [Pure,
                                  TransposeOpInterface,
-                                 InferTypeOpAdaptorWithIsCompatible,
+                                 InferTypeOpWithLayoutEquivalence,
                                  SameOperandsAndResultElementType]> {
 
     let summary = "rearrange the dimensions of a tensor";

diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -123,6 +123,12 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
 
   std::tie(scratchConfig.inVec, scratchConfig.outVec) =
       getScratchCvtInOutVecLengths(srcTy, dstTy);
+  // We can't write a longer vector than the shape of shared memory.
+  // This shape might be smaller than the tensor shape in case we decided to
+  // do the conversion in multiple iterations.
+  unsigned contiguousShapeDim = scratchConfig.repShape[scratchConfig.order[0]];
+  scratchConfig.inVec = std::min(scratchConfig.inVec, contiguousShapeDim);
+  scratchConfig.outVec = std::min(scratchConfig.outVec, contiguousShapeDim);
 
   // No padding is required if the tensor is 1-D, or if all dimensions except
   // the first accessed dimension have a size of 1.

diff --git a/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt b/lib/Conversion/TritonGPUToLLVM/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_triton_library(TritonGPUToLLVM
     ConvertLayoutOpToLLVM/SharedToDotOperandFMA.cpp
     DotOpToLLVM/FMA.cpp
+    DotOpToLLVM/FMADotUtility.cpp
     AllocateSharedMemory.cpp
     AssertOpToLLVM.cpp
     ControlFlowOpToLLVM.cpp

diff --git a/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/FMA.cpp b/lib/Conversion/TritonGPUToLLVM/DotOpToLLVM/FMA.cpp
@@ -1,144 +1,38 @@
-#include "mlir/Support/LLVM.h"
+#include "triton/Conversion/TritonGPUToLLVM/FMADotUtility.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
-#include "triton/Dialect/TritonGPU/IR/Dialect.h"
-#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 
 using namespace mlir;
 using namespace mlir::triton;
 using namespace ::mlir::triton::gpu;
 
-using ::mlir::LLVM::linearize;
-using ::mlir::triton::gpu::expandMatrixOrderWithBatch;
-using ::mlir::triton::gpu::expandMatrixShapeWithBatch;
-using ::mlir::triton::gpu::getShapePerCTA;
-using ::mlir::triton::gpu::getSizePerThread;
-
-/// \brief spatial position of repetition and register of a given value
-struct OperandValueKey {
-  unsigned bRepIdx, nonKRepIdx;
-  unsigned bIdx, nonKIdx, kIdx;
-
-  bool operator==(const OperandValueKey &other) const {
-    return (bRepIdx == other.bRepIdx && nonKRepIdx == other.nonKRepIdx &&
-            bIdx == other.bIdx && nonKIdx == other.nonKIdx &&
-            kIdx == other.kIdx);
-  }
-};
-
-template <> struct std::hash<OperandValueKey> {
-  std::size_t operator()(const OperandValueKey &k) const {
-    return llvm::hash_combine(k.bRepIdx, k.nonKRepIdx, k.bIdx, k.nonKIdx,
-                              k.kIdx);
+namespace {
+class GenericFMAVectorMultiplier : public FMAVectorMultiplier {
+  OpBuilder &builder;
+  Location loc;
+
+public:
+  GenericFMAVectorMultiplier(OpBuilder &builder, Location loc)
+      : builder(builder), loc(loc) {}
+
+  Value multiplyVectors(ArrayRef<Value> a, ArrayRef<Value> b,
+                        Value c) override {
+    auto K = a.size();
+    assert(b.size() == K);
+    Value accum = c;
+    for (auto [aElem, bElem] : llvm::zip(a, b))
+      accum = builder.create<LLVM::FMulAddOp>(loc, aElem, bElem, accum);
+    return accum;
   }
 };
 
-using ValueTableFMA = std::unordered_map<OperandValueKey, Value>;
-
-static ValueTableFMA getValueTableFromStructFMA(
-    Value val, ArrayRef<unsigned> perRepShape, ArrayRef<unsigned> repetitions,
-    unsigned kDim, unsigned nonKDim, ConversionPatternRewriter &rewriter,
-    Location loc, ArrayRef<unsigned> inRepOrder, ArrayRef<unsigned> repOrder) {
-  ValueTableFMA res;
-  auto elems = unpackLLElements(loc, val, rewriter);
-  assert(perRepShape.size() == 3);
-  auto numElemsRep = product(perRepShape);
-  assert(elems.size() == numElemsRep * product(repetitions));
-  assert(kDim == 1 || kDim == 2);
-  assert(nonKDim == 1 || nonKDim == 2);
-  const unsigned bDim = 0;
+} // namespace
 
-  for (unsigned idx = 0; idx < elems.size(); ++idx) {
-    auto inRepLinearIdx = idx % numElemsRep;
-    auto repLinearIdx = idx / numElemsRep;
-    auto inRepSpatialIdx =
-        mlir::LLVM::delinearize(inRepLinearIdx, perRepShape, inRepOrder);
-    auto repSpatialIdx =
-        mlir::LLVM::delinearize(repLinearIdx, repetitions, repOrder);
-    OperandValueKey key{repSpatialIdx[0], repSpatialIdx[nonKDim],
-                        inRepSpatialIdx[0], inRepSpatialIdx[nonKDim],
-                        inRepSpatialIdx[kDim]};
-    res[key] = elems[idx];
-  }
-  return res;
-}
-
-LogicalResult convertFMADot(triton::DotOp op, triton::DotOp::Adaptor adaptor,
+LogicalResult convertFMADot(DotOp op, DotOp::Adaptor adaptor,
                             const LLVMTypeConverter *typeConverter,
                             ConversionPatternRewriter &rewriter) {
   auto *ctx = rewriter.getContext();
   auto loc = op.getLoc();
-
-  auto A = op.getA();
-  auto D = op.getResult();
-
-  auto aTensorTy = cast<RankedTensorType>(A.getType());
-  auto dTensorTy = cast<RankedTensorType>(D.getType());
-
-  SmallVector<int64_t> aShapePerCTA =
-      expandMatrixShapeWithBatch(ArrayRef(getShapePerCTA(aTensorTy)));
-  auto dShapePerCTA =
-      expandMatrixShapeWithBatch(ArrayRef(getShapePerCTA(dTensorTy)));
-
-  BlockedEncodingAttr dLayout =
-      cast<BlockedEncodingAttr>(dTensorTy.getEncoding());
-  // TODO process A and B operand separately
-  auto inRepOrder = expandMatrixOrderWithBatch(dLayout.getOrder());
-  auto repOrder = expandMatrixOrderWithBatch(dLayout.getRepOrder());
-  auto cc = unpackLLElements(loc, adaptor.getC(), rewriter);
-
-  Value llA = adaptor.getA();
-  Value llB = adaptor.getB();
-
-  auto sizePerThread =
-      expandMatrixShapeWithBatch(ArrayRef(getSizePerThread(dLayout)));
-  auto numElemsPerThread = product(sizePerThread);
-  auto shapePerCTATile =
-      expandMatrixShapeWithBatch(ArrayRef(getShapePerCTATile(dLayout)));
-
-  unsigned K = aShapePerCTA[2];
-
-  unsigned threadTileShape[3];
-  unsigned repetitions[3];
-  for (int i = 0; i < 3; ++i) {
-    repetitions[i] =
-        ceil(dShapePerCTA[i], static_cast<int64_t>(shapePerCTATile[i]));
-  }
-
-  auto has = getValueTableFromStructFMA(
-      llA, {sizePerThread[0], sizePerThread[1], K},
-      {repetitions[0], repetitions[1], 1},
-      /*kDim*/ 2, /*nonKDim*/ 1, rewriter, loc, inRepOrder, repOrder);
-  auto hbs = getValueTableFromStructFMA(
-      llB, {sizePerThread[0], K, sizePerThread[2]},
-      {repetitions[0], 1, repetitions[2]},
-      /*kDim*/ 1, /*nonKDim*/ 2, rewriter, loc, inRepOrder, repOrder);
-
-  SmallVector<Value> acc = cc;
-
-  for (unsigned bRep = 0; bRep < repetitions[0]; ++bRep)
-    for (unsigned mRep = 0; mRep < repetitions[1]; ++mRep)
-      for (unsigned nRep = 0; nRep < repetitions[2]; ++nRep)
-        for (unsigned b = 0; b < sizePerThread[0]; ++b)
-          for (unsigned m = 0; m < sizePerThread[1]; ++m)
-            for (unsigned n = 0; n < sizePerThread[2]; ++n) {
-              SmallVector<unsigned> multiDimAccumIdx = {b, m, n};
-              unsigned linearInRepIdx =
-                  linearize(multiDimAccumIdx, sizePerThread, inRepOrder);
-              SmallVector<unsigned> multiDimRepIdx = {bRep, mRep, nRep};
-              unsigned linearRepIdx =
-                  linearize(multiDimRepIdx, repetitions, repOrder);
-              unsigned linearAccumIdx =
-                  linearInRepIdx + linearRepIdx * numElemsPerThread;
-              for (unsigned k = 0; k < K; ++k) {
-                auto aOp = has[{bRep, mRep, b, m, k}];
-                auto bOp = hbs[{bRep, nRep, b, n, k}];
-                acc[linearAccumIdx] = rewriter.create<LLVM::FMulAddOp>(
-                    loc, aOp, bOp, acc[linearAccumIdx]);
-              }
-            }
-
-  auto res = packLLElements(loc, typeConverter, acc, rewriter, dTensorTy);
-  rewriter.replaceOp(op, res);
-
-  return success();
+  GenericFMAVectorMultiplier multiplier(rewriter, loc);
+  return parametricConvertFMADot(op, adaptor, typeConverter, rewriter,
+                                 multiplier);
 }