triton-lang · Jokeren · Feb 6, 2025 · Feb 6, 2025 · Feb 5, 2025 · Feb 5, 2025
@@ -288,7 +288,7 @@ jobs:
         runner: ${{fromJson(needs.Runner-Preparation.outputs.matrix-HIP)}}
     name: Integration-Tests (${{matrix.runner[1] == 'gfx90a' && 'mi210' || 'mi300x'}})
     container:
-      image: rocmshared/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
+      image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
       options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
     steps:
       - name: Checkout

@@ -334,7 +334,7 @@ jobs:
     name: Integration-Tests (${{matrix.runner[1] == 'gfx90a' && 'mi210' || 'mi300x'}})
 
     container:
-      image: rocmshared/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
+      image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
       options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
 
     steps:

@@ -22,7 +22,7 @@ jobs:
           docker container prune -f
 
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       # The LATEST_DATE here should be kept in sync with the one in Patch setup.py
       - id: check-version

@@ -284,3 +284,21 @@ Supported Hardware:
 - NVIDIA GPUs (Compute Capability 8.0+)
 - AMD GPUs (ROCm 6.2+)
 - Under development: CPUs
+
+# Development Container (Dev Container)
+
+**Dev Containers** for the Triton project are available from
+the [triton-dev-containers repository](https://github.com/redhat-et/triton-dev-containers)
+
+### Key Benefits:
+- **Consistency**: All developers can work with the same development
+  environment, ensuring uniform behavior across different systems.
+- **Isolation**: The container prevents potential conflicts with software
+  installed on your local machine.
+- **Portability**: Easily share the development environment with team members,
+  minimizing onboarding time and setup issues.
+
+### How to Use the Dev Container:
+
+For detailed instructions on how to use the dev containers please see
+the [dev container user guide](https://github.com/redhat-et/triton-dev-containers/blob/main/.devcontainer/devcontainer.md)
@@ -1 +1 @@
-c118864223c6309378cd704f3406533474c2759f
+ffe3129e9bdc146ee4d91e849173d1c64b1ae974
@@ -28,20 +28,6 @@ constexpr int patternBenefitClampOptimizedPattern = 20;
 constexpr int patternBenefitConvertLayoutOptimizedPattern = 20;
 constexpr int patternBenefitNvidiaTensorCoreSubviewPattern = 20;
 
-struct BackendCallbacks {
-  /**
-   * A backend-specific callback for appending auxiliary data during
-   * `LocalStoreOp` conversion.
-   *
-   * @param[in] op The reference to the re-written `LocalStoreOp`.
-   * @param[in] count The number of issued LLVM instructions.
-   * @param[in] type The input type of issued LLVM instructions.
-   */
-  std::function<void(triton::gpu::LocalStoreOp op, size_t llvmOpCount,
-                     Type llvmOpType)>
-      localStoreOpConversion = nullptr;
-};
-
 void populateElementwiseOpToLLVMPatterns(
     LLVMTypeConverter &typeConverter, RewritePatternSet &patterns,
     ModuleAxisInfoAnalysis &axisInfoAnalysis, const TargetInfoBase &targetInfo,
@@ -51,10 +37,10 @@ void populateElementwiseOpToLLVMPatterns(
 // callback receives 1) the current source op, 2) the number of issued LLVM
 // instructions and 3) their input types. Each MLIR backend can provide a
 // callback and, thus, handle backend-specific behaviors.
-void populateMemoryOpToLLVMPatterns(
-    LLVMTypeConverter &typeConverter, const TargetInfoBase &targetInfo,
-    RewritePatternSet &patterns, PatternBenefit benefit,
-    std::optional<BackendCallbacks> backendCallbacks = std::nullopt);
+void populateMemoryOpToLLVMPatterns(LLVMTypeConverter &typeConverter,
+                                    const TargetInfoBase &targetInfo,
+                                    RewritePatternSet &patterns,
+                                    PatternBenefit benefit);
 
 void populateAssertOpToLLVMPattern(LLVMTypeConverter &typeConverter,
                                    RewritePatternSet &patterns,

@@ -91,6 +91,11 @@ class TargetInfoBase {
 
   virtual bool supportVectorizedAtomics() const = 0;
 
+  // Helper used by targets to annotate store operations during lowering to
+  // llvm.
+  virtual void storeOpAnnotation(triton::gpu::LocalStoreOp op,
+                                 size_t localStoreOpCount, Type type) const {}
+
   virtual ~TargetInfoBase() {}
 };
 } // namespace mlir::triton

@@ -11,6 +11,8 @@ namespace impl {
 
 LogicalResult verifyTransposeOpInterface(Operation *op);
 
+LogicalResult verifyDotOpInterface(Operation *op);
+
 } // namespace impl
 
 } // namespace triton

@@ -58,53 +58,6 @@ class VerifyTensorLayoutsTrait
   }
 };
 
-// Verify if the op is a dot-like operation.
-// A dot-like operation should have three operands.
-// The first two operands should share a common dimension, and the result
-// should have the dimensions of the two operands that are not shared.
-// A dot-like operation can be either 2d or 3d.
-// In the 3d case, the first dimension of operands is the batch dimension.
-template <class ConcreteType>
-class DotLike : public TraitBase<ConcreteType, DotLike> {
-public:
-  static LogicalResult verifyTrait(Operation *op) {
-    if (op->getNumOperands() < 3)
-      return op->emitOpError("expected at least 3 operands");
-    auto aTy = cast<ShapedType>(op->getOperand(0).getType());
-    auto bTy = cast<ShapedType>(op->getOperand(1).getType());
-    auto cTy = cast<ShapedType>(op->getOperand(2).getType());
-    auto aShape = aTy.getShape();
-    auto bShape = bTy.getShape();
-    auto cShape = cTy.getShape();
-    // Check if all 3d or all 2d
-    if (aShape.size() != 2 && aShape.size() != 3)
-      return op->emitOpError("expected operands to be 2d or 3d");
-    if (aShape.size() != bShape.size() || aShape.size() != cShape.size())
-      return op->emitOpError("expected all operands to have the same rank");
-    // Check if the first two operands share a common dimension
-    // TODO: enable back with an interface to support scaled dot.
-    // if (aShape[aShape.size() - 1] != bShape[aShape.size() - 2])
-    //   return op->emitOpError("expected the last dimension of the first
-    //   operand "
-    //                          "to be equal to the second-to-last dimension of
-    //                          " "the second operand");
-    // Check the batch dimension
-    if (aShape.size() == 3 &&
-        (aShape[0] != cShape[0] || bShape[0] != cShape[0]))
-      return op->emitOpError("expected the first dimension of the first "
-                             "operand to be equal to the first dimension of "
-                             "the result");
-    // Check the output shape
-    if (cShape[cShape.size() - 2] != aShape[aShape.size() - 2] ||
-        cShape[cShape.size() - 1] != bShape[aShape.size() - 1])
-      return op->emitOpError(
-          "expected the output shape to be the concatenation of the last "
-          "dimension of the first operand and the last dimension of the "
-          "second ");
-    return success();
-  }
-};
-
 template <typename ConcreteType>
 class SameOperandsAndResultEncoding
     : public TraitBase<ConcreteType, SameOperandsAndResultEncoding> {

@@ -6,7 +6,6 @@ include "mlir/Interfaces/InferTypeOpInterface.td"
 
 def TensorSizeTrait : NativeOpTrait<"TensorSizeTrait">;
 def VerifyTensorLayoutsTrait : NativeOpTrait<"VerifyTensorLayoutsTrait">;
-def DotLike : NativeOpTrait<"DotLike">;
 def SameOperandsEncoding : NativeOpTrait<"SameOperandsEncoding">;
 def SameOperandsAndResultEncoding : NativeOpTrait<"SameOperandsAndResultEncoding">;
 def SameLoadStoreOperandsShape : NativeOpTrait<"SameLoadStoreOperandsShape">;

@@ -29,7 +29,27 @@ def TransposeOpInterface : OpInterface<"TransposeOpInterface"> {
 			/*args=*/(ins)>
   ];
 
-	let verify = [{ return ::mlir::triton::impl::verifyTransposeOpInterface($_op); }];
+  let verify = [{ return ::mlir::triton::impl::verifyTransposeOpInterface($_op); }];
+}
+
+def DotOpInterface : OpInterface<"DotOpInterface"> {
+  let description = [{
+	This interface is implemented by operations that perform a dot product.
+  }];
+
+  let cppNamespace = "::mlir::triton";
+
+  let methods = [
+	InterfaceMethod<
+			/*desc=*/[{
+			  Verifies the dimensions of the A and B DotOp operands.
+		  }],
+			/*retType=*/"bool",
+			/*methodName=*/"verifyDims",
+			/*args=*/(ins)>
+	];
+
+  let verify = [{ return ::mlir::triton::impl::verifyDotOpInterface($_op); }];
 }
 
 

@@ -631,7 +631,7 @@ def TT_GetNumProgramsOp : TT_Op<"get_num_programs", [Pure]> {
 //
 def TT_DotOp : TT_Op<"dot", [Pure,
                              DeclareOpInterfaceMethods<InferTypeOpInterface>,
-                             DotLike,
+                             DeclareOpInterfaceMethods<DotOpInterface>,
                              TypesMatchWith<"result's type matches accumulator's type",
                                             "d", "c", "$_self">]> {
     let summary = "dot";
@@ -671,7 +671,7 @@ def TT_DotOp : TT_Op<"dot", [Pure,
 //
 def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
                              AttrSizedOperandSegments,
-                             DotLike,
+                             DeclareOpInterfaceMethods<DotOpInterface>,
                              TypesMatchWith<"result's type matches accumulator's type",
                                             "d", "c", "$_self">]> {
     let summary = "dot_scaled";

@@ -70,10 +70,12 @@ struct SharedMemory : public SideEffects::Resource::Base<SharedMemory> {
   StringRef getName() final { return "<SharedMemory>"; }
 };
 
+// Convert a distributed layout to a linear encoding
+LinearEncodingAttr toLinearEncoding(Attribute layout, ArrayRef<int64_t> shape);
+
 unsigned getTotalElemsPerThread(Type type);
 
-unsigned getTotalElemsPerThread(Attribute layout, ArrayRef<int64_t> shape,
-                                Type eltTy);
+unsigned getTotalElemsPerThread(Attribute layout, ArrayRef<int64_t> shape);
 
 SmallVector<unsigned> getElemsPerThread(Type type);
 

@@ -501,13 +501,17 @@ We call each individual tile "rep".
     InterfaceMethod<"Return total element size per thread.",
                     "unsigned",
                     "getTotalElemsPerThread",
-                     (ins "ArrayRef<int64_t>":$tensorShape,
-                          "Type":$eltTy)>,
+                     (ins "ArrayRef<int64_t>":$shape),
+                     /*defaultImplementation=*/[{
+                         return toLinearEncoding($_self, shape).getTotalElemsPerThread(shape);
+                     }]>,
     InterfaceMethod<"Return element size per thread in each dimension.",
                     "SmallVector<unsigned>",
                     "getElemsPerThread",
-                     (ins "ArrayRef<int64_t>":$tensorShape,
-                          "Type":$eltTy)>,
+                     (ins "ArrayRef<int64_t>":$shape),
+                     /*defaultImplementation=*/[{
+                         return toLinearEncoding($_self, shape).getElemsPerThread(shape);
+                     }]>,
     // Interface for the meta information about the multiple thread hierarchy.
     InterfaceMethod<"Get the shape of the warps per CTA.",
                     "SmallVector<unsigned>",
@@ -577,8 +581,7 @@ L(T) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
   }];
 
   code extraDistributedDeclaration  = extraBaseClassDeclaration # [{
-    unsigned getTotalElemsPerThread(ArrayRef<int64_t> shape, Type eltTy) const;
-    SmallVector<unsigned> getElemsPerThread(ArrayRef<int64_t> shape, Type eltTy) const;
+    // Implemented in subclasses
     SmallVector<unsigned> getRepOrder() const;
     SmallVector<unsigned> getCTAsPerCGA() const;
     SmallVector<unsigned> getCTAOrder() const;
@@ -613,6 +616,10 @@ def LinearEncodingAttr : DistributedEncoding<"LinearEncoding", "linear_encoding"
   let parameters = (ins LinearLayoutParam:$linearLayout);
 
   let extraClassDeclaration = extraDistributedDeclaration # [{
+    // Generic distributed encoding methods
+    unsigned getTotalElemsPerThread(ArrayRef<int64_t> shape) const;
+    SmallVector<unsigned> getElemsPerThread(ArrayRef<int64_t> shape) const;
+
     SmallVector<unsigned> getContigPerThread() const;
     SmallVector<unsigned> getOrder() const;
 
@@ -965,7 +972,6 @@ V [ 0,4,8...60   1,5...61     2,6...62     3,7...63    ]   [ 128,132...188  129,
       return true;
     }
     SmallVector<unsigned> getSizePerThreadForOperand(int kWidth, int opIdx) const;
-    unsigned getTotalElemsPerThreadForOperand(ArrayRef<int64_t> shape, Type eltTy, int kWidth, int opIdx) const;
     SmallVector<int64_t> getInstrShapeForOperand(int kWidth, int opIdx) const;
     SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> operandShape, int kWidth, int opIdx) const;
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
@@ -1095,7 +1101,6 @@ Row |
       return true;
     }
     SmallVector<unsigned> getSizePerThreadForOperand(int kWidth, int opIdx) const;
-    unsigned getTotalElemsPerThreadForOperand(ArrayRef<int64_t> shape, Type eltTy, int kWidth, int opIdx) const;
     SmallVector<int64_t> getElemsPerInstrForOperands() const;
     SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> operandShape,
                                           Type elemType, int kWidth, int opIdx) const;

@@ -10,6 +10,8 @@ namespace mlir {
 namespace triton {
 
 static const char *kNumStagesAttrName = "tt.num_stages";
+static const char *kDisallowAccMultiBufferAttrName =
+    "tt.disallow_acc_multi_buffer";
 static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
 
@@ -37,6 +39,10 @@ void addOps(scf::ForOp forOp, int stage,
 void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
                                  Value val);
 
+// Return true if the given ForOp has the attribute
+// `tt.disallow_acc_multi_buffer` set to true.
+bool getDisallowAccMultiBuffer(scf::ForOp forOp);
+
 // Return the minClusterId and maxClusterId for the given ForOp.
 std::pair<int, int> getMinMaxCluster(scf::ForOp &forOp);
 std::pair<int, int> getStageCluster(Operation *op);

@@ -200,14 +200,6 @@ StringRef getAMDArch(Operation *module);
 std::optional<mlir::triton::gpu::SwizzledSharedEncodingAttr>
 getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible);
 
-enum class MMALoadType {
-  SharedV3,
-  Registers,     // may be v2 or v3
-  DoNotPipeline, // could be a valid shared/registers MMA operand, but skip
-                 // pipelining
-};
-MMALoadType getMMALoadType(Operation *loadOp);
-
 // Convert \param op operands and results to layout \param encoding.
 void convertOpEncoding(Attribute encoding, Operation *op);
 

@@ -15,3 +15,8 @@ mlir_tablegen(TritonNvidiaGPUAttrDefs.cpp.inc -gen-attrdef-defs)
 mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
 mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(TritonNvidiaGPUAttrDefsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS TritonNvidiaGPUOpInterfaces.td)
+mlir_tablegen(TritonNvidiaGPUOpInterfaces.h.inc -gen-op-interface-decls)
+mlir_tablegen(TritonNvidiaGPUOpInterfaces.cpp.inc -gen-op-interface-defs)
+add_public_tablegen_target(TritonNvidiaGPUOpInterfacesIncGen)
@@ -37,6 +37,8 @@
 #define GET_ATTRDEF_CLASSES
 #include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUAttrDefs.h.inc"
 
+#include "triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.h.inc"
+
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonNvidiaGPU/IR/Ops.h.inc"
 

@@ -0,0 +1,42 @@
+#ifndef TRITON_NVIDIAGPU_OP_INTERFACES
+#define TRITON_NVIDIAGPU_OP_INTERFACES
+
+include "mlir/IR/OpBase.td"
+
+def MMAv5OpInterface : OpInterface<"MMAv5OpInterface"> {
+  let description = [{
+     This interface is implemented by MMAv5 dot and dot scaled ops.
+  }];
+
+  let cppNamespace = "::mlir::triton::nvidia_gpu";
+
+  // We can add more methods as needed.
+  let methods = [
+    InterfaceMethod<"Return the accumulator init flag.",
+                    "::mlir::Value",
+                    "useAccumulator">,
+    InterfaceMethod<"Set the accumulator init flag.",
+                    "void",
+                    "setUseAccumulator",
+                    (ins "::mlir::Value":$flag)>,
+    InterfaceMethod<"Associate a new barrier to this MMAv5 op.",
+                    "void",
+                    "setBarrier",
+                    (ins "::mlir::Value":$barrier)>,
+    InterfaceMethod<"Return the accumulator.",
+                    "::mlir::Value",
+                    "getAccumulator">,
+    InterfaceMethod<"Set the accumulator.",
+                    "void",
+                    "setAccumulator",
+                    (ins "::mlir::Value":$accum)>,
+    InterfaceMethod<"Return the predicate of this op.",
+                    "::mlir::Value",
+                    "getPredicate">,
+    InterfaceMethod<"Set the predicate of this op.",
+                    "void",
+                    "setPredicate",
+                    (ins "::mlir::Value":$pred)>,
+  ];
+}
+#endif // TRITON_NVIDIAGPU_OP_INTERFACES
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		c118864223c6309378cd704f3406533474c2759f
		ffe3129e9bdc146ee4d91e849173d1c64b1ae974