Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge OpenAI Triton commit 2778526 #3332

Merged
merged 6 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bin/triton-tensor-layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,11 @@ static cl::opt<std::string> TensorStr(
//===--------------------------------------------------------------------===//

LogicalResult layoutPrint(RankedTensorType tensorType, raw_ostream &os) {
// DistributedEncodingTrait and SharedEncodingAttr implements the
// DistributedEncodingTrait and SharedEncodingTrait implements the
// toLinearLayout interface.
mlir::Attribute layout = tensorType.getEncoding();
if (isa<mlir::triton::gpu::DistributedEncodingTrait,
mlir::triton::gpu::SharedEncodingAttr>(layout)) {
mlir::triton::gpu::SharedEncodingTrait>(layout)) {
os << triton::gpu::getLayoutStr(tensorType, UseHWPointOfView);
return success();
}
Expand Down
7 changes: 5 additions & 2 deletions include/triton/Conversion/TritonGPUToLLVM/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -1288,9 +1288,12 @@ inline Value packLLVector(Location loc, ValueRange vals,
inline bool
isSimpleSharedMemoryAccess(ArrayRef<int64_t> shape,
ArrayRef<int64_t> allocShape,
triton::gpu::SharedEncodingAttr sharedEnc) {
triton::gpu::SharedEncodingTrait sharedEnc) {
auto rank = shape.size();
return /*no swizzling*/ sharedEnc.getMaxPhase() == 1 ||
auto swizzledLayout =
dyn_cast<triton::gpu::SwizzledSharedEncodingAttr>(sharedEnc);
bool noSwizzling = swizzledLayout && swizzledLayout.getMaxPhase() == 1;
return /*no swizzling*/ noSwizzling ||
/*swizzling but same shape*/ shape == allocShape ||
/*swizzling and rank-reduced and rank >= 2*/
(shape == allocShape.take_back(rank) && rank >= 2);
Expand Down
12 changes: 7 additions & 5 deletions include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
#include "triton/Tools/LinearLayout.h"

namespace mlir::triton::gpu {
class SharedEncodingAttr;
class SwizzledSharedEncodingAttr;
class NVMMASharedEncodingAttr;

// - BlockedEncodingAttrs have the following input dimensions.
//
Expand All @@ -18,7 +19,8 @@ class SharedEncodingAttr;
// "warp": warps in a block/CTA
// "block": blocks in a cluster
//
// - An n-dimensional SharedEncodingAttr has the following input dimensions.
// - An n-dimensional SwizzledSharedEncodingAttr has the following input
// dimensions.
//
// "offset": the n'th element in the allocation, within a particular thread
// block (i.e. within a CTA). The offset is measured in elements, not
Expand All @@ -36,19 +38,19 @@ class SharedEncodingAttr;
//
// elemBitWidth is the bit width of one element in the layout. This is required
// to compute the linear layout for MMAv3 (i.e. Hopper) shared layouts (i.e.
// shared layouts with hasLeadingOffset == true) but is otherwise unused.
// shared layouts with nvmma_shared layout) but is otherwise unused.
//
// Returns std::nullopt if the given layout can't be converted to an LL.
LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
std::optional<int32_t> elemBitWidth = std::nullopt);

// Convert the shared encoding of a tensor with `hasLeadingOffset=true` to a
// Convert the shared encoding of a tensor with `nvmma_shared` layout to a
// LinearLayout that maps from a linear shared memory offset to tensor index.
//
// If `disableSwizzle` is set, then the resulting layout does not include
// swizzling.
LinearLayout sharedToLinearLayoutLeadingOffset(ArrayRef<int64_t> shape,
SharedEncodingAttr shared,
NVMMASharedEncodingAttr shared,
int32_t elemBitWidth,
bool disableSwizzle = false);

Expand Down
156 changes: 89 additions & 67 deletions include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,6 @@ def TritonGPU_AttrTrait : AttrInterface<"TritonGPU_AttrTrait"> {
let cppNamespace = "::mlir::triton::gpu";

let methods = [
InterfaceMethod<"Return total element size per thread.",
"unsigned",
"getTotalElemsPerThread",
(ins "ArrayRef<int64_t>":$tensorShape,
"Type":$eltTy)>,

InterfaceMethod<"Return element size per thread in each dimension.",
"SmallVector<unsigned>",
"getElemsPerThread",
(ins "ArrayRef<int64_t>":$tensorShape,
"Type":$eltTy)>,
];
}

Expand Down Expand Up @@ -54,8 +43,6 @@ Right now, Triton implements two main classes of layouts: shared, and distribute
let attrName = "triton.gpu." # attrMnemonic;

code extraBaseClassDeclaration = [{
unsigned getTotalElemsPerThread(ArrayRef<int64_t> shape, Type eltTy) const;
SmallVector<unsigned> getElemsPerThread(ArrayRef<int64_t> shape, Type eltTy) const;
}];
}

Expand Down Expand Up @@ -124,15 +111,6 @@ addition, if there's only one CTA per CGA, then Triton canonicalizes CTAOrder to
];

let extraClassDeclaration = [{
SmallVector<unsigned> getElemsPerThread(ArrayRef<int64_t> shape, Type eltTy) const {
llvm::report_fatal_error(
"Unsupported getElemsPerThread in CTALayoutAttr.");
}
unsigned getTotalElemsPerThread(ArrayRef<int64_t> shape, Type eltTy) const {
llvm::report_fatal_error(
"Unsupported getTotalElemsPerThread in CTALayoutAttr.");
}

static CTALayoutAttr getDefault(MLIRContext *context, int rank) {
SmallVector<unsigned> CTAsPerCGA(rank, 1);
SmallVector<unsigned> CTASplitNum(rank, 1);
Expand All @@ -146,12 +124,46 @@ addition, if there's only one CTA per CGA, then Triton canonicalizes CTAOrder to
let genVerifyDecl = 1;
let skipDefaultBuilders = 1;
}


def LayoutEncodingTrait : AttrInterface<"LayoutEncodingTrait"> {
let cppNamespace = "::mlir::triton::gpu";
let description = [{
Common trait for all TTGIR layouts.
}];
let methods = [
InterfaceMethod<"Get the shape of the CTAs per CGA.",
"SmallVector<unsigned>",
"getCTAsPerCGA">,
InterfaceMethod<"Get the order of the CTAs per CGA. The fastest-changing axis first",
"SmallVector<unsigned>",
"getCTAOrder">,
InterfaceMethod<"Each CTA processes 1/CTASplitNum of the tensor.",
"SmallVector<unsigned>",
"getCTASplitNum">,
];
}

//===----------------------------------------------------------------------===//
// Shared Layout Encoding
//===----------------------------------------------------------------------===//

def SharedEncodingAttr : TritonGPU_Attr<"SharedEncoding", "shared_encoding"> {
let mnemonic = "shared";
def SharedEncodingTrait : AttrInterface<"SharedEncodingTrait"> {
let cppNamespace = "::mlir::triton::gpu";

let description = [{
Common trait describing shared memory.
}];
let methods = [
InterfaceMethod<"Return the default alignment for the layout.",
"int32_t",
"getAlignment">,
];
}

def SwizzledSharedEncodingAttr :
TritonGPU_Attr<"SwizzledSharedEncoding", "swizzled_shared_encoding", [SharedEncodingTrait, LayoutEncodingTrait]> {
let mnemonic = "swizzled_shared";

let description = [{
An encoding for tensors whose elements may be simultaneously accessed by
Expand Down Expand Up @@ -226,13 +238,6 @@ When vec=2, elements are swizzled in pairs of 2. In other words, the element at
(r,c) has value

((c / 2) ^ r) * 2 + (c % 2).

For MMAv3 eg Hopper GMMA, hasLeadingOffset should be true. In this case,
when the matrix is stored in shared memory, there will be an offset not
only in the stride dimension, but also in the leading dimension. For example,
a matrix of size 16x128 and data type I8 is stored in the shared memory with
64B-swizzle mode. The offset of the element with index (0, 64) will be 16*64,
compared to 1*64 when the hasLeadingOffset is false.
}];

// swizzle info: vec, perPhase, maxPhase
Expand All @@ -243,20 +248,10 @@ compared to 1*64 when the hasLeadingOffset is false.
"unsigned":$perPhase,
"unsigned":$maxPhase,
ArrayRefParameter<"unsigned">:$order,
"CTALayoutAttr":$CTALayout,
"bool":$hasLeadingOffset
"CTALayoutAttr":$CTALayout
);

let builders = [
AttrBuilder<(ins "unsigned":$vec,
"unsigned":$perPhase,
"unsigned":$maxPhase,
"ArrayRef<unsigned>":$order,
"CTALayoutAttr":$CTALayout), [{
bool hasLeadingOffset = false; // default value
return $_get(context, vec, perPhase, maxPhase, order, CTALayout, hasLeadingOffset);
}]>,

AttrBuilder<(ins "DotOperandEncodingAttr":$dotOpEnc,
"ArrayRef<int64_t>":$shape,
"ArrayRef<unsigned>":$order,
Expand All @@ -267,7 +262,7 @@ compared to 1*64 when the hasLeadingOffset is false.
}]>,

// TODO(jlebar): This should not be an overload of
// SharedEncodingAttr::get(). It's misleading, because it does a bunch of
// SwizzledSharedEncodingAttr::get(). It's misleading, because it does a bunch of
// nontrivial work based on the given dotOpEnc.
AttrBuilder<(ins "DotOperandEncodingAttr":$dotOpEnc,
"ArrayRef<int64_t>":$shape,
Expand Down Expand Up @@ -402,38 +397,66 @@ compared to 1*64 when the hasLeadingOffset is false.
unsigned bitwidth = eltTy.getIntOrFloatBitWidth();
return get(context, dotOpEnc, shape, order, CTALayout, bitwidth, needTrans);
}]>,
];

let extraClassDeclaration = extraBaseClassDeclaration # [{
int32_t getAlignment() const;
SmallVector<unsigned> getCTAsPerCGA() const;
SmallVector<unsigned> getCTAOrder() const;
SmallVector<unsigned> getCTASplitNum() const;
}];
let hasCustomAssemblyFormat = 1;
}

def NVMMASharedEncodingAttr :
TritonGPU_Attr<"NVMMASharedEncoding", "nvmma_shared_encoding", [SharedEncodingTrait, LayoutEncodingTrait]> {
let mnemonic = "nvmma_shared";

let description = [{
Represent blocked shared memory matching MMAv3/MMAv5 shared memory input.
This is meant to represent 2d tiled blocked layout.
The full layout representation is described here:
https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-shared-memory-layout
}];

let parameters = (
ins
"unsigned":$swizzlingByteWidth,
"bool":$transposed,
"CTALayoutAttr":$CTALayout
);

let builders = [
AttrBuilder<(ins "ArrayRef<int64_t>":$shape,
"ArrayRef<unsigned>":$order,
"CTALayoutAttr":$CTALayout,
"Type":$eltTy), [{
auto shapePerCTA = getShapePerCTA(CTALayout.getCTASplitNum(), shape);

int32_t swizzlingByteWidth = 0;
int32_t eleBitWidth = eltTy.getIntOrFloatBitWidth();
int32_t vec = 128 / eleBitWidth, perPhase = 1, maxPhase = 1;

// get proper shared memory swizzling mode from the contiguous dimension
// size of the origin blocked layout.
auto contigDimSizeInByte = shapePerCTA[order[0]] * eleBitWidth / 8;
if (contigDimSizeInByte >= 128 && contigDimSizeInByte % 128 == 0) {
perPhase = 1;
maxPhase = 8;
swizzlingByteWidth = 128;
} else if (contigDimSizeInByte >= 64 && contigDimSizeInByte % 64 == 0) {
perPhase = 2;
maxPhase = 4;
swizzlingByteWidth = 64;
} else if (contigDimSizeInByte >= 32 && contigDimSizeInByte % 32 == 0) {
perPhase = 4;
maxPhase = 2;
swizzlingByteWidth = 32;
} else {
llvm_unreachable("unsupported shared memory layout for MMAv3");
}

return $_get(context, vec, perPhase, maxPhase, order, CTALayout, true);
bool transposed = order[0] == 0;
return $_get(context, swizzlingByteWidth, transposed, CTALayout);
}]>
];

let extraClassDeclaration = extraBaseClassDeclaration # [{
int32_t getAlignment() const;
SmallVector<unsigned> getCTAsPerCGA() const;
SmallVector<unsigned> getCTAOrder() const;
SmallVector<unsigned> getCTASplitNum() const;
}];
let hasCustomAssemblyFormat = 1;
}
Expand Down Expand Up @@ -468,16 +491,17 @@ We call each individual tile "rep".
InterfaceMethod<"Get the order of reps (tiles of this layout that tile the whole tensor). The fastest-changing axis first",
"SmallVector<unsigned>",
"getRepOrder">,

// Interface for the meta information about the multiple thread hierarchy.
InterfaceMethod<"Get the shape of the CTAs per CGA.",
"SmallVector<unsigned>",
"getCTAsPerCGA">,

InterfaceMethod<"Get the order of the CTAs per CGA. The fastest-changing axis first",
InterfaceMethod<"Return total element size per thread.",
"unsigned",
"getTotalElemsPerThread",
(ins "ArrayRef<int64_t>":$tensorShape,
"Type":$eltTy)>,
InterfaceMethod<"Return element size per thread in each dimension.",
"SmallVector<unsigned>",
"getCTAOrder">,

"getElemsPerThread",
(ins "ArrayRef<int64_t>":$tensorShape,
"Type":$eltTy)>,
// Interface for the meta information about the multiple thread hierarchy.
InterfaceMethod<"Get the shape of the warps per CTA.",
"SmallVector<unsigned>",
"getWarpsPerCTA">,
Expand All @@ -498,10 +522,6 @@ We call each individual tile "rep".
"SmallVector<unsigned>",
"getSizePerThread">,

InterfaceMethod<"Each CTA processes 1/CTASplitNum of the tensor.",
"SmallVector<unsigned>",
"getCTASplitNum">,

InterfaceMethod<"Gets the number of contiguous elements per thread.",
"SmallVector<unsigned>",
"getContigPerThread">,
Expand All @@ -514,7 +534,7 @@ We call each individual tile "rep".

class DistributedEncoding<string name, string attrMnemonic, list<Trait> traits = [],
Dialect dialect = TritonGPU_Dialect>
: TritonGPU_Attr<name, attrMnemonic, !listconcat([DistributedEncodingTrait], traits), dialect> {
: TritonGPU_Attr<name, attrMnemonic, !listconcat([DistributedEncodingTrait, LayoutEncodingTrait], traits), dialect> {

let description = [{
Distributed encodings have a layout function L that is entirely characterized
Expand Down Expand Up @@ -550,6 +570,8 @@ L(T) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
}];

code extraDistributedDeclaration = extraBaseClassDeclaration # [{
unsigned getTotalElemsPerThread(ArrayRef<int64_t> shape, Type eltTy) const;
SmallVector<unsigned> getElemsPerThread(ArrayRef<int64_t> shape, Type eltTy) const;
SmallVector<unsigned> getRepOrder() const;
SmallVector<unsigned> getCTAsPerCGA() const;
SmallVector<unsigned> getCTAOrder() const;
Expand Down
4 changes: 2 additions & 2 deletions include/triton/Dialect/TritonGPU/Transforms/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class LoadOp;
class StoreOp;
class FuncOp;
namespace gpu {
class SharedEncodingAttr;
class SwizzledSharedEncodingAttr;
}
} // namespace triton

Expand Down Expand Up @@ -197,7 +197,7 @@ int getNVIDIAComputeCapability(Operation *module);
// Read the amd target from the module attributes
StringRef getAMDArch(Operation *module);

std::optional<mlir::triton::gpu::SharedEncodingAttr>
std::optional<mlir::triton::gpu::SwizzledSharedEncodingAttr>
getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible);

enum class MMALoadType {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def TTNG_AsyncTMAScatterOp : TTNG_Op<"async_tma_scatter", [DeclareOpInterfaceMet
The `ttng.async_tma_scatter` operation scatters multiple separately-indexed
rows of data from local memory into global memory asynchronously. The
operation scatters a 2D tensor in shared memory, laid out by core tensor
tiles (`hasLeadingOffset=true`) into separately indexed rows in global
tiles nvmma_shared layout into separately indexed rows in global
memory at a given `y` offset.
}];

Expand Down
4 changes: 2 additions & 2 deletions lib/Analysis/Allocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ unsigned defaultAllocationAnalysisScratchSizeFn(Operation *op) {
auto dstTy = cvtLayout.getType();
auto srcEncoding = srcTy.getEncoding();
auto dstEncoding = dstTy.getEncoding();
if (isa<gpu::SharedEncodingAttr>(srcEncoding) ||
isa<gpu::SharedEncodingAttr>(dstEncoding)) {
if (mlir::isa<gpu::SharedEncodingTrait>(srcEncoding) ||
mlir::isa<gpu::SharedEncodingTrait>(dstEncoding)) {
// Conversions from/to shared memory do not need scratch memory.
return 0;
}
Expand Down
Loading