Skip to content

Commit

Permalink
[AMD] Enable block pingpong for smaller tiles (#5820)
Browse files Browse the repository at this point in the history
Recent experiment found it also helps few more configs especially
smaller tiles.
Enable one cluster pingpong for the 4 times smaller tiles.
  • Loading branch information
jungpark-mlir authored Feb 6, 2025
1 parent 8c25078 commit d56f4fe
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ void Pingponger::getDotPingponged() {
// software pipelining and dot rank=2. Also only accept the for-loop with
// supported combination of operations because this transformation is very
// tightly scheduling the latencies.
if (gLoadOps.size() != 2 || lLoadOps.size() != 2 || dotOps.size() != 1)
if (gLoadOps.size() < 2 || lLoadOps.size() < 2 || dotOps.size() != 1)
return;

// Pingpong scheduling tries to form two different types of the instruction
Expand Down Expand Up @@ -447,6 +447,7 @@ void Pingponger::getDotPingponged() {
auto elemWidth = aType.getElementTypeBitWidth();
int64_t tileSize = dotShape[0] * dotShape[1] * aShape[1] * elemWidth;

const int64_t minTile = 262144; // e.g. 32x128x64x16bit
const int64_t smallTile = 16777216; // e.g. 128x128x64x16bit
const int64_t mediumTile = 33554432; // smallTile x 2
const int64_t largeTile = 67108864; // e.g. 256x256x64x16bit
Expand All @@ -465,11 +466,13 @@ void Pingponger::getDotPingponged() {
// times for issuing the memory operations and issuing dot operations,
// smaller tile sizes are not likely to get any advantage from current dot
// centric pingpong scheduling.
if (tileSize == smallTile)
if (tileSize <= smallTile && tileSize >= minTile)
transformOnePPClusters(builder, loc);
// numWarps=4 doesn't need asymmetric sync, return.
return;
} else if (numWarps == 8) { // Pingpong between warps from the same block
if (gLoadOps.size() != 2 || lLoadOps.size() != 2)
return;
// Transform a loop where the tile size requires dots to be sliced
if (tileSize == mediumTile) {
if (transformTwoPPClusters(builder, dotOps[0]->getLoc()).failed())
Expand Down

0 comments on commit d56f4fe

Please sign in to comment.