From 1ed6350de54796b61ecc03d6fb1ba1c2bf107abb Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Mon, 3 Feb 2025 16:02:41 -0800
Subject: [PATCH] [Codegen] Use affine.delinearize_index in workgroup
 distribution (#19839)

In the interests of getting rid of needless substractions from affine
composition, get rid of one of the last remaining manual calls to
floorDiv()

The other ProcInfo generators have been converted to delinearize_index
by earlier PRs - this one finishes the job.

This should not impact the behavior of generated programs.

---------

Co-authored-by: Han-Chung Wang <hanhan0912@gmail.com>
---
 .../tile_and_distribute_to_workgroups.mlir    | 63 ++++++++++---------
 ...d_distribute_to_workgroups_func_scope.mlir |  6 +-
 .../src/iree/compiler/Codegen/Utils/Utils.cpp | 31 ++++-----
 3 files changed, 52 insertions(+), 48 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
index 025a4bf19236..03d1e51f9398 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir
@@ -303,13 +303,13 @@ hal.executable private @add_distribute4D {
 }
 //  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
 //  CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1] -> ((s0 ceildiv 64) * (s1 ceildiv 2))>
-//  CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0, s1] -> ((s0 floordiv (s1 ceildiv 64)) * 2)>
-//  CHECK-DAG: #[[MAP3:.+]] = affine_map<()[s0] -> ((s0 ceildiv 2) * 2)>
-//  CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
-//  CHECK-DAG: #[[MAP5:.+]] = affine_map<()[s0, s1] -> ((s0 mod (s1 ceildiv 64)) * 64)>
-//  CHECK-DAG: #[[MAP6:.+]] = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
-//  CHECK-DAG: #[[MAP7:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 64)>
-//  CHECK-DAG: #[[MAP8:.+]] = affine_map<()[s0] -> (s0 * 64)>
+//  CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)>
+//  CHECK-DAG: #[[MAP3:.+]] = affine_map<()[s0] -> (s0 * 2)>
+//  CHECK-DAG: #[[MAP4:.+]] = affine_map<()[s0] -> ((s0 ceildiv 2) * 2)>
+//  CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
+//  CHECK-DAG: #[[MAP6:.+]] = affine_map<()[s0] -> (s0 * 64)>
+//  CHECK-DAG: #[[MAP7:.+]] = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
+//  CHECK-DAG: #[[MAP8:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 64)>
 //  CHECK-DAG: #[[MAP9:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 
 //  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
@@ -337,31 +337,34 @@ hal.executable private @add_distribute4D {
 //      CHECK:      %[[WORKGROUP_ID_Y:.*]] = hal.interface.workgroup.id[1] : index
 //      CHECK:      %[[WORKGROUP_COUNT_Y:.*]] = hal.interface.workgroup.count[1] : index
 //      CHECK:      %[[WORKGROUP_ID_Z:.*]] = hal.interface.workgroup.id[2] : index
-//  CHECK-DAG:      %[[D7:.*]] = affine.apply #map2(){{\[}}%[[WORKGROUP_ID_Z]], %[[D1]]]
-//  CHECK-DAG:      %[[D8:.*]] = affine.apply #map3(){{\[}}%[[D0]]]
-//      CHECK:      scf.for %[[ARG0:.*]] = %[[D7]] to %[[D0]] step %[[D8]] {
-//  CHECK-DAG:        %[[D9:.*]] = affine.min #map4(%[[ARG0]]){{\[}}%[[D0]]]
-//  CHECK-DAG:        %[[D10:.*]] = affine.apply #map5(){{\[}}%[[WORKGROUP_ID_Z]], %[[D1]]]
-//  CHECK-DAG:        %[[D11:.*]] = affine.apply #map6(){{\[}}%[[D1]]]
-//      CHECK:        scf.for %[[ARG1:.*]] = %[[D10]] to %[[D1]] step %[[D11]] {
-//  CHECK-DAG:          %[[D12:.*]] = affine.min #map7(%[[ARG1]]){{\[}}%[[D1]]]
-//  CHECK-DAG:          %[[D13:.*]] = affine.apply #map8(){{\[}}%[[WORKGROUP_ID_Y]]]
-//  CHECK-DAG:          %[[D14:.*]] = affine.apply #map8(){{\[}}%[[WORKGROUP_COUNT_Y]]]
-//      CHECK:          scf.for %[[ARG2:.*]] = %[[D13]] to %[[D2]] step %[[D14]] {
-//  CHECK-DAG:            %[[D15:.*]] = affine.min #map7(%[[ARG2]]){{\[}}%[[D2]]]
-//  CHECK-DAG:            %[[D16:.*]] = affine.apply #map8(){{\[}}%[[WORKGROUP_ID_X]]]
-//  CHECK-DAG:            %[[D17:.*]] = affine.apply #map8(){{\[}}%[[WORKGROUP_COUNT_X]]]
-//      CHECK:            scf.for %[[ARG3:.*]] = %[[D16]] to %[[D3]] step %[[D17]] {
-//      CHECK:              %[[D18:.*]] = affine.min #map7(%[[ARG3]]){{\[}}%[[D3]]]
-//      CHECK:              %[[D19:.*]] = flow.dispatch.tensor.load %[[D4]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D9]], %[[D12]], %[[D15]], %[[D18]]], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -> tensor<?x?x?x?xf32>
-//      CHECK:              %[[D20:.*]] = flow.dispatch.tensor.load %[[D5]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D9]], %[[D12]], %[[D15]], %[[D18]]], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -> tensor<?x?x?x?xf32>
-//      CHECK:              %[[D21:.*]] = tensor.empty(%[[D9]], %[[D12]], %[[D15]], %[[D18]]) : tensor<?x?x?x?xf32>
-//      CHECK:              %[[D22:.*]] = linalg.generic {indexing_maps = [#map9, #map9, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[D19]], %[[D20]] : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) outs(%[[D21]] : tensor<?x?x?x?xf32>) attrs =  {lowering_config = #config} {
+//  CHECK-DAG:      %[[D7:.*]] = affine.apply #map(){{\[}}%[[D1]]]
+//  CHECK-DAG:      %[[D8:.*]] = affine.apply #map2(){{\[}}%[[D0]]]
+//  CHECK-DAG:      %[[D9:.*]]:2 = affine.delinearize_index %[[WORKGROUP_ID_Z]] into (%[[D8]], %[[D7]])
+//  CHECK-DAG:      %[[D10:.*]] = affine.apply #map3(){{\[}}%[[D9]]
+//  CHECK-DAG:      %[[D11:.*]] = affine.apply #map4(){{\[}}%[[D0]]
+//      CHECK:      scf.for %[[ARG0:.*]] = %[[D10]] to %[[D0]] step %[[D11]] {
+//  CHECK-DAG:        %[[D12:.*]] = affine.min #map5(%[[ARG0]]){{\[}}%[[D0]]]
+//  CHECK-DAG:        %[[D13:.*]] = affine.apply #map6(){{\[}}%[[D9]]#1]
+//  CHECK-DAG:        %[[D14:.*]] = affine.apply #map7(){{\[}}%[[D1]]]
+//      CHECK:        scf.for %[[ARG1:.*]] = %[[D13]] to %[[D1]] step %[[D14]] {
+//  CHECK-DAG:          %[[D15:.*]] = affine.min #map8(%[[ARG1]]){{\[}}%[[D1]]]
+//  CHECK-DAG:          %[[D16:.*]] = affine.apply #map6(){{\[}}%[[WORKGROUP_ID_Y]]]
+//  CHECK-DAG:          %[[D17:.*]] = affine.apply #map6(){{\[}}%[[WORKGROUP_COUNT_Y]]]
+//      CHECK:          scf.for %[[ARG2:.*]] = %[[D16]] to %[[D2]] step %[[D17]] {
+//  CHECK-DAG:            %[[D18:.*]] = affine.min #map8(%[[ARG2]]){{\[}}%[[D2]]]
+//  CHECK-DAG:            %[[D19:.*]] = affine.apply #map6(){{\[}}%[[WORKGROUP_ID_X]]]
+//  CHECK-DAG:            %[[D20:.*]] = affine.apply #map6(){{\[}}%[[WORKGROUP_COUNT_X]]]
+//      CHECK:            scf.for %[[ARG3:.*]] = %[[D19]] to %[[D3]] step %[[D20]] {
+//      CHECK:              %[[D21:.*]] = affine.min #map8(%[[ARG3]]){{\[}}%[[D3]]]
+//      CHECK:              %[[D22:.*]] = flow.dispatch.tensor.load %[[D4]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D12]], %[[D15]], %[[D18]], %[[D21]]], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -> tensor<?x?x?x?xf32>
+//      CHECK:              %[[D23:.*]] = flow.dispatch.tensor.load %[[D5]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D12]], %[[D15]], %[[D18]], %[[D21]]], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -> tensor<?x?x?x?xf32>
+//      CHECK:              %[[D24:.*]] = tensor.empty(%[[D12]], %[[D15]], %[[D18]], %[[D21]]) : tensor<?x?x?x?xf32>
+//      CHECK:              %[[D25:.*]] = linalg.generic {indexing_maps = [#map9, #map9, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[D22]], %[[D23]] : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) outs(%[[D24]] : tensor<?x?x?x?xf32>) attrs =  {lowering_config = #config} {
 //      CHECK:              ^bb0(%[[IN:.*]]: f32, %[[IN_0:.*]]: f32, %[[OUT:.*]]: f32):
-//      CHECK:                %[[D23:.*]] = arith.addf %[[IN]], %[[IN_0]] : f32
-//      CHECK:                linalg.yield %[[D23]] : f32
+//      CHECK:                %[[D26:.*]] = arith.addf %[[IN]], %[[IN_0]] : f32
+//      CHECK:                linalg.yield %[[D26]] : f32
 //      CHECK:              } -> tensor<?x?x?x?xf32>
-//      CHECK:              flow.dispatch.tensor.store %[[D22:.*]], %[[D6]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D9]], %[[D12]], %[[D15]], %[[D18]]], strides = [1, 1, 1, 1] : tensor<?x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xf32>>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]}
+//      CHECK:              flow.dispatch.tensor.store %[[D25:.*]], %[[D6]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D12]], %[[D15]], %[[D18]], %[[D21]]], strides = [1, 1, 1, 1] : tensor<?x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xf32>>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]}
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups_func_scope.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups_func_scope.mlir
index 0478f50b8d63..337165a3adae 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups_func_scope.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups_func_scope.mlir
@@ -32,14 +32,12 @@ func.func @multiple_dim_distribute(%s0 : index, %s1 : index, %s2 : index, %s3 :
 //   CHECK-DAG:   %[[WG_ID_X:.+]] = hal.interface.workgroup.id[0]
 //   CHECK-DAG:   %[[WG_ID_Y:.+]] = hal.interface.workgroup.id[1]
 //   CHECK-DAG:   %[[WG_ID_Z:.+]] = hal.interface.workgroup.id[2]
+//   CHECK-DAG:   %[[WG_IDS_Z:.+]]:3 = affine.delinearize_index %[[WG_ID_Z]] into (%[[S0]], %[[S1]], %[[S2]])
 //   CHECK-DAG:   %[[EMPTY:.+]] = tensor.empty() : tensor<1x2x1x3x1x4x1x1xf32>
 //   CHECK-DAG:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[INPUT]][0, 0, 0, %[[WG_ID_X]]] [2, 3, 4, 1]
 //       CHECK:   %[[GENERIC:.+]] = linalg.generic
 //  CHECK-SAME:       ins(%[[IN_SLICE]] :
 //  CHECK-SAME:       outs(%[[EMPTY]] :
-//   CHECK-DAG:   %[[WG_ID_Z_0:.+]] = affine.apply affine_map<()[s0, s1, s2] -> ((s1 floordiv s2) floordiv s0)>()[%[[S1]], %[[WG_ID_Z]], %[[S2]]]
-//   CHECK-DAG:   %[[WG_ID_Z_1:.+]] = affine.apply affine_map<()[s0, s1, s2] -> ((s1 floordiv s2) mod s0)>()[%[[S1]], %[[WG_ID_Z]], %[[S2]]]
-//   CHECK-DAG:   %[[WG_ID_Z_2:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 mod s1)>()[%[[WG_ID_Z]], %[[S2]]]
 //       CHECK:   flow.dispatch.tensor.store %[[GENERIC]],
-//  CHECK-SAME:       offsets = [%[[WG_ID_Z_0]], 0, %[[WG_ID_Z_1]], 0, %[[WG_ID_Z_2]], 0, %[[WG_ID_Y]], %[[WG_ID_X]]]
+//  CHECK-SAME:       offsets = [%[[WG_IDS_Z]]#0, 0, %[[WG_IDS_Z]]#1, 0, %[[WG_IDS_Z]]#2, 0, %[[WG_ID_Y]], %[[WG_ID_X]]]
 //  CHECK-SAME:       sizes = [1, 2, 1, 3, 1, 4, 1, 1]
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
index e23fe37e90e5..c02d730e5c33 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
@@ -1013,7 +1013,8 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions(
     auto numParallelDims = parallelLoopRanges.size();
 
     SmallVector<linalg::ProcInfo, 3> procInfo(numParallelDims);
-    std::optional<OpFoldResult> splitDim;
+    std::optional<Value> splitDim;
+    SmallVector<OpFoldResult> splitNumTiles;
     for (size_t dim = 0; dim < numParallelDims; ++dim) {
       if (numParallelDims > maxWorkgroupParallelDims &&
           dim >= maxWorkgroupParallelDims - 1) {
@@ -1030,19 +1031,7 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions(
         bindSymbols(builder.getContext(), d0, d1, d2);
         OpFoldResult numTiles = affine::makeComposedFoldedAffineApply(
             builder, loc, (d1 - d0).ceilDiv(d2), {offset, size, step});
-        OpFoldResult dimValue;
-        if (dim == numParallelDims - 1)
-          dimValue = splitDim.value();
-        else {
-          dimValue = affine::makeComposedFoldedAffineApply(
-              builder, loc, (d0 % d1), {splitDim.value(), numTiles});
-          splitDim = affine::makeComposedFoldedAffineApply(
-              builder, loc, (d0).floorDiv(d1), {splitDim.value(), numTiles});
-        }
-        procInfo[numParallelDims - dim - 1] = {
-            getValueOrCreateConstantIndexOp(builder, loc, dimValue),
-            getValueOrCreateConstantIndexOp(builder, loc, numTiles),
-            distributionMethod};
+        splitNumTiles.push_back(numTiles);
         continue;
       }
       procInfo[numParallelDims - dim - 1] = {
@@ -1052,6 +1041,20 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions(
                                                                         dim),
           distributionMethod};
     }
+    if (splitDim) {
+      std::reverse(splitNumTiles.begin(), splitNumTiles.end());
+      auto delinearized = builder.create<affine::AffineDelinearizeIndexOp>(
+          loc, *splitDim, splitNumTiles, /*hasOuterBound=*/true);
+      for (auto [i, id, numTiles] :
+           llvm::enumerate(delinearized.getResults(), splitNumTiles)) {
+        // We iterate the delinearize results from slowest up to fastest, and
+        // we know that these are all the highest values of dimension. That is,
+        // `i = 0` corresponds to the `numParallelDims - 1`-th dimension.
+        procInfo[i] = {id,
+                       getValueOrCreateConstantIndexOp(builder, loc, numTiles),
+                       distributionMethod};
+      }
+    }
     return procInfo;
   }};
 }