From 1ed6350de54796b61ecc03d6fb1ba1c2bf107abb Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Mon, 3 Feb 2025 16:02:41 -0800 Subject: [PATCH] [Codegen] Use affine.delinearize_index in workgroup distribution (#19839) In the interests of getting rid of needless substractions from affine composition, get rid of one of the last remaining manual calls to floorDiv() The other ProcInfo generators have been converted to delinearize_index by earlier PRs - this one finishes the job. This should not impact the behavior of generated programs. --------- Co-authored-by: Han-Chung Wang --- .../tile_and_distribute_to_workgroups.mlir | 63 ++++++++++--------- ...d_distribute_to_workgroups_func_scope.mlir | 6 +- .../src/iree/compiler/Codegen/Utils/Utils.cpp | 31 ++++----- 3 files changed, 52 insertions(+), 48 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir index 025a4bf19236..03d1e51f9398 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups.mlir @@ -303,13 +303,13 @@ hal.executable private @add_distribute4D { } // CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)> // CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1] -> ((s0 ceildiv 64) * (s1 ceildiv 2))> -// CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0, s1] -> ((s0 floordiv (s1 ceildiv 64)) * 2)> -// CHECK-DAG: #[[MAP3:.+]] = affine_map<()[s0] -> ((s0 ceildiv 2) * 2)> -// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> -// CHECK-DAG: #[[MAP5:.+]] = affine_map<()[s0, s1] -> ((s0 mod (s1 ceildiv 64)) * 64)> -// CHECK-DAG: #[[MAP6:.+]] = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> -// CHECK-DAG: #[[MAP7:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 64)> -// CHECK-DAG: #[[MAP8:.+]] = affine_map<()[s0] -> (s0 * 64)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)> +// CHECK-DAG: #[[MAP3:.+]] = affine_map<()[s0] -> (s0 * 2)> +// CHECK-DAG: #[[MAP4:.+]] = affine_map<()[s0] -> ((s0 ceildiv 2) * 2)> +// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)> +// CHECK-DAG: #[[MAP6:.+]] = affine_map<()[s0] -> (s0 * 64)> +// CHECK-DAG: #[[MAP7:.+]] = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)> +// CHECK-DAG: #[[MAP8:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 64)> // CHECK-DAG: #[[MAP9:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info @@ -337,31 +337,34 @@ hal.executable private @add_distribute4D { // CHECK: %[[WORKGROUP_ID_Y:.*]] = hal.interface.workgroup.id[1] : index // CHECK: %[[WORKGROUP_COUNT_Y:.*]] = hal.interface.workgroup.count[1] : index // CHECK: %[[WORKGROUP_ID_Z:.*]] = hal.interface.workgroup.id[2] : index -// CHECK-DAG: %[[D7:.*]] = affine.apply #map2(){{\[}}%[[WORKGROUP_ID_Z]], %[[D1]]] -// CHECK-DAG: %[[D8:.*]] = affine.apply #map3(){{\[}}%[[D0]]] -// CHECK: scf.for %[[ARG0:.*]] = %[[D7]] to %[[D0]] step %[[D8]] { -// CHECK-DAG: %[[D9:.*]] = affine.min #map4(%[[ARG0]]){{\[}}%[[D0]]] -// CHECK-DAG: %[[D10:.*]] = affine.apply #map5(){{\[}}%[[WORKGROUP_ID_Z]], %[[D1]]] -// CHECK-DAG: %[[D11:.*]] = affine.apply #map6(){{\[}}%[[D1]]] -// CHECK: scf.for %[[ARG1:.*]] = %[[D10]] to %[[D1]] step %[[D11]] { -// CHECK-DAG: %[[D12:.*]] = affine.min #map7(%[[ARG1]]){{\[}}%[[D1]]] -// CHECK-DAG: %[[D13:.*]] = affine.apply #map8(){{\[}}%[[WORKGROUP_ID_Y]]] -// CHECK-DAG: %[[D14:.*]] = affine.apply #map8(){{\[}}%[[WORKGROUP_COUNT_Y]]] -// CHECK: scf.for %[[ARG2:.*]] = %[[D13]] to %[[D2]] step %[[D14]] { -// CHECK-DAG: %[[D15:.*]] = affine.min #map7(%[[ARG2]]){{\[}}%[[D2]]] -// CHECK-DAG: %[[D16:.*]] = affine.apply #map8(){{\[}}%[[WORKGROUP_ID_X]]] -// CHECK-DAG: %[[D17:.*]] = affine.apply #map8(){{\[}}%[[WORKGROUP_COUNT_X]]] -// CHECK: scf.for %[[ARG3:.*]] = %[[D16]] to %[[D3]] step %[[D17]] { -// CHECK: %[[D18:.*]] = affine.min #map7(%[[ARG3]]){{\[}}%[[D3]]] -// CHECK: %[[D19:.*]] = flow.dispatch.tensor.load %[[D4]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D9]], %[[D12]], %[[D15]], %[[D18]]], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -> tensor -// CHECK: %[[D20:.*]] = flow.dispatch.tensor.load %[[D5]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D9]], %[[D12]], %[[D15]], %[[D18]]], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -> tensor -// CHECK: %[[D21:.*]] = tensor.empty(%[[D9]], %[[D12]], %[[D15]], %[[D18]]) : tensor -// CHECK: %[[D22:.*]] = linalg.generic {indexing_maps = [#map9, #map9, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[D19]], %[[D20]] : tensor, tensor) outs(%[[D21]] : tensor) attrs = {lowering_config = #config} { +// CHECK-DAG: %[[D7:.*]] = affine.apply #map(){{\[}}%[[D1]]] +// CHECK-DAG: %[[D8:.*]] = affine.apply #map2(){{\[}}%[[D0]]] +// CHECK-DAG: %[[D9:.*]]:2 = affine.delinearize_index %[[WORKGROUP_ID_Z]] into (%[[D8]], %[[D7]]) +// CHECK-DAG: %[[D10:.*]] = affine.apply #map3(){{\[}}%[[D9]] +// CHECK-DAG: %[[D11:.*]] = affine.apply #map4(){{\[}}%[[D0]] +// CHECK: scf.for %[[ARG0:.*]] = %[[D10]] to %[[D0]] step %[[D11]] { +// CHECK-DAG: %[[D12:.*]] = affine.min #map5(%[[ARG0]]){{\[}}%[[D0]]] +// CHECK-DAG: %[[D13:.*]] = affine.apply #map6(){{\[}}%[[D9]]#1] +// CHECK-DAG: %[[D14:.*]] = affine.apply #map7(){{\[}}%[[D1]]] +// CHECK: scf.for %[[ARG1:.*]] = %[[D13]] to %[[D1]] step %[[D14]] { +// CHECK-DAG: %[[D15:.*]] = affine.min #map8(%[[ARG1]]){{\[}}%[[D1]]] +// CHECK-DAG: %[[D16:.*]] = affine.apply #map6(){{\[}}%[[WORKGROUP_ID_Y]]] +// CHECK-DAG: %[[D17:.*]] = affine.apply #map6(){{\[}}%[[WORKGROUP_COUNT_Y]]] +// CHECK: scf.for %[[ARG2:.*]] = %[[D16]] to %[[D2]] step %[[D17]] { +// CHECK-DAG: %[[D18:.*]] = affine.min #map8(%[[ARG2]]){{\[}}%[[D2]]] +// CHECK-DAG: %[[D19:.*]] = affine.apply #map6(){{\[}}%[[WORKGROUP_ID_X]]] +// CHECK-DAG: %[[D20:.*]] = affine.apply #map6(){{\[}}%[[WORKGROUP_COUNT_X]]] +// CHECK: scf.for %[[ARG3:.*]] = %[[D19]] to %[[D3]] step %[[D20]] { +// CHECK: %[[D21:.*]] = affine.min #map8(%[[ARG3]]){{\[}}%[[D3]]] +// CHECK: %[[D22:.*]] = flow.dispatch.tensor.load %[[D4]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D12]], %[[D15]], %[[D18]], %[[D21]]], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -> tensor +// CHECK: %[[D23:.*]] = flow.dispatch.tensor.load %[[D5]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D12]], %[[D15]], %[[D18]], %[[D21]]], strides = [1, 1, 1, 1] : !flow.dispatch.tensor>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -> tensor +// CHECK: %[[D24:.*]] = tensor.empty(%[[D12]], %[[D15]], %[[D18]], %[[D21]]) : tensor +// CHECK: %[[D25:.*]] = linalg.generic {indexing_maps = [#map9, #map9, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[D22]], %[[D23]] : tensor, tensor) outs(%[[D24]] : tensor) attrs = {lowering_config = #config} { // CHECK: ^bb0(%[[IN:.*]]: f32, %[[IN_0:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: %[[D23:.*]] = arith.addf %[[IN]], %[[IN_0]] : f32 -// CHECK: linalg.yield %[[D23]] : f32 +// CHECK: %[[D26:.*]] = arith.addf %[[IN]], %[[IN_0]] : f32 +// CHECK: linalg.yield %[[D26]] : f32 // CHECK: } -> tensor -// CHECK: flow.dispatch.tensor.store %[[D22:.*]], %[[D6]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D9]], %[[D12]], %[[D15]], %[[D18]]], strides = [1, 1, 1, 1] : tensor -> !flow.dispatch.tensor>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} +// CHECK: flow.dispatch.tensor.store %[[D25:.*]], %[[D6]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D12]], %[[D15]], %[[D18]], %[[D21]]], strides = [1, 1, 1, 1] : tensor -> !flow.dispatch.tensor>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} // ----- diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups_func_scope.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups_func_scope.mlir index 0478f50b8d63..337165a3adae 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups_func_scope.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_to_workgroups_func_scope.mlir @@ -32,14 +32,12 @@ func.func @multiple_dim_distribute(%s0 : index, %s1 : index, %s2 : index, %s3 : // CHECK-DAG: %[[WG_ID_X:.+]] = hal.interface.workgroup.id[0] // CHECK-DAG: %[[WG_ID_Y:.+]] = hal.interface.workgroup.id[1] // CHECK-DAG: %[[WG_ID_Z:.+]] = hal.interface.workgroup.id[2] +// CHECK-DAG: %[[WG_IDS_Z:.+]]:3 = affine.delinearize_index %[[WG_ID_Z]] into (%[[S0]], %[[S1]], %[[S2]]) // CHECK-DAG: %[[EMPTY:.+]] = tensor.empty() : tensor<1x2x1x3x1x4x1x1xf32> // CHECK-DAG: %[[IN_SLICE:.+]] = tensor.extract_slice %[[INPUT]][0, 0, 0, %[[WG_ID_X]]] [2, 3, 4, 1] // CHECK: %[[GENERIC:.+]] = linalg.generic // CHECK-SAME: ins(%[[IN_SLICE]] : // CHECK-SAME: outs(%[[EMPTY]] : -// CHECK-DAG: %[[WG_ID_Z_0:.+]] = affine.apply affine_map<()[s0, s1, s2] -> ((s1 floordiv s2) floordiv s0)>()[%[[S1]], %[[WG_ID_Z]], %[[S2]]] -// CHECK-DAG: %[[WG_ID_Z_1:.+]] = affine.apply affine_map<()[s0, s1, s2] -> ((s1 floordiv s2) mod s0)>()[%[[S1]], %[[WG_ID_Z]], %[[S2]]] -// CHECK-DAG: %[[WG_ID_Z_2:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 mod s1)>()[%[[WG_ID_Z]], %[[S2]]] // CHECK: flow.dispatch.tensor.store %[[GENERIC]], -// CHECK-SAME: offsets = [%[[WG_ID_Z_0]], 0, %[[WG_ID_Z_1]], 0, %[[WG_ID_Z_2]], 0, %[[WG_ID_Y]], %[[WG_ID_X]]] +// CHECK-SAME: offsets = [%[[WG_IDS_Z]]#0, 0, %[[WG_IDS_Z]]#1, 0, %[[WG_IDS_Z]]#2, 0, %[[WG_ID_Y]], %[[WG_ID_X]]] // CHECK-SAME: sizes = [1, 2, 1, 3, 1, 4, 1, 1] diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp index e23fe37e90e5..c02d730e5c33 100644 --- a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp +++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp @@ -1013,7 +1013,8 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions( auto numParallelDims = parallelLoopRanges.size(); SmallVector procInfo(numParallelDims); - std::optional splitDim; + std::optional splitDim; + SmallVector splitNumTiles; for (size_t dim = 0; dim < numParallelDims; ++dim) { if (numParallelDims > maxWorkgroupParallelDims && dim >= maxWorkgroupParallelDims - 1) { @@ -1030,19 +1031,7 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions( bindSymbols(builder.getContext(), d0, d1, d2); OpFoldResult numTiles = affine::makeComposedFoldedAffineApply( builder, loc, (d1 - d0).ceilDiv(d2), {offset, size, step}); - OpFoldResult dimValue; - if (dim == numParallelDims - 1) - dimValue = splitDim.value(); - else { - dimValue = affine::makeComposedFoldedAffineApply( - builder, loc, (d0 % d1), {splitDim.value(), numTiles}); - splitDim = affine::makeComposedFoldedAffineApply( - builder, loc, (d0).floorDiv(d1), {splitDim.value(), numTiles}); - } - procInfo[numParallelDims - dim - 1] = { - getValueOrCreateConstantIndexOp(builder, loc, dimValue), - getValueOrCreateConstantIndexOp(builder, loc, numTiles), - distributionMethod}; + splitNumTiles.push_back(numTiles); continue; } procInfo[numParallelDims - dim - 1] = { @@ -1052,6 +1041,20 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions( dim), distributionMethod}; } + if (splitDim) { + std::reverse(splitNumTiles.begin(), splitNumTiles.end()); + auto delinearized = builder.create( + loc, *splitDim, splitNumTiles, /*hasOuterBound=*/true); + for (auto [i, id, numTiles] : + llvm::enumerate(delinearized.getResults(), splitNumTiles)) { + // We iterate the delinearize results from slowest up to fastest, and + // we know that these are all the highest values of dimension. That is, + // `i = 0` corresponds to the `numParallelDims - 1`-th dimension. + procInfo[i] = {id, + getValueOrCreateConstantIndexOp(builder, loc, numTiles), + distributionMethod}; + } + } return procInfo; }}; }