Skip to content

Commit

Permalink
[Codegen] Use affine.delinearize_index in workgroup distribution (#19839
Browse files Browse the repository at this point in the history
)

In the interests of getting rid of needless substractions from affine
composition, get rid of one of the last remaining manual calls to
floorDiv()

The other ProcInfo generators have been converted to delinearize_index
by earlier PRs - this one finishes the job.

This should not impact the behavior of generated programs.

---------

Co-authored-by: Han-Chung Wang <[email protected]>
  • Loading branch information
krzysz00 and hanhanW authored Feb 4, 2025
1 parent d661efa commit 1ed6350
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 48 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -303,13 +303,13 @@ hal.executable private @add_distribute4D {
}
// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 64)>
// CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1] -> ((s0 ceildiv 64) * (s1 ceildiv 2))>
// CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0, s1] -> ((s0 floordiv (s1 ceildiv 64)) * 2)>
// CHECK-DAG: #[[MAP3:.+]] = affine_map<()[s0] -> ((s0 ceildiv 2) * 2)>
// CHECK-DAG: #[[MAP4:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
// CHECK-DAG: #[[MAP5:.+]] = affine_map<()[s0, s1] -> ((s0 mod (s1 ceildiv 64)) * 64)>
// CHECK-DAG: #[[MAP6:.+]] = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
// CHECK-DAG: #[[MAP7:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 64)>
// CHECK-DAG: #[[MAP8:.+]] = affine_map<()[s0] -> (s0 * 64)>
// CHECK-DAG: #[[MAP2:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)>
// CHECK-DAG: #[[MAP3:.+]] = affine_map<()[s0] -> (s0 * 2)>
// CHECK-DAG: #[[MAP4:.+]] = affine_map<()[s0] -> ((s0 ceildiv 2) * 2)>
// CHECK-DAG: #[[MAP5:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
// CHECK-DAG: #[[MAP6:.+]] = affine_map<()[s0] -> (s0 * 64)>
// CHECK-DAG: #[[MAP7:.+]] = affine_map<()[s0] -> ((s0 ceildiv 64) * 64)>
// CHECK-DAG: #[[MAP8:.+]] = affine_map<(d0)[s0] -> (-d0 + s0, 64)>
// CHECK-DAG: #[[MAP9:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>

// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>
Expand Down Expand Up @@ -337,31 +337,34 @@ hal.executable private @add_distribute4D {
// CHECK: %[[WORKGROUP_ID_Y:.*]] = hal.interface.workgroup.id[1] : index
// CHECK: %[[WORKGROUP_COUNT_Y:.*]] = hal.interface.workgroup.count[1] : index
// CHECK: %[[WORKGROUP_ID_Z:.*]] = hal.interface.workgroup.id[2] : index
// CHECK-DAG: %[[D7:.*]] = affine.apply #map2(){{\[}}%[[WORKGROUP_ID_Z]], %[[D1]]]
// CHECK-DAG: %[[D8:.*]] = affine.apply #map3(){{\[}}%[[D0]]]
// CHECK: scf.for %[[ARG0:.*]] = %[[D7]] to %[[D0]] step %[[D8]] {
// CHECK-DAG: %[[D9:.*]] = affine.min #map4(%[[ARG0]]){{\[}}%[[D0]]]
// CHECK-DAG: %[[D10:.*]] = affine.apply #map5(){{\[}}%[[WORKGROUP_ID_Z]], %[[D1]]]
// CHECK-DAG: %[[D11:.*]] = affine.apply #map6(){{\[}}%[[D1]]]
// CHECK: scf.for %[[ARG1:.*]] = %[[D10]] to %[[D1]] step %[[D11]] {
// CHECK-DAG: %[[D12:.*]] = affine.min #map7(%[[ARG1]]){{\[}}%[[D1]]]
// CHECK-DAG: %[[D13:.*]] = affine.apply #map8(){{\[}}%[[WORKGROUP_ID_Y]]]
// CHECK-DAG: %[[D14:.*]] = affine.apply #map8(){{\[}}%[[WORKGROUP_COUNT_Y]]]
// CHECK: scf.for %[[ARG2:.*]] = %[[D13]] to %[[D2]] step %[[D14]] {
// CHECK-DAG: %[[D15:.*]] = affine.min #map7(%[[ARG2]]){{\[}}%[[D2]]]
// CHECK-DAG: %[[D16:.*]] = affine.apply #map8(){{\[}}%[[WORKGROUP_ID_X]]]
// CHECK-DAG: %[[D17:.*]] = affine.apply #map8(){{\[}}%[[WORKGROUP_COUNT_X]]]
// CHECK: scf.for %[[ARG3:.*]] = %[[D16]] to %[[D3]] step %[[D17]] {
// CHECK: %[[D18:.*]] = affine.min #map7(%[[ARG3]]){{\[}}%[[D3]]]
// CHECK: %[[D19:.*]] = flow.dispatch.tensor.load %[[D4]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D9]], %[[D12]], %[[D15]], %[[D18]]], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -> tensor<?x?x?x?xf32>
// CHECK: %[[D20:.*]] = flow.dispatch.tensor.load %[[D5]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D9]], %[[D12]], %[[D15]], %[[D18]]], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -> tensor<?x?x?x?xf32>
// CHECK: %[[D21:.*]] = tensor.empty(%[[D9]], %[[D12]], %[[D15]], %[[D18]]) : tensor<?x?x?x?xf32>
// CHECK: %[[D22:.*]] = linalg.generic {indexing_maps = [#map9, #map9, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[D19]], %[[D20]] : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) outs(%[[D21]] : tensor<?x?x?x?xf32>) attrs = {lowering_config = #config} {
// CHECK-DAG: %[[D7:.*]] = affine.apply #map(){{\[}}%[[D1]]]
// CHECK-DAG: %[[D8:.*]] = affine.apply #map2(){{\[}}%[[D0]]]
// CHECK-DAG: %[[D9:.*]]:2 = affine.delinearize_index %[[WORKGROUP_ID_Z]] into (%[[D8]], %[[D7]])
// CHECK-DAG: %[[D10:.*]] = affine.apply #map3(){{\[}}%[[D9]]
// CHECK-DAG: %[[D11:.*]] = affine.apply #map4(){{\[}}%[[D0]]
// CHECK: scf.for %[[ARG0:.*]] = %[[D10]] to %[[D0]] step %[[D11]] {
// CHECK-DAG: %[[D12:.*]] = affine.min #map5(%[[ARG0]]){{\[}}%[[D0]]]
// CHECK-DAG: %[[D13:.*]] = affine.apply #map6(){{\[}}%[[D9]]#1]
// CHECK-DAG: %[[D14:.*]] = affine.apply #map7(){{\[}}%[[D1]]]
// CHECK: scf.for %[[ARG1:.*]] = %[[D13]] to %[[D1]] step %[[D14]] {
// CHECK-DAG: %[[D15:.*]] = affine.min #map8(%[[ARG1]]){{\[}}%[[D1]]]
// CHECK-DAG: %[[D16:.*]] = affine.apply #map6(){{\[}}%[[WORKGROUP_ID_Y]]]
// CHECK-DAG: %[[D17:.*]] = affine.apply #map6(){{\[}}%[[WORKGROUP_COUNT_Y]]]
// CHECK: scf.for %[[ARG2:.*]] = %[[D16]] to %[[D2]] step %[[D17]] {
// CHECK-DAG: %[[D18:.*]] = affine.min #map8(%[[ARG2]]){{\[}}%[[D2]]]
// CHECK-DAG: %[[D19:.*]] = affine.apply #map6(){{\[}}%[[WORKGROUP_ID_X]]]
// CHECK-DAG: %[[D20:.*]] = affine.apply #map6(){{\[}}%[[WORKGROUP_COUNT_X]]]
// CHECK: scf.for %[[ARG3:.*]] = %[[D19]] to %[[D3]] step %[[D20]] {
// CHECK: %[[D21:.*]] = affine.min #map8(%[[ARG3]]){{\[}}%[[D3]]]
// CHECK: %[[D22:.*]] = flow.dispatch.tensor.load %[[D4]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D12]], %[[D15]], %[[D18]], %[[D21]]], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -> tensor<?x?x?x?xf32>
// CHECK: %[[D23:.*]] = flow.dispatch.tensor.load %[[D5]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D12]], %[[D15]], %[[D18]], %[[D21]]], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xf32>>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]} -> tensor<?x?x?x?xf32>
// CHECK: %[[D24:.*]] = tensor.empty(%[[D12]], %[[D15]], %[[D18]], %[[D21]]) : tensor<?x?x?x?xf32>
// CHECK: %[[D25:.*]] = linalg.generic {indexing_maps = [#map9, #map9, #map9], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[D22]], %[[D23]] : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) outs(%[[D24]] : tensor<?x?x?x?xf32>) attrs = {lowering_config = #config} {
// CHECK: ^bb0(%[[IN:.*]]: f32, %[[IN_0:.*]]: f32, %[[OUT:.*]]: f32):
// CHECK: %[[D23:.*]] = arith.addf %[[IN]], %[[IN_0]] : f32
// CHECK: linalg.yield %[[D23]] : f32
// CHECK: %[[D26:.*]] = arith.addf %[[IN]], %[[IN_0]] : f32
// CHECK: linalg.yield %[[D26]] : f32
// CHECK: } -> tensor<?x?x?x?xf32>
// CHECK: flow.dispatch.tensor.store %[[D22:.*]], %[[D6]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D9]], %[[D12]], %[[D15]], %[[D18]]], strides = [1, 1, 1, 1] : tensor<?x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xf32>>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]}
// CHECK: flow.dispatch.tensor.store %[[D25:.*]], %[[D6]], offsets = {{\[}}%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]]], sizes = {{\[}}%[[D12]], %[[D15]], %[[D18]], %[[D21]]], strides = [1, 1, 1, 1] : tensor<?x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?x?x?xf32>>{%[[D0]], %[[D1]], %[[D2]], %[[D3]]}

// -----

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,12 @@ func.func @multiple_dim_distribute(%s0 : index, %s1 : index, %s2 : index, %s3 :
// CHECK-DAG: %[[WG_ID_X:.+]] = hal.interface.workgroup.id[0]
// CHECK-DAG: %[[WG_ID_Y:.+]] = hal.interface.workgroup.id[1]
// CHECK-DAG: %[[WG_ID_Z:.+]] = hal.interface.workgroup.id[2]
// CHECK-DAG: %[[WG_IDS_Z:.+]]:3 = affine.delinearize_index %[[WG_ID_Z]] into (%[[S0]], %[[S1]], %[[S2]])
// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty() : tensor<1x2x1x3x1x4x1x1xf32>
// CHECK-DAG: %[[IN_SLICE:.+]] = tensor.extract_slice %[[INPUT]][0, 0, 0, %[[WG_ID_X]]] [2, 3, 4, 1]
// CHECK: %[[GENERIC:.+]] = linalg.generic
// CHECK-SAME: ins(%[[IN_SLICE]] :
// CHECK-SAME: outs(%[[EMPTY]] :
// CHECK-DAG: %[[WG_ID_Z_0:.+]] = affine.apply affine_map<()[s0, s1, s2] -> ((s1 floordiv s2) floordiv s0)>()[%[[S1]], %[[WG_ID_Z]], %[[S2]]]
// CHECK-DAG: %[[WG_ID_Z_1:.+]] = affine.apply affine_map<()[s0, s1, s2] -> ((s1 floordiv s2) mod s0)>()[%[[S1]], %[[WG_ID_Z]], %[[S2]]]
// CHECK-DAG: %[[WG_ID_Z_2:.+]] = affine.apply affine_map<()[s0, s1] -> (s0 mod s1)>()[%[[WG_ID_Z]], %[[S2]]]
// CHECK: flow.dispatch.tensor.store %[[GENERIC]],
// CHECK-SAME: offsets = [%[[WG_ID_Z_0]], 0, %[[WG_ID_Z_1]], 0, %[[WG_ID_Z_2]], 0, %[[WG_ID_Y]], %[[WG_ID_X]]]
// CHECK-SAME: offsets = [%[[WG_IDS_Z]]#0, 0, %[[WG_IDS_Z]]#1, 0, %[[WG_IDS_Z]]#2, 0, %[[WG_ID_Y]], %[[WG_ID_X]]]
// CHECK-SAME: sizes = [1, 2, 1, 3, 1, 4, 1, 1]
31 changes: 17 additions & 14 deletions compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1013,7 +1013,8 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions(
auto numParallelDims = parallelLoopRanges.size();

SmallVector<linalg::ProcInfo, 3> procInfo(numParallelDims);
std::optional<OpFoldResult> splitDim;
std::optional<Value> splitDim;
SmallVector<OpFoldResult> splitNumTiles;
for (size_t dim = 0; dim < numParallelDims; ++dim) {
if (numParallelDims > maxWorkgroupParallelDims &&
dim >= maxWorkgroupParallelDims - 1) {
Expand All @@ -1030,19 +1031,7 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions(
bindSymbols(builder.getContext(), d0, d1, d2);
OpFoldResult numTiles = affine::makeComposedFoldedAffineApply(
builder, loc, (d1 - d0).ceilDiv(d2), {offset, size, step});
OpFoldResult dimValue;
if (dim == numParallelDims - 1)
dimValue = splitDim.value();
else {
dimValue = affine::makeComposedFoldedAffineApply(
builder, loc, (d0 % d1), {splitDim.value(), numTiles});
splitDim = affine::makeComposedFoldedAffineApply(
builder, loc, (d0).floorDiv(d1), {splitDim.value(), numTiles});
}
procInfo[numParallelDims - dim - 1] = {
getValueOrCreateConstantIndexOp(builder, loc, dimValue),
getValueOrCreateConstantIndexOp(builder, loc, numTiles),
distributionMethod};
splitNumTiles.push_back(numTiles);
continue;
}
procInfo[numParallelDims - dim - 1] = {
Expand All @@ -1052,6 +1041,20 @@ linalg::LinalgLoopDistributionOptions getIREELinalgLoopDistributionOptions(
dim),
distributionMethod};
}
if (splitDim) {
std::reverse(splitNumTiles.begin(), splitNumTiles.end());
auto delinearized = builder.create<affine::AffineDelinearizeIndexOp>(
loc, *splitDim, splitNumTiles, /*hasOuterBound=*/true);
for (auto [i, id, numTiles] :
llvm::enumerate(delinearized.getResults(), splitNumTiles)) {
// We iterate the delinearize results from slowest up to fastest, and
// we know that these are all the highest values of dimension. That is,
// `i = 0` corresponds to the `numParallelDims - 1`-th dimension.
procInfo[i] = {id,
getValueOrCreateConstantIndexOp(builder, loc, numTiles),
distributionMethod};
}
}
return procInfo;
}};
}
Expand Down

0 comments on commit 1ed6350

Please sign in to comment.