diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp index df56f0d76e67..f4c289aec413 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp +++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp @@ -1828,9 +1828,9 @@ LogicalResult TensorSplatOp::verify() { LogicalResult TensorCloneOp::verify() { if (failed(verifyOpDynamicDims(getOperation(), {getOperand()}, - getArgumentDims())) || + getOperandDims())) || failed(verifyOpDynamicDims(getOperation(), {getResult()}, - getArgumentDims()))) { + getOperandDims()))) { return failure(); } return success(); @@ -1840,7 +1840,30 @@ LogicalResult TensorCloneOp::verify() { // flow.tensor.barrier //===----------------------------------------------------------------------===// -LogicalResult TensorBarrierOp::verify() { return success(); } +LogicalResult TensorBarrierOp::verify() { + if (failed(verifyOpDynamicDims(getOperation(), {getOperand()}, + getOperandDims()))) { + return failure(); + } + return success(); +} + +Value TensorBarrierOp::getTiedResult(unsigned resultIndex) { + return IREE::Util::TiedOpInterface::findTiedBaseValue(getOperand()); +} + +Value TensorBarrierOp::getTiedResultOperand(Value result) { + return getOperand(); +} + +::std::optional +TensorBarrierOp::getTiedResultOperandIndex(unsigned resultIndex) { + return {0}; // operand +} + +SmallVector TensorBarrierOp::getTiedResultOperandIndices() { + return {0}; // operand +} //===----------------------------------------------------------------------===// // flow.tensor.transfer @@ -1848,9 +1871,9 @@ LogicalResult TensorBarrierOp::verify() { return success(); } LogicalResult TensorTransferOp::verify() { if (failed(verifyOpDynamicDims(getOperation(), {getOperand()}, - getArgumentDims())) || + getOperandDims())) || failed(verifyOpDynamicDims(getOperation(), {getResult()}, - getArgumentDims()))) { + getOperandDims()))) { return failure(); } return success(); diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td index a10982b7fba0..0241a843b906 100644 --- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td +++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td @@ -1469,14 +1469,14 @@ def FLOW_TensorCloneOp : FLOW_PureOp<"tensor.clone", [ let arguments = (ins FLOW_Tensor:$operand, - FLOW_ShapeDynamicDims:$argument_dims + FLOW_ShapeDynamicDims:$operand_dims ); let results = (outs FLOW_Tensor:$result ); let assemblyFormat = [{ - $operand `:` type($result) (`{` $argument_dims^ `}`)? + $operand `:` type($result) (`{` $operand_dims^ `}`)? attr-dict-with-keyword }]; @@ -1493,8 +1493,8 @@ def FLOW_TensorCloneOp : FLOW_PureOp<"tensor.clone", [ let extraClassDeclaration = [{ bool isHoistableLeafOp() { return false; } - ValueRange getOperandDynamicDims(unsigned idx) { return getArgumentDims(); } - ValueRange getResultDynamicDims(unsigned idx) { return getArgumentDims(); } + ValueRange getOperandDynamicDims(unsigned idx) { return getOperandDims(); } + ValueRange getResultDynamicDims(unsigned idx) { return getOperandDims(); } }]; let hasVerifier = 1; @@ -1506,14 +1506,24 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [ AllTypesMatch<["operand", "result"]>, DeclareOpInterfaceMethods, Util_ShapeAwareOp, + DeclareOpInterfaceMethods, ]> { - let summary = [{}]; + let summary = [{indicates a value that must have a specific affinity}]; let description = [{ + Prevents fusion and scheduling of a value across an affinity boundary. + May introduce copy-on-write behavior if the operand value is used as well as + the result and users should try to keep the operand to a single use by this + op. }]; let arguments = (ins FLOW_Tensor:$operand, - FLOW_ShapeDynamicDims:$argument_dims, + FLOW_ShapeDynamicDims:$operand_dims, AnyAttr:$target ); let results = (outs @@ -1521,7 +1531,7 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [ ); let assemblyFormat = [{ - $operand `:` type($result) (`{` $argument_dims^ `}`)? + $operand `:` type($result) (`{` $operand_dims^ `}`)? `on` $target attr-dict-with-keyword }]; @@ -1540,8 +1550,8 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [ let extraClassDeclaration = [{ bool isHoistableLeafOp() { return false; } - ValueRange getOperandDynamicDims(unsigned idx) { return getArgumentDims(); } - ValueRange getResultDynamicDims(unsigned idx) { return getArgumentDims(); } + ValueRange getOperandDynamicDims(unsigned idx) { return getOperandDims(); } + ValueRange getResultDynamicDims(unsigned idx) { return getOperandDims(); } }]; let hasVerifier = 1; @@ -1564,7 +1574,7 @@ def FLOW_TensorTransferOp : FLOW_PureOp<"tensor.transfer", [ let arguments = (ins FLOW_Tensor:$operand, - FLOW_ShapeDynamicDims:$argument_dims, + FLOW_ShapeDynamicDims:$operand_dims, AnyAttr:$target ); let results = (outs @@ -1572,7 +1582,7 @@ def FLOW_TensorTransferOp : FLOW_PureOp<"tensor.transfer", [ ); let assemblyFormat = [{ - $operand `:` type($result) (`{` $argument_dims^ `}`)? + $operand `:` type($result) (`{` $operand_dims^ `}`)? `to` $target attr-dict-with-keyword }]; @@ -1591,8 +1601,8 @@ def FLOW_TensorTransferOp : FLOW_PureOp<"tensor.transfer", [ let extraClassDeclaration = [{ bool isHoistableLeafOp() { return false; } - ValueRange getOperandDynamicDims(unsigned idx) { return getArgumentDims(); } - ValueRange getResultDynamicDims(unsigned idx) { return getArgumentDims(); } + ValueRange getOperandDynamicDims(unsigned idx) { return getOperandDims(); } + ValueRange getResultDynamicDims(unsigned idx) { return getOperandDims(); } }]; let hasVerifier = 1; diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp index d60e6b19c447..00288cc640d6 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp @@ -229,8 +229,8 @@ struct ConvertTensorCloneOp auto unknownType = rewriter.getType(); auto cloneOp = rewriter.create( op.getLoc(), unknownType, operand.resource, op.getOperand().getType(), - op.getArgumentDims(), operand.resourceSize, op.getResult().getType(), - flattenValues(adaptor.getArgumentDims()), operand.resourceSize, + op.getOperandDims(), operand.resourceSize, op.getResult().getType(), + flattenValues(adaptor.getOperandDims()), operand.resourceSize, executionAffinityAttr); rewriter.replaceOpWithMultiple(op, {{cloneOp, operand.resourceSize}}); return success(); @@ -249,7 +249,7 @@ struct ConvertTensorBarrierOp auto barrierOp = rewriter.create( op.getLoc(), operand.resource.getType(), operand.resource, operand.resourceSize, - /*affinity=*/operand.affinity); + /*affinity=*/executionAffinityAttr); rewriter.replaceOpWithMultiple(op, {{barrierOp, operand.resourceSize}}); return success(); } diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir index 4f61917ed439..640da7802f1a 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir +++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir @@ -139,14 +139,14 @@ util.func public @tensorSplat(%value: i8, %dim0: index) -> tensor { util.global private @device : !hal.device // CHECK-LABEL: @tensorBarrierDispatch -// CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index) +// CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM0:.+]]: index) util.func public @tensorBarrierDispatch(%input: tensor, %dim0: index) -> tensor { - // CHECK: %[[BARRIER:.+]] = stream.async.barrier %[[INPUT]] : !stream.resource<*>{%[[DIM0]]} -> !stream.resource<*> + // CHECK: %[[BARRIER:.+]] = stream.async.barrier on(#hal.device.affinity<@device>) %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} %barrier = flow.tensor.barrier %input : tensor{%dim0} on #hal.device.affinity<@device> - // CHECK: %[[SIZE:.+]] = stream.tensor.sizeof on(#hal.device.affinity<@device>) tensor{%arg2} : index - // CHECK: %[[RESULT:.+]] = stream.tensor.dispatch on(#hal.device.affinity<@device>) @ex::@entry(%[[BARRIER]]) + // CHECK: %[[RESULT_SIZE:.+]] = stream.tensor.sizeof tensor{%[[DIM0]]} : index + // CHECK: %[[RESULT:.+]] = stream.tensor.dispatch @ex::@entry(%[[BARRIER]]) %0 = flow.dispatch @ex::@entry(%barrier) : (tensor{%dim0}) -> tensor{%dim0} - // CHECK: util.return %[[RESULT]], %[[SIZE]] + // CHECK: util.return %[[RESULT]], %[[RESULT_SIZE]] util.return %0 : tensor } @@ -170,7 +170,7 @@ util.global private @device : !hal.device // CHECK-LABEL: @tensorBarrier // CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM0:.+]]: index) util.func public @tensorBarrier(%input: tensor, %dim0: index) -> tensor { - // CHECK: %[[TRANSFER:.+]] = stream.async.barrier %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} -> !stream.resource<*> + // CHECK: %[[TRANSFER:.+]] = stream.async.barrier on(#hal.device.affinity<@device>) %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} %transfer = flow.tensor.barrier %input : tensor{%dim0} on #hal.device.affinity<@device> // CHECK: util.return %[[TRANSFER]], %[[INPUT_SIZE]] util.return %transfer : tensor diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp index 4623a7bd6c64..13988a999b2f 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp @@ -2469,6 +2469,19 @@ bool AsyncBarrierOp::isMetadata() { return true; } LogicalResult AsyncBarrierOp::verify() { return success(); } +Value AsyncBarrierOp::getTiedResult(unsigned resultIndex) { + return IREE::Util::TiedOpInterface::findTiedBaseValue(getSource()); +} + +::std::optional +AsyncBarrierOp::getTiedResultOperandIndex(unsigned resultIndex) { + return {0}; // source +} + +SmallVector AsyncBarrierOp::getTiedResultOperandIndices() { + return {0}; // source +} + //===----------------------------------------------------------------------===// // stream.async.transfer //===----------------------------------------------------------------------===// diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td index 8ed4bca948fa..62a44d5bac66 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td +++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td @@ -2291,15 +2291,25 @@ def Stream_AsyncCollectiveOp : Stream_Op<"async.collective", [ } def Stream_AsyncBarrierOp : Stream_Op<"async.barrier", [ + AllTypesMatch<["source", "result"]>, Stream_AffinityOp, Stream_AsyncPhaseOp, DeclareOpInterfaceMethods, Util_SizeAwareOp, + DeclareOpInterfaceMethods, ]> { - let summary = [{ }]; + let summary = [{indicates a value that must have a specific affinity}]; let description = [{ + Prevents fusion and scheduling of a value across an affinity boundary. + May introduce copy-on-write behavior if the operand value is used as well as + the result and users should try to keep the operand to a single use by this + op. }]; let arguments = (ins @@ -2318,11 +2328,9 @@ def Stream_AsyncBarrierOp : Stream_Op<"async.barrier", [ ); let assemblyFormat = [{ + (`on` `(` $affinity^ `)`)? $source `:` type($source) `` `{` $size `}` - (`from` `(` $affinity^ `)`)? - `->` - type($result) attr-dict-with-keyword }]; diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp index 6cb8a2a1ce42..b7ce5dfece90 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp @@ -694,10 +694,14 @@ static LogicalResult applyAsyncTransferOp(IREE::Stream::AsyncTransferOp asyncOp, }; auto currentAffinityAttr = IREE::Stream::AffinityAttr::lookupOrDefault(asyncOp); - bool transferIn = asyncOp.getSourceAffinityAttr() != currentAffinityAttr || - isStaging(asyncOp.getSource()); - bool transferOut = asyncOp.getResultAffinityAttr() != currentAffinityAttr || - isStaging(asyncOp.getResult()); + auto sourceAffinityAttr = asyncOp.getSourceAffinityAttr(); + auto resultAffinityAttr = asyncOp.getResultAffinityAttr(); + bool transferIn = + (sourceAffinityAttr && sourceAffinityAttr != currentAffinityAttr) || + isStaging(asyncOp.getSource()); + bool transferOut = + (resultAffinityAttr && resultAffinityAttr != currentAffinityAttr) || + isStaging(asyncOp.getResult()); auto sourceRange = scope.lookupResourceRange(asyncOp.getSource()); auto targetRange = scope.lookupResourceRange(asyncOp.getResult()); @@ -1274,35 +1278,47 @@ struct ResultReservationSet { }; struct ResultAllocation { + // Affinity for the allocations. + IREE::Stream::AffinityAttr affinityAttr; // Reservations bucketed by lifetime. SmallVector reservationSets; }; +// A map of allocation placement affinities to the alloc reservations requested. +using ResultAllocationMap = + llvm::MapVector>; + // Produces parameters for one or more result allocations composed of an ordered -// set of |reservations| with matching lifetimes. -static ResultAllocation -reserveResultAllocation(ArrayRef reservations) { - // We want deterministic ordering of the allocations for each lifetime type - // so we build them all here and then just nuke the ones we don't end up - // using. - SmallVector sets( - IREE::Stream::getMaxEnumValForLifetime() + 1); - for (auto &reservation : reservations) { - auto &set = - sets[static_cast(reservation.resultType.getLifetime())]; - set.reservationLocs.push_back(reservation.loc); - set.reservationTypes.push_back(reservation.resultType); - set.reservationSizes.push_back(reservation.resultSize); - set.reservations.push_back(std::move(reservation)); - } +// set of |reservations| with matching lifetimes. Allocations will be bucketed +// both by their allocation affinity (where they should be placed) and their +// lifetime (how long they're expected to live). +static std::vector +reserveResultAllocations(ResultAllocationMap &reservationMap) { + std::vector result; + for (auto &[affinityAttr, reservations] : reservationMap) { + // We want deterministic ordering of the allocations for each lifetime type + // so we build them all here and then just nuke the ones we don't end up + // using. + SmallVector sets( + IREE::Stream::getMaxEnumValForLifetime() + 1); + for (auto &reservation : reservations) { + auto &set = + sets[static_cast(reservation.resultType.getLifetime())]; + set.reservationLocs.push_back(reservation.loc); + set.reservationTypes.push_back(reservation.resultType); + set.reservationSizes.push_back(reservation.resultSize); + set.reservations.push_back(std::move(reservation)); + } - // Remove unused sets. This does a bunch of moves and is really bad but eh. - for (int i = sets.size() - 1; i >= 0; --i) { - if (sets[i].reservations.empty()) { - sets.erase(sets.begin() + i); + // Remove unused sets. This does a bunch of moves and is really bad but eh. + for (int i = sets.size() - 1; i >= 0; --i) { + if (sets[i].reservations.empty()) { + sets.erase(sets.begin() + i); + } } + result.push_back(ResultAllocation{affinityAttr, sets}); } - return ResultAllocation{sets}; + return result; } //===----------------------------------------------------------------------===// @@ -1333,6 +1349,49 @@ static Value findTiedYieldResult(Value seedValue) { return {}; } +// Walks up the use-def chain to find an affinity the given local value is +// pinned to. May return nullptr if there's no assigned affinity and the +// enclosing execution region affinity should be used. +// +// TODO(benvanik): change this to use an affinity analysis on the escaping +// value instead. The local value may not have a transfer associated with it. +static IREE::Stream::AffinityAttr findLocalValueAffinity(Value value) { + while (value) { + auto definingOp = value.getDefiningOp(); + if (!definingOp) { + // Block argument or something we don't track locally. + return {}; + } else if (auto transferOp = + dyn_cast(definingOp)) { + return transferOp.getResultAffinityAttr(); + } else if (auto regionOp = dyn_cast(definingOp)) { + // A region op with a yielded value (like stream.async.concurrent). + // Note that we always want to check for tied ops first as that will let + // us skip over the region entirely. + if (auto tiedOp = dyn_cast(definingOp)) { + if (auto tiedValue = tiedOp.getTiedResultOperand(value)) { + value = tiedValue; + continue; + } + } + unsigned resultIndex = cast(value).getResultNumber(); + auto &block = regionOp.getOperation()->getRegion(0).front(); + auto terminatorOp = + cast(block.getTerminator()); + value = terminatorOp.getSuccessorOperands( + RegionBranchPoint::parent())[resultIndex]; + } else if (auto tiedOp = + dyn_cast(definingOp)) { + // If the producer is tied then try to get the operand. + value = tiedOp.getTiedResultOperand(value); + } else { + // Analysis blocked. + break; + } + } + return {}; +} + // Returns a reversed list of subrange operations that lead from an initial // resource down a sequence to |derivedValue|. The first element in the list // will be the last subview of |derivedValue| and the last element will be the @@ -1541,7 +1600,7 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) { auto resourceRange = ResourceRange(arg, operandSize); scope.mapResourceRange(arg, resourceRange, asmState.get()); } - SmallVector resultReservations; + ResultAllocationMap resultReservations; for (auto [result, resultSize] : llvm::zip_equal(executeOp.getResults(), executeOp.getResultSizes())) { auto resultType = llvm::cast(result.getType()); @@ -1623,6 +1682,13 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) { continue; } + // Find a pinned affinity for the value or inherit the execution region + // affinity. + auto allocationAffinity = findLocalValueAffinity(yieldValue); + if (!allocationAffinity) { + allocationAffinity = executeOp.getAffinityAttr(); + } + // Queue up the allocation for packing. ResultReservation resultReservation = { definingOp->getLoc(), result, resultType, resultSize, yieldValue, @@ -1633,54 +1699,56 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) { resultReservation.result.printAsOperand(llvm::dbgs(), asmState); llvm::dbgs() << "\n"; }); - resultReservations.push_back(resultReservation); + resultReservations[allocationAffinity].push_back(resultReservation); } - auto resultAllocation = reserveResultAllocation(resultReservations); - for (auto &reservationSet : resultAllocation.reservationSets) { - // Allocate and tie an operand to the result. - auto timepointType = externalBuilder.getType(); - auto [allocaOp, suballocations] = - IREE::Stream::ResourceAllocaOp::createSuballocations( - timepointType, reservationSet.reservationTypes.front(), - reservationSet.reservationLocs, reservationSet.reservationSizes, - executeOp.getAwaitTimepoint(), executeOp.getAffinityAttr(), - externalBuilder); - newAwaitTimepoints.push_back(allocaOp.getResultTimepoint()); - - auto asmState = getRootAsmState(executeOp->getParentOp()); - LLVM_DEBUG({ - llvm::dbgs() << " + alloc for result reservation set: "; - allocaOp.print(llvm::dbgs(), *asmState); - llvm::dbgs() << ":\n"; - }); - - for (auto [reservation, suballocation] : - llvm::zip_equal(reservationSet.reservations, suballocations)) { - newOperands.push_back(suballocation); - newOperandSizes.push_back(reservation.resultSize); - resultReplacements.push_back( - std::make_pair(reservation.result, suballocation)); - - // Insert entry arg for the new operand tied all the way to the yield. - auto arg = - entryBlock.addArgument(reservation.resultType, reservation.loc); + for (auto &resultAllocation : reserveResultAllocations(resultReservations)) { + for (auto &reservationSet : resultAllocation.reservationSets) { + // Allocate and tie an operand to the result. + auto timepointType = + externalBuilder.getType(); + auto [allocaOp, suballocations] = + IREE::Stream::ResourceAllocaOp::createSuballocations( + timepointType, reservationSet.reservationTypes.front(), + reservationSet.reservationLocs, reservationSet.reservationSizes, + executeOp.getAwaitTimepoint(), resultAllocation.affinityAttr, + externalBuilder); + newAwaitTimepoints.push_back(allocaOp.getResultTimepoint()); + auto asmState = getRootAsmState(executeOp->getParentOp()); LLVM_DEBUG({ - llvm::dbgs() << " + adding entry arg for reservation "; - reservation.result.printAsOperand(llvm::dbgs(), *asmState); - llvm::dbgs() << "{"; - reservation.resultSize.printAsOperand(llvm::dbgs(), *asmState); - llvm::dbgs() << "} from "; - reservation.yieldValue.printAsOperand(llvm::dbgs(), *asmState); - llvm::dbgs() << " as "; - arg.printAsOperand(llvm::dbgs(), *asmState); - llvm::dbgs() << "\n"; + llvm::dbgs() << " + alloc for result reservation set: "; + allocaOp.print(llvm::dbgs(), *asmState); + llvm::dbgs() << ":\n"; }); - // Map into scope, updating all aliases. - auto resourceRange = ResourceRange(arg, reservation.resultSize); - scope.mapResourceRange(reservation.yieldValue, resourceRange, - asmState.get()); + for (auto [reservation, suballocation] : + llvm::zip_equal(reservationSet.reservations, suballocations)) { + newOperands.push_back(suballocation); + newOperandSizes.push_back(reservation.resultSize); + resultReplacements.push_back( + std::make_pair(reservation.result, suballocation)); + + // Insert entry arg for the new operand tied all the way to the yield. + auto arg = + entryBlock.addArgument(reservation.resultType, reservation.loc); + + LLVM_DEBUG({ + llvm::dbgs() << " + adding entry arg for reservation "; + reservation.result.printAsOperand(llvm::dbgs(), *asmState); + llvm::dbgs() << "{"; + reservation.resultSize.printAsOperand(llvm::dbgs(), *asmState); + llvm::dbgs() << "} from "; + reservation.yieldValue.printAsOperand(llvm::dbgs(), *asmState); + llvm::dbgs() << " as "; + arg.printAsOperand(llvm::dbgs(), *asmState); + llvm::dbgs() << "\n"; + }); + + // Map into scope, updating all aliases. + auto resourceRange = ResourceRange(arg, reservation.resultSize); + scope.mapResourceRange(reservation.yieldValue, resourceRange, + asmState.get()); + } } } diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp index a793ad4aaee8..3a26bb9a95bb 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp @@ -150,8 +150,15 @@ struct ExecutePartitionBuilder { // If the op has the same affinity as the partition region we can strip it. // Note that some ops may have affinities that are more specific and we // want to preserve those as long as possible. - if (auto affinityOp = - dyn_cast(clonedOp)) { + if (auto transferOp = dyn_cast(clonedOp)) { + if (transferOp.getSourceAffinityAttr() == partition->affinity) { + transferOp.setSourceAffinityAttr(nullptr); + } + if (transferOp.getResultAffinityAttr() == partition->affinity) { + transferOp.setResultAffinityAttr(nullptr); + } + } else if (auto affinityOp = + dyn_cast(clonedOp)) { if (affinityOp.getAffinityAttr() == partition->affinity) { affinityOp.setAffinityAttr(nullptr); } diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir index 8c2d35fa73a0..79e28ac9ac27 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir @@ -530,6 +530,64 @@ util.func public @applyAsyncTransferOp(%operand: !stream.resource, %s // ----- +// CHECK-LABEL: @applyAsyncTransferMultiScopeOp +// CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource, %[[SIZE:.+]]: index) +util.func public @applyAsyncTransferMultiScopeOp(%operand: !stream.resource, %size: index) { + // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device>) : !stream.resource{%[[SIZE]]} + // CHECK: stream.cmd.execute on(#hal.device.affinity<@execution_device>) await(%[[ALLOCA_TIMEPOINT]]) + // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}, + // CHECK-SAME: %[[ALLOCA]] as %[[ALLOCA_CAPTURE:.+]]: !stream.resource{%[[SIZE]]}) + %result, %result_timepoint = stream.async.execute on(#hal.device.affinity<@execution_device>) with(%operand as %capture: !stream.resource{%size}) -> !stream.resource{%size} { + // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_CAPTURE]][%c0], %[[SIZE]] + // CHECK-SAME: : !stream.resource{%[[SIZE]]} -> !stream.resource{%[[SIZE]]} + // CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device>) %[[ALLOCA_CAPTURE]][%c0 for %[[SIZE]]] + // CHECK-SAME: : !stream.resource{%[[SIZE]]} + %0 = stream.async.transfer %capture : !stream.resource{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device>) !stream.resource{%size} + stream.yield %0 : !stream.resource{%size} + } => !stream.timepoint + // CHECK: util.optimization_barrier %[[ALLOCA]] + util.optimization_barrier %result : !stream.resource + util.return +} + +// ----- + +// CHECK-LABEL: @applyAsyncConcurrentTransferMultiScopeOp +// CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource, %[[SIZE:.+]]: index) +util.func public @applyAsyncConcurrentTransferMultiScopeOp(%operand: !stream.resource, %size: index) { + // CHECK-DAG: %[[ALLOCA_A:.+]], %[[ALLOCA_A_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device_a>) : !stream.resource{%[[SIZE]]} + // CHECK-DAG: %[[ALLOCA_B:.+]], %[[ALLOCA_B_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device_b>) : !stream.resource{%[[SIZE]]} + // CHECK-DAG: %[[ALLOCA_TIMEPOINTS:.+]] = stream.timepoint.join max(%[[ALLOCA_A_TIMEPOINT]], %[[ALLOCA_B_TIMEPOINT]]) + // CHECK: stream.cmd.execute on(#hal.device.affinity<@execution_device>) await(%[[ALLOCA_TIMEPOINTS]]) + // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:[a-z0-9_]+]]: !stream.resource{%[[SIZE]]}, + // CHECK-SAME: %[[ALLOCA_A]] as %[[ALLOCA_A_CAPTURE:[a-z0-9_]+]]: !stream.resource{%[[SIZE]]}, + // CHECK-SAME: %[[ALLOCA_B]] as %[[ALLOCA_B_CAPTURE:[a-z0-9_]+]]: !stream.resource{%[[SIZE]]}) + %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@execution_device>) with(%operand as %capture: !stream.resource{%size}) -> (!stream.resource{%size}, !stream.resource{%size}) { + // CHECK: stream.cmd.concurrent + %concurrent:2 = stream.async.concurrent with(%capture as %concurrent_capture: !stream.resource{%size}) -> (!stream.resource{%size}, !stream.resource{%size}) { + // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_A_CAPTURE]][%c0], %[[SIZE]] + // CHECK-SAME: : !stream.resource{%[[SIZE]]} -> !stream.resource{%[[SIZE]]} + // CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device_a>) %[[ALLOCA_A_CAPTURE]][%c0 for %[[SIZE]]] + // CHECK-SAME: : !stream.resource{%[[SIZE]]} + %transfer_a = stream.async.transfer %concurrent_capture : !stream.resource{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device_a>) !stream.resource{%size} + // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_B_CAPTURE]][%c0], %[[SIZE]] + // CHECK-SAME: : !stream.resource{%[[SIZE]]} -> !stream.resource{%[[SIZE]]} + // CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device_b>) %[[ALLOCA_B_CAPTURE]][%c0 for %[[SIZE]]] + // CHECK-SAME: : !stream.resource{%[[SIZE]]} + %transfer_b = stream.async.transfer %concurrent_capture : !stream.resource{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device_b>) !stream.resource{%size} + stream.yield %transfer_a, %transfer_b : !stream.resource{%size}, !stream.resource{%size} + } + stream.yield %concurrent#0, %concurrent#1 : !stream.resource{%size}, !stream.resource{%size} + } => !stream.timepoint + // CHECK: util.optimization_barrier %[[ALLOCA_A]] + util.optimization_barrier %results#0 : !stream.resource + // CHECK: util.optimization_barrier %[[ALLOCA_B]] + util.optimization_barrier %results#1 : !stream.resource + util.return +} + +// ----- + // CHECK-LABEL: @applyAsyncDispatchOp // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource, %[[SIZE:.+]]: index, %[[OFFSET:.+]]: index, %[[END:.+]]: index, %[[LENGTH:.+]]: index) util.func public @applyAsyncDispatchOp(%operand: !stream.resource, %size: index, %offset: index, %end: index, %length: index) { diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir index 7d3c2284aa86..cd6ccca95a20 100644 --- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir +++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir @@ -34,8 +34,8 @@ util.func public @partitioning(%arg0: !stream.resource, %arg1: !stream // ----- -// Tests partitioning multi device execution with barriers and transfers. -// It validates that multi stream commands are created and run in parallel: +// Tests partitioning multi-device execution with barriers and transfers. +// It validates that multi-stream commands are created and run in parallel. // CHECK-LABEL: util.func public @deviceMultiDeviceSync util.func public @deviceMultiDeviceSync(%arg0: i1) -> !stream.resource { @@ -43,37 +43,38 @@ util.func public @deviceMultiDeviceSync(%arg0: i1) -> !stream.resource !stream.resource{%c128} - %1 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @ex::@dispatch0[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource{%c128}) -> !stream.resource{%c128} - %3 = stream.async.barrier %1 : !stream.resource{%c128} -> !stream.resource - %4 = stream.async.transfer %1 : !stream.resource{%c128} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_1>) !stream.resource{%c128} - // CHECK: stream.async.execute on(#hal.device.affinity<@__device_0>) + + // CHECK: stream.async.execute on(#hal.device.affinity<@device0>) // CHECK: stream.async.splat // CHECK: stream.async.dispatch // CHECK: stream.async.transfer + %0 = stream.async.splat %c255_i32 : i32 -> !stream.resource{%c128} + %1 = stream.async.dispatch on(#hal.device.affinity<@device0>) @ex::@dispatch0[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource{%c128}) -> !stream.resource{%c128} + %3 = stream.async.barrier %1 : !stream.resource{%c128} + %4 = stream.async.transfer %1 : !stream.resource{%c128} from(#hal.device.affinity<@device0>) -> to(#hal.device.affinity<@device1>) !stream.resource{%c128} - %2 = stream.async.dispatch on(#hal.device.affinity<@__device_1>) @ex::@dispatch1[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource{%c128}) -> !stream.resource{%c128} - %5 = stream.async.barrier %2 : !stream.resource{%c128} -> !stream.resource - %6 = stream.async.transfer %2 : !stream.resource{%c128} from(#hal.device.affinity<@__device_1>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c128} - // CHECK: stream.async.execute on(#hal.device.affinity<@__device_1>) + // CHECK: stream.async.execute on(#hal.device.affinity<@device1>) // CHECK: stream.async.splat // CHECK: stream.async.dispatch // CHECK: stream.async.transfer + %2 = stream.async.dispatch on(#hal.device.affinity<@device1>) @ex::@dispatch1[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource{%c128}) -> !stream.resource{%c128} + %5 = stream.async.barrier %2 : !stream.resource{%c128} + %6 = stream.async.transfer %2 : !stream.resource{%c128} from(#hal.device.affinity<@device1>) -> to(#hal.device.affinity<@device0>) !stream.resource{%c128} - %7 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @ex::@dispatch2[%c1, %c1, %c1](%3[%c0 to %c128 for %c128], %6[%c0 to %c128 for %c128]) : (!stream.resource{%c128}, !stream.resource{%c128}) -> !stream.resource{%c128} - %9 = stream.async.barrier %7 : !stream.resource{%c128} -> !stream.resource - // CHECK: stream.async.execute on(#hal.device.affinity<@__device_0>) + // CHECK: stream.async.execute on(#hal.device.affinity<@device0>) // CHECK: stream.async.dispatch + %7 = stream.async.dispatch on(#hal.device.affinity<@device0>) @ex::@dispatch2[%c1, %c1, %c1](%3[%c0 to %c128 for %c128], %6[%c0 to %c128 for %c128]) : (!stream.resource{%c128}, !stream.resource{%c128}) -> !stream.resource{%c128} + %8 = stream.async.barrier %7 : !stream.resource{%c128} - %8 = stream.async.dispatch on(#hal.device.affinity<@__device_1>) @ex::@dispatch2[%c1, %c1, %c1](%4[%c0 to %c128 for %c128], %5[%c0 to %c128 for %c128]) : (!stream.resource{%c128}, !stream.resource{%c128}) -> !stream.resource{%c128} - %10 = stream.async.transfer %8 : !stream.resource{%c128} from(#hal.device.affinity<@__device_1>) -> to(#hal.device.affinity<@__device_0>) !stream.resource{%c128} - // CHECK: stream.async.execute on(#hal.device.affinity<@__device_1>) + // CHECK: stream.async.execute on(#hal.device.affinity<@device1>) // CHECK: stream.async.dispatch // CHECK: stream.async.transfer + %9 = stream.async.dispatch on(#hal.device.affinity<@device1>) @ex::@dispatch2[%c1, %c1, %c1](%4[%c0 to %c128 for %c128], %5[%c0 to %c128 for %c128]) : (!stream.resource{%c128}, !stream.resource{%c128}) -> !stream.resource{%c128} + %10 = stream.async.transfer %9 : !stream.resource{%c128} from(#hal.device.affinity<@device1>) -> to(#hal.device.affinity<@device0>) !stream.resource{%c128} - %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @ex::@dispatch2[%c1, %c1, %c1](%9[%c0 to %c128 for %c128], %10[%c0 to %c128 for %c128]) : (!stream.resource{%c128}, !stream.resource{%c128}) -> !stream.resource{%c128} - // CHECK: stream.async.execute on(#hal.device.affinity<@__device_0>) + // CHECK: stream.async.execute on(#hal.device.affinity<@device0>) // CHECK: stream.async.dispatch + %11 = stream.async.dispatch on(#hal.device.affinity<@device0>) @ex::@dispatch2[%c1, %c1, %c1](%8[%c0 to %c128 for %c128], %10[%c0 to %c128 for %c128]) : (!stream.resource{%c128}, !stream.resource{%c128}) -> !stream.resource{%c128} util.return %11 : !stream.resource }