diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
index df56f0d76e67..f4c289aec413 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.cpp
@@ -1828,9 +1828,9 @@ LogicalResult TensorSplatOp::verify() {
 
 LogicalResult TensorCloneOp::verify() {
   if (failed(verifyOpDynamicDims(getOperation(), {getOperand()},
-                                 getArgumentDims())) ||
+                                 getOperandDims())) ||
       failed(verifyOpDynamicDims(getOperation(), {getResult()},
-                                 getArgumentDims()))) {
+                                 getOperandDims()))) {
     return failure();
   }
   return success();
@@ -1840,7 +1840,30 @@ LogicalResult TensorCloneOp::verify() {
 // flow.tensor.barrier
 //===----------------------------------------------------------------------===//
 
-LogicalResult TensorBarrierOp::verify() { return success(); }
+LogicalResult TensorBarrierOp::verify() {
+  if (failed(verifyOpDynamicDims(getOperation(), {getOperand()},
+                                 getOperandDims()))) {
+    return failure();
+  }
+  return success();
+}
+
+Value TensorBarrierOp::getTiedResult(unsigned resultIndex) {
+  return IREE::Util::TiedOpInterface::findTiedBaseValue(getOperand());
+}
+
+Value TensorBarrierOp::getTiedResultOperand(Value result) {
+  return getOperand();
+}
+
+::std::optional<unsigned>
+TensorBarrierOp::getTiedResultOperandIndex(unsigned resultIndex) {
+  return {0}; // operand
+}
+
+SmallVector<int64_t> TensorBarrierOp::getTiedResultOperandIndices() {
+  return {0}; // operand
+}
 
 //===----------------------------------------------------------------------===//
 // flow.tensor.transfer
@@ -1848,9 +1871,9 @@ LogicalResult TensorBarrierOp::verify() { return success(); }
 
 LogicalResult TensorTransferOp::verify() {
   if (failed(verifyOpDynamicDims(getOperation(), {getOperand()},
-                                 getArgumentDims())) ||
+                                 getOperandDims())) ||
       failed(verifyOpDynamicDims(getOperation(), {getResult()},
-                                 getArgumentDims()))) {
+                                 getOperandDims()))) {
     return failure();
   }
   return success();
diff --git a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
index a10982b7fba0..0241a843b906 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
+++ b/compiler/src/iree/compiler/Dialect/Flow/IR/FlowOps.td
@@ -1469,14 +1469,14 @@ def FLOW_TensorCloneOp : FLOW_PureOp<"tensor.clone", [
 
   let arguments = (ins
     FLOW_Tensor:$operand,
-    FLOW_ShapeDynamicDims:$argument_dims
+    FLOW_ShapeDynamicDims:$operand_dims
   );
   let results = (outs
     FLOW_Tensor:$result
   );
 
   let assemblyFormat = [{
-    $operand `:` type($result) (`{` $argument_dims^ `}`)?
+    $operand `:` type($result) (`{` $operand_dims^ `}`)?
     attr-dict-with-keyword
   }];
 
@@ -1493,8 +1493,8 @@ def FLOW_TensorCloneOp : FLOW_PureOp<"tensor.clone", [
   let extraClassDeclaration = [{
     bool isHoistableLeafOp() { return false; }
 
-    ValueRange getOperandDynamicDims(unsigned idx) { return getArgumentDims(); }
-    ValueRange getResultDynamicDims(unsigned idx) { return getArgumentDims(); }
+    ValueRange getOperandDynamicDims(unsigned idx) { return getOperandDims(); }
+    ValueRange getResultDynamicDims(unsigned idx) { return getOperandDims(); }
   }];
 
   let hasVerifier = 1;
@@ -1506,14 +1506,24 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [
   AllTypesMatch<["operand", "result"]>,
   DeclareOpInterfaceMethods<Util_HoistableOpInterface>,
   Util_ShapeAwareOp,
+  DeclareOpInterfaceMethods<Util_TiedOpInterface, [
+      "getTiedResult",
+      "getTiedResultOperand",
+      "getTiedResultOperandIndex",
+      "getTiedResultOperandIndices",
+  ]>,
 ]> {
-  let summary = [{}];
+  let summary = [{indicates a value that must have a specific affinity}];
   let description = [{
+    Prevents fusion and scheduling of a value across an affinity boundary.
+    May introduce copy-on-write behavior if the operand value is used as well as
+    the result and users should try to keep the operand to a single use by this
+    op.
   }];
 
   let arguments = (ins
     FLOW_Tensor:$operand,
-    FLOW_ShapeDynamicDims:$argument_dims,
+    FLOW_ShapeDynamicDims:$operand_dims,
     AnyAttr:$target
   );
   let results = (outs
@@ -1521,7 +1531,7 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [
   );
 
   let assemblyFormat = [{
-    $operand `:` type($result) (`{` $argument_dims^ `}`)?
+    $operand `:` type($result) (`{` $operand_dims^ `}`)?
     `on` $target
     attr-dict-with-keyword
   }];
@@ -1540,8 +1550,8 @@ def FLOW_TensorBarrierOp : FLOW_PureOp<"tensor.barrier", [
   let extraClassDeclaration = [{
     bool isHoistableLeafOp() { return false; }
 
-    ValueRange getOperandDynamicDims(unsigned idx) { return getArgumentDims(); }
-    ValueRange getResultDynamicDims(unsigned idx) { return getArgumentDims(); }
+    ValueRange getOperandDynamicDims(unsigned idx) { return getOperandDims(); }
+    ValueRange getResultDynamicDims(unsigned idx) { return getOperandDims(); }
   }];
 
   let hasVerifier = 1;
@@ -1564,7 +1574,7 @@ def FLOW_TensorTransferOp : FLOW_PureOp<"tensor.transfer", [
 
   let arguments = (ins
     FLOW_Tensor:$operand,
-    FLOW_ShapeDynamicDims:$argument_dims,
+    FLOW_ShapeDynamicDims:$operand_dims,
     AnyAttr:$target
   );
   let results = (outs
@@ -1572,7 +1582,7 @@ def FLOW_TensorTransferOp : FLOW_PureOp<"tensor.transfer", [
   );
 
   let assemblyFormat = [{
-    $operand `:` type($result) (`{` $argument_dims^ `}`)?
+    $operand `:` type($result) (`{` $operand_dims^ `}`)?
     `to` $target
     attr-dict-with-keyword
   }];
@@ -1591,8 +1601,8 @@ def FLOW_TensorTransferOp : FLOW_PureOp<"tensor.transfer", [
   let extraClassDeclaration = [{
     bool isHoistableLeafOp() { return false; }
 
-    ValueRange getOperandDynamicDims(unsigned idx) { return getArgumentDims(); }
-    ValueRange getResultDynamicDims(unsigned idx) { return getArgumentDims(); }
+    ValueRange getOperandDynamicDims(unsigned idx) { return getOperandDims(); }
+    ValueRange getResultDynamicDims(unsigned idx) { return getOperandDims(); }
   }];
 
   let hasVerifier = 1;
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp
index d60e6b19c447..00288cc640d6 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/Patterns.cpp
@@ -229,8 +229,8 @@ struct ConvertTensorCloneOp
     auto unknownType = rewriter.getType<IREE::Stream::ResourceType>();
     auto cloneOp = rewriter.create<IREE::Stream::TensorCloneOp>(
         op.getLoc(), unknownType, operand.resource, op.getOperand().getType(),
-        op.getArgumentDims(), operand.resourceSize, op.getResult().getType(),
-        flattenValues(adaptor.getArgumentDims()), operand.resourceSize,
+        op.getOperandDims(), operand.resourceSize, op.getResult().getType(),
+        flattenValues(adaptor.getOperandDims()), operand.resourceSize,
         executionAffinityAttr);
     rewriter.replaceOpWithMultiple(op, {{cloneOp, operand.resourceSize}});
     return success();
@@ -249,7 +249,7 @@ struct ConvertTensorBarrierOp
     auto barrierOp = rewriter.create<IREE::Stream::AsyncBarrierOp>(
         op.getLoc(), operand.resource.getType(), operand.resource,
         operand.resourceSize,
-        /*affinity=*/operand.affinity);
+        /*affinity=*/executionAffinityAttr);
     rewriter.replaceOpWithMultiple(op, {{barrierOp, operand.resourceSize}});
     return success();
   }
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir
index 4f61917ed439..640da7802f1a 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Conversion/FlowToStream/test/tensor_ops.mlir
@@ -139,14 +139,14 @@ util.func public @tensorSplat(%value: i8, %dim0: index) -> tensor<?x128xi8> {
 util.global private @device : !hal.device
 
 // CHECK-LABEL: @tensorBarrierDispatch
-//  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[DIM0:.+]]: index, %[[DIM1:.+]]: index)
+//  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM0:.+]]: index)
 util.func public @tensorBarrierDispatch(%input: tensor<?x128xi8>, %dim0: index) -> tensor<?x128xi8> {
-  // CHECK: %[[BARRIER:.+]] = stream.async.barrier %[[INPUT]] : !stream.resource<*>{%[[DIM0]]} -> !stream.resource<*>
+  // CHECK: %[[BARRIER:.+]] = stream.async.barrier on(#hal.device.affinity<@device>) %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]}
   %barrier = flow.tensor.barrier %input : tensor<?x128xi8>{%dim0} on #hal.device.affinity<@device>
-  // CHECK: %[[SIZE:.+]] = stream.tensor.sizeof on(#hal.device.affinity<@device>) tensor<?x128xi8>{%arg2} : index
-  // CHECK: %[[RESULT:.+]] = stream.tensor.dispatch on(#hal.device.affinity<@device>) @ex::@entry(%[[BARRIER]])
+  // CHECK: %[[RESULT_SIZE:.+]] = stream.tensor.sizeof tensor<?x128xi8>{%[[DIM0]]} : index
+  // CHECK: %[[RESULT:.+]] = stream.tensor.dispatch @ex::@entry(%[[BARRIER]])
   %0 = flow.dispatch @ex::@entry(%barrier) : (tensor<?x128xi8>{%dim0}) -> tensor<?x128xi8>{%dim0}
-  // CHECK: util.return %[[RESULT]], %[[SIZE]]
+  // CHECK: util.return %[[RESULT]], %[[RESULT_SIZE]]
   util.return %0 : tensor<?x128xi8>
 }
 
@@ -170,7 +170,7 @@ util.global private @device : !hal.device
 // CHECK-LABEL: @tensorBarrier
 //  CHECK-SAME: (%[[INPUT:.+]]: !stream.resource<*>, %[[INPUT_SIZE:.+]]: index, %[[DIM0:.+]]: index)
 util.func public @tensorBarrier(%input: tensor<?x128xi8>, %dim0: index) -> tensor<?x128xi8> {
-  // CHECK: %[[TRANSFER:.+]] = stream.async.barrier %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]} -> !stream.resource<*>
+  // CHECK: %[[TRANSFER:.+]] = stream.async.barrier on(#hal.device.affinity<@device>) %[[INPUT]] : !stream.resource<*>{%[[INPUT_SIZE]]}
   %transfer = flow.tensor.barrier %input : tensor<?x128xi8>{%dim0} on #hal.device.affinity<@device>
   // CHECK: util.return %[[TRANSFER]], %[[INPUT_SIZE]]
   util.return %transfer : tensor<?x128xi8>
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
index 4623a7bd6c64..13988a999b2f 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.cpp
@@ -2469,6 +2469,19 @@ bool AsyncBarrierOp::isMetadata() { return true; }
 
 LogicalResult AsyncBarrierOp::verify() { return success(); }
 
+Value AsyncBarrierOp::getTiedResult(unsigned resultIndex) {
+  return IREE::Util::TiedOpInterface::findTiedBaseValue(getSource());
+}
+
+::std::optional<unsigned>
+AsyncBarrierOp::getTiedResultOperandIndex(unsigned resultIndex) {
+  return {0}; // source
+}
+
+SmallVector<int64_t> AsyncBarrierOp::getTiedResultOperandIndices() {
+  return {0}; // source
+}
+
 //===----------------------------------------------------------------------===//
 // stream.async.transfer
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
index 8ed4bca948fa..62a44d5bac66 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
+++ b/compiler/src/iree/compiler/Dialect/Stream/IR/StreamOps.td
@@ -2291,15 +2291,25 @@ def Stream_AsyncCollectiveOp : Stream_Op<"async.collective", [
 }
 
 def Stream_AsyncBarrierOp : Stream_Op<"async.barrier", [
+  AllTypesMatch<["source", "result"]>,
   Stream_AffinityOp,
   Stream_AsyncPhaseOp,
   DeclareOpInterfaceMethods<Stream_StreamableOp, [
     "isMetadata",
   ]>,
   Util_SizeAwareOp,
+  DeclareOpInterfaceMethods<Util_TiedOpInterface, [
+    "getTiedResult",
+    "getTiedResultOperandIndex",
+    "getTiedResultOperandIndices",
+  ]>,
 ]> {
-  let summary = [{ }];
+  let summary = [{indicates a value that must have a specific affinity}];
   let description = [{
+    Prevents fusion and scheduling of a value across an affinity boundary.
+    May introduce copy-on-write behavior if the operand value is used as well as
+    the result and users should try to keep the operand to a single use by this
+    op.
   }];
 
   let arguments = (ins
@@ -2318,11 +2328,9 @@ def Stream_AsyncBarrierOp : Stream_Op<"async.barrier", [
   );
 
   let assemblyFormat = [{
+    (`on` `(` $affinity^ `)`)?
     $source `:` type($source)
     `` `{` $size `}`
-    (`from` `(` $affinity^ `)`)?
-    `->`
-    type($result)
     attr-dict-with-keyword
   }];
 
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp
index 6cb8a2a1ce42..b7ce5dfece90 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleAllocation.cpp
@@ -694,10 +694,14 @@ static LogicalResult applyAsyncTransferOp(IREE::Stream::AsyncTransferOp asyncOp,
   };
   auto currentAffinityAttr =
       IREE::Stream::AffinityAttr::lookupOrDefault(asyncOp);
-  bool transferIn = asyncOp.getSourceAffinityAttr() != currentAffinityAttr ||
-                    isStaging(asyncOp.getSource());
-  bool transferOut = asyncOp.getResultAffinityAttr() != currentAffinityAttr ||
-                     isStaging(asyncOp.getResult());
+  auto sourceAffinityAttr = asyncOp.getSourceAffinityAttr();
+  auto resultAffinityAttr = asyncOp.getResultAffinityAttr();
+  bool transferIn =
+      (sourceAffinityAttr && sourceAffinityAttr != currentAffinityAttr) ||
+      isStaging(asyncOp.getSource());
+  bool transferOut =
+      (resultAffinityAttr && resultAffinityAttr != currentAffinityAttr) ||
+      isStaging(asyncOp.getResult());
 
   auto sourceRange = scope.lookupResourceRange(asyncOp.getSource());
   auto targetRange = scope.lookupResourceRange(asyncOp.getResult());
@@ -1274,35 +1278,47 @@ struct ResultReservationSet {
 };
 
 struct ResultAllocation {
+  // Affinity for the allocations.
+  IREE::Stream::AffinityAttr affinityAttr;
   // Reservations bucketed by lifetime.
   SmallVector<ResultReservationSet> reservationSets;
 };
 
+// A map of allocation placement affinities to the alloc reservations requested.
+using ResultAllocationMap =
+    llvm::MapVector<IREE::Stream::AffinityAttr, SmallVector<ResultReservation>>;
+
 // Produces parameters for one or more result allocations composed of an ordered
-// set of |reservations| with matching lifetimes.
-static ResultAllocation
-reserveResultAllocation(ArrayRef<ResultReservation> reservations) {
-  // We want deterministic ordering of the allocations for each lifetime type
-  // so we build them all here and then just nuke the ones we don't end up
-  // using.
-  SmallVector<ResultReservationSet> sets(
-      IREE::Stream::getMaxEnumValForLifetime() + 1);
-  for (auto &reservation : reservations) {
-    auto &set =
-        sets[static_cast<unsigned>(reservation.resultType.getLifetime())];
-    set.reservationLocs.push_back(reservation.loc);
-    set.reservationTypes.push_back(reservation.resultType);
-    set.reservationSizes.push_back(reservation.resultSize);
-    set.reservations.push_back(std::move(reservation));
-  }
+// set of |reservations| with matching lifetimes. Allocations will be bucketed
+// both by their allocation affinity (where they should be placed) and their
+// lifetime (how long they're expected to live).
+static std::vector<ResultAllocation>
+reserveResultAllocations(ResultAllocationMap &reservationMap) {
+  std::vector<ResultAllocation> result;
+  for (auto &[affinityAttr, reservations] : reservationMap) {
+    // We want deterministic ordering of the allocations for each lifetime type
+    // so we build them all here and then just nuke the ones we don't end up
+    // using.
+    SmallVector<ResultReservationSet> sets(
+        IREE::Stream::getMaxEnumValForLifetime() + 1);
+    for (auto &reservation : reservations) {
+      auto &set =
+          sets[static_cast<unsigned>(reservation.resultType.getLifetime())];
+      set.reservationLocs.push_back(reservation.loc);
+      set.reservationTypes.push_back(reservation.resultType);
+      set.reservationSizes.push_back(reservation.resultSize);
+      set.reservations.push_back(std::move(reservation));
+    }
 
-  // Remove unused sets. This does a bunch of moves and is really bad but eh.
-  for (int i = sets.size() - 1; i >= 0; --i) {
-    if (sets[i].reservations.empty()) {
-      sets.erase(sets.begin() + i);
+    // Remove unused sets. This does a bunch of moves and is really bad but eh.
+    for (int i = sets.size() - 1; i >= 0; --i) {
+      if (sets[i].reservations.empty()) {
+        sets.erase(sets.begin() + i);
+      }
     }
+    result.push_back(ResultAllocation{affinityAttr, sets});
   }
-  return ResultAllocation{sets};
+  return result;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1333,6 +1349,49 @@ static Value findTiedYieldResult(Value seedValue) {
   return {};
 }
 
+// Walks up the use-def chain to find an affinity the given local value is
+// pinned to. May return nullptr if there's no assigned affinity and the
+// enclosing execution region affinity should be used.
+//
+// TODO(benvanik): change this to use an affinity analysis on the escaping
+// value instead. The local value may not have a transfer associated with it.
+static IREE::Stream::AffinityAttr findLocalValueAffinity(Value value) {
+  while (value) {
+    auto definingOp = value.getDefiningOp();
+    if (!definingOp) {
+      // Block argument or something we don't track locally.
+      return {};
+    } else if (auto transferOp =
+                   dyn_cast<IREE::Stream::AsyncTransferOp>(definingOp)) {
+      return transferOp.getResultAffinityAttr();
+    } else if (auto regionOp = dyn_cast<RegionBranchOpInterface>(definingOp)) {
+      // A region op with a yielded value (like stream.async.concurrent).
+      // Note that we always want to check for tied ops first as that will let
+      // us skip over the region entirely.
+      if (auto tiedOp = dyn_cast<IREE::Util::TiedOpInterface>(definingOp)) {
+        if (auto tiedValue = tiedOp.getTiedResultOperand(value)) {
+          value = tiedValue;
+          continue;
+        }
+      }
+      unsigned resultIndex = cast<OpResult>(value).getResultNumber();
+      auto &block = regionOp.getOperation()->getRegion(0).front();
+      auto terminatorOp =
+          cast<RegionBranchTerminatorOpInterface>(block.getTerminator());
+      value = terminatorOp.getSuccessorOperands(
+          RegionBranchPoint::parent())[resultIndex];
+    } else if (auto tiedOp =
+                   dyn_cast<IREE::Util::TiedOpInterface>(definingOp)) {
+      // If the producer is tied then try to get the operand.
+      value = tiedOp.getTiedResultOperand(value);
+    } else {
+      // Analysis blocked.
+      break;
+    }
+  }
+  return {};
+}
+
 // Returns a reversed list of subrange operations that lead from an initial
 // resource down a sequence to |derivedValue|. The first element in the list
 // will be the last subview of |derivedValue| and the last element will be the
@@ -1541,7 +1600,7 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) {
     auto resourceRange = ResourceRange(arg, operandSize);
     scope.mapResourceRange(arg, resourceRange, asmState.get());
   }
-  SmallVector<ResultReservation> resultReservations;
+  ResultAllocationMap resultReservations;
   for (auto [result, resultSize] :
        llvm::zip_equal(executeOp.getResults(), executeOp.getResultSizes())) {
     auto resultType = llvm::cast<IREE::Stream::ResourceType>(result.getType());
@@ -1623,6 +1682,13 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) {
       continue;
     }
 
+    // Find a pinned affinity for the value or inherit the execution region
+    // affinity.
+    auto allocationAffinity = findLocalValueAffinity(yieldValue);
+    if (!allocationAffinity) {
+      allocationAffinity = executeOp.getAffinityAttr();
+    }
+
     // Queue up the allocation for packing.
     ResultReservation resultReservation = {
         definingOp->getLoc(), result, resultType, resultSize, yieldValue,
@@ -1633,54 +1699,56 @@ allocateExecutionRegion(IREE::Stream::AsyncExecuteOp executeOp) {
       resultReservation.result.printAsOperand(llvm::dbgs(), asmState);
       llvm::dbgs() << "\n";
     });
-    resultReservations.push_back(resultReservation);
+    resultReservations[allocationAffinity].push_back(resultReservation);
   }
-  auto resultAllocation = reserveResultAllocation(resultReservations);
-  for (auto &reservationSet : resultAllocation.reservationSets) {
-    // Allocate and tie an operand to the result.
-    auto timepointType = externalBuilder.getType<IREE::Stream::TimepointType>();
-    auto [allocaOp, suballocations] =
-        IREE::Stream::ResourceAllocaOp::createSuballocations(
-            timepointType, reservationSet.reservationTypes.front(),
-            reservationSet.reservationLocs, reservationSet.reservationSizes,
-            executeOp.getAwaitTimepoint(), executeOp.getAffinityAttr(),
-            externalBuilder);
-    newAwaitTimepoints.push_back(allocaOp.getResultTimepoint());
-
-    auto asmState = getRootAsmState(executeOp->getParentOp());
-    LLVM_DEBUG({
-      llvm::dbgs() << "  + alloc for result reservation set: ";
-      allocaOp.print(llvm::dbgs(), *asmState);
-      llvm::dbgs() << ":\n";
-    });
-
-    for (auto [reservation, suballocation] :
-         llvm::zip_equal(reservationSet.reservations, suballocations)) {
-      newOperands.push_back(suballocation);
-      newOperandSizes.push_back(reservation.resultSize);
-      resultReplacements.push_back(
-          std::make_pair(reservation.result, suballocation));
-
-      // Insert entry arg for the new operand tied all the way to the yield.
-      auto arg =
-          entryBlock.addArgument(reservation.resultType, reservation.loc);
+  for (auto &resultAllocation : reserveResultAllocations(resultReservations)) {
+    for (auto &reservationSet : resultAllocation.reservationSets) {
+      // Allocate and tie an operand to the result.
+      auto timepointType =
+          externalBuilder.getType<IREE::Stream::TimepointType>();
+      auto [allocaOp, suballocations] =
+          IREE::Stream::ResourceAllocaOp::createSuballocations(
+              timepointType, reservationSet.reservationTypes.front(),
+              reservationSet.reservationLocs, reservationSet.reservationSizes,
+              executeOp.getAwaitTimepoint(), resultAllocation.affinityAttr,
+              externalBuilder);
+      newAwaitTimepoints.push_back(allocaOp.getResultTimepoint());
 
+      auto asmState = getRootAsmState(executeOp->getParentOp());
       LLVM_DEBUG({
-        llvm::dbgs() << "    + adding entry arg for reservation ";
-        reservation.result.printAsOperand(llvm::dbgs(), *asmState);
-        llvm::dbgs() << "{";
-        reservation.resultSize.printAsOperand(llvm::dbgs(), *asmState);
-        llvm::dbgs() << "} from ";
-        reservation.yieldValue.printAsOperand(llvm::dbgs(), *asmState);
-        llvm::dbgs() << " as ";
-        arg.printAsOperand(llvm::dbgs(), *asmState);
-        llvm::dbgs() << "\n";
+        llvm::dbgs() << "  + alloc for result reservation set: ";
+        allocaOp.print(llvm::dbgs(), *asmState);
+        llvm::dbgs() << ":\n";
       });
 
-      // Map into scope, updating all aliases.
-      auto resourceRange = ResourceRange(arg, reservation.resultSize);
-      scope.mapResourceRange(reservation.yieldValue, resourceRange,
-                             asmState.get());
+      for (auto [reservation, suballocation] :
+           llvm::zip_equal(reservationSet.reservations, suballocations)) {
+        newOperands.push_back(suballocation);
+        newOperandSizes.push_back(reservation.resultSize);
+        resultReplacements.push_back(
+            std::make_pair(reservation.result, suballocation));
+
+        // Insert entry arg for the new operand tied all the way to the yield.
+        auto arg =
+            entryBlock.addArgument(reservation.resultType, reservation.loc);
+
+        LLVM_DEBUG({
+          llvm::dbgs() << "    + adding entry arg for reservation ";
+          reservation.result.printAsOperand(llvm::dbgs(), *asmState);
+          llvm::dbgs() << "{";
+          reservation.resultSize.printAsOperand(llvm::dbgs(), *asmState);
+          llvm::dbgs() << "} from ";
+          reservation.yieldValue.printAsOperand(llvm::dbgs(), *asmState);
+          llvm::dbgs() << " as ";
+          arg.printAsOperand(llvm::dbgs(), *asmState);
+          llvm::dbgs() << "\n";
+        });
+
+        // Map into scope, updating all aliases.
+        auto resourceRange = ResourceRange(arg, reservation.resultSize);
+        scope.mapResourceRange(reservation.yieldValue, resourceRange,
+                               asmState.get());
+      }
     }
   }
 
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp
index a793ad4aaee8..3a26bb9a95bb 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/ScheduleExecution.cpp
@@ -150,8 +150,15 @@ struct ExecutePartitionBuilder {
     // If the op has the same affinity as the partition region we can strip it.
     // Note that some ops may have affinities that are more specific and we
     // want to preserve those as long as possible.
-    if (auto affinityOp =
-            dyn_cast<IREE::Stream::AffinityOpInterface>(clonedOp)) {
+    if (auto transferOp = dyn_cast<IREE::Stream::AsyncTransferOp>(clonedOp)) {
+      if (transferOp.getSourceAffinityAttr() == partition->affinity) {
+        transferOp.setSourceAffinityAttr(nullptr);
+      }
+      if (transferOp.getResultAffinityAttr() == partition->affinity) {
+        transferOp.setResultAffinityAttr(nullptr);
+      }
+    } else if (auto affinityOp =
+                   dyn_cast<IREE::Stream::AffinityOpInterface>(clonedOp)) {
       if (affinityOp.getAffinityAttr() == partition->affinity) {
         affinityOp.setAffinityAttr(nullptr);
       }
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir
index 8c2d35fa73a0..79e28ac9ac27 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_allocation.mlir
@@ -530,6 +530,64 @@ util.func public @applyAsyncTransferOp(%operand: !stream.resource<transient>, %s
 
 // -----
 
+// CHECK-LABEL: @applyAsyncTransferMultiScopeOp
+// CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
+util.func public @applyAsyncTransferMultiScopeOp(%operand: !stream.resource<transient>, %size: index) {
+  // CHECK: %[[ALLOCA:.+]], %[[ALLOCA_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device>) : !stream.resource<transient>{%[[SIZE]]}
+  // CHECK: stream.cmd.execute on(#hal.device.affinity<@execution_device>) await(%[[ALLOCA_TIMEPOINT]])
+  // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:.+]]: !stream.resource<transient>{%[[SIZE]]},
+  // CHECK-SAME:      %[[ALLOCA]] as %[[ALLOCA_CAPTURE:.+]]: !stream.resource<transient>{%[[SIZE]]})
+  %result, %result_timepoint = stream.async.execute on(#hal.device.affinity<@execution_device>) with(%operand as %capture: !stream.resource<transient>{%size}) -> !stream.resource<transient>{%size} {
+    // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_CAPTURE]][%c0], %[[SIZE]]
+    // CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]} -> !stream.resource<transient>{%[[SIZE]]}
+    // CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device>) %[[ALLOCA_CAPTURE]][%c0 for %[[SIZE]]]
+    // CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]}
+    %0 = stream.async.transfer %capture : !stream.resource<transient>{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device>) !stream.resource<transient>{%size}
+    stream.yield %0 : !stream.resource<transient>{%size}
+  } => !stream.timepoint
+  // CHECK: util.optimization_barrier %[[ALLOCA]]
+  util.optimization_barrier %result : !stream.resource<transient>
+  util.return
+}
+
+// -----
+
+// CHECK-LABEL: @applyAsyncConcurrentTransferMultiScopeOp
+// CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index)
+util.func public @applyAsyncConcurrentTransferMultiScopeOp(%operand: !stream.resource<transient>, %size: index) {
+  // CHECK-DAG: %[[ALLOCA_A:.+]], %[[ALLOCA_A_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device_a>) : !stream.resource<transient>{%[[SIZE]]}
+  // CHECK-DAG: %[[ALLOCA_B:.+]], %[[ALLOCA_B_TIMEPOINT:.+]] = stream.resource.alloca uninitialized on(#hal.device.affinity<@result_device_b>) : !stream.resource<transient>{%[[SIZE]]}
+  // CHECK-DAG: %[[ALLOCA_TIMEPOINTS:.+]] = stream.timepoint.join max(%[[ALLOCA_A_TIMEPOINT]], %[[ALLOCA_B_TIMEPOINT]])
+  // CHECK: stream.cmd.execute on(#hal.device.affinity<@execution_device>) await(%[[ALLOCA_TIMEPOINTS]])
+  // CHECK-SAME: with(%[[OPERAND]] as %[[OPERAND_CAPTURE:[a-z0-9_]+]]: !stream.resource<transient>{%[[SIZE]]},
+  // CHECK-SAME:      %[[ALLOCA_A]] as %[[ALLOCA_A_CAPTURE:[a-z0-9_]+]]: !stream.resource<transient>{%[[SIZE]]},
+  // CHECK-SAME:      %[[ALLOCA_B]] as %[[ALLOCA_B_CAPTURE:[a-z0-9_]+]]: !stream.resource<transient>{%[[SIZE]]})
+  %results:2, %result_timepoint = stream.async.execute on(#hal.device.affinity<@execution_device>) with(%operand as %capture: !stream.resource<transient>{%size}) -> (!stream.resource<transient>{%size}, !stream.resource<transient>{%size}) {
+    // CHECK: stream.cmd.concurrent
+    %concurrent:2 = stream.async.concurrent with(%capture as %concurrent_capture: !stream.resource<transient>{%size}) -> (!stream.resource<transient>{%size}, !stream.resource<transient>{%size}) {
+      // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_A_CAPTURE]][%c0], %[[SIZE]]
+      // CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]} -> !stream.resource<transient>{%[[SIZE]]}
+      // CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device_a>) %[[ALLOCA_A_CAPTURE]][%c0 for %[[SIZE]]]
+      // CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]}
+      %transfer_a = stream.async.transfer %concurrent_capture : !stream.resource<transient>{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device_a>) !stream.resource<transient>{%size}
+      // CHECK: stream.cmd.copy %[[OPERAND_CAPTURE]][%c0], %[[ALLOCA_B_CAPTURE]][%c0], %[[SIZE]]
+      // CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]} -> !stream.resource<transient>{%[[SIZE]]}
+      // CHECK: stream.cmd.flush to(#hal.device.affinity<@result_device_b>) %[[ALLOCA_B_CAPTURE]][%c0 for %[[SIZE]]]
+      // CHECK-SAME: : !stream.resource<transient>{%[[SIZE]]}
+      %transfer_b = stream.async.transfer %concurrent_capture : !stream.resource<transient>{%size} from(#hal.device.affinity<@execution_device>) -> to(#hal.device.affinity<@result_device_b>) !stream.resource<transient>{%size}
+      stream.yield %transfer_a, %transfer_b : !stream.resource<transient>{%size}, !stream.resource<transient>{%size}
+    }
+    stream.yield %concurrent#0, %concurrent#1 : !stream.resource<transient>{%size}, !stream.resource<transient>{%size}
+  } => !stream.timepoint
+  // CHECK: util.optimization_barrier %[[ALLOCA_A]]
+  util.optimization_barrier %results#0 : !stream.resource<transient>
+  // CHECK: util.optimization_barrier %[[ALLOCA_B]]
+  util.optimization_barrier %results#1 : !stream.resource<transient>
+  util.return
+}
+
+// -----
+
 // CHECK-LABEL: @applyAsyncDispatchOp
 // CHECK-SAME: (%[[OPERAND:.+]]: !stream.resource<transient>, %[[SIZE:.+]]: index, %[[OFFSET:.+]]: index, %[[END:.+]]: index, %[[LENGTH:.+]]: index)
 util.func public @applyAsyncDispatchOp(%operand: !stream.resource<transient>, %size: index, %offset: index, %end: index, %length: index) {
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir
index 7d3c2284aa86..cd6ccca95a20 100644
--- a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir
+++ b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/schedule_execution.mlir
@@ -34,8 +34,8 @@ util.func public @partitioning(%arg0: !stream.resource<external>, %arg1: !stream
 
 // -----
 
-// Tests partitioning multi device execution with barriers and transfers.
-// It validates that multi stream commands are created and run in parallel:
+// Tests partitioning multi-device execution with barriers and transfers.
+// It validates that multi-stream commands are created and run in parallel.
 
 // CHECK-LABEL: util.func public @deviceMultiDeviceSync
 util.func public @deviceMultiDeviceSync(%arg0: i1) -> !stream.resource<transient> {
@@ -43,37 +43,38 @@ util.func public @deviceMultiDeviceSync(%arg0: i1) -> !stream.resource<transient
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
   %c255_i32 = arith.constant 255 : i32
-  %0 = stream.async.splat %c255_i32 : i32 -> !stream.resource<transient>{%c128}
-  %1 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @ex::@dispatch0[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
-  %3 = stream.async.barrier %1 : !stream.resource<transient>{%c128} -> !stream.resource<transient>
-  %4 = stream.async.transfer %1 : !stream.resource<transient>{%c128} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_1>) !stream.resource<transient>{%c128}
-  // CHECK: stream.async.execute on(#hal.device.affinity<@__device_0>)
+
+  // CHECK: stream.async.execute on(#hal.device.affinity<@device0>)
   // CHECK: stream.async.splat
   // CHECK: stream.async.dispatch
   // CHECK: stream.async.transfer
+  %0 = stream.async.splat %c255_i32 : i32 -> !stream.resource<transient>{%c128}
+  %1 = stream.async.dispatch on(#hal.device.affinity<@device0>) @ex::@dispatch0[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
+  %3 = stream.async.barrier %1 : !stream.resource<transient>{%c128}
+  %4 = stream.async.transfer %1 : !stream.resource<transient>{%c128} from(#hal.device.affinity<@device0>) -> to(#hal.device.affinity<@device1>) !stream.resource<transient>{%c128}
 
-  %2 = stream.async.dispatch on(#hal.device.affinity<@__device_1>) @ex::@dispatch1[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
-  %5 = stream.async.barrier %2 : !stream.resource<transient>{%c128} -> !stream.resource<transient>
-  %6 = stream.async.transfer %2 : !stream.resource<transient>{%c128} from(#hal.device.affinity<@__device_1>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<transient>{%c128}
-  // CHECK: stream.async.execute on(#hal.device.affinity<@__device_1>)
+  // CHECK: stream.async.execute on(#hal.device.affinity<@device1>)
   // CHECK: stream.async.splat
   // CHECK: stream.async.dispatch
   // CHECK: stream.async.transfer
+  %2 = stream.async.dispatch on(#hal.device.affinity<@device1>) @ex::@dispatch1[%c1, %c1, %c1](%0[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
+  %5 = stream.async.barrier %2 : !stream.resource<transient>{%c128}
+  %6 = stream.async.transfer %2 : !stream.resource<transient>{%c128} from(#hal.device.affinity<@device1>) -> to(#hal.device.affinity<@device0>) !stream.resource<transient>{%c128}
 
-  %7 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @ex::@dispatch2[%c1, %c1, %c1](%3[%c0 to %c128 for %c128], %6[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
-  %9 = stream.async.barrier %7 : !stream.resource<transient>{%c128} -> !stream.resource<transient>
-  // CHECK: stream.async.execute on(#hal.device.affinity<@__device_0>)
+  // CHECK: stream.async.execute on(#hal.device.affinity<@device0>)
   // CHECK: stream.async.dispatch
+  %7 = stream.async.dispatch on(#hal.device.affinity<@device0>) @ex::@dispatch2[%c1, %c1, %c1](%3[%c0 to %c128 for %c128], %6[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
+  %8 = stream.async.barrier %7 : !stream.resource<transient>{%c128}
 
-  %8 = stream.async.dispatch on(#hal.device.affinity<@__device_1>) @ex::@dispatch2[%c1, %c1, %c1](%4[%c0 to %c128 for %c128], %5[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
-  %10 = stream.async.transfer %8 : !stream.resource<transient>{%c128} from(#hal.device.affinity<@__device_1>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<transient>{%c128}
-  // CHECK: stream.async.execute on(#hal.device.affinity<@__device_1>)
+  // CHECK: stream.async.execute on(#hal.device.affinity<@device1>)
   // CHECK: stream.async.dispatch
   // CHECK: stream.async.transfer
+  %9 = stream.async.dispatch on(#hal.device.affinity<@device1>) @ex::@dispatch2[%c1, %c1, %c1](%4[%c0 to %c128 for %c128], %5[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
+  %10 = stream.async.transfer %9 : !stream.resource<transient>{%c128} from(#hal.device.affinity<@device1>) -> to(#hal.device.affinity<@device0>) !stream.resource<transient>{%c128}
 
-  %11 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @ex::@dispatch2[%c1, %c1, %c1](%9[%c0 to %c128 for %c128], %10[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
-  // CHECK: stream.async.execute on(#hal.device.affinity<@__device_0>)
+  // CHECK: stream.async.execute on(#hal.device.affinity<@device0>)
   // CHECK: stream.async.dispatch
+  %11 = stream.async.dispatch on(#hal.device.affinity<@device0>) @ex::@dispatch2[%c1, %c1, %c1](%8[%c0 to %c128 for %c128], %10[%c0 to %c128 for %c128]) : (!stream.resource<transient>{%c128}, !stream.resource<transient>{%c128}) -> !stream.resource<transient>{%c128}
 
   util.return %11 : !stream.resource<transient>
 }