Skip to content

Commit

Permalink
[GPU] Add barriers when resolving GPUMappedForall to fix race conditi…
Browse files Browse the repository at this point in the history
…on (#19635)

The barriers added here can be pessimistic and we can look into
optimizing them at a later point if needed. However, we end up with a
race if we dont have them.
In some local testing I did on a MI300 GPU, I did not find any
significant performance impact by these barriers. For example an
unaligned matmul + elementwise took 47us and 48us with and without the
barriers respectively with TileAndFuse with padding support and the
corresponding default path takes 68us. The prefill stage of ToyLLAMA
took 325us and 324us respectively with and without barriers while the
default path takes 461us.

Signed-off-by: Nirvedh Meshram <[email protected]>
  • Loading branch information
nirvedhmeshram authored Jan 8, 2025
1 parent 9b4906e commit c484058
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,12 @@ LogicalResult resolveGPUMappedForallOp(RewriterBase &rewriter,
Value ub = getValueOrCreateConstantIndexOp(rewriter, loc, totalLoopTripCount);
Value step =
rewriter.create<arith::ConstantIndexOp>(loc, flatTotalNumWorkers);
// We need to add barriers before and after the distributed loop because the
// loop might have reads/writes to shared memory that can have a different
// layout compared to rest of the program.
rewriter.create<gpu::BarrierOp>(loc);
auto forLoop = rewriter.create<scf::ForOp>(loc, lb, ub, step, ValueRange{});
rewriter.create<gpu::BarrierOp>(loc);
Block *loopBody = forLoop.getBody();

// Get the replacement IDs for the forall iterator ids.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ func.func @distribute_thread_forall(%out : memref<?xi32>)
// CHECK-DAG: %[[TX:.+]] = gpu.thread_id x
// CHECK-DAG: %[[TY:.+]] = gpu.thread_id y
// CHECK: %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
// CHECK: gpu.barrier
// CHECK: scf.for %[[I:.+]] = %c0 to %c1024 step %c128 {
// CHECK: %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[TFLAT]]]
// CHECK: memref.store {{.*}}[%[[LINID]]]
// CHECK: gpu.barrier

// -----

Expand All @@ -38,9 +40,11 @@ func.func @distribute_warp_forall(%out : memref<?xi32>)
// CHECK-DAG: %[[TY:.+]] = gpu.thread_id y
// CHECK: %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
// CHECK: %[[WARPSPLIT:.+]]:2 = affine.delinearize_index %[[TFLAT]] into (4, 32)
// CHECK: gpu.barrier
// CHECK: scf.for %[[I:.+]] = %c0 to %c32 step %c4 {
// CHECK: %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[WARPSPLIT]]#0]
// CHECK: memref.store {{.*}}[%[[LINID]]]
// CHECK: gpu.barrier

// -----

Expand Down Expand Up @@ -76,7 +80,9 @@ func.func @distribute_thread_forall_drop_for_loop(%out : memref<?xi32>)
// CHECK-DAG: %[[TX:.+]] = gpu.thread_id x
// CHECK-DAG: %[[TY:.+]] = gpu.thread_id y
// CHECK: %[[LINID:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
// CHECK: gpu.barrier
// CHECK: memref.store {{.*}}[%[[LINID]]]
// CHECK: gpu.barrier

// -----

Expand All @@ -96,8 +102,10 @@ func.func @distribute_thread_forall_single_thread(%out : memref<?xi32>)
// CHECK-DAG: %[[TX:.+]] = gpu.thread_id x
// CHECK-DAG: %[[TY:.+]] = gpu.thread_id y
// CHECK: %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
// CHECK: gpu.barrier
// CHECK: scf.for %[[I:.+]] = %[[TFLAT]] to %c1 step %c128 {
// CHECK: memref.store {{.*}}[%[[I]]]
// CHECK: gpu.barrier

// -----

Expand All @@ -117,8 +125,10 @@ func.func @distribute_thread_forall_overhang(%out : memref<?xi32>)
// CHECK-DAG: %[[TX:.+]] = gpu.thread_id x
// CHECK-DAG: %[[TY:.+]] = gpu.thread_id y
// CHECK: %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
// CHECK: gpu.barrier
// CHECK: scf.for %[[I:.+]] = %[[TFLAT]] to %[[C513]] step %c128 {
// CHECK: memref.store {{.*}}[%[[I]]]
// CHECK: gpu.barrier

// -----

Expand All @@ -137,11 +147,12 @@ func.func @distribute_thread_forall_multi_dim(%out : memref<?x?x?xi32>)
// CHECK-DAG: %[[TX:.+]] = gpu.thread_id x
// CHECK-DAG: %[[TY:.+]] = gpu.thread_id y
// CHECK: %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
// CHECK: gpu.barrier
// CHECK: scf.for %[[I:.+]] = %c0 to %c512 step %c128 {
// CHECK: %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[TFLAT]]]
// CHECK: %[[DELIN:.+]]:3 = affine.delinearize_index %[[LINID]] into (16, 8, 4) : index
// CHECK: memref.store {{.*}}[%[[DELIN]]#0, %[[DELIN]]#1, %[[DELIN]]#2]

// CHECK: gpu.barrier

// -----

Expand All @@ -157,5 +168,7 @@ func.func @distribute_thread_forall_small_workgroup(%out : memref<?xi32>)
}

// CHECK-LABEL: func @distribute_thread_forall_small_workgroup
// CHECK: %[[TX:.+]] = gpu.thread_id x
// CHECK: memref.store {{.*}}[%[[TX]]]
// CHECK: %[[TX:.+]] = gpu.thread_id x
// CHECK: gpu.barrier
// CHECK: memref.store {{.*}}[%[[TX]]]
// CHECK: gpu.barrier
Original file line number Diff line number Diff line change
Expand Up @@ -908,6 +908,7 @@ hal.executable public @main {
// for loop.
// CHECK: vector.transfer_write %{{.*}}, %[[B2]]{{.*}} memref<10x1xf32, #hal.descriptor_type<storage_buffer>>
// CHECK-NEXT: }
// CHECK-NEXT: gpu.barrier
// CHECK-NEXT: } {mapping = [#iree_codegen.workgroup_mapping<x>]}
// CHECK-NEXT: return

Expand Down

0 comments on commit c484058

Please sign in to comment.