[GPU] Add barriers when resolving GPUMappedForall to fix race conditi…

…on (#19635) The barriers added here can be pessimistic and we can look into optimizing them at a later point if needed. However, we end up with a race if we dont have them. In some local testing I did on a MI300 GPU, I did not find any significant performance impact by these barriers. For example an unaligned matmul + elementwise took 47us and 48us with and without the barriers respectively with TileAndFuse with padding support and the corresponding default path takes 68us. The prefill stage of ToyLLAMA took 325us and 324us respectively with and without barriers while the default path takes 461us. Signed-off-by: Nirvedh Meshram <[email protected]>
iree-org · Jan 8, 2025 · c484058 · c484058
1 parent 9b4906e
commit c484058
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 3 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistributeForall.cpp
@@ -127,7 +127,12 @@ LogicalResult resolveGPUMappedForallOp(RewriterBase &rewriter,
   Value ub = getValueOrCreateConstantIndexOp(rewriter, loc, totalLoopTripCount);
   Value step =
       rewriter.create<arith::ConstantIndexOp>(loc, flatTotalNumWorkers);
+  // We need to add barriers before and after the distributed loop because the
+  // loop might have reads/writes to shared memory that can have a different
+  // layout compared to rest of the program.
+  rewriter.create<gpu::BarrierOp>(loc);
   auto forLoop = rewriter.create<scf::ForOp>(loc, lb, ub, step, ValueRange{});
+  rewriter.create<gpu::BarrierOp>(loc);
   Block *loopBody = forLoop.getBody();
 
   // Get the replacement IDs for the forall iterator ids.

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_forall.mlir
@@ -16,9 +16,11 @@ func.func @distribute_thread_forall(%out : memref<?xi32>)
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %c0 to %c1024 step %c128 {
 //       CHECK:     %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[TFLAT]]]
 //       CHECK:     memref.store {{.*}}[%[[LINID]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -38,9 +40,11 @@ func.func @distribute_warp_forall(%out : memref<?xi32>)
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
 //       CHECK:   %[[WARPSPLIT:.+]]:2 = affine.delinearize_index %[[TFLAT]] into (4, 32)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %c0 to %c32 step %c4 {
 //       CHECK:     %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[WARPSPLIT]]#0]
 //       CHECK:     memref.store {{.*}}[%[[LINID]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -76,7 +80,9 @@ func.func @distribute_thread_forall_drop_for_loop(%out : memref<?xi32>)
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[LINID:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   memref.store {{.*}}[%[[LINID]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -96,8 +102,10 @@ func.func @distribute_thread_forall_single_thread(%out : memref<?xi32>)
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %[[TFLAT]] to %c1 step %c128 {
 //       CHECK:     memref.store {{.*}}[%[[I]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -117,8 +125,10 @@ func.func @distribute_thread_forall_overhang(%out : memref<?xi32>)
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %[[TFLAT]] to %[[C513]] step %c128 {
 //       CHECK:     memref.store {{.*}}[%[[I]]]
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -137,11 +147,12 @@ func.func @distribute_thread_forall_multi_dim(%out : memref<?x?x?xi32>)
 //   CHECK-DAG:   %[[TX:.+]] = gpu.thread_id x
 //   CHECK-DAG:   %[[TY:.+]] = gpu.thread_id y
 //       CHECK:   %[[TFLAT:.+]] = affine.linearize_index disjoint [%[[TY]], %[[TX]]] by (2, 64)
+//       CHECK:   gpu.barrier
 //       CHECK:   scf.for %[[I:.+]] = %c0 to %c512 step %c128 {
 //       CHECK:     %[[LINID:.+]] = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%[[I]])[%[[TFLAT]]]
 //       CHECK:     %[[DELIN:.+]]:3 = affine.delinearize_index %[[LINID]] into (16, 8, 4) : index
 //       CHECK:     memref.store {{.*}}[%[[DELIN]]#0, %[[DELIN]]#1, %[[DELIN]]#2]
-
+//       CHECK:   gpu.barrier
 
 // -----
 
@@ -157,5 +168,7 @@ func.func @distribute_thread_forall_small_workgroup(%out : memref<?xi32>)
 }
 
 // CHECK-LABEL: func @distribute_thread_forall_small_workgroup
-//   CHECK:   %[[TX:.+]] = gpu.thread_id x
-//   CHECK:   memref.store {{.*}}[%[[TX]]]
+//       CHECK:   %[[TX:.+]] = gpu.thread_id x
+//       CHECK:   gpu.barrier
+//       CHECK:   memref.store {{.*}}[%[[TX]]]
+//       CHECK:   gpu.barrier
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -908,6 +908,7 @@ hal.executable public @main {
 // for loop.
 //       CHECK:       vector.transfer_write %{{.*}}, %[[B2]]{{.*}} memref<10x1xf32, #hal.descriptor_type<storage_buffer>>
 //  CHECK-NEXT:     }
+//  CHECK-NEXT:   gpu.barrier
 //  CHECK-NEXT:   } {mapping = [#iree_codegen.workgroup_mapping<x>]}
 //  CHECK-NEXT:   return