From 9aa114a3c5a578c0d1a5b239fa309af60e6957e9 Mon Sep 17 00:00:00 2001 From: ravil-mobile Date: Fri, 15 Nov 2024 21:58:42 +0100 Subject: [PATCH] [AMD] Refactor instruction scheduling hints (#5144) - Renamed instruction scheduling variants - Enabled `buffer-ops` for `local-prefetch` - Added documentation regarding current variants --------- Co-authored-by: Lei Zhang --- test/TritonGPU/amd/amd-instruction-sched.mlir | 10 ++-- third_party/amd/backend/compiler.py | 22 +++++++- .../TritonAMDGPUToLLVM/SchedInstructions.cpp | 55 ++++++++----------- 3 files changed, 50 insertions(+), 37 deletions(-) diff --git a/test/TritonGPU/amd/amd-instruction-sched.mlir b/test/TritonGPU/amd/amd-instruction-sched.mlir index 400c219b6790..011116e1b201 100644 --- a/test/TritonGPU/amd/amd-instruction-sched.mlir +++ b/test/TritonGPU/amd/amd-instruction-sched.mlir @@ -1,8 +1,8 @@ -// RUN: triton-opt %s -split-input-file -triton-amdgpu-insert-instruction-sched-hints -triton-amdgpu-lower-insert-instruction-sched-hints='variant=iglp0' -verify-diagnostics | FileCheck %s -check-prefix=INSERT_IGLP0 -// RUN: triton-opt %s -split-input-file -triton-amdgpu-insert-instruction-sched-hints -triton-amdgpu-lower-insert-instruction-sched-hints='variant=iglp1' -verify-diagnostics | FileCheck %s -check-prefix=INSERT_IGLP1 +// RUN: triton-opt %s -split-input-file -triton-amdgpu-insert-instruction-sched-hints -triton-amdgpu-lower-insert-instruction-sched-hints='variant=llvm-iglp-0' -verify-diagnostics | FileCheck %s -check-prefix=INSERT_IGLP0 +// RUN: triton-opt %s -split-input-file -triton-amdgpu-insert-instruction-sched-hints -triton-amdgpu-lower-insert-instruction-sched-hints='variant=llvm-iglp-1' -verify-diagnostics | FileCheck %s -check-prefix=INSERT_IGLP1 // RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline-v2='num_stages=1' -triton-amdgpu-insert-instruction-sched-hints -decompose-unsupported-amd-conversions -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS1 // RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline-v2='num_stages=2' -triton-amdgpu-insert-instruction-sched-hints -decompose-unsupported-amd-conversions -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -verify-diagnostics | FileCheck %s -check-prefix=INSTR_COUNT_NS2 -// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline-v2='num_stages=2' -triton-amdgpu-insert-instruction-sched-hints -decompose-unsupported-amd-conversions -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -triton-amdgpu-lower-insert-instruction-sched-hints='variant=ck_v3' -debug-only='lower-insert-instruction-sched-hints' -verify-diagnostics 2>&1 | FileCheck %s -check-prefix=USE_CKV3_GLOBAL_LOAD +// RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritonamdgpu-accelerate-matmul='arch-generation-name=gfx942 matrix-instruction-size=32 kPack=1' -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline-v2='num_stages=2' -triton-amdgpu-insert-instruction-sched-hints -decompose-unsupported-amd-conversions -optimize-amd-lds-usage='target-arch=gfx942' -convert-scf-to-cf -convert-index-to-llvm -allocate-shared-memory -convert-triton-amdgpu-to-llvm='arch=gfx942' -triton-amdgpu-lower-insert-instruction-sched-hints='variant=local-prefetch' -debug-only='lower-insert-instruction-sched-hints' -verify-diagnostics 2>&1 | FileCheck %s -check-prefix=USE_LOCAL_PREFETCH_GLOBAL_LOAD // RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline-v2='num_stages=1' | FileCheck %s -check-prefix=LABELING_PS_1 // RUN: triton-opt %s -split-input-file -convert-triton-to-tritongpu='target=hip:gfx942 num-ctas=1 num-warps=4 threads-per-warp=64' -tritongpu-coalesce -tritongpu-remove-layout-conversions -tritonamdgpu-stream-pipeline-v2='num_stages=2' | FileCheck %s -check-prefix=LABELING_PS_2 @@ -68,8 +68,8 @@ module { // INSTR_COUNT_NS2-SAME: numGlobalLoadsB = #amdgpu.InstCounter<4, vector<4xf16>> // INSTR_COUNT_NS2-SAME: numMMAs = #amdgpu.InstCounter<16, tensor<32x32x8xf16>> - // USE_CKV3_GLOBAL_LOAD: [lower-insert-instruction-sched-hints] - // USE_CKV3_GLOBAL_LOAD-SAME: Skipping instruction scheduling because `ck_v3` scheduling can be used only with `buffer_load` instructions. + // USE_LOCAL_PREFETCH_GLOBAL_LOAD: [lower-insert-instruction-sched-hints] + // USE_LOCAL_PREFETCH_GLOBAL_LOAD-SAME: skipping `local-prefetch` scheduling given it needs `buffer_load` instructions // LABELING_PS_1: scf.for // LABELING_PS_1: %[[REG0_OP0:.+]] = tt.load {{.*}} {OpIdx = #amdgpu.OpIdx<0>} diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py index e6231a389acb..0433f2458f32 100644 --- a/third_party/amd/backend/compiler.py +++ b/third_party/amd/backend/compiler.py @@ -52,6 +52,18 @@ class HIPOptions: # instruction scheduling of the AMDGPU backend which aims at maximizing occupancy. # The option is experimental and may change at any time regarding its semantics and/or may # be gone entirely anytime. + # + # Current experimental scheduling variants: + # + # llvm-iglp-0: injects `llvm.amdgcn.iglp_opt` intrinsic call with value `0` to the GEMM's + # k-loop; i.e., "interleave DS and MFMA instructions for small GEMM kernels". + # llvm-iglp-1: injects `llvm.amdgcn.iglp_opt` intrinsic call with value `1` to the GEMM's + # k-loop; i.e., "interleave DS and MFMA instructions for single wave small + # GEMM kernels.". + # local-prefetch: implements instruction scheduling similar to the one from the ROCm Composable + # Kernel library. Note, this variant requires the use of buffer load/store ops + # and a special software pipelining style - i.e., 1x LDS and 1x register + # prefetch buffers for each GEMM tile. instruction_sched_variant: str = 'none' def __post_init__(self): @@ -215,6 +227,7 @@ def make_ttgir(mod, metadata, options): passes.ttgpuir.add_remove_layout_conversions(pm) amd.passes.ttgpuir.add_optimize_epilogue(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) + if amd.has_matrix_core_feature(options.arch): assert options.num_stages != 0, ("Triton AMD backend pipeliner has been updated. " "We used to trigger software pipelining with " @@ -229,7 +242,14 @@ def make_ttgir(mod, metadata, options): passes.ttgpuir.add_reduce_data_duplication(pm) if amd.has_matrix_core_feature(options.arch): amd.passes.ttgpuir.add_reorder_instructions(pm) - if os.environ.get("AMDGCN_USE_BUFFER_OPS", "0") == "1": + + use_buffer_ops = os.environ.get("AMDGCN_USE_BUFFER_OPS", "0") == "1" + + # The `local-prefetch` scheduling variant requires turning on buffer ops. + if options.instruction_sched_variant == "local-prefetch": + use_buffer_ops = True + + if use_buffer_ops: amd.passes.ttgpuir.add_canonicalize_pointers(pm) passes.common.add_canonicalizer(pm) amd.passes.ttgpuir.add_convert_to_buffer_ops(pm) diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/SchedInstructions.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/SchedInstructions.cpp index 64500176a61a..d93f2ca6c6ec 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/SchedInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/SchedInstructions.cpp @@ -6,7 +6,6 @@ #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "mlir/Pass/Pass.h" #include "triton/Conversion/TritonGPUToLLVM/Utility.h" -#include "llvm/TargetParser/TargetParser.h" namespace mlir::triton { #define GEN_PASS_DEF_TRITONAMDGPUINSERTINSTRUCTIONSCHEDHINTS @@ -221,12 +220,13 @@ struct InstructionSchedHintsRewriter std::transform(variant.begin(), variant.end(), variant.begin(), [](unsigned char c) { return std::tolower(c); }); - this->schedulingType = llvm::StringSwitch(variant) - .Case("none", SchedulingType::NONE) - .Case("iglp0", SchedulingType::IGLP0) - .Case("iglp1", SchedulingType::IGLP1) - .Case("ck_v3", SchedulingType::CK_V3) - .Default(SchedulingType::UNKNOWN); + this->schedulingType = + llvm::StringSwitch(variant) + .Case("none", SchedulingType::NONE) + .Case("llvm-iglp-0", SchedulingType::LLVM_IGLP_0) + .Case("llvm-iglp-1", SchedulingType::LLVM_IGLP_1) + .Case("local-prefetch", SchedulingType::LOCAL_PREFETCH) + .Default(SchedulingType::UNKNOWN); if (this->numStages < 2) { this->schedulingType = SchedulingType::NONE; @@ -237,26 +237,24 @@ struct InstructionSchedHintsRewriter enum class SchedulingType : uint32_t { NONE = 0, - IGLP0, - IGLP1, - CK_V3, + LLVM_IGLP_0, + LLVM_IGLP_1, + LOCAL_PREFETCH, UNKNOWN }; - // This is the implementation of the CK's V3 pipelining (see - // see ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp). + // The following is inspired by ROCm Composable Kernel library's V3 pipelining + // (see ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.hpp). // This scheduling requires 1x register and 1x LDS buffers combined with the // local (LDS to registers) and global (HBM to registers) data prefetching. - // see: - // include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v3.h - void - createCKV3Schedule(PatternRewriter &rewriter, Location loc, - triton::amdgpu::InstructionSchedHint schedHint) const { + void createLocalPrefetchSchedule( + PatternRewriter &rewriter, Location loc, + triton::amdgpu::InstructionSchedHint schedHint) const { if (!(schedHint.getIsBufferLoadsAEnabled() && schedHint.getIsBufferLoadsBEnabled())) { - LDBG("Skipping instruction scheduling because `ck_v3` " - "scheduling can be used only with `buffer_load` instructions."); + LDBG("skipping `local-prefetch` scheduling given it needs `buffer_load` " + "instructions"); return; } @@ -435,8 +433,8 @@ struct InstructionSchedHintsRewriter // backend documentation. const bool limitSchedulingRange = !(schedulingType == SchedulingType::NONE || - schedulingType == SchedulingType::IGLP0 || - schedulingType == SchedulingType::IGLP1); + schedulingType == SchedulingType::LLVM_IGLP_0 || + schedulingType == SchedulingType::LLVM_IGLP_1); Location loc = instructionSchedHint->getLoc(); Block *block = instructionSchedHint->getBlock(); if (limitSchedulingRange) { @@ -448,22 +446,17 @@ struct InstructionSchedHintsRewriter rewriter.setInsertionPoint(block, std::prev(block->end())); switch (schedulingType) { - case SchedulingType::IGLP0: - [[fallthrough]]; - case SchedulingType::IGLP1: { + case SchedulingType::LLVM_IGLP_0: + case SchedulingType::LLVM_IGLP_1: createIglpOpt(rewriter, loc, static_cast(schedulingType) - 1); break; - } - case SchedulingType::CK_V3: { - createCKV3Schedule(rewriter, loc, instructionSchedHint); + case SchedulingType::LOCAL_PREFETCH: + createLocalPrefetchSchedule(rewriter, loc, instructionSchedHint); break; - } case SchedulingType::NONE: - [[fallthrough]]; - default: { + default: break; } - } if (limitSchedulingRange) createSchedBarrier(rewriter, loc,