Triton benchmarks #1356
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Triton benchmarks | |
run-name: ${{ inputs.run_name }} | |
on: | |
workflow_dispatch: | |
inputs: | |
runner_label: | |
description: Runner label, keep empty for default | |
type: string | |
default: "" | |
tag: | |
description: Tag for benchmark results | |
type: string | |
default: "test" | |
benchmarking_method: | |
description: The method used to obtain performance numbers | |
type: choice | |
options: | |
- ELAPSED_TIME | |
- UPSTREAM_PYTORCH_PROFILER | |
default: UPSTREAM_PYTORCH_PROFILER | |
verify: | |
description: Verify the benchmark results | |
type: boolean | |
default: true | |
run_name: | |
description: Run name | |
type: string | |
default: "Triton benchmarks" | |
skip_benchmarks: | |
description: JSON list of benchmarks to skip | |
type: string | |
default: "[]" | |
use_pyenv_python: | |
description: Use Python built with pyenv | |
type: boolean | |
default: false | |
schedule: | |
- cron: "5 23 * * *" | |
pull_request: | |
branches: | |
- main | |
paths: | |
- .github/workflows/triton-benchmarks.yml | |
- benchmarks/** | |
permissions: read-all | |
env: | |
PYTHON_VERSION: "3.10" | |
BENCHMARKING_METHOD: ${{ inputs.benchmarking_method || 'UPSTREAM_PYTORCH_PROFILER' }} | |
VERIFY: ${{ (github.event_name == 'pull_request' || github.event_name == 'schedule' || inputs.verify) && '1' || '0' }} | |
TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }} | |
jobs: | |
build: | |
name: Triton benchmarks | |
runs-on: | |
- ${{ inputs.runner_label || 'max1550' }} | |
timeout-minutes: 720 | |
defaults: | |
run: | |
shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" | |
steps: | |
- name: Print inputs | |
run: | | |
cat <<EOF | |
${{ toJSON(inputs) }} | |
EOF | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- name: Install Python | |
if: ${{ !(inputs.use_pyenv_python || false) }} | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
- name: Install Python (from pyenv) ${{ inputs.python_version }} | |
if: ${{ inputs.use_pyenv_python }} | |
uses: ./.github/actions/setup-pyenv-python | |
with: | |
python-version: ${{ env.PYTHON_VERSION }} | |
- name: Identify Python version | |
run: | | |
PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info[0]}.{ sys.version_info[1]}")')" | |
echo "PYTHON_VERSION=$PYTHON_VERSION" | tee -a $GITHUB_ENV | |
- name: Install Python build dependencies | |
run: | | |
pip install wheel cmake | |
- name: Setup PyTorch | |
uses: ./.github/actions/setup-pytorch | |
- name: Build Triton wheels | |
uses: ./.github/actions/setup-triton | |
with: | |
command: DEBUG=1 python setup.py bdist_wheel | |
- name: Install Triton | |
run: | | |
pip install python/dist/*.whl | |
- name: Install benchmark dependencies | |
run: | | |
pip install matplotlib pandas tabulate | |
- name: Create reports dir | |
run: | | |
mkdir reports | |
echo "REPORTS=$PWD/reports" >> $GITHUB_ENV | |
- name: Install benchmarks | |
id: install | |
run: | | |
cd benchmarks | |
pip install . | |
- name: Run Triton Softmax kernel benchmark | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
python fused_softmax.py --reports $REPORTS | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-triton-report.csv --benchmark softmax --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG | |
- name: Run Triton GEMM kernel benchmark | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
python gemm_benchmark.py --reports $REPORTS | |
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-base.csv | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-triton-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG | |
- name: Run Triton GEMM kernel benchmark - advanced path | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_advanced') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
# Advanced path: | |
TRITON_INTEL_ADVANCED_PATH=1 \ | |
IGC_VISAOptions=" -enableBCR -nolocalra" \ | |
IGC_DisableLoopUnroll=1 \ | |
python gemm_benchmark.py --reports $REPORTS | |
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-adv-path.csv | |
source ../../scripts/capture-hw-details.sh | |
TAG="${TAG}-adv" | |
python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
- name: Run Triton GEMM (A@B^t) kernel benchmark | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS | |
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-bt.csv | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-triton-report.csv --benchmark gemm-bt --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG | |
- name: Run Triton GEMM (A^t@B) kernel benchmark | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS | |
mv $REPORTS/matmul-performance.csv $REPORTS/matmul-performance-at.csv | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-triton-report.csv --benchmark gemm-at --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG | |
- name: Run Triton GEMM (stream-k) kernel benchmark | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
python gemm_streamk_benchmark.py --reports $REPORTS | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-xetla-report.csv --benchmark gemm-streamk --compiler xetla --param_cols "M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG | |
- name: Run Triton GEMM (split-k) kernel benchmark | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
python gemm_splitk_benchmark.py --reports $REPORTS | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-xetla-report.csv --benchmark gemm-splitk --compiler xetla --param_cols "M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG | |
- name: Run Triton GEMM + PreOp (exp) kernel benchmark | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
python gemm_preop_exp_benchmark.py --reports $REPORTS | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
- name: Run Triton GEMM + PostOp (Gelu) kernel benchmark | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
python gemm_postop_gelu_benchmark.py --reports $REPORTS | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark bfloat16 | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark.py') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
python gemm_postop_addmatrix_benchmark.py --reports $REPORTS | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix-bfloat16.csv $REPORTS/gemm-postop-addmatrix-bfloat16-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
- name: Run Triton GEMM + PostOp (add matrix) kernel benchmark int8 | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark.py') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
INT8_ONLY=1 python gemm_postop_addmatrix_benchmark.py --reports $REPORTS | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix-int8.csv $REPORTS/gemm-postop-addmatrix-int8-triton-report.csv --benchmark gemm-postop-addmatrix-int8 --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
- name: Run Triton FA fwd kernel benchmark | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
python flash_attention_benchmark.py --reports $REPORTS | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL,MODE" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL,MODE" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG | |
- name: Run Triton FA fwd kernel benchmark - advanced path | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py_advanced') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
TRITON_INTEL_ADVANCED_PATH=1 \ | |
IGC_VISAOptions=" -enableBCR" \ | |
python flash_attention_benchmark.py --reports $REPORTS | |
TAG="${TAG}-adv" | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL,MODE" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
- name: Run Triton FA bwd kernel benchmark | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_benchmark.py') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
FA_KERNEL_MODE="bwd" \ | |
python flash_attention_benchmark.py --reports $REPORTS | |
mv $REPORTS/attn-performance.csv $REPORTS/attn-bwd-performance.csv | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/attn-bwd-performance.csv $REPORTS/attn-bwd-triton-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL,MODE" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
python ../../scripts/build_report.py $REPORTS/attn-bwd-performance.csv $REPORTS/attn-bwd-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL,MODE" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG | |
- name: Run Prefix Sums kernel benchmark | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} | |
run: | | |
cd benchmarks/triton_kernels_benchmark | |
python prefix_sums.py --reports $REPORTS | |
source ../../scripts/capture-hw-details.sh | |
python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG | |
- name: Run micro benchmark | |
if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} | |
run: | | |
cd benchmarks/micro_benchmarks | |
python run_benchmarks.py --reports $REPORTS | |
- name: Upload benchmark reports | |
if: ${{ steps.install.outcome == 'success' && !cancelled() }} | |
uses: actions/upload-artifact@v4 | |
with: | |
name: benchmark-reports | |
path: reports |