triton-lang · peymanbr · Jan 17, 2025 · Jokeren · Jan 27, 2025 · Jokeren
@@ -1,4 +1,4 @@
-default_stages: [pre-commit, pre-push, manual]
+default_stages: [pre-commit, commit-msg, manual, pre-merge-commit, post-checkout, post-commit, post-merge, post-rewrite, prepare-commit-msg]
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v5.0.0
@@ -18,7 +18,7 @@ repos:
       - id: debug-statements
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.9.1
+    rev: v0.9.3
     hooks:
       - id: ruff
         files: '(^python|^third_party/proton|^third_party/amd|^third_party/nvidia|^test)/.*'
@@ -35,7 +35,7 @@ repos:
         args: ["-p", "-i"]
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.6
+    rev: v19.1.7
     hooks:
       - id: clang-format
 

@@ -147,6 +147,8 @@ Scan/Sort Ops
     histogram
     sort
     gather
+    topk
+
 
 Atomic Ops
 ----------

@@ -114,3 +114,48 @@ def swizzle2d_kernel(output, size_i, size_j, size_g):
     expected_order = torch.tensor([[0, 3, 6, 9, 12, 15, 18], [1, 4, 7, 10, 13, 16, 19], [2, 5, 8, 11, 14, 17, 20],
                                    [21, 23, 25, 27, 29, 31, 33], [22, 24, 26, 28, 30, 32, 34]]).to(device)
     assert (output == expected_order).all(), (output, expected_order)
+
+
+# ---------------
+# test topk op
+# ---------------
+
+
+@pytest.mark.interpreter
+@pytest.mark.parametrize("M, N", [[1, 512], [8, 64], [256, 16], [512, 8]])
+@pytest.mark.parametrize("k", [1, 4, 10, 100, 512])
+@pytest.mark.parametrize("descending", [False, True])
+@pytest.mark.parametrize("dtype_str", ['int32', 'float16', 'float32', 'bfloat16'])
+def test_topk(M, N, k, descending, dtype_str, device):
+    k = min(k, M)
+
+    @triton.jit
+    def topk_kernel(X, expected_values, expected_indices, N: tl.constexpr, M: tl.constexpr, k: tl.constexpr,
+                    descending: tl.constexpr):
+        offx = tl.arange(0, M)
+        offy = tl.arange(0, N) * M
+        off2d = offx[None, :] + offy[:, None]
+        x = tl.load(X + off2d)
+        actual_values, actual_indices = tl.topk(x, k=k, descending=descending)
+        tl.store(expected_values + off2d, actual_values)
+        tl.store(expected_indices + off2d, actual_indices)
+
+    x = numpy_random((N, M), dtype_str=dtype_str)
+    x = torch.from_numpy(x).to(device)
+
+    if descending:
+        torch_values, torch_indices = torch.topk(x, k, dim=1)
+    else:
+        torch_values, torch_indices = torch.topk(x, k, dim=1, largest=False)
+
+    # Allocate output tensors.
+    expected_values = torch.zeros_like(x)
+    expected_indices = torch.zeros_like(x, dtype=torch.int64)
+
+    expected_values[:, :k] = torch_values
+    expected_indices[:, :k] = torch_indices
+
+    topk_kernel[(1, )](x, expected_values, expected_indices, N, M, k, descending, num_warps=8)
+
+    assert torch.allclose(expected_values, expected_values), (expected_values, expected_values)
+    assert torch.allclose(expected_indices, expected_indices), (expected_indices, expected_indices)
@@ -19,6 +19,7 @@
     sort,
     sum,
     swizzle2d,
+    topk,
     xor_sum,
     zeros,
     zeros_like,
@@ -247,6 +248,7 @@
     "sum",
     "swizzle2d",
     "tensor",
+    "topk",
     "trans",
     "tuple",
     "uint16",

@@ -265,6 +265,7 @@ def __call__(self, *args, **kwds):
 
 
 CONSTEXPR_0 = constexpr(0)
+CONSTEXPR_1 = constexpr(1)
 
 
 def _unwrap_if_constexpr(o):
@@ -1161,6 +1162,9 @@ def sort(self, dim: constexpr = None, descending: constexpr = CONSTEXPR_0) -> te
     def flip(self, dim=None) -> tensor:
         ...
 
+    def topk(self, k: constexpr, descending: constexpr = CONSTEXPR_1) -> tuple[tensor, tensor]:
+        ...
+
 
 class tuple:
 

@@ -3,6 +3,7 @@
 from ..runtime.jit import jit
 from . import core
 from . import math
+import triton.language as tl
 
 # constexpr utilities
 
@@ -452,3 +453,48 @@ def interleave(a, b):
         # understand that if we take the `if` above we definitely don't run this
         # `else`.
         return core.reshape(c, c.shape[:-2] + [2 * c.shape[-2]])
+
+
+# topk
+
+
+@core._tensor_member_fn
+@jit
+def topk(x, k: core.constexpr, descending: core.constexpr = core.CONSTEXPR_1):
+    """
+    Returns the top-k elements and their indices along the last dimension.
+    :param x: The input tensor.
+    :type x: Tensor
+    :param k: The number of top elements to return.
+    :type k: int
+    :param descending: If True (default), returns the largest elements. If False, returns the smallest.
+    :type descending: bool
+    :return: A tuple of (top-k elements, top-k indices) tensors.
+    :rtype: Tuple[Tensor, Tensor]
+    """
+    core.static_assert(k > 0, "k must be greater than 0.")
+    core.static_assert(k <= x.shape[-1], "k must not exceed the size of the last dimension.")
+    # Verify that the shape of the input tensor satisfies Triton's requirements.
+    core.static_assert(_is_power_of_two(x.shape[-1]), "Last dimension must be a power of 2.")
+
+    # Sort the tensor along the last dimension.
+    sorted_elements = tl.sort(x, dim=len(x.shape) - 1, descending=descending)
+
+    # Create a range tensor to map the top-k elements back to their original indices.
+    # n_outer represents the number of elements outside the last dimension being processed.
+    n_outer: core.constexpr = x.numel // x.shape[-1]
+    last_dim_size: core.constexpr = x.shape[-1]
+
+    # Create a tensor for the original indices.
+    original_indices = core.arange(0, last_dim_size)
+    original_indices = core.reshape(original_indices, [1, last_dim_size])
+    original_indices = core.broadcast_to(original_indices, [n_outer, last_dim_size])
+
+    # Create a mask to keep only the first k elements and their indices.
+    mask = original_indices < k
+
+    # Apply the mask to the sorted elements and indices.
+    topk_elements = sorted_elements * mask.to(sorted_elements.dtype)
+    topk_indices = original_indices * mask.to(original_indices.dtype)
+
+    return (topk_elements, topk_indices)
-Original file line number
+Diff line change
@@ Expand Up / @@ -147,6 +147,8 @@ Scan/Sort Ops @@
         histogram
         sort
         gather
+        topk
     Atomic Ops
     ----------
@@ Expand Down @@