Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EgorBot for hamarb123 in #99596 #276

Open
EgorBot opened this issue Jan 29, 2025 · 2 comments
Open

EgorBot for hamarb123 in #99596 #276

EgorBot opened this issue Jan 29, 2025 · 2 comments

Comments

@EgorBot
Copy link
Owner

EgorBot commented Jan 29, 2025

Processing dotnet/runtime#99596 (comment) command:

Command

-intel -amd

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;

public class Benchmarks
{
    private byte[] source = null!;

    [Params(64, 128, 256, 1024, 4096, 16384, 65536, 1 << 20)]
    public int N;

    [GlobalSetup]
    public void Setup()
    {
        source = new byte[N];
        new Random(42).NextBytes(source);
    }

    [Benchmark]
    public byte TestWork() => Work(ref MemoryMarshal.GetArrayDataReference(source), (uint)source.Length);

    [Benchmark]
    public byte TestWorkWithZeroing() => WorkWithZeroing(ref MemoryMarshal.GetArrayDataReference(source), (uint)source.Length);

    //test to see if the new codegen using shuf over perm is meaningfully faster
    //this should be more pipelineable than the other test I ran

    public static byte Work(ref byte b, nuint size)
    {
        Vector256<byte> result = Vector256<byte>.Zero;
        while (size >= (uint)Vector256<byte>.Count + 7u)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            Vector256<byte> v1 = Vector256.LoadUnsafe(ref b, 1);
            Vector256<byte> v2 = Vector256.LoadUnsafe(ref b, 2);
            Vector256<byte> v3 = Vector256.LoadUnsafe(ref b, 3);
            Vector256<byte> v4 = Vector256.LoadUnsafe(ref b, 4);
            Vector256<byte> v5 = Vector256.LoadUnsafe(ref b, 5);
            Vector256<byte> v6 = Vector256.LoadUnsafe(ref b, 6);
            Vector256<byte> v7 = Vector256.LoadUnsafe(ref b, 7);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v1 = Vector256.Shuffle(v1, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v2 = Vector256.Shuffle(v2, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v3 = Vector256.Shuffle(v3, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v4 = Vector256.Shuffle(v4, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v5 = Vector256.Shuffle(v5, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v6 = Vector256.Shuffle(v6, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v7 = Vector256.Shuffle(v7, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += ((v0 ^ v1) - (v2 + v3)) ^ ((v4 + ~v5) + (v6 + v7));
            b = ref Unsafe.Add(ref b, 8);
            size -= 8;
        }
        while (size >= (uint)Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0;
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }

    public static byte WorkWithZeroing(ref byte b, nuint size)
    {
        Vector256<byte> result = Vector256<byte>.Zero;
        while (size >= (uint)Vector256<byte>.Count + 7u)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            Vector256<byte> v1 = Vector256.LoadUnsafe(ref b, 1);
            Vector256<byte> v2 = Vector256.LoadUnsafe(ref b, 2);
            Vector256<byte> v3 = Vector256.LoadUnsafe(ref b, 3);
            Vector256<byte> v4 = Vector256.LoadUnsafe(ref b, 4);
            Vector256<byte> v5 = Vector256.LoadUnsafe(ref b, 5);
            Vector256<byte> v6 = Vector256.LoadUnsafe(ref b, 6);
            Vector256<byte> v7 = Vector256.LoadUnsafe(ref b, 7);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v1 = Vector256.Shuffle(v1, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v2 = Vector256.Shuffle(v2, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v3 = Vector256.Shuffle(v3, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v4 = Vector256.Shuffle(v4, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v5 = Vector256.Shuffle(v5, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v6 = Vector256.Shuffle(v6, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v7 = Vector256.Shuffle(v7, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += ((v0 ^ v1) - (v2 + v3)) ^ ((v4 + ~v5) + (v6 + v7));
            b = ref Unsafe.Add(ref b, 8);
            size -= 8;
        }
        while (size >= (uint)Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0;
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }
}

(EgorBot will reply in this issue)

@EgorBot
Copy link
Owner Author

EgorBot commented Jan 29, 2025

Benchmark results on linux_azure_genoa

BenchmarkDotNet v0.14.0, Ubuntu 24.04.1 LTS (Noble Numbat)
AMD EPYC 9V74, 1 CPU, 8 logical and 4 physical cores
  Job-JRMVGK : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
  Job-BNRGWM : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Toolchain N Mean Error Ratio
TestWork Main 64 8.060 ns 0.0717 ns 1.00
TestWork PR 64 8.344 ns 0.0940 ns 1.04
TestWorkWithZeroing Main 64 9.771 ns 0.0351 ns 1.00
TestWorkWithZeroing PR 64 8.341 ns 0.1607 ns 0.85
TestWork Main 128 21.339 ns 0.3073 ns 1.00
TestWork PR 128 23.009 ns 0.2659 ns 1.08
TestWorkWithZeroing Main 128 27.356 ns 0.5715 ns 1.00
TestWorkWithZeroing PR 128 23.892 ns 0.5034 ns 0.87
TestWork Main 256 49.933 ns 0.9419 ns 1.00
TestWork PR 256 50.601 ns 1.0426 ns 1.01
TestWorkWithZeroing Main 256 58.178 ns 1.1913 ns 1.00
TestWorkWithZeroing PR 256 51.124 ns 1.0582 ns 0.88
TestWork Main 1024 221.675 ns 3.4949 ns 1.00
TestWork PR 1024 219.388 ns 3.9195 ns 0.99
TestWorkWithZeroing Main 1024 252.837 ns 2.5360 ns 1.00
TestWorkWithZeroing PR 1024 217.202 ns 3.7242 ns 0.86
TestWork Main 4096 920.566 ns 14.9785 ns 1.00
TestWork PR 4096 890.564 ns 11.4375 ns 0.97
TestWorkWithZeroing Main 4096 1,038.222 ns 5.8478 ns 1.00
TestWorkWithZeroing PR 4096 895.899 ns 9.8948 ns 0.86
TestWork Main 16384 3,695.416 ns 8.3498 ns 1.00
TestWork PR 16384 3,572.049 ns 5.1600 ns 0.97
TestWorkWithZeroing Main 16384 4,174.213 ns 5.9289 ns 1.00
TestWorkWithZeroing PR 16384 3,578.050 ns 6.2754 ns 0.86
TestWork Main 65536 14,818.742 ns 15.8500 ns 1.00
TestWork PR 65536 14,359.367 ns 16.1748 ns 0.97
TestWorkWithZeroing Main 65536 16,721.886 ns 21.0476 ns 1.00
TestWorkWithZeroing PR 65536 14,335.655 ns 10.1123 ns 0.86
TestWork Main 1048576 236,854.538 ns 262.3565 ns 1.00
TestWork PR 1048576 229,811.150 ns 249.4602 ns 0.97
TestWorkWithZeroing Main 1048576 267,728.676 ns 140.5791 ns 1.00
TestWorkWithZeroing PR 1048576 229,378.676 ns 220.4589 ns 0.86

BDN_Artifacts.zip

cc @hamarb123 (agent_logs.txt). EgorBot manual: link.

@EgorBot
Copy link
Owner Author

EgorBot commented Jan 29, 2025

Benchmark results on linux_azure_cascadelake

BenchmarkDotNet v0.14.0, Ubuntu 24.04.1 LTS (Noble Numbat)
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 8 logical and 4 physical cores
  Job-POPUCL : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
  Job-WUZLBP : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Toolchain N Mean Error Ratio
TestWork Main 64 10.477 ns 0.0021 ns 1.00
TestWork PR 64 10.255 ns 0.0049 ns 0.98
TestWorkWithZeroing Main 64 12.716 ns 0.0047 ns 1.00
TestWorkWithZeroing PR 64 9.887 ns 0.0300 ns 0.78
TestWork Main 128 28.710 ns 0.0111 ns 1.00
TestWork PR 128 25.435 ns 0.1354 ns 0.89
TestWorkWithZeroing Main 128 33.987 ns 0.0108 ns 1.00
TestWorkWithZeroing PR 128 26.736 ns 0.0253 ns 0.79
TestWork Main 256 64.977 ns 0.0135 ns 1.00
TestWork PR 256 56.324 ns 0.0955 ns 0.87
TestWorkWithZeroing Main 256 76.767 ns 0.0187 ns 1.00
TestWorkWithZeroing PR 256 57.097 ns 0.1499 ns 0.74
TestWork Main 1024 285.519 ns 0.0380 ns 1.00
TestWork PR 1024 238.199 ns 0.5961 ns 0.83
TestWorkWithZeroing Main 1024 332.434 ns 0.0453 ns 1.00
TestWorkWithZeroing PR 1024 241.675 ns 2.0290 ns 0.73
TestWork Main 4096 1,187.669 ns 0.0767 ns 1.00
TestWork PR 4096 1,026.536 ns 20.0993 ns 0.86
TestWorkWithZeroing Main 4096 1,376.965 ns 0.5258 ns 1.00
TestWorkWithZeroing PR 4096 1,024.971 ns 20.4914 ns 0.74
TestWork Main 16384 4,758.117 ns 0.5854 ns 1.00
TestWork PR 16384 4,179.029 ns 2.6244 ns 0.88
TestWorkWithZeroing Main 16384 5,519.795 ns 1.7770 ns 1.00
TestWorkWithZeroing PR 16384 4,177.967 ns 2.0834 ns 0.76
TestWork Main 65536 19,054.506 ns 1.2671 ns 1.00
TestWork PR 65536 16,550.536 ns 13.1765 ns 0.87
TestWorkWithZeroing Main 65536 22,109.951 ns 9.0549 ns 1.00
TestWorkWithZeroing PR 65536 16,448.069 ns 15.1416 ns 0.74
TestWork Main 1048576 305,015.070 ns 61.3536 ns 1.00
TestWork PR 1048576 252,464.712 ns 75.2583 ns 0.83
TestWorkWithZeroing Main 1048576 354,147.083 ns 175.3066 ns 1.00
TestWorkWithZeroing PR 1048576 252,433.582 ns 124.9834 ns 0.71

BDN_Artifacts.zip

cc @hamarb123 (agent_logs.txt). EgorBot manual: link.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant