Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EgorBot for hamarb123 in #99596 #277

Open
EgorBot opened this issue Jan 29, 2025 · 2 comments
Open

EgorBot for hamarb123 in #99596 #277

EgorBot opened this issue Jan 29, 2025 · 2 comments

Comments

@EgorBot
Copy link
Owner

EgorBot commented Jan 29, 2025

Processing dotnet/runtime#99596 (comment) command:

Command

-intel -amd

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;

public class Benchmarks
{
    private byte[] source = null!;

    [Params(32, 64, 128, 256, 1024, 4096, 16384, 65536)]
    public int N;

    [GlobalSetup]
    public void Setup()
    {
        source = new byte[N];
        new Random(42).NextBytes(source);
    }

    [Benchmark]
    public float TestWork() => Work(ref MemoryMarshal.GetArrayDataReference(source), (uint)source.Length);

    [Benchmark]
    public float TestWorkWithZeroing() => WorkWithZeroing(ref MemoryMarshal.GetArrayDataReference(source), (uint)source.Length);

    //this test should be very latency bound as it has a long dependency chain (and it will even compile first try this time!)

    public static float Work(ref byte b, nuint size)
    {
        Vector256<float> result = Vector256<float>.Zero;
        while (size >= (uint)Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0.AsSingle();
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }

    public static float WorkWithZeroing(ref byte b, nuint size)
    {
        Vector256<float> result = Vector256<float>.Zero;
        while (size >= (uint)Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0.AsSingle();
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }
}

(EgorBot will reply in this issue)

@EgorBot
Copy link
Owner Author

EgorBot commented Jan 29, 2025

Benchmark results on linux_azure_genoa

BenchmarkDotNet v0.14.0, Ubuntu 24.04.1 LTS (Noble Numbat)
AMD EPYC 9V74, 1 CPU, 8 logical and 4 physical cores
  Job-FVPUPO : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
  Job-JPALZN : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Toolchain N Mean Error Ratio
TestWork Main 32 4.028 ns 0.0005 ns 1.00
TestWork PR 32 3.250 ns 0.0007 ns 0.81
TestWorkWithZeroing Main 32 5.412 ns 0.0012 ns 1.00
TestWorkWithZeroing PR 32 3.248 ns 0.0009 ns 0.60
TestWork Main 64 83.408 ns 0.0115 ns 1.00
TestWork PR 64 75.637 ns 0.0293 ns 0.91
TestWorkWithZeroing Main 64 129.789 ns 0.1251 ns 1.00
TestWorkWithZeroing PR 64 75.297 ns 0.0071 ns 0.58
TestWork Main 128 242.562 ns 0.0254 ns 1.00
TestWork PR 128 219.447 ns 0.0186 ns 0.90
TestWorkWithZeroing Main 128 379.330 ns 0.4133 ns 1.00
TestWorkWithZeroing PR 128 219.431 ns 0.0263 ns 0.58
TestWork Main 256 561.700 ns 0.0269 ns 1.00
TestWork PR 256 508.631 ns 0.0425 ns 0.91
TestWorkWithZeroing Main 256 878.289 ns 0.7308 ns 1.00
TestWorkWithZeroing PR 256 508.509 ns 0.0843 ns 0.58
TestWork Main 1024 2,476.962 ns 0.3858 ns 1.00
TestWork PR 1024 2,243.843 ns 0.1472 ns 0.91
TestWorkWithZeroing Main 1024 3,875.836 ns 3.8726 ns 1.00
TestWorkWithZeroing PR 1024 2,243.851 ns 0.3199 ns 0.58
TestWork Main 4096 10,137.186 ns 0.8249 ns 1.00
TestWork PR 4096 9,181.582 ns 0.8262 ns 0.91
TestWorkWithZeroing Main 4096 15,882.453 ns 20.3580 ns 1.00
TestWorkWithZeroing PR 4096 9,183.172 ns 2.6816 ns 0.58
TestWork Main 16384 40,775.922 ns 2.0389 ns 1.00
TestWork PR 16384 36,933.674 ns 2.0524 ns 0.91
TestWorkWithZeroing Main 16384 63,865.811 ns 79.2406 ns 1.00
TestWorkWithZeroing PR 16384 36,932.880 ns 1.8486 ns 0.58
TestWork Main 65536 163,333.184 ns 11.3526 ns 1.00
TestWork PR 65536 147,965.859 ns 13.2357 ns 0.91
TestWorkWithZeroing Main 65536 255,547.157 ns 252.9114 ns 1.00
TestWorkWithZeroing PR 65536 147,958.248 ns 55.0237 ns 0.58

BDN_Artifacts.zip

cc @hamarb123 (agent_logs.txt). EgorBot manual: link.

@EgorBot
Copy link
Owner Author

EgorBot commented Jan 29, 2025

Benchmark results on linux_azure_cascadelake

BenchmarkDotNet v0.14.0, Ubuntu 24.04.1 LTS (Noble Numbat)
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 8 logical and 4 physical cores
  Job-HYSEPG : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
  Job-ULQOPH : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Toolchain N Mean Error Ratio
TestWork Main 32 4.997 ns 0.0009 ns 1.00
TestWork PR 32 4.069 ns 0.0012 ns 0.81
TestWorkWithZeroing Main 32 6.538 ns 0.0021 ns 1.00
TestWorkWithZeroing PR 32 3.678 ns 0.0008 ns 0.56
TestWork Main 64 82.628 ns 0.0268 ns 1.00
TestWork PR 64 72.185 ns 0.0143 ns 0.87
TestWorkWithZeroing Main 64 139.382 ns 0.0256 ns 1.00
TestWorkWithZeroing PR 64 71.550 ns 0.0089 ns 0.51
TestWork Main 128 243.990 ns 0.0723 ns 1.00
TestWork PR 128 210.388 ns 0.0247 ns 0.86
TestWorkWithZeroing Main 128 413.672 ns 0.1493 ns 1.00
TestWorkWithZeroing PR 128 211.923 ns 0.1518 ns 0.51
TestWork Main 256 569.155 ns 0.0916 ns 1.00
TestWork PR 256 492.100 ns 0.2155 ns 0.86
TestWorkWithZeroing Main 256 959.317 ns 0.1196 ns 1.00
TestWorkWithZeroing PR 256 493.338 ns 0.1968 ns 0.51
TestWork Main 1024 2,504.693 ns 1.2798 ns 1.00
TestWork PR 1024 2,156.762 ns 0.3129 ns 0.86
TestWorkWithZeroing Main 1024 4,220.248 ns 0.3756 ns 1.00
TestWorkWithZeroing PR 1024 2,157.426 ns 0.2552 ns 0.51
TestWork Main 4096 10,244.097 ns 3.1170 ns 1.00
TestWork PR 4096 8,817.136 ns 1.8494 ns 0.86
TestWorkWithZeroing Main 4096 17,293.932 ns 9.5948 ns 1.00
TestWorkWithZeroing PR 4096 8,821.353 ns 0.8262 ns 0.51
TestWork Main 16384 41,268.643 ns 18.7114 ns 1.00
TestWork PR 16384 35,501.194 ns 16.2738 ns 0.86
TestWorkWithZeroing Main 16384 69,479.384 ns 12.5593 ns 1.00
TestWorkWithZeroing PR 16384 35,452.610 ns 5.6369 ns 0.51
TestWork Main 65536 165,604.372 ns 234.1340 ns 1.00
TestWork PR 65536 142,501.684 ns 323.1169 ns 0.86
TestWorkWithZeroing Main 65536 278,419.241 ns 69.8628 ns 1.00
TestWorkWithZeroing PR 65536 142,335.196 ns 103.7682 ns 0.51

BDN_Artifacts.zip

cc @hamarb123 (agent_logs.txt). EgorBot manual: link.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant