Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EgorBot for hamarb123 in #99596 #279

Open
EgorBot opened this issue Jan 30, 2025 · 2 comments
Open

EgorBot for hamarb123 in #99596 #279

EgorBot opened this issue Jan 30, 2025 · 2 comments

Comments

@EgorBot
Copy link
Owner

EgorBot commented Jan 30, 2025

Processing dotnet/runtime#99596 (comment) command:

Command

-intel -amd

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;

public unsafe class Benchmarks
{
    private byte* source = null;

    [Params(32, 64, 128, 256, 1024, 4096, 16384, 65536, 1 << 20)]
    public int N;

    [GlobalSetup]
    public void Setup()
    {
        source = (byte*)NativeMemory.AlignedAlloc((uint)N, 4096);
        new Random(42).NextBytes(new Span<byte>(source, N));
    }

    [GlobalCleanup]
    public void Teardown()
    {
        NativeMemory.AlignedFree(source);
        source = null;
    }

    [Benchmark]
    public byte TestWorkPipelineable() => WorkPipelineable(ref *source, (uint)N);

    [Benchmark]
    public float TestWorkDependencyChain() => WorkDependencyChain(ref *source, (uint)N);

    [Benchmark]
    public byte TestWorkPipelineableWithZeroing() => WorkPipelineableWithZeroing(ref *source, (uint)N);

    [Benchmark]
    public float TestWorkDependencyChainWithZeroing() => WorkDependencyChainWithZeroing(ref *source, (uint)N);

    //test to see if the new codegen using shuf over perm is meaningfully faster
    //this should be more pipelineable than the other test I ran

    public static byte WorkPipelineable(ref byte b, nuint size)
    {
        Vector256<byte> result = Vector256<byte>.Zero;
        while (size >= (uint)Vector256<byte>.Count + 7u)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            Vector256<byte> v1 = Vector256.LoadUnsafe(ref b, 1);
            Vector256<byte> v2 = Vector256.LoadUnsafe(ref b, 2);
            Vector256<byte> v3 = Vector256.LoadUnsafe(ref b, 3);
            Vector256<byte> v4 = Vector256.LoadUnsafe(ref b, 4);
            Vector256<byte> v5 = Vector256.LoadUnsafe(ref b, 5);
            Vector256<byte> v6 = Vector256.LoadUnsafe(ref b, 6);
            Vector256<byte> v7 = Vector256.LoadUnsafe(ref b, 7);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v1 = Vector256.Shuffle(v1, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v2 = Vector256.Shuffle(v2, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v3 = Vector256.Shuffle(v3, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v4 = Vector256.Shuffle(v4, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v5 = Vector256.Shuffle(v5, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v6 = Vector256.Shuffle(v6, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v7 = Vector256.Shuffle(v7, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += ((v0 ^ v1) - (v2 + v3)) ^ ((v4 + ~v5) + (v6 + v7));
            b = ref Unsafe.Add(ref b, 8);
            size -= 8;
        }
        while (size >= (uint)Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0;
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }

    public static byte WorkPipelineableWithZeroing(ref byte b, nuint size)
    {
        Vector256<byte> result = Vector256<byte>.Zero;
        while (size >= (uint)Vector256<byte>.Count + 7u)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            Vector256<byte> v1 = Vector256.LoadUnsafe(ref b, 1);
            Vector256<byte> v2 = Vector256.LoadUnsafe(ref b, 2);
            Vector256<byte> v3 = Vector256.LoadUnsafe(ref b, 3);
            Vector256<byte> v4 = Vector256.LoadUnsafe(ref b, 4);
            Vector256<byte> v5 = Vector256.LoadUnsafe(ref b, 5);
            Vector256<byte> v6 = Vector256.LoadUnsafe(ref b, 6);
            Vector256<byte> v7 = Vector256.LoadUnsafe(ref b, 7);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v1 = Vector256.Shuffle(v1, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v2 = Vector256.Shuffle(v2, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v3 = Vector256.Shuffle(v3, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v4 = Vector256.Shuffle(v4, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v5 = Vector256.Shuffle(v5, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v6 = Vector256.Shuffle(v6, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v7 = Vector256.Shuffle(v7, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += ((v0 ^ v1) - (v2 + v3)) ^ ((v4 + ~v5) + (v6 + v7));
            b = ref Unsafe.Add(ref b, 8);
            size -= 8;
        }
        while (size >= (uint)Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0;
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }

    //this test should be very latency bound as it has a long dependency chain

    public static float WorkDependencyChain(ref byte b, nuint size)
    {
        Vector256<float> result = Vector256<float>.Zero;
        while (size >= (uint)Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0.AsSingle();
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }

    public static float WorkDependencyChainWithZeroing(ref byte b, nuint size)
    {
        Vector256<float> result = Vector256<float>.Zero;
        while (size >= (uint)Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0.AsSingle();
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }
}

(EgorBot will reply in this issue)

@EgorBot
Copy link
Owner Author

EgorBot commented Jan 30, 2025

Benchmark results on linux_azure_genoa

BenchmarkDotNet v0.14.0, Ubuntu 24.04.1 LTS (Noble Numbat)
AMD EPYC 9V74, 1 CPU, 8 logical and 4 physical cores
  Job-QBCGOJ : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
  Job-FLOZDB : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Toolchain N Mean Error Ratio
TestWorkPipelineable Main 32 1.621 ns 0.0318 ns 1.00
TestWorkPipelineable PR 32 1.299 ns 0.0241 ns 0.80
TestWorkDependencyChain Main 32 4.734 ns 0.0452 ns 1.00
TestWorkDependencyChain PR 32 3.813 ns 0.0274 ns 0.81
TestWorkPipelineableWithZeroing Main 32 1.231 ns 0.0274 ns 1.00
TestWorkPipelineableWithZeroing PR 32 1.315 ns 0.0213 ns 1.07
TestWorkDependencyChainWithZeroing Main 32 6.359 ns 0.0398 ns 1.00
TestWorkDependencyChainWithZeroing PR 32 3.858 ns 0.0425 ns 0.61
TestWorkPipelineable Main 64 9.140 ns 0.1290 ns 1.00
TestWorkPipelineable PR 64 9.433 ns 0.0861 ns 1.03
TestWorkDependencyChain Main 64 99.247 ns 0.6407 ns 1.00
TestWorkDependencyChain PR 64 89.515 ns 0.5800 ns 0.90
TestWorkPipelineableWithZeroing Main 64 11.335 ns 0.0852 ns 1.00
TestWorkPipelineableWithZeroing PR 64 9.618 ns 0.0724 ns 0.85
TestWorkDependencyChainWithZeroing Main 64 153.947 ns 1.3082 ns 1.00
TestWorkDependencyChainWithZeroing PR 64 90.125 ns 0.5444 ns 0.59
TestWorkPipelineable Main 128 26.055 ns 0.3297 ns 1.00
TestWorkPipelineable PR 128 27.572 ns 0.1980 ns 1.06
TestWorkDependencyChain Main 128 289.060 ns 1.8196 ns 1.00
TestWorkDependencyChain PR 128 259.310 ns 1.7688 ns 0.90
TestWorkPipelineableWithZeroing Main 128 33.047 ns 0.2265 ns 1.00
TestWorkPipelineableWithZeroing PR 128 27.831 ns 0.2607 ns 0.84
TestWorkDependencyChainWithZeroing Main 128 451.250 ns 2.7357 ns 1.00
TestWorkDependencyChainWithZeroing PR 128 260.039 ns 1.4276 ns 0.58
TestWorkPipelineable Main 256 60.098 ns 0.5447 ns 1.00
TestWorkPipelineable PR 256 60.034 ns 0.8126 ns 1.00
TestWorkDependencyChain Main 256 670.485 ns 5.3941 ns 1.00
TestWorkDependencyChain PR 256 605.254 ns 3.8565 ns 0.90
TestWorkPipelineableWithZeroing Main 256 72.381 ns 0.8625 ns 1.00
TestWorkPipelineableWithZeroing PR 256 60.506 ns 0.6167 ns 0.84
TestWorkDependencyChainWithZeroing Main 256 1,036.934 ns 4.4639 ns 1.00
TestWorkDependencyChainWithZeroing PR 256 610.374 ns 3.3020 ns 0.59
TestWorkPipelineable Main 1024 268.519 ns 5.1901 ns 1.00
TestWorkPipelineable PR 1024 260.932 ns 5.2617 ns 0.97
TestWorkDependencyChain Main 1024 2,948.048 ns 27.4073 ns 1.00
TestWorkDependencyChain PR 1024 2,652.525 ns 9.7212 ns 0.90
TestWorkPipelineableWithZeroing Main 1024 305.643 ns 3.1595 ns 1.00
TestWorkPipelineableWithZeroing PR 1024 261.943 ns 4.3686 ns 0.86
TestWorkDependencyChainWithZeroing Main 1024 4,587.634 ns 10.1879 ns 1.00
TestWorkDependencyChainWithZeroing PR 1024 2,678.700 ns 22.9431 ns 0.58
TestWorkPipelineable Main 4096 1,101.384 ns 19.5415 ns 1.00
TestWorkPipelineable PR 4096 1,078.131 ns 10.5645 ns 0.98
TestWorkDependencyChain Main 4096 12,134.039 ns 51.5571 ns 1.00
TestWorkDependencyChain PR 4096 10,982.100 ns 50.7779 ns 0.91
TestWorkPipelineableWithZeroing Main 4096 1,241.756 ns 5.0880 ns 1.00
TestWorkPipelineableWithZeroing PR 4096 1,070.295 ns 6.2527 ns 0.86
TestWorkDependencyChainWithZeroing Main 4096 19,118.332 ns 208.6301 ns 1.00
TestWorkDependencyChainWithZeroing PR 4096 10,948.115 ns 95.9672 ns 0.57
TestWorkPipelineable Main 16384 4,429.837 ns 48.8727 ns 1.00
TestWorkPipelineable PR 16384 4,232.977 ns 26.0471 ns 0.96
TestWorkDependencyChain Main 16384 48,351.725 ns 248.0566 ns 1.00
TestWorkDependencyChain PR 16384 44,271.369 ns 260.4794 ns 0.92
TestWorkPipelineableWithZeroing Main 16384 5,017.464 ns 43.0286 ns 1.00
TestWorkPipelineableWithZeroing PR 16384 4,290.491 ns 45.4591 ns 0.86
TestWorkDependencyChainWithZeroing Main 16384 76,137.381 ns 561.5843 ns 1.00
TestWorkDependencyChainWithZeroing PR 16384 43,992.386 ns 355.6288 ns 0.58
TestWorkPipelineable Main 65536 17,712.365 ns 115.5184 ns 1.00
TestWorkPipelineable PR 65536 17,300.111 ns 76.8237 ns 0.98
TestWorkDependencyChain Main 65536 194,939.787 ns 555.3184 ns 1.00
TestWorkDependencyChain PR 65536 177,258.114 ns 995.2090 ns 0.91
TestWorkPipelineableWithZeroing Main 65536 19,923.054 ns 159.5374 ns 1.00
TestWorkPipelineableWithZeroing PR 65536 17,355.804 ns 107.9378 ns 0.87
TestWorkDependencyChainWithZeroing Main 65536 303,664.000 ns 1,977.6564 ns 1.00
TestWorkDependencyChainWithZeroing PR 65536 176,106.015 ns 760.8893 ns 0.58
TestWorkPipelineable Main 1048576 285,622.234 ns 1,619.0143 ns 1.00
TestWorkPipelineable PR 1048576 276,786.718 ns 2,967.4042 ns 0.97
TestWorkDependencyChain Main 1048576 3,157,656.744 ns 15,055.0844 ns 1.00
TestWorkDependencyChain PR 1048576 2,839,843.859 ns 23,074.2656 ns 0.90
TestWorkPipelineableWithZeroing Main 1048576 322,385.050 ns 2,905.5243 ns 1.00
TestWorkPipelineableWithZeroing PR 1048576 274,143.051 ns 824.2073 ns 0.85
TestWorkDependencyChainWithZeroing Main 1048576 4,907,468.092 ns 33,081.6507 ns 1.00
TestWorkDependencyChainWithZeroing PR 1048576 2,806,245.509 ns 14,689.7401 ns 0.57

BDN_Artifacts.zip

cc @hamarb123 (agent_logs.txt). EgorBot manual: link.

@EgorBot
Copy link
Owner Author

EgorBot commented Jan 30, 2025

Benchmark results on linux_azure_cascadelake

BenchmarkDotNet v0.14.0, Ubuntu 24.04.1 LTS (Noble Numbat)
Intel Xeon Platinum 8370C CPU 2.80GHz, 1 CPU, 8 logical and 4 physical cores
  Job-IYBOSE : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
  Job-HAUWTU : .NET 10.0.0 (42.42.42.42424), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI
Method Toolchain N Mean Error Ratio
TestWorkPipelineable Main 32 2.015 ns 0.0008 ns 1.00
TestWorkPipelineable PR 32 1.591 ns 0.0565 ns 0.79
TestWorkDependencyChain Main 32 4.129 ns 0.0013 ns 1.00
TestWorkDependencyChain PR 32 3.308 ns 0.0007 ns 0.80
TestWorkPipelineableWithZeroing Main 32 1.438 ns 0.0004 ns 1.00
TestWorkPipelineableWithZeroing PR 32 1.546 ns 0.0069 ns 1.07
TestWorkDependencyChainWithZeroing Main 32 6.221 ns 0.0010 ns 1.00
TestWorkDependencyChainWithZeroing PR 32 3.309 ns 0.0005 ns 0.53
TestWorkPipelineable Main 64 9.347 ns 0.0017 ns 1.00
TestWorkPipelineable PR 64 9.299 ns 0.0062 ns 0.99
TestWorkDependencyChain Main 64 81.480 ns 0.0096 ns 1.00
TestWorkDependencyChain PR 64 71.044 ns 0.0065 ns 0.87
TestWorkPipelineableWithZeroing Main 64 12.317 ns 0.0018 ns 1.00
TestWorkPipelineableWithZeroing PR 64 9.306 ns 0.0140 ns 0.76
TestWorkDependencyChainWithZeroing Main 64 137.336 ns 0.0807 ns 1.00
TestWorkDependencyChainWithZeroing PR 64 69.753 ns 0.0124 ns 0.51
TestWorkPipelineable Main 128 27.244 ns 0.0051 ns 1.00
TestWorkPipelineable PR 128 25.090 ns 0.0063 ns 0.92
TestWorkDependencyChain Main 128 242.621 ns 0.0292 ns 1.00
TestWorkDependencyChain PR 128 208.483 ns 0.0272 ns 0.86
TestWorkPipelineableWithZeroing Main 128 33.718 ns 0.0251 ns 1.00
TestWorkPipelineableWithZeroing PR 128 25.081 ns 0.0061 ns 0.74
TestWorkDependencyChainWithZeroing Main 128 409.063 ns 0.0501 ns 1.00
TestWorkDependencyChainWithZeroing PR 128 208.404 ns 0.0326 ns 0.51
TestWorkPipelineable Main 256 63.669 ns 0.0034 ns 1.00
TestWorkPipelineable PR 256 57.261 ns 0.0100 ns 0.90
TestWorkDependencyChain Main 256 568.083 ns 0.0742 ns 1.00
TestWorkDependencyChain PR 256 489.799 ns 0.0676 ns 0.86
TestWorkPipelineableWithZeroing Main 256 76.111 ns 0.0287 ns 1.00
TestWorkPipelineableWithZeroing PR 256 57.279 ns 0.0084 ns 0.75
TestWorkDependencyChainWithZeroing Main 256 954.488 ns 0.0915 ns 1.00
TestWorkDependencyChainWithZeroing PR 256 489.585 ns 0.0868 ns 0.51
TestWorkPipelineable Main 1024 284.233 ns 0.0333 ns 1.00
TestWorkPipelineable PR 1024 249.602 ns 0.0688 ns 0.88
TestWorkDependencyChain Main 1024 2,502.301 ns 0.2558 ns 1.00
TestWorkDependencyChain PR 1024 2,154.098 ns 0.1747 ns 0.86
TestWorkPipelineableWithZeroing Main 1024 332.632 ns 0.2007 ns 1.00
TestWorkPipelineableWithZeroing PR 1024 249.624 ns 0.0426 ns 0.75
TestWorkDependencyChainWithZeroing Main 1024 4,214.613 ns 0.3169 ns 1.00
TestWorkDependencyChainWithZeroing PR 1024 2,154.157 ns 0.1785 ns 0.51
TestWorkPipelineable Main 4096 1,173.618 ns 0.2041 ns 1.00
TestWorkPipelineable PR 4096 1,027.609 ns 0.6465 ns 0.88
TestWorkDependencyChain Main 4096 10,231.266 ns 1.5051 ns 1.00
TestWorkDependencyChain PR 4096 8,823.374 ns 2.7114 ns 0.86
TestWorkPipelineableWithZeroing Main 4096 1,362.735 ns 0.6094 ns 1.00
TestWorkPipelineableWithZeroing PR 4096 1,027.476 ns 0.6673 ns 0.75
TestWorkDependencyChainWithZeroing Main 4096 17,255.498 ns 1.5601 ns 1.00
TestWorkDependencyChainWithZeroing PR 4096 8,818.680 ns 3.4113 ns 0.51
TestWorkPipelineable Main 16384 4,747.632 ns 0.5336 ns 1.00
TestWorkPipelineable PR 16384 3,993.880 ns 56.8574 ns 0.84
TestWorkDependencyChain Main 16384 41,216.636 ns 17.4360 ns 1.00
TestWorkDependencyChain PR 16384 35,451.041 ns 5.9434 ns 0.86
TestWorkPipelineableWithZeroing Main 16384 5,505.710 ns 0.6226 ns 1.00
TestWorkPipelineableWithZeroing PR 16384 3,944.768 ns 6.8011 ns 0.72
TestWorkDependencyChainWithZeroing Main 16384 69,444.435 ns 9.2554 ns 1.00
TestWorkDependencyChainWithZeroing PR 16384 35,446.611 ns 4.0584 ns 0.51
TestWorkPipelineable Main 65536 19,051.137 ns 2.6371 ns 1.00
TestWorkPipelineable PR 65536 15,802.064 ns 42.4753 ns 0.83
TestWorkDependencyChain Main 65536 165,448.616 ns 145.9864 ns 1.00
TestWorkDependencyChain PR 65536 142,138.702 ns 112.8088 ns 0.86
TestWorkPipelineableWithZeroing Main 65536 22,090.972 ns 1.8095 ns 1.00
TestWorkPipelineableWithZeroing PR 65536 15,786.253 ns 32.6605 ns 0.71
TestWorkDependencyChainWithZeroing Main 65536 278,195.406 ns 29.7947 ns 1.00
TestWorkDependencyChainWithZeroing PR 65536 142,244.863 ns 202.4602 ns 0.51
TestWorkPipelineable Main 1048576 305,169.314 ns 37.8255 ns 1.00
TestWorkPipelineable PR 1048576 252,216.886 ns 86.1144 ns 0.83
TestWorkDependencyChain Main 1048576 2,694,406.521 ns 24,977.6341 ns 1.00
TestWorkDependencyChain PR 1048576 2,330,393.989 ns 22,686.3393 ns 0.86
TestWorkPipelineableWithZeroing Main 1048576 353,888.669 ns 28.2779 ns 1.00
TestWorkPipelineableWithZeroing PR 1048576 252,309.066 ns 57.2610 ns 0.71
TestWorkDependencyChainWithZeroing Main 1048576 4,454,224.889 ns 1,140.0706 ns 1.00
TestWorkDependencyChainWithZeroing PR 1048576 2,309,306.266 ns 5,442.9339 ns 0.52

BDN_Artifacts.zip

cc @hamarb123 (agent_logs.txt). EgorBot manual: link.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant