Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EgorBot for hamarb123 in #99596 #278

Open
EgorBot opened this issue Jan 30, 2025 · 2 comments
Open

EgorBot for hamarb123 in #99596 #278

EgorBot opened this issue Jan 30, 2025 · 2 comments

Comments

@EgorBot
Copy link
Owner

EgorBot commented Jan 30, 2025

Processing dotnet/runtime#99596 (comment) command:

Command

-intel -amd

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;

public unsafe class Benchmarks
{
    private byte* source = null;

    [Params(32, 64, 128, 256, 1024, 4096, 16384, 65536, 1 << 20)]
    public int N;

    [GlobalSetup]
    public void Setup()
    {
        source = (byte*)NativeMemory.AlignedAlloc((uint)N, 4096);
        new Random(42).NextBytes(new Span<byte>(source, N));
    }

    [GlobalCleanup]
    public void Teardown()
    {
        NativeMemory.AlignedFree(source);
        source = null;
    }

    [Benchmark]
    public byte TestWorkPipelineable() => WorkPipelineable(ref *source, (uint)N);

    [Benchmark]
    public byte TestWorkDependencyChain() => WorkDependencyChain(ref *source, (uint)N);

    [Benchmark]
    public byte TestWorkPipelineableWithZeroing() => WorkPipelineableWithZeroing(ref *source, (uint)N);

    [Benchmark]
    public byte TestWorkDependencyChainWithZeroing() => WorkDependencyChainWithZeroing(ref *source, (uint)N);

    //test to see if the new codegen using shuf over perm is meaningfully faster
    //this should be more pipelineable than the other test I ran

    public static byte WorkPipelineable(ref byte b, nuint size)
    {
        Vector256<byte> result = Vector256<byte>.Zero;
        while (size >= (uint)Vector256<byte>.Count + 7u)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            Vector256<byte> v1 = Vector256.LoadUnsafe(ref b, 1);
            Vector256<byte> v2 = Vector256.LoadUnsafe(ref b, 2);
            Vector256<byte> v3 = Vector256.LoadUnsafe(ref b, 3);
            Vector256<byte> v4 = Vector256.LoadUnsafe(ref b, 4);
            Vector256<byte> v5 = Vector256.LoadUnsafe(ref b, 5);
            Vector256<byte> v6 = Vector256.LoadUnsafe(ref b, 6);
            Vector256<byte> v7 = Vector256.LoadUnsafe(ref b, 7);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v1 = Vector256.Shuffle(v1, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v2 = Vector256.Shuffle(v2, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v3 = Vector256.Shuffle(v3, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v4 = Vector256.Shuffle(v4, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v5 = Vector256.Shuffle(v5, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v6 = Vector256.Shuffle(v6, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v7 = Vector256.Shuffle(v7, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += ((v0 ^ v1) - (v2 + v3)) ^ ((v4 + ~v5) + (v6 + v7));
            b = ref Unsafe.Add(ref b, 8);
            size -= 8;
        }
        while (size >= (uint)Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0;
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }

    public static byte WorkPipelineableWithZeroing(ref byte b, nuint size)
    {
        Vector256<byte> result = Vector256<byte>.Zero;
        while (size >= (uint)Vector256<byte>.Count + 7u)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            Vector256<byte> v1 = Vector256.LoadUnsafe(ref b, 1);
            Vector256<byte> v2 = Vector256.LoadUnsafe(ref b, 2);
            Vector256<byte> v3 = Vector256.LoadUnsafe(ref b, 3);
            Vector256<byte> v4 = Vector256.LoadUnsafe(ref b, 4);
            Vector256<byte> v5 = Vector256.LoadUnsafe(ref b, 5);
            Vector256<byte> v6 = Vector256.LoadUnsafe(ref b, 6);
            Vector256<byte> v7 = Vector256.LoadUnsafe(ref b, 7);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v1 = Vector256.Shuffle(v1, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v2 = Vector256.Shuffle(v2, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v3 = Vector256.Shuffle(v3, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v4 = Vector256.Shuffle(v4, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v5 = Vector256.Shuffle(v5, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v6 = Vector256.Shuffle(v6, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v7 = Vector256.Shuffle(v7, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += ((v0 ^ v1) - (v2 + v3)) ^ ((v4 + ~v5) + (v6 + v7));
            b = ref Unsafe.Add(ref b, 8);
            size -= 8;
        }
        while (size >= (uint)Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0;
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }

    //this test should be very latency bound as it has a long dependency chain

    public static float WorkDependencyChain(ref byte b, nuint size)
    {
        Vector256<float> result = Vector256<float>.Zero;
        while (size >= (uint)Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0.AsSingle();
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }

    public static float WorkDependencyChainWithZeroing(ref byte b, nuint size)
    {
        Vector256<float> result = Vector256<float>.Zero;
        while (size >= (uint)Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v0 = (v0.AsSingle() + Vector256.Create(1.0f)).AsByte();
            v0 = Vector256.Shuffle(v0, Vector256.Create((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0.AsSingle();
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }
}

(EgorBot will reply in this issue)

@EgorBot
Copy link
Owner Author

EgorBot commented Jan 30, 2025

❌ Failed on AzureGenoa: Job failed, see logs.

cc @hamarb123 (logs). EgorBot manual: link.

@EgorBot
Copy link
Owner Author

EgorBot commented Jan 30, 2025

❌ Failed on AzureCascadeLake: Job failed, see logs.

cc @hamarb123 (logs). EgorBot manual: link.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant