Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EgorBot for hamarb123 in #99596 #274

Open
EgorBot opened this issue Jan 29, 2025 · 2 comments
Open

EgorBot for hamarb123 in #99596 #274

EgorBot opened this issue Jan 29, 2025 · 2 comments

Comments

@EgorBot
Copy link
Owner

EgorBot commented Jan 29, 2025

Processing dotnet/runtime#99596 (comment) command:

Command

-intel -amd

using BenchmarkDotNet.Attributes;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

public class Benchmarks
{
    private byte[] source = null!;

    [Params(64, 128, 256, 1024, 4096, 16384, 65536, 1 << 20)]
    public int N;

    [GlobalSetup]
    public void Setup()
    {
        source = new byte[N];
        new Random(42).NextBytes(source);
    }

    [Benchmark]
    public byte TestWork() => Work(ref MemoryMarshal.GetArrayDataReference(source), (uint)source.Length);

    [Benchmark]
    public byte TestWorkWithZeroing() => WorkWithZeroing(ref MemoryMarshal.GetArrayDataReference(source), (uint)source.Length);

    //test to see if the new codegen using shuf over perm is meaningfully faster
    //this should be more pipelineable than the other test I ran

    public static byte Work(ref byte b, nuint size)
    {
        Vector256<byte> result = Vector256<byte>.Zero;
        while (size >= Vector256<byte>.Count + 7)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            Vector256<byte> v1 = Vector256.LoadUnsafe(ref b, 1);
            Vector256<byte> v2 = Vector256.LoadUnsafe(ref b, 2);
            Vector256<byte> v3 = Vector256.LoadUnsafe(ref b, 3);
            Vector256<byte> v4 = Vector256.LoadUnsafe(ref b, 4);
            Vector256<byte> v5 = Vector256.LoadUnsafe(ref b, 5);
            Vector256<byte> v6 = Vector256.LoadUnsafe(ref b, 6);
            Vector256<byte> v7 = Vector256.LoadUnsafe(ref b, 7);
            v0 = Vector256.Shuffle(v0, Vector256.Shuffle((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v1 = Vector256.Shuffle(v1, Vector256.Shuffle((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v2 = Vector256.Shuffle(v2, Vector256.Shuffle((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v3 = Vector256.Shuffle(v3, Vector256.Shuffle((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v4 = Vector256.Shuffle(v4, Vector256.Shuffle((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v5 = Vector256.Shuffle(v5, Vector256.Shuffle((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v6 = Vector256.Shuffle(v6, Vector256.Shuffle((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v7 = Vector256.Shuffle(v7, Vector256.Shuffle((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += ((v0 ^ v1) - (v2 + v3)) ^ ((v4 + ~v5) + (v6 + v7));
            b = ref Unsafe.Add(ref b, 8);
            size -= 8;
        }
        while (size >= Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Shuffle((byte)15, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0;
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }

    public static byte WorkWithZeroing(ref byte b, nuint size)
    {
        Vector256<byte> result = Vector256<byte>.Zero;
        while (size >= Vector256<byte>.Count + 7)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            Vector256<byte> v1 = Vector256.LoadUnsafe(ref b, 1);
            Vector256<byte> v2 = Vector256.LoadUnsafe(ref b, 2);
            Vector256<byte> v3 = Vector256.LoadUnsafe(ref b, 3);
            Vector256<byte> v4 = Vector256.LoadUnsafe(ref b, 4);
            Vector256<byte> v5 = Vector256.LoadUnsafe(ref b, 5);
            Vector256<byte> v6 = Vector256.LoadUnsafe(ref b, 6);
            Vector256<byte> v7 = Vector256.LoadUnsafe(ref b, 7);
            v0 = Vector256.Shuffle(v0, Vector256.Shuffle((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v1 = Vector256.Shuffle(v1, Vector256.Shuffle((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v2 = Vector256.Shuffle(v2, Vector256.Shuffle((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v3 = Vector256.Shuffle(v3, Vector256.Shuffle((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v4 = Vector256.Shuffle(v4, Vector256.Shuffle((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v5 = Vector256.Shuffle(v5, Vector256.Shuffle((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v6 = Vector256.Shuffle(v6, Vector256.Shuffle((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            v7 = Vector256.Shuffle(v7, Vector256.Shuffle((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += ((v0 ^ v1) - (v2 + v3)) ^ ((v4 + ~v5) + (v6 + v7));
            b = ref Unsafe.Add(ref b, 8);
            size -= 8;
        }
        while (size >= Vector256<byte>.Count)
        {
            Vector256<byte> v0 = Vector256.LoadUnsafe(ref b);
            v0 = Vector256.Shuffle(v0, Vector256.Shuffle((byte)99, 0, 14, 1, 13, 2, 12, 3, 11, 4, 10, 5, 9, 6, 8, 7, 31, 16, 30, 17, 29, 18, 28, 19, 27, 20, 26, 21, 25, 22, 24, 23));
            result += v0;
            b = ref Unsafe.Add(ref b, 1);
            size--;
        }
        return Vector256.Sum(result);
    }
}

(EgorBot will reply in this issue)

@EgorBot
Copy link
Owner Author

EgorBot commented Jan 29, 2025

❌ Failed on AzureCascadeLake: Job failed, see logs.

cc @hamarb123 (logs). EgorBot manual: link.

@EgorBot
Copy link
Owner Author

EgorBot commented Jan 29, 2025

❌ Failed on AzureGenoa: Job failed, see logs.

cc @hamarb123 (logs). EgorBot manual: link.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant