You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
usingBenchmarkDotNet.Attributes;usingSystem.Runtime.CompilerServices;usingSystem.Runtime.InteropServices;usingSystem.Runtime.Intrinsics;publicunsafeclassBenchmarks{privatebyte*source=null;[Params(32,64,128,256,1024,4096,16384,65536,1<<20)]publicintN;[GlobalSetup]publicvoidSetup(){source=(byte*)NativeMemory.AlignedAlloc((uint)N,4096);newRandom(42).NextBytes(newSpan<byte>(source,N));}[GlobalCleanup]publicvoidTeardown(){NativeMemory.AlignedFree(source);source=null;}[Benchmark]publicbyteTestWorkPipelineable()=>WorkPipelineable(ref*source,(uint)N);[Benchmark]publicbyteTestWorkDependencyChain()=>WorkDependencyChain(ref*source,(uint)N);[Benchmark]publicbyteTestWorkPipelineableWithZeroing()=>WorkPipelineableWithZeroing(ref*source,(uint)N);[Benchmark]publicbyteTestWorkDependencyChainWithZeroing()=>WorkDependencyChainWithZeroing(ref*source,(uint)N);//test to see if the new codegen using shuf over perm is meaningfully faster//this should be more pipelineable than the other test I ranpublicstaticbyteWorkPipelineable(refbyteb,nuintsize){Vector256<byte>result=Vector256<byte>.Zero;while(size>=(uint)Vector256<byte>.Count+7u){Vector256<byte>v0=Vector256.LoadUnsafe(refb);Vector256<byte>v1=Vector256.LoadUnsafe(refb,1);Vector256<byte>v2=Vector256.LoadUnsafe(refb,2);Vector256<byte>v3=Vector256.LoadUnsafe(refb,3);Vector256<byte>v4=Vector256.LoadUnsafe(refb,4);Vector256<byte>v5=Vector256.LoadUnsafe(refb,5);Vector256<byte>v6=Vector256.LoadUnsafe(refb,6);Vector256<byte>v7=Vector256.LoadUnsafe(refb,7);v0=Vector256.Shuffle(v0,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v1=Vector256.Shuffle(v1,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v2=Vector256.Shuffle(v2,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v3=Vector256.Shuffle(v3,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v4=Vector256.Shuffle(v4,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v5=Vector256.Shuffle(v5,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v6=Vector256.Shuffle(v6,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v7=Vector256.Shuffle(v7,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));result+=((v0^v1)-(v2+v3))^((v4+~v5)+(v6+v7));b=refUnsafe.Add(refb,8);size-=8;}while(size>=(uint)Vector256<byte>.Count){Vector256<byte>v0=Vector256.LoadUnsafe(refb);v0=Vector256.Shuffle(v0,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));result+=v0;b=refUnsafe.Add(refb,1);size--;}returnVector256.Sum(result);}publicstaticbyteWorkPipelineableWithZeroing(refbyteb,nuintsize){Vector256<byte>result=Vector256<byte>.Zero;while(size>=(uint)Vector256<byte>.Count+7u){Vector256<byte>v0=Vector256.LoadUnsafe(refb);Vector256<byte>v1=Vector256.LoadUnsafe(refb,1);Vector256<byte>v2=Vector256.LoadUnsafe(refb,2);Vector256<byte>v3=Vector256.LoadUnsafe(refb,3);Vector256<byte>v4=Vector256.LoadUnsafe(refb,4);Vector256<byte>v5=Vector256.LoadUnsafe(refb,5);Vector256<byte>v6=Vector256.LoadUnsafe(refb,6);Vector256<byte>v7=Vector256.LoadUnsafe(refb,7);v0=Vector256.Shuffle(v0,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v1=Vector256.Shuffle(v1,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v2=Vector256.Shuffle(v2,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v3=Vector256.Shuffle(v3,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v4=Vector256.Shuffle(v4,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v5=Vector256.Shuffle(v5,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v6=Vector256.Shuffle(v6,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v7=Vector256.Shuffle(v7,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));result+=((v0^v1)-(v2+v3))^((v4+~v5)+(v6+v7));b=refUnsafe.Add(refb,8);size-=8;}while(size>=(uint)Vector256<byte>.Count){Vector256<byte>v0=Vector256.LoadUnsafe(refb);v0=Vector256.Shuffle(v0,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));result+=v0;b=refUnsafe.Add(refb,1);size--;}returnVector256.Sum(result);}//this test should be very latency bound as it has a long dependency chainpublicstaticfloatWorkDependencyChain(refbyteb,nuintsize){Vector256<float>result=Vector256<float>.Zero;while(size>=(uint)Vector256<byte>.Count){Vector256<byte>v0=Vector256.LoadUnsafe(refb);v0=Vector256.Shuffle(v0,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));result+=v0.AsSingle();b=refUnsafe.Add(refb,1);size--;}returnVector256.Sum(result);}publicstaticfloatWorkDependencyChainWithZeroing(refbyteb,nuintsize){Vector256<float>result=Vector256<float>.Zero;while(size>=(uint)Vector256<byte>.Count){Vector256<byte>v0=Vector256.LoadUnsafe(refb);v0=Vector256.Shuffle(v0,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));v0=(v0.AsSingle()+Vector256.Create(1.0f)).AsByte();v0=Vector256.Shuffle(v0,Vector256.Create((byte)99,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7,31,16,30,17,29,18,28,19,27,20,26,21,25,22,24,23));result+=v0.AsSingle();b=refUnsafe.Add(refb,1);size--;}returnVector256.Sum(result);}}
(EgorBot will reply in this issue)
The text was updated successfully, but these errors were encountered:
Processing dotnet/runtime#99596 (comment) command:
Command
-intel -amd
(EgorBot will reply in this issue)
The text was updated successfully, but these errors were encountered: