forked from MihaZupan/runtime-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[JitDiff X64] [hamarb123] Implement ShuffleNative
methods and optimise Shuffle
for non-cons ...
#959
Comments
Top method improvements-23 (-24.21 % of base) - System.Numerics.Vector4:Shuffle(System.Numerics.Vector4,ubyte,ubyte,ubyte,ubyte):System.Numerics.Vector4 ; Assembly listing for method System.Numerics.Vector4:Shuffle(System.Numerics.Vector4,ubyte,ubyte,ubyte,ubyte):System.Numerics.Vector4 (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
-; rbp based frame
+; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
-; V00 arg0 [V00,T04] ( 3, 3 ) simd16 -> mm0 multireg-arg single-def <System.Numerics.Vector4>
+; V00 arg0 [V00,T05] ( 3, 3 ) simd16 -> mm0 multireg-arg single-def <System.Numerics.Vector4>
; V01 arg1 [V01,T00] ( 3, 3 ) ubyte -> rdi single-def
; V02 arg2 [V02,T01] ( 3, 3 ) ubyte -> rsi single-def
; V03 arg3 [V03,T02] ( 3, 3 ) ubyte -> rdx single-def
; V04 arg4 [V04,T03] ( 3, 3 ) ubyte -> rcx single-def
-; V05 OutArgs [V05 ] ( 1, 1 ) struct (32) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V06 tmp1 [V06,T05] ( 2, 4 ) simd16 -> mm0 multireg-ret "Return value temp for multireg return" <System.Numerics.Vector4>
-; V07 rat0 [V07,T06] ( 2, 4 ) simd16 -> [rbp-0x10] do-not-enreg[S] "return buffer for hwintrinsic" <System.Runtime.Intrinsics.Vector128`1[float]>
+;# V05 OutArgs [V05 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+; V06 tmp1 [V06,T06] ( 2, 4 ) simd16 -> mm0 multireg-ret "Return value temp for multireg return" <System.Numerics.Vector4>
+; V07 rat0 [V07,T04] ( 3, 6 ) simd16 -> mm1 "fgMakeTemp is creating a new local variable"
;
-; Lcl frame size = 48
+; Lcl frame size = 0
G_M40044_IG01:
- push rbp
- sub rsp, 48
- lea rbp, [rsp+0x30]
vshufpd xmm0, xmm1, 0
- ;; size=15 bbWeight=1 PerfScore 2.75
+ ;; size=5 bbWeight=1 PerfScore 1.00
G_M40044_IG02:
- vmovups xmmword ptr [rsp], xmm0
- movzx rdi, dil
- vmovd xmm0, edi
- movzx rdi, sil
- vpinsrd xmm0, xmm0, edi, 1
- movzx rdi, dl
- vpinsrd xmm0, xmm0, edi, 2
- movzx rdi, cl
- vpinsrd xmm0, xmm0, edi, 3
- vmovups xmmword ptr [rsp+0x10], xmm0
- lea rdi, [rbp-0x10]
- mov rax, 0xD1FFAB1E ; code for System.Runtime.Intrinsics.Vector128:Shuffle(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[int]):System.Runtime.Intrinsics.Vector128`1[float]
- call [rax]System.Runtime.Intrinsics.Vector128:Shuffle(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[int]):System.Runtime.Intrinsics.Vector128`1[float]
- vmovaps xmm0, xmmword ptr [rbp-0x10]
+ movzx rax, dil
+ vmovd xmm1, eax
+ movzx rax, sil
+ vpinsrd xmm1, xmm1, eax, 1
+ movzx rax, dl
+ vpinsrd xmm1, xmm1, eax, 2
+ movzx rax, cl
+ vpinsrd xmm1, xmm1, eax, 3
+ vpermilps xmm0, xmm0, xmm1
+ vpcmpud k1, xmm1, dword ptr [reloc @RWD00] {1to4}, 1
+ vpmovm2d xmm1, k1
+ vandps xmm0, xmm1, xmm0
vmovhlps xmm1, xmm1, xmm0
- ;; size=74 bbWeight=1 PerfScore 21.75
+ ;; size=66 bbWeight=1 PerfScore 20.33
G_M40044_IG03:
- add rsp, 48
- pop rbp
ret
- ;; size=6 bbWeight=1 PerfScore 1.75
+ ;; size=1 bbWeight=1 PerfScore 1.00
+RWD00 dd 00000004h
-; Total bytes of code 95, prolog size 10, PerfScore 26.25, instruction count 22, allocated bytes for code 95 (MethodHash=e5926393) for method System.Numerics.Vector4:Shuffle(System.Numerics.Vector4,ubyte,ubyte,ubyte,ubyte):System.Numerics.Vector4 (FullOpts)
+
+; Total bytes of code 72, prolog size 0, PerfScore 22.33, instruction count 15, allocated bytes for code 72 (MethodHash=e5926393) for method System.Numerics.Vector4:Shuffle(System.Numerics.Vector4,ubyte,ubyte,ubyte,ubyte):System.Numerics.Vector4 (FullOpts)
; ============================================================ -22 (-20.37 % of base) - System.Numerics.Vector3:Shuffle(System.Numerics.Vector3,ubyte,ubyte,ubyte):System.Numerics.Vector3 ; Assembly listing for method System.Numerics.Vector3:Shuffle(System.Numerics.Vector3,ubyte,ubyte,ubyte):System.Numerics.Vector3 (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
-; rbp based frame
+; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
-; V00 arg0 [V00,T03] ( 3, 3 ) simd12 -> mm0 multireg-arg single-def <System.Numerics.Vector3>
+; V00 arg0 [V00,T04] ( 3, 3 ) simd12 -> mm0 multireg-arg single-def <System.Numerics.Vector3>
; V01 arg1 [V01,T00] ( 3, 3 ) ubyte -> rdi single-def
; V02 arg2 [V02,T01] ( 3, 3 ) ubyte -> rsi single-def
; V03 arg3 [V03,T02] ( 3, 3 ) ubyte -> rdx single-def
-; V04 OutArgs [V04 ] ( 1, 1 ) struct (32) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V05 tmp1 [V05,T04] ( 2, 4 ) simd12 -> mm0 multireg-ret "Return value temp for multireg return" <System.Numerics.Vector3>
-; V06 rat0 [V06,T05] ( 2, 4 ) simd16 -> [rbp-0x10] do-not-enreg[S] "return buffer for hwintrinsic" <System.Runtime.Intrinsics.Vector128`1[float]>
+;# V04 OutArgs [V04 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+; V05 tmp1 [V05,T05] ( 2, 4 ) simd12 -> mm0 multireg-ret "Return value temp for multireg return" <System.Numerics.Vector3>
+; V06 rat0 [V06,T03] ( 3, 6 ) simd16 -> mm1 "fgMakeTemp is creating a new local variable"
;
-; Lcl frame size = 48
+; Lcl frame size = 0
G_M61336_IG01:
- push rbp
- sub rsp, 48
- lea rbp, [rsp+0x30]
vinsertps xmm1, xmm1, xmm1, -8
vshufpd xmm0, xmm1, 0
- ;; size=21 bbWeight=1 PerfScore 3.75
+ ;; size=11 bbWeight=1 PerfScore 2.00
G_M61336_IG02:
vinsertps xmm0, xmm0, xmm0, 56
- vmovups xmmword ptr [rsp], xmm0
- movzx rdi, dil
- vmovd xmm0, edi
- movzx rdi, sil
- vpinsrd xmm0, xmm0, edi, 1
- movzx rdi, dl
- vpinsrd xmm0, xmm0, edi, 2
- mov edi, 3
- vpinsrd xmm0, xmm0, edi, 3
- vmovups xmmword ptr [rsp+0x10], xmm0
- lea rdi, [rbp-0x10]
- mov rax, 0xD1FFAB1E ; code for System.Runtime.Intrinsics.Vector128:Shuffle(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[int]):System.Runtime.Intrinsics.Vector128`1[float]
- call [rax]System.Runtime.Intrinsics.Vector128:Shuffle(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[int]):System.Runtime.Intrinsics.Vector128`1[float]
- vmovups xmm0, xmmword ptr [rbp-0x10]
+ movzx rax, dil
+ vmovd xmm1, eax
+ movzx rax, sil
+ vpinsrd xmm1, xmm1, eax, 1
+ movzx rax, dl
+ vpinsrd xmm1, xmm1, eax, 2
+ mov eax, 3
+ vpinsrd xmm1, xmm1, eax, 3
+ vpermilps xmm0, xmm0, xmm1
+ vpcmpud k1, xmm1, dword ptr [reloc @RWD00] {1to4}, 1
+ vpmovm2d xmm1, k1
+ vandps xmm0, xmm1, xmm0
vmovhlps xmm1, xmm1, xmm0
- ;; size=81 bbWeight=1 PerfScore 22.75
+ ;; size=74 bbWeight=1 PerfScore 21.33
G_M61336_IG03:
- add rsp, 48
- pop rbp
ret
- ;; size=6 bbWeight=1 PerfScore 1.75
+ ;; size=1 bbWeight=1 PerfScore 1.00
+RWD00 dd 00000004h
-; Total bytes of code 108, prolog size 10, PerfScore 28.25, instruction count 24, allocated bytes for code 108 (MethodHash=f7431067) for method System.Numerics.Vector3:Shuffle(System.Numerics.Vector3,ubyte,ubyte,ubyte):System.Numerics.Vector3 (FullOpts)
+
+; Total bytes of code 86, prolog size 0, PerfScore 24.33, instruction count 17, allocated bytes for code 86 (MethodHash=f7431067) for method System.Numerics.Vector3:Shuffle(System.Numerics.Vector3,ubyte,ubyte,ubyte):System.Numerics.Vector3 (FullOpts)
; ============================================================ -21 (-22.34 % of base) - System.Numerics.Vector2:Shuffle(System.Numerics.Vector2,ubyte,ubyte):System.Numerics.Vector2 ; Assembly listing for method System.Numerics.Vector2:Shuffle(System.Numerics.Vector2,ubyte,ubyte):System.Numerics.Vector2 (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
-; rbp based frame
+; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
-; V00 arg0 [V00,T02] ( 3, 3 ) simd8 -> mm0 single-def <System.Numerics.Vector2>
+; V00 arg0 [V00,T03] ( 3, 3 ) simd8 -> mm0 single-def <System.Numerics.Vector2>
; V01 arg1 [V01,T00] ( 3, 3 ) ubyte -> rdi single-def
; V02 arg2 [V02,T01] ( 3, 3 ) ubyte -> rsi single-def
-; V03 OutArgs [V03 ] ( 1, 1 ) struct (32) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V04 rat0 [V04,T03] ( 2, 4 ) simd16 -> [rbp-0x10] do-not-enreg[S] "return buffer for hwintrinsic" <System.Runtime.Intrinsics.Vector128`1[float]>
+;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+; V04 rat0 [V04,T02] ( 3, 6 ) simd16 -> mm1 "fgMakeTemp is creating a new local variable"
;
-; Lcl frame size = 48
+; Lcl frame size = 0
G_M58090_IG01:
- push rbp
- sub rsp, 48
- lea rbp, [rsp+0x30]
- ;; size=10 bbWeight=1 PerfScore 1.75
+ ;; size=0 bbWeight=1 PerfScore 0.00
G_M58090_IG02:
vinsertps xmm0, xmm0, xmm0, 60
- vmovups xmmword ptr [rsp], xmm0
- movzx rdi, dil
- vmovd xmm0, edi
- movzx rdi, sil
- vpinsrd xmm0, xmm0, edi, 1
- mov edi, 2
- vpinsrd xmm0, xmm0, edi, 2
- mov edi, 3
- vpinsrd xmm0, xmm0, edi, 3
- vmovups xmmword ptr [rsp+0x10], xmm0
- lea rdi, [rbp-0x10]
- mov rax, 0xD1FFAB1E ; code for System.Runtime.Intrinsics.Vector128:Shuffle(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[int]):System.Runtime.Intrinsics.Vector128`1[float]
- call [rax]System.Runtime.Intrinsics.Vector128:Shuffle(System.Runtime.Intrinsics.Vector128`1[float],System.Runtime.Intrinsics.Vector128`1[int]):System.Runtime.Intrinsics.Vector128`1[float]
- vmovsd xmm0, qword ptr [rbp-0x10]
- ;; size=78 bbWeight=1 PerfScore 21.75
+ movzx rax, dil
+ vmovd xmm1, eax
+ movzx rax, sil
+ vpinsrd xmm1, xmm1, eax, 1
+ mov eax, 2
+ vpinsrd xmm1, xmm1, eax, 2
+ mov eax, 3
+ vpinsrd xmm1, xmm1, eax, 3
+ vpermilps xmm0, xmm0, xmm1
+ vpcmpud k1, xmm1, dword ptr [reloc @RWD00] {1to4}, 1
+ vpmovm2d xmm1, k1
+ vandps xmm0, xmm1, xmm0
+ ;; size=72 bbWeight=1 PerfScore 20.33
G_M58090_IG03:
- add rsp, 48
- pop rbp
ret
- ;; size=6 bbWeight=1 PerfScore 1.75
+ ;; size=1 bbWeight=1 PerfScore 1.00
+RWD00 dd 00000004h
-; Total bytes of code 94, prolog size 10, PerfScore 25.25, instruction count 21, allocated bytes for code 94 (MethodHash=33e81d15) for method System.Numerics.Vector2:Shuffle(System.Numerics.Vector2,ubyte,ubyte):System.Numerics.Vector2 (FullOpts)
+
+; Total bytes of code 73, prolog size 0, PerfScore 21.33, instruction count 14, allocated bytes for code 73 (MethodHash=33e81d15) for method System.Numerics.Vector2:Shuffle(System.Numerics.Vector2,ubyte,ubyte):System.Numerics.Vector2 (FullOpts)
; ============================================================ -8 (-3.51 % of base) - System.IO.Hashing.XxHashShared:Accumulate(ulong,ulong,ulong,int,ubyte,int) ; Assembly listing for method System.IO.Hashing.XxHashShared:Accumulate(ulong,ulong,ulong,int,ubyte,int) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 14 single block inlinees; 2 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T05] ( 6, 6 ) long -> rdi single-def
; V01 arg1 [V01,T00] ( 8, 98 ) long -> rsi
; V02 arg2 [V02,T01] ( 8, 71 ) long -> registers
; V03 arg3 [V03,T04] ( 4, 7 ) int -> rcx single-def
; V04 arg4 [V04,T09] ( 3, 2.25) ubyte -> r8 single-def
; V05 arg5 [V05,T07] ( 4, 3.25) int -> r9 single-def
; V06 loc0 [V06,T06] ( 3, 6 ) long -> rdx
; V07 loc1 [V07,T08] ( 3, 5 ) long -> rax
; V08 loc2 [V08,T18] ( 7, 40 ) simd32 -> mm0 <System.Runtime.Intrinsics.Vector256`1[ulong]>
; V09 loc3 [V09,T19] ( 7, 40 ) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[ulong]>
;* V10 loc4 [V10,T12] ( 0, 0 ) int -> zero-ref
;* V11 loc5 [V11,T11] ( 0, 0 ) int -> zero-ref
-; V12 loc6 [V12,T15] ( 4, 64 ) simd32 -> mm4 <System.Runtime.Intrinsics.Vector256`1[uint]>
+; V12 loc6 [V12,T15] ( 4, 64 ) simd32 -> mm3 <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V13 loc7 [V13 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ulong]>
;* V14 loc8 [V14 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ulong]>
;* V15 loc9 [V15 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ulong]>
;* V16 loc10 [V16 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ulong]>
;* V17 loc11 [V17 ] ( 0, 0 ) int -> zero-ref
;* V18 loc12 [V18 ] ( 0, 0 ) int -> zero-ref
;* V19 loc13 [V19 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[uint]>
;* V20 loc14 [V20 ] ( 0, 0 ) int -> zero-ref
;* V21 loc15 [V21 ] ( 0, 0 ) int -> zero-ref
;# V22 OutArgs [V22 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V23 tmp1 [V23,T22] ( 2, 8 ) simd32 -> mm4 "spilled call-like call argument"
-; V24 tmp2 [V24,T23] ( 2, 8 ) simd32 -> mm4 "spilled call-like call argument"
+; V23 tmp1 [V23,T21] ( 2, 8 ) simd32 -> mm3 "spilled call-like call argument"
+; V24 tmp2 [V24,T22] ( 2, 8 ) simd32 -> mm3 "spilled call-like call argument"
;* V25 tmp3 [V25 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
-; V26 tmp4 [V26,T13] ( 3, 96 ) simd32 -> mm5 "dup spill"
-; V27 tmp5 [V27,T16] ( 3, 48 ) simd32 -> mm4 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
+; V26 tmp4 [V26,T13] ( 3, 96 ) simd32 -> mm4 "dup spill"
+; V27 tmp5 [V27,T16] ( 3, 48 ) simd32 -> mm3 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V28 tmp6 [V28 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V29 tmp7 [V29 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V30 tmp8 [V30 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ulong]>
;* V31 tmp9 [V31 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ulong]>
;* V32 tmp10 [V32 ] ( 0, 0 ) simd32 -> zero-ref
;* V33 tmp11 [V33 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
-; V34 tmp12 [V34,T14] ( 3, 96 ) simd32 -> mm5 "dup spill"
-; V35 tmp13 [V35,T17] ( 3, 48 ) simd32 -> mm4 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
+; V34 tmp12 [V34,T14] ( 3, 96 ) simd32 -> mm4 "dup spill"
+; V35 tmp13 [V35,T17] ( 3, 48 ) simd32 -> mm3 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V36 tmp14 [V36 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V37 tmp15 [V37 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V38 tmp16 [V38 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ulong]>
;* V39 tmp17 [V39 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ulong]>
;* V40 tmp18 [V40 ] ( 0, 0 ) simd32 -> zero-ref
;* V41 tmp19 [V41 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ulong]>
;* V42 tmp20 [V42 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V43 tmp21 [V43 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
;* V44 tmp22 [V44 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ulong]>
;* V45 tmp23 [V45 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
;* V46 tmp24 [V46 ] ( 0, 0 ) long -> zero-ref "Inlining Arg"
; V47 cse0 [V47,T20] ( 3, 32.25) simd32 -> mm2 hoist "CSE #01: aggressive"
-; V48 cse1 [V48,T21] ( 3, 32.25) simd32 -> mm3 hoist "CSE #02: aggressive"
-; V49 cse2 [V49,T10] ( 2, 4.25) int -> r8 hoist "CSE #03: moderate"
-; V50 cse3 [V50,T24] ( 3, 6 ) simd32 -> mm5 "CSE #04: moderate"
-; V51 rat0 [V51,T03] ( 4, 12.25) int -> r9 "Trip count IV"
-; V52 rat1 [V52,T02] ( 4, 49 ) int -> r11 "Trip count IV"
+; V48 cse1 [V48,T10] ( 2, 4.25) int -> r8 hoist "CSE #02: moderate"
+; V49 cse2 [V49,T23] ( 3, 6 ) simd32 -> mm4 "CSE #03: moderate"
+; V50 rat0 [V50,T03] ( 4, 12.25) int -> r9 "Trip count IV"
+; V51 rat1 [V51,T02] ( 4, 49 ) int -> r11 "Trip count IV"
;
; Lcl frame size = 0
G_M26233_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M26233_IG02:
lea rax, [rdx+0x80]
vmovups ymm0, ymmword ptr [rdi]
vmovups ymm1, ymmword ptr [rdi+0x20]
test r9d, r9d
jle G_M26233_IG10
;; size=25 bbWeight=1 PerfScore 11.75
G_M26233_IG03:
movzx r8, r8b
vmovups ymm2, ymmword ptr [reloc @RWD00]
- vmovups ymm3, ymmword ptr [reloc @RWD32]
- ;; size=20 bbWeight=0.25 PerfScore 2.06
+ ;; size=12 bbWeight=0.25 PerfScore 1.06
G_M26233_IG04:
mov r10, rdx
test ecx, ecx
jle SHORT G_M26233_IG07
;; size=7 bbWeight=4 PerfScore 6.00
G_M26233_IG05:
mov r11d, ecx
align [0 bytes for IG06]
;; size=3 bbWeight=1 PerfScore 0.25
G_M26233_IG06:
- vmovups ymm4, ymmword ptr [r10]
- vmovups ymm5, ymmword ptr [rsi]
- vpxor ymm4, ymm5, ymm4
- vpermd ymm6, ymm2, ymm4
- vpmuludq ymm4, ymm6, ymm4
- vpermd ymm5, ymm3, ymm5
- vpaddq ymm0, ymm5, ymm0
- vpaddq ymm0, ymm0, ymm4
+ vmovups ymm3, ymmword ptr [r10]
+ vmovups ymm4, ymmword ptr [rsi]
+ vpxor ymm3, ymm4, ymm3
+ vpermd ymm5, ymm2, ymm3
+ vpmuludq ymm3, ymm5, ymm3
+ vpshufd ymm4, ymm4, 78
+ vpaddq ymm0, ymm4, ymm0
+ vpaddq ymm0, ymm0, ymm3
add rsi, 32
- vmovups ymm4, ymmword ptr [r10+0x20]
- vmovups ymm5, ymmword ptr [rsi]
- vpxor ymm4, ymm5, ymm4
- vpermd ymm6, ymm2, ymm4
- vpmuludq ymm4, ymm6, ymm4
- vpermd ymm5, ymm3, ymm5
- vpaddq ymm1, ymm5, ymm1
- vpaddq ymm1, ymm1, ymm4
+ vmovups ymm3, ymmword ptr [r10+0x20]
+ vmovups ymm4, ymmword ptr [rsi]
+ vpxor ymm3, ymm4, ymm3
+ vpermd ymm5, ymm2, ymm3
+ vpmuludq ymm3, ymm5, ymm3
+ vpshufd ymm4, ymm4, 78
+ vpaddq ymm1, ymm4, ymm1
+ vpaddq ymm1, ymm1, ymm3
add rsi, 32
add r10, 8
dec r11d
jne SHORT G_M26233_IG06
- ;; size=88 bbWeight=16 PerfScore 640.00
+ ;; size=88 bbWeight=16 PerfScore 608.00
G_M26233_IG07:
test r8d, r8d
je SHORT G_M26233_IG09
;; size=5 bbWeight=4 PerfScore 5.00
G_M26233_IG08:
- vmovups ymm4, ymmword ptr [rax]
- vpsrlq ymm5, ymm0, 47
- vpternlogq ymm4, ymm5, ymm0, -106
- vmovups ymm5, ymmword ptr [reloc @RWD64]
- vpmullq ymm0, ymm4, ymm5
- vmovups ymm4, ymmword ptr [rax+0x20]
- vpsrlq ymm6, ymm1, 47
- vpternlogq ymm4, ymm6, ymm1, -106
- vpmullq ymm1, ymm4, ymm5
+ vmovups ymm3, ymmword ptr [rax]
+ vpsrlq ymm4, ymm0, 47
+ vpternlogq ymm3, ymm4, ymm0, -106
+ vmovups ymm4, ymmword ptr [reloc @RWD32]
+ vpmullq ymm0, ymm3, ymm4
+ vmovups ymm3, ymmword ptr [rax+0x20]
+ vpsrlq ymm5, ymm1, 47
+ vpternlogq ymm3, ymm5, ymm1, -106
+ vpmullq ymm1, ymm3, ymm4
;; size=53 bbWeight=2 PerfScore 98.00
G_M26233_IG09:
dec r9d
jne G_M26233_IG04
;; size=9 bbWeight=4 PerfScore 5.00
G_M26233_IG10:
vmovups ymmword ptr [rdi], ymm0
vmovups ymmword ptr [rdi+0x20], ymm1
;; size=9 bbWeight=1 PerfScore 4.00
G_M26233_IG11:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=1 PerfScore 2.50
RWD00 dq 0000000000000001h, 0000000000000003h, 0000000000000005h, 0000000000000007h
-RWD32 dq 0000000300000002h, 0000000100000000h, 0000000700000006h, 0000000500000004h
-RWD64 dq 000000009E3779B1h, 000000009E3779B1h, 000000009E3779B1h, 000000009E3779B1h
+RWD32 dq 000000009E3779B1h, 000000009E3779B1h, 000000009E3779B1h, 000000009E3779B1h
-; Total bytes of code 228, prolog size 4, PerfScore 775.81, instruction count 54, allocated bytes for code 228 (MethodHash=eac09986) for method System.IO.Hashing.XxHashShared:Accumulate(ulong,ulong,ulong,int,ubyte,int) (FullOpts)
+; Total bytes of code 220, prolog size 4, PerfScore 742.81, instruction count 53, allocated bytes for code 220 (MethodHash=eac09986) for method System.IO.Hashing.XxHashShared:Accumulate(ulong,ulong,ulong,int,ubyte,int) (FullOpts)
; ============================================================ -8 (-12.50 % of base) - System.IO.Hashing.XxHashShared:Accumulate256(System.Runtime.Intrinsics.Vector256`1[ulong],ulong,System.Runtime.Intrinsics.Vector256`1[uint]):System.Runtime.Intrinsics.Vector256`1[ulong] ; Assembly listing for method System.IO.Hashing.XxHashShared:Accumulate256(System.Runtime.Intrinsics.Vector256`1[ulong],ulong,System.Runtime.Intrinsics.Vector256`1[uint]):System.Runtime.Intrinsics.Vector256`1[ulong] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 1 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T04] ( 1, 1 ) simd32 -> [rbp+0x10] single-def <System.Runtime.Intrinsics.Vector256`1[ulong]>
; V02 arg1 [V02,T01] ( 3, 3 ) long -> rsi single-def
; V03 arg2 [V03,T05] ( 1, 1 ) simd32 -> [rbp+0x30] single-def <System.Runtime.Intrinsics.Vector256`1[uint]>
; V04 loc0 [V04,T03] ( 3, 3 ) simd32 -> mm1 <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V05 loc1 [V05 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V06 loc2 [V06 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V07 loc3 [V07 ] ( 0, 0 ) simd32 -> zero-ref <System.Runtime.Intrinsics.Vector256`1[ulong]>
;# V08 OutArgs [V08 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V09 tmp1 [V09,T02] ( 3, 6 ) simd32 -> mm0 "dup spill"
;* V10 tmp2 [V10 ] ( 0, 0 ) simd32 -> zero-ref
;
; Lcl frame size = 0
G_M28995_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M28995_IG02:
vmovups ymm0, ymmword ptr [rsi]
vpxor ymm1, ymm0, ymmword ptr [rbp+0x30]
vmovups ymm2, ymmword ptr [reloc @RWD00]
vpermd ymm2, ymm2, ymm1
vpmuludq ymm1, ymm2, ymm1
- vmovups ymm2, ymmword ptr [reloc @RWD32]
- vpermd ymm0, ymm2, ymm0
+ vpshufd ymm0, ymm0, 78
vpaddq ymm0, ymm0, ymmword ptr [rbp+0x10]
vpaddq ymm0, ymm0, ymm1
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=55 bbWeight=1 PerfScore 27.58
+ ;; size=47 bbWeight=1 PerfScore 22.58
G_M28995_IG03:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=1 PerfScore 2.50
RWD00 dq 0000000000000001h, 0000000000000003h, 0000000000000005h, 0000000000000007h
-RWD32 dq 0000000300000002h, 0000000100000000h, 0000000700000006h, 0000000500000004h
-; Total bytes of code 64, prolog size 4, PerfScore 31.33, instruction count 16, allocated bytes for code 64 (MethodHash=161e8ebc) for method System.IO.Hashing.XxHashShared:Accumulate256(System.Runtime.Intrinsics.Vector256`1[ulong],ulong,System.Runtime.Intrinsics.Vector256`1[uint]):System.Runtime.Intrinsics.Vector256`1[ulong] (FullOpts)
+; Total bytes of code 56, prolog size 4, PerfScore 26.33, instruction count 15, allocated bytes for code 56 (MethodHash=161e8ebc) for method System.IO.Hashing.XxHashShared:Accumulate256(System.Runtime.Intrinsics.Vector256`1[ulong],ulong,System.Runtime.Intrinsics.Vector256`1[uint]):System.Runtime.Intrinsics.Vector256`1[ulong] (FullOpts)
; ============================================================ -8 (-6.61 % of base) - System.IO.Hashing.XxHashShared:Accumulate512(ulong,ulong,ulong) ; Assembly listing for method System.IO.Hashing.XxHashShared:Accumulate512(ulong,ulong,ulong) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 4 single block inlinees; 2 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T03] ( 3, 3 ) long -> rdi single-def
; V01 arg1 [V01,T04] ( 3, 3 ) long -> rsi single-def
; V02 arg2 [V02,T05] ( 3, 3 ) long -> rdx single-def
;# V03 OutArgs [V03 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V04 tmp1 [V04,T06] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
; V05 tmp2 [V05,T00] ( 7, 14 ) long -> rdi "Inlining Arg"
; V06 tmp3 [V06,T01] ( 5, 10 ) long -> rsi "Inlining Arg"
; V07 tmp4 [V07,T02] ( 5, 10 ) long -> rdx "Inlining Arg"
; V08 tmp5 [V08,T08] ( 4, 8 ) simd32 -> mm0 "impAppendStmt"
-; V09 tmp6 [V09,T09] ( 4, 8 ) simd32 -> registers "spilled call-like call argument"
+; V09 tmp6 [V09,T09] ( 4, 8 ) simd32 -> mm1 "spilled call-like call argument"
;* V10 tmp7 [V10 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
-; V11 tmp8 [V11,T07] ( 6, 12 ) simd32 -> registers "dup spill"
-; V12 tmp9 [V12,T10] ( 6, 6 ) simd32 -> registers "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
+; V11 tmp8 [V11,T07] ( 6, 12 ) simd32 -> mm2 "dup spill"
+; V12 tmp9 [V12,T10] ( 6, 6 ) simd32 -> mm3 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V13 tmp10 [V13 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V14 tmp11 [V14 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V15 tmp12 [V15 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ulong]>
;* V16 tmp13 [V16 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ulong]>
;* V17 tmp14 [V17 ] ( 0, 0 ) simd32 -> zero-ref
; V18 cse0 [V18,T11] ( 3, 3 ) simd32 -> mm4 "CSE #01: moderate"
-; V19 cse1 [V19,T12] ( 3, 3 ) simd32 -> mm1 "CSE #02: moderate"
;
; Lcl frame size = 0
G_M64892_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M64892_IG02:
vmovups ymm0, ymmword ptr [rdi]
vmovups ymm1, ymmword ptr [rdx]
vmovups ymm2, ymmword ptr [rsi]
vpxor ymm3, ymm2, ymm1
vmovups ymm4, ymmword ptr [reloc @RWD00]
vpermd ymm1, ymm4, ymm3
vpmuludq ymm3, ymm1, ymm3
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vpermd ymm2, ymm1, ymm2
+ vpshufd ymm2, ymm2, 78
vpaddq ymm0, ymm2, ymm0
vpaddq ymm0, ymm0, ymm3
vmovups ymmword ptr [rdi], ymm0
add rdi, 32
add rdx, 32
add rsi, 32
vmovups ymm0, ymmword ptr [rdi]
- vmovups ymm2, ymmword ptr [rdx]
- vmovups ymm3, ymmword ptr [rsi]
- vpxor ymm2, ymm3, ymm2
- vpermd ymm4, ymm4, ymm2
- vpmuludq ymm2, ymm4, ymm2
- vpermd ymm1, ymm1, ymm3
- vpaddq ymm0, ymm1, ymm0
- vpaddq ymm0, ymm0, ymm2
+ vmovups ymm1, ymmword ptr [rdx]
+ vmovups ymm2, ymmword ptr [rsi]
+ vpxor ymm3, ymm2, ymm1
+ vpermd ymm1, ymm4, ymm3
+ vpmuludq ymm1, ymm1, ymm3
+ vpshufd ymm2, ymm2, 78
+ vpaddq ymm0, ymm2, ymm0
+ vpaddq ymm0, ymm0, ymm1
vmovups ymmword ptr [rdi], ymm0
- ;; size=112 bbWeight=1 PerfScore 60.75
+ ;; size=104 bbWeight=1 PerfScore 54.75
G_M64892_IG03:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=1 PerfScore 2.50
RWD00 dq 0000000000000001h, 0000000000000003h, 0000000000000005h, 0000000000000007h
-RWD32 dq 0000000300000002h, 0000000100000000h, 0000000700000006h, 0000000500000004h
-; Total bytes of code 121, prolog size 4, PerfScore 64.50, instruction count 30, allocated bytes for code 121 (MethodHash=af380283) for method System.IO.Hashing.XxHashShared:Accumulate512(ulong,ulong,ulong) (FullOpts)
+; Total bytes of code 113, prolog size 4, PerfScore 58.50, instruction count 29, allocated bytes for code 113 (MethodHash=af380283) for method System.IO.Hashing.XxHashShared:Accumulate512(ulong,ulong,ulong) (FullOpts)
; ============================================================ -8 (-6.61 % of base) - System.IO.Hashing.XxHashShared:Accumulate512Inlined(ulong,ulong,ulong) ; Assembly listing for method System.IO.Hashing.XxHashShared:Accumulate512Inlined(ulong,ulong,ulong) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 4 single block inlinees; 1 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 8, 8 ) long -> rdi
; V01 arg1 [V01,T01] ( 6, 6 ) long -> rsi
; V02 arg2 [V02,T02] ( 6, 6 ) long -> rdx
;* V03 loc0 [V03,T03] ( 0, 0 ) int -> zero-ref
;* V04 loc1 [V04 ] ( 0, 0 ) int -> zero-ref
;* V05 loc2 [V05 ] ( 0, 0 ) int -> zero-ref
;* V06 loc3 [V06 ] ( 0, 0 ) long -> zero-ref
;* V07 loc4 [V07 ] ( 0, 0 ) long -> zero-ref
;# V08 OutArgs [V08 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V09 tmp1 [V09,T05] ( 4, 8 ) simd32 -> mm0 "impAppendStmt"
-; V10 tmp2 [V10,T06] ( 4, 8 ) simd32 -> registers "spilled call-like call argument"
+; V10 tmp2 [V10,T06] ( 4, 8 ) simd32 -> mm1 "spilled call-like call argument"
;* V11 tmp3 [V11 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
-; V12 tmp4 [V12,T04] ( 6, 12 ) simd32 -> registers "dup spill"
-; V13 tmp5 [V13,T07] ( 6, 6 ) simd32 -> registers "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
+; V12 tmp4 [V12,T04] ( 6, 12 ) simd32 -> mm2 "dup spill"
+; V13 tmp5 [V13,T07] ( 6, 6 ) simd32 -> mm3 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V14 tmp6 [V14 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V15 tmp7 [V15 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[uint]>
;* V16 tmp8 [V16 ] ( 0, 0 ) simd32 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector256`1[ulong]>
;* V17 tmp9 [V17 ] ( 0, 0 ) simd32 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector256`1[ulong]>
;* V18 tmp10 [V18 ] ( 0, 0 ) simd32 -> zero-ref
; V19 cse0 [V19,T08] ( 3, 3 ) simd32 -> mm4 "CSE #01: moderate"
-; V20 cse1 [V20,T09] ( 3, 3 ) simd32 -> mm1 "CSE #02: moderate"
;
; Lcl frame size = 0
G_M8369_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M8369_IG02:
vmovups ymm0, ymmword ptr [rdi]
vmovups ymm1, ymmword ptr [rdx]
vmovups ymm2, ymmword ptr [rsi]
vpxor ymm3, ymm2, ymm1
vmovups ymm4, ymmword ptr [reloc @RWD00]
vpermd ymm1, ymm4, ymm3
vpmuludq ymm3, ymm1, ymm3
- vmovups ymm1, ymmword ptr [reloc @RWD32]
- vpermd ymm2, ymm1, ymm2
+ vpshufd ymm2, ymm2, 78
vpaddq ymm0, ymm2, ymm0
vpaddq ymm0, ymm0, ymm3
vmovups ymmword ptr [rdi], ymm0
add rdi, 32
add rdx, 32
add rsi, 32
vmovups ymm0, ymmword ptr [rdi]
- vmovups ymm2, ymmword ptr [rdx]
- vmovups ymm3, ymmword ptr [rsi]
- vpxor ymm2, ymm3, ymm2
- vpermd ymm4, ymm4, ymm2
- vpmuludq ymm2, ymm4, ymm2
- vpermd ymm1, ymm1, ymm3
- vpaddq ymm0, ymm1, ymm0
- vpaddq ymm0, ymm0, ymm2
+ vmovups ymm1, ymmword ptr [rdx]
+ vmovups ymm2, ymmword ptr [rsi]
+ vpxor ymm3, ymm2, ymm1
+ vpermd ymm1, ymm4, ymm3
+ vpmuludq ymm1, ymm1, ymm3
+ vpshufd ymm2, ymm2, 78
+ vpaddq ymm0, ymm2, ymm0
+ vpaddq ymm0, ymm0, ymm1
vmovups ymmword ptr [rdi], ymm0
- ;; size=112 bbWeight=1 PerfScore 60.75
+ ;; size=104 bbWeight=1 PerfScore 54.75
G_M8369_IG03:
vzeroupper
pop rbp
ret
;; size=5 bbWeight=1 PerfScore 2.50
RWD00 dq 0000000000000001h, 0000000000000003h, 0000000000000005h, 0000000000000007h
-RWD32 dq 0000000300000002h, 0000000100000000h, 0000000700000006h, 0000000500000004h
-; Total bytes of code 121, prolog size 4, PerfScore 64.50, instruction count 30, allocated bytes for code 121 (MethodHash=6712df4e) for method System.IO.Hashing.XxHashShared:Accumulate512Inlined(ulong,ulong,ulong) (FullOpts)
+; Total bytes of code 113, prolog size 4, PerfScore 58.50, instruction count 29, allocated bytes for code 113 (MethodHash=6712df4e) for method System.IO.Hashing.XxHashShared:Accumulate512Inlined(ulong,ulong,ulong) (FullOpts)
; ============================================================ -4 (-13.33 % of base) - System.Buffers.Binary.BinaryPrimitives+Int32EndiannessReverser:Reverse(System.Runtime.Intrinsics.Vector256`1[int]):System.Runtime.Intrinsics.Vector256`1[int] ; Assembly listing for method System.Buffers.Binary.BinaryPrimitives+Int32EndiannessReverser:Reverse(System.Runtime.Intrinsics.Vector256`1[int]):System.Runtime.Intrinsics.Vector256`1[int] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 1, 1 ) simd32 -> [rsp+0x08] single-def <System.Runtime.Intrinsics.Vector256`1[int]>
;# V02 OutArgs [V02 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M36120_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M36120_IG02:
- vmovups ymm0, ymmword ptr [reloc @RWD00]
- vpermb ymm0, ymm0, ymmword ptr [rsp+0x08]
+ vmovups ymm0, ymmword ptr [rsp+0x08]
+ vpshufb ymm0, ymm0, ymmword ptr [reloc @RWD00]
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=26 bbWeight=1 PerfScore 10.25
+ ;; size=22 bbWeight=1 PerfScore 8.25
G_M36120_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
-RWD00 dq 0405060700010203h, 0C0D0E0F08090A0Bh, 1415161710111213h, 1C1D1E1F18191A1Bh
+RWD00 dq 0405060700010203h, 0C0D0E0F08090A0Bh, 0405060700010203h, 0C0D0E0F08090A0Bh
-; Total bytes of code 30, prolog size 0, PerfScore 12.25, instruction count 6, allocated bytes for code 30 (MethodHash=6ced72e7) for method System.Buffers.Binary.BinaryPrimitives+Int32EndiannessReverser:Reverse(System.Runtime.Intrinsics.Vector256`1[int]):System.Runtime.Intrinsics.Vector256`1[int] (FullOpts)
+; Total bytes of code 26, prolog size 0, PerfScore 10.25, instruction count 6, allocated bytes for code 26 (MethodHash=6ced72e7) for method System.Buffers.Binary.BinaryPrimitives+Int32EndiannessReverser:Reverse(System.Runtime.Intrinsics.Vector256`1[int]):System.Runtime.Intrinsics.Vector256`1[int] (FullOpts)
; ============================================================ -4 (-13.33 % of base) - System.Buffers.Binary.BinaryPrimitives+Int64EndiannessReverser:Reverse(System.Runtime.Intrinsics.Vector256`1[long]):System.Runtime.Intrinsics.Vector256`1[long] ; Assembly listing for method System.Buffers.Binary.BinaryPrimitives+Int64EndiannessReverser:Reverse(System.Runtime.Intrinsics.Vector256`1[long]):System.Runtime.Intrinsics.Vector256`1[long] (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; Final local variable assignments
;
; V00 RetBuf [V00,T00] ( 4, 4 ) byref -> rdi single-def
; V01 arg0 [V01,T01] ( 1, 1 ) simd32 -> [rsp+0x08] single-def <System.Runtime.Intrinsics.Vector256`1[long]>
;# V02 OutArgs [V02 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0
G_M59963_IG01:
;; size=0 bbWeight=1 PerfScore 0.00
G_M59963_IG02:
- vmovups ymm0, ymmword ptr [reloc @RWD00]
- vpermb ymm0, ymm0, ymmword ptr [rsp+0x08]
+ vmovups ymm0, ymmword ptr [rsp+0x08]
+ vpshufb ymm0, ymm0, ymmword ptr [reloc @RWD00]
vmovups ymmword ptr [rdi], ymm0
mov rax, rdi
- ;; size=26 bbWeight=1 PerfScore 10.25
+ ;; size=22 bbWeight=1 PerfScore 8.25
G_M59963_IG03:
vzeroupper
ret
;; size=4 bbWeight=1 PerfScore 2.00
-RWD00 dq 0001020304050607h, 08090A0B0C0D0E0Fh, 1011121314151617h, 18191A1B1C1D1E1Fh
+RWD00 dq 0001020304050607h, 08090A0B0C0D0E0Fh, 0001020304050607h, 08090A0B0C0D0E0Fh
-; Total bytes of code 30, prolog size 0, PerfScore 12.25, instruction count 6, allocated bytes for code 30 (MethodHash=7eb215c4) for method System.Buffers.Binary.BinaryPrimitives+Int64EndiannessReverser:Reverse(System.Runtime.Intrinsics.Vector256`1[long]):System.Runtime.Intrinsics.Vector256`1[long] (FullOpts)
+; Total bytes of code 26, prolog size 0, PerfScore 10.25, instruction count 6, allocated bytes for code 26 (MethodHash=7eb215c4) for method System.Buffers.Binary.BinaryPrimitives+Int64EndiannessReverser:Reverse(System.Runtime.Intrinsics.Vector256`1[long]):System.Runtime.Intrinsics.Vector256`1[long] (FullOpts)
; ============================================================ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Job completed in 23 minutes 31 seconds (remote runner delay: 1 minute 19 seconds).
dotnet/runtime#99596
Diffs
Diffs
Artifacts:
The text was updated successfully, but these errors were encountered: