-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
x86:Set preferred CPU features and default NT threshold for Zhaoxin p…
…rocessors
- Loading branch information
1 parent
3865fb2
commit 5dce854
Showing
5 changed files
with
239 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,9 @@ | ||
glibc (2.38-6deepin12) unstable; urgency=medium | ||
|
||
* x86:Set preferred CPU features and default NT threshold for Zhaoxin processors. | ||
|
||
-- May Shao <[email protected]> Mon, 20 Jan 2025 15:20:32 +0800 | ||
|
||
glibc (2.38-6deepin11) unstable; urgency=medium | ||
|
||
* LoongArch: Force SHMLBA the same as kernel | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
102 changes: 102 additions & 0 deletions
102
debian/patches/zhaoxin/0001-x86-Set-preferred-CPU-features-on-the-KH-40000-and-K.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
From 898620f8afab01837bc7e04bd89912bcb6803a79 Mon Sep 17 00:00:00 2001 | ||
From: MayShao-oc <[email protected]> | ||
Date: Thu, 16 Jan 2025 14:13:17 +0800 | ||
Subject: [PATCH 1/3] x86: Set preferred CPU features on the KH-40000 and | ||
KX-7000 Zhaoxin processors | ||
|
||
Fix code formatting under the Zhaoxin branch and add comments for | ||
different Zhaoxin models. | ||
|
||
Unaligned AVX load are slower on KH-40000 and KX-7000, so disable | ||
the AVX_Fast_Unaligned_Load. | ||
|
||
Enable Prefer_No_VZEROUPPER and Fast_Unaligned_Load features to | ||
use sse2_unaligned version of memset,strcpy and strcat. | ||
|
||
Signed-off-by: MayShao-oc <[email protected]> | ||
--- | ||
sysdeps/x86/cpu-features.c | 51 ++++++++++++++++++++++++++------------ | ||
1 file changed, 35 insertions(+), 16 deletions(-) | ||
|
||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c | ||
index badf0888..43b5f562 100644 | ||
--- a/sysdeps/x86/cpu-features.c | ||
+++ b/sysdeps/x86/cpu-features.c | ||
@@ -907,39 +907,58 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht | ||
|
||
model += extended_model; | ||
if (family == 0x6) | ||
- { | ||
- if (model == 0xf || model == 0x19) | ||
- { | ||
+ { | ||
+ /* Tuning for older Zhaoxin processors. */ | ||
+ if (model == 0xf || model == 0x19) | ||
+ { | ||
CPU_FEATURE_UNSET (cpu_features, AVX); | ||
CPU_FEATURE_UNSET (cpu_features, AVX2); | ||
|
||
- cpu_features->preferred[index_arch_Slow_SSE4_2] | ||
- |= bit_arch_Slow_SSE4_2; | ||
+ cpu_features->preferred[index_arch_Slow_SSE4_2] | ||
+ |= bit_arch_Slow_SSE4_2; | ||
|
||
+ /* Unaligned AVX loads are slower. */ | ||
cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load] | ||
- &= ~bit_arch_AVX_Fast_Unaligned_Load; | ||
- } | ||
- } | ||
+ &= ~bit_arch_AVX_Fast_Unaligned_Load; | ||
+ } | ||
+ } | ||
else if (family == 0x7) | ||
- { | ||
- if (model == 0x1b) | ||
+ { | ||
+ switch (model) | ||
{ | ||
+ /* Wudaokou microarch tuning. */ | ||
+ case 0x1b: | ||
CPU_FEATURE_UNSET (cpu_features, AVX); | ||
CPU_FEATURE_UNSET (cpu_features, AVX2); | ||
|
||
cpu_features->preferred[index_arch_Slow_SSE4_2] | ||
- |= bit_arch_Slow_SSE4_2; | ||
+ |= bit_arch_Slow_SSE4_2; | ||
|
||
cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load] | ||
- &= ~bit_arch_AVX_Fast_Unaligned_Load; | ||
- } | ||
- else if (model == 0x3b) | ||
- { | ||
+ &= ~bit_arch_AVX_Fast_Unaligned_Load; | ||
+ break; | ||
+ | ||
+ /* Lujiazui microarch tuning. */ | ||
+ case 0x3b: | ||
CPU_FEATURE_UNSET (cpu_features, AVX); | ||
CPU_FEATURE_UNSET (cpu_features, AVX2); | ||
|
||
cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load] | ||
- &= ~bit_arch_AVX_Fast_Unaligned_Load; | ||
+ &= ~bit_arch_AVX_Fast_Unaligned_Load; | ||
+ break; | ||
+ | ||
+ /* Yongfeng and Shijidadao mircoarch tuning. */ | ||
+ case 0x5b: | ||
+ case 0x6b: | ||
+ cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load] | ||
+ &= ~bit_arch_AVX_Fast_Unaligned_Load; | ||
+ | ||
+ /* To use sse2_unaligned versions of memset, strcpy and strcat. | ||
+ */ | ||
+ cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER] | ||
+ |= (bit_arch_Prefer_No_VZEROUPPER | ||
+ | bit_arch_Fast_Unaligned_Load); | ||
+ break; | ||
} | ||
} | ||
} | ||
-- | ||
2.27.0 | ||
|
77 changes: 77 additions & 0 deletions
77
debian/patches/zhaoxin/0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
From bd766a57df22a8935470fa8e5f339947c0a11653 Mon Sep 17 00:00:00 2001 | ||
From: MayShao-oc <[email protected]> | ||
Date: Thu, 16 Jan 2025 14:15:31 +0800 | ||
Subject: [PATCH 2/3] x86_64: Optimize large size copy in memmove-ssse3 | ||
|
||
This patch optimizes large size copy using normal store when src > dst | ||
and overlap. Make it the same as the logic in memmove-vec-unaligned-erms.S. | ||
|
||
Current memmove-ssse3 use '__x86_shared_cache_size_half' as the non- | ||
temporal threshold, this patch updates that value to | ||
'__x86_shared_non_temporal_threshold'. Currently, the | ||
__x86_shared_non_temporal_threshold is cpu-specific, and different CPUs | ||
will have different values based on the related nt-benchmark results. | ||
However, in memmove-ssse3, the nontemporal threshold uses | ||
'__x86_shared_cache_size_half', which sounds unreasonable. | ||
|
||
The performance is not changed drastically although shows overall | ||
improvements without any major regressions or gains. | ||
|
||
Results on Zhaoxin KX-7000: | ||
|
||
bench-memcpy geometric_mean(N=20) New / Original: 0.999 | ||
bench-memcpy-random geometric_mean(N=20) New / Original: 0.999 | ||
bench-memcpy-large geometric_mean(N=20) New / Original: 0.978 | ||
bench-memmove geometric_mean(N=20) New / Original: 1.000 | ||
bench-memmmove-large geometric_mean(N=20) New / Original: 0.962 | ||
|
||
Results on Intel Core i5-6600K: | ||
|
||
bench-memcpy geometric_mean(N=20) New / Original: 1.001 | ||
bench-memcpy-random geometric_mean(N=20) New / Original: 0.999 | ||
bench-memcpy-large geometric_mean(N=20) New / Original: 1.001 | ||
bench-memmove geometric_mean(N=20) New / Original: 0.995 | ||
bench-memmmove-large geometric_mean(N=20) New / Original: 0.936 | ||
|
||
Signed-off-by: MayShao-oc <[email protected]> | ||
--- | ||
sysdeps/x86_64/multiarch/memmove-ssse3.S | 14 +++++++++----- | ||
1 file changed, 9 insertions(+), 5 deletions(-) | ||
|
||
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S | ||
index 460b0ec0..69561628 100644 | ||
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S | ||
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S | ||
@@ -151,13 +151,10 @@ L(more_2x_vec): | ||
loop. */ | ||
movups %xmm0, (%rdi) | ||
|
||
-# ifdef SHARED_CACHE_SIZE_HALF | ||
- cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP | ||
-# else | ||
- cmp __x86_shared_cache_size_half(%rip), %rdx | ||
-# endif | ||
+ cmp __x86_shared_non_temporal_threshold(%rip), %rdx | ||
ja L(large_memcpy) | ||
|
||
+L(loop_fwd): | ||
leaq -64(%rdi, %rdx), %r8 | ||
andq $-16, %rdi | ||
movl $48, %edx | ||
@@ -199,6 +196,13 @@ L(large_memcpy): | ||
movups -64(%r9, %rdx), %xmm10 | ||
movups -80(%r9, %rdx), %xmm11 | ||
|
||
+ /* Check if src and dst overlap. If they do use cacheable | ||
+ writes to potentially gain positive interference between | ||
+ the loads during the memmove. */ | ||
+ subq %rdi, %r9 | ||
+ cmpq %rdx, %r9 | ||
+ jb L(loop_fwd) | ||
+ | ||
sall $5, %ecx | ||
leal (%rcx, %rcx, 2), %r8d | ||
leaq -96(%rdi, %rdx), %rcx | ||
-- | ||
2.27.0 | ||
|
50 changes: 50 additions & 0 deletions
50
debian/patches/zhaoxin/0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
From e2792bd9be6e2df3b12e887b90f0daebad971d4a Mon Sep 17 00:00:00 2001 | ||
From: MayShao-oc <[email protected]> | ||
Date: Thu, 16 Jan 2025 14:22:44 +0800 | ||
Subject: [PATCH 3/3] x86: Set default non_temporal_threshold for Zhaoxin | ||
processors | ||
|
||
Current 'non_temporal_threshold' set to 'non_temporal_threshold_lowbound' | ||
on Zhaoxin processors without ERMS. The default | ||
'non_temporal_threshold_lowbound' is too small for the KH-40000 and KX-7000 | ||
Zhaoxin processors, this patch updates the value to | ||
'shared / cachesize_non_temporal_divisor'. | ||
|
||
Signed-off-by: MayShao-oc <[email protected]> | ||
--- | ||
sysdeps/x86/cpu-features.c | 1 + | ||
sysdeps/x86/dl-cacheinfo.h | 6 ++++-- | ||
2 files changed, 5 insertions(+), 2 deletions(-) | ||
|
||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c | ||
index 43b5f562..f752ebd2 100644 | ||
--- a/sysdeps/x86/cpu-features.c | ||
+++ b/sysdeps/x86/cpu-features.c | ||
@@ -949,6 +949,7 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht | ||
|
||
/* Yongfeng and Shijidadao mircoarch tuning. */ | ||
case 0x5b: | ||
+ cpu_features->cachesize_non_temporal_divisor = 2; | ||
case 0x6b: | ||
cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load] | ||
&= ~bit_arch_AVX_Fast_Unaligned_Load; | ||
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h | ||
index cd4d0351..2c5b6d69 100644 | ||
--- a/sysdeps/x86/dl-cacheinfo.h | ||
+++ b/sysdeps/x86/dl-cacheinfo.h | ||
@@ -769,8 +769,10 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) | ||
/* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run | ||
a higher risk of actually thrashing the cache as they don't have a HW LRU | ||
hint. As well, their performance in highly parallel situations is | ||
- noticeably worse. */ | ||
- if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) | ||
+ noticeably worse. Zhaoxin processors are an exception, the lowbound is not | ||
+ suitable for them based on actual test data. */ | ||
+ if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS) | ||
+ && cpu_features->basic.kind != arch_kind_zhaoxin) | ||
non_temporal_threshold = non_temporal_threshold_lowbound; | ||
/* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of | ||
'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best | ||
-- | ||
2.27.0 | ||
|