From 5dce854e94de21b289b36cc6453acfe12df0b937 Mon Sep 17 00:00:00 2001
From: MayShao-oc <mayshao-oc@zhaoxin.com>
Date: Thu, 23 Jan 2025 16:29:31 +0800
Subject: [PATCH] x86:Set preferred CPU features and default NT threshold for
 Zhaoxin processors

---
 debian/changelog                              |   6 ++
 debian/patches/series                         |   4 +
 ...d-CPU-features-on-the-KH-40000-and-K.patch | 102 ++++++++++++++++++
 ...ize-large-size-copy-in-memmove-ssse3.patch |  77 +++++++++++++
 ...non_temporal_threshold-for-Zhaoxin-p.patch |  50 +++++++++
 5 files changed, 239 insertions(+)
 create mode 100644 debian/patches/zhaoxin/0001-x86-Set-preferred-CPU-features-on-the-KH-40000-and-K.patch
 create mode 100644 debian/patches/zhaoxin/0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
 create mode 100644 debian/patches/zhaoxin/0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch

diff --git a/debian/changelog b/debian/changelog
index 96689479..e8bc28c2 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+glibc (2.38-6deepin12) unstable; urgency=medium
+
+  * x86:Set preferred CPU features and default NT threshold for Zhaoxin processors.
+
+ -- May Shao <mayshao-oc@zhaoxin.com>  Mon, 20 Jan 2025 15:20:32 +0800
+
 glibc (2.38-6deepin11) unstable; urgency=medium
 
   * LoongArch: Force SHMLBA the same as kernel
diff --git a/debian/patches/series b/debian/patches/series
index 43c3e4ee..8fd214e9 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -159,3 +159,7 @@ loong64/0048-nptl-fix-__builtin_thread_pointer-detection-on-Loong.patch
 loong64/0049-LoongArch-Force-SHMLBA-the-same-as-kernel.patch
 
 iconvdata/add_GB18030-2022_charmap_and_test_the_entire_GB18030_charmap.patch
+
+zhaoxin/0001-x86-Set-preferred-CPU-features-on-the-KH-40000-and-K.patch
+zhaoxin/0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
+zhaoxin/0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch
diff --git a/debian/patches/zhaoxin/0001-x86-Set-preferred-CPU-features-on-the-KH-40000-and-K.patch b/debian/patches/zhaoxin/0001-x86-Set-preferred-CPU-features-on-the-KH-40000-and-K.patch
new file mode 100644
index 00000000..b52cba12
--- /dev/null
+++ b/debian/patches/zhaoxin/0001-x86-Set-preferred-CPU-features-on-the-KH-40000-and-K.patch
@@ -0,0 +1,102 @@
+From 898620f8afab01837bc7e04bd89912bcb6803a79 Mon Sep 17 00:00:00 2001
+From: MayShao-oc <mayshao-oc@zhaoxin.com>
+Date: Thu, 16 Jan 2025 14:13:17 +0800
+Subject: [PATCH 1/3] x86: Set preferred CPU features on the KH-40000 and
+ KX-7000 Zhaoxin processors
+
+Fix code formatting under the Zhaoxin branch and add comments for
+different Zhaoxin models.
+
+Unaligned AVX load are slower on KH-40000 and KX-7000, so disable
+the AVX_Fast_Unaligned_Load.
+
+Enable Prefer_No_VZEROUPPER and Fast_Unaligned_Load features to
+use sse2_unaligned version of memset,strcpy and strcat.
+
+Signed-off-by: MayShao-oc <mayshao-oc@zhaoxin.com>
+---
+ sysdeps/x86/cpu-features.c | 51 ++++++++++++++++++++++++++------------
+ 1 file changed, 35 insertions(+), 16 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index badf0888..43b5f562 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -907,39 +907,58 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+ 
+       model += extended_model;
+       if (family == 0x6)
+-        {
+-          if (model == 0xf || model == 0x19)
+-            {
++	{
++	  /* Tuning for older Zhaoxin processors.  */
++	  if (model == 0xf || model == 0x19)
++	    {
+ 	      CPU_FEATURE_UNSET (cpu_features, AVX);
+ 	      CPU_FEATURE_UNSET (cpu_features, AVX2);
+ 
+-              cpu_features->preferred[index_arch_Slow_SSE4_2]
+-                |= bit_arch_Slow_SSE4_2;
++	      cpu_features->preferred[index_arch_Slow_SSE4_2]
++		  |= bit_arch_Slow_SSE4_2;
+ 
++	      /*  Unaligned AVX loads are slower.  */
+ 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
+-		&= ~bit_arch_AVX_Fast_Unaligned_Load;
+-            }
+-        }
++		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
++	    }
++	}
+       else if (family == 0x7)
+-        {
+-	  if (model == 0x1b)
++	{
++	  switch (model)
+ 	    {
++	      /* Wudaokou microarch tuning.  */
++	    case 0x1b:
+ 	      CPU_FEATURE_UNSET (cpu_features, AVX);
+ 	      CPU_FEATURE_UNSET (cpu_features, AVX2);
+ 
+ 	      cpu_features->preferred[index_arch_Slow_SSE4_2]
+-		|= bit_arch_Slow_SSE4_2;
++		  |= bit_arch_Slow_SSE4_2;
+ 
+ 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
+-		&= ~bit_arch_AVX_Fast_Unaligned_Load;
+-	    }
+-	  else if (model == 0x3b)
+-	    {
++		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
++	      break;
++
++	      /* Lujiazui microarch tuning.  */
++	    case 0x3b:
+ 	      CPU_FEATURE_UNSET (cpu_features, AVX);
+ 	      CPU_FEATURE_UNSET (cpu_features, AVX2);
+ 
+ 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
+-		&= ~bit_arch_AVX_Fast_Unaligned_Load;
++		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
++	      break;
++
++	      /* Yongfeng and Shijidadao mircoarch tuning.  */
++	    case 0x5b:
++	    case 0x6b:
++	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
++		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
++
++	      /* To use sse2_unaligned versions of memset, strcpy and strcat.
++	       */
++	      cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
++		  |= (bit_arch_Prefer_No_VZEROUPPER
++		      | bit_arch_Fast_Unaligned_Load);
++	      break;
+ 	    }
+ 	}
+     }
+-- 
+2.27.0
+
diff --git a/debian/patches/zhaoxin/0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch b/debian/patches/zhaoxin/0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
new file mode 100644
index 00000000..6253af83
--- /dev/null
+++ b/debian/patches/zhaoxin/0002-x86_64-Optimize-large-size-copy-in-memmove-ssse3.patch
@@ -0,0 +1,77 @@
+From bd766a57df22a8935470fa8e5f339947c0a11653 Mon Sep 17 00:00:00 2001
+From: MayShao-oc <mayshao-oc@zhaoxin.com>
+Date: Thu, 16 Jan 2025 14:15:31 +0800
+Subject: [PATCH 2/3] x86_64: Optimize large size copy in memmove-ssse3
+
+This patch optimizes large size copy using normal store when src > dst
+and overlap.  Make it the same as the logic in memmove-vec-unaligned-erms.S.
+
+Current memmove-ssse3 use '__x86_shared_cache_size_half' as the non-
+temporal threshold, this patch updates that value to
+'__x86_shared_non_temporal_threshold'.  Currently, the
+__x86_shared_non_temporal_threshold is cpu-specific, and different CPUs
+will have different values based on the related nt-benchmark results.
+However, in memmove-ssse3, the nontemporal threshold uses
+'__x86_shared_cache_size_half', which sounds unreasonable.
+
+The performance is not changed drastically although shows overall
+improvements without any major regressions or gains.
+
+Results on Zhaoxin KX-7000:
+
+bench-memcpy geometric_mean(N=20) New / Original: 0.999
+bench-memcpy-random geometric_mean(N=20) New / Original: 0.999
+bench-memcpy-large geometric_mean(N=20) New / Original: 0.978
+bench-memmove geometric_mean(N=20) New / Original: 1.000
+bench-memmmove-large geometric_mean(N=20) New / Original: 0.962
+
+Results on Intel Core i5-6600K:
+
+bench-memcpy geometric_mean(N=20) New / Original: 1.001
+bench-memcpy-random geometric_mean(N=20) New / Original: 0.999
+bench-memcpy-large geometric_mean(N=20) New / Original: 1.001
+bench-memmove geometric_mean(N=20) New / Original: 0.995
+bench-memmmove-large geometric_mean(N=20) New / Original: 0.936
+
+Signed-off-by: MayShao-oc <mayshao-oc@zhaoxin.com>
+---
+ sysdeps/x86_64/multiarch/memmove-ssse3.S | 14 +++++++++-----
+ 1 file changed, 9 insertions(+), 5 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
+index 460b0ec0..69561628 100644
+--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
++++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
+@@ -151,13 +151,10 @@ L(more_2x_vec):
+ 	   loop.  */
+ 	movups	%xmm0, (%rdi)
+ 
+-# ifdef SHARED_CACHE_SIZE_HALF
+-	cmp	$SHARED_CACHE_SIZE_HALF, %RDX_LP
+-# else
+-	cmp	__x86_shared_cache_size_half(%rip), %rdx
+-# endif
++	cmp	__x86_shared_non_temporal_threshold(%rip), %rdx
+ 	ja	L(large_memcpy)
+ 
++L(loop_fwd):
+ 	leaq	-64(%rdi, %rdx), %r8
+ 	andq	$-16, %rdi
+ 	movl	$48, %edx
+@@ -199,6 +196,13 @@ L(large_memcpy):
+ 	movups	-64(%r9, %rdx), %xmm10
+ 	movups	-80(%r9, %rdx), %xmm11
+ 
++	/* Check if src and dst overlap. If they do use cacheable
++	   writes to potentially gain positive interference between
++	   the loads during the memmove.  */
++	subq	%rdi, %r9
++	cmpq	%rdx, %r9
++	jb	L(loop_fwd)
++
+ 	sall	$5, %ecx
+ 	leal	(%rcx, %rcx, 2), %r8d
+ 	leaq	-96(%rdi, %rdx), %rcx
+-- 
+2.27.0
+
diff --git a/debian/patches/zhaoxin/0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch b/debian/patches/zhaoxin/0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch
new file mode 100644
index 00000000..966e35aa
--- /dev/null
+++ b/debian/patches/zhaoxin/0003-x86-Set-default-non_temporal_threshold-for-Zhaoxin-p.patch
@@ -0,0 +1,50 @@
+From e2792bd9be6e2df3b12e887b90f0daebad971d4a Mon Sep 17 00:00:00 2001
+From: MayShao-oc <mayshao-oc@zhaoxin.com>
+Date: Thu, 16 Jan 2025 14:22:44 +0800
+Subject: [PATCH 3/3] x86: Set default non_temporal_threshold for Zhaoxin
+ processors
+
+Current 'non_temporal_threshold' set to 'non_temporal_threshold_lowbound'
+on Zhaoxin processors without ERMS. The default
+'non_temporal_threshold_lowbound' is too small for the KH-40000 and KX-7000
+Zhaoxin processors, this patch updates the value to
+'shared / cachesize_non_temporal_divisor'.
+
+Signed-off-by: MayShao-oc <mayshao-oc@zhaoxin.com>
+---
+ sysdeps/x86/cpu-features.c | 1 +
+ sysdeps/x86/dl-cacheinfo.h | 6 ++++--
+ 2 files changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 43b5f562..f752ebd2 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -949,6 +949,7 @@ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.ht
+ 
+ 	      /* Yongfeng and Shijidadao mircoarch tuning.  */
+ 	    case 0x5b:
++	      cpu_features->cachesize_non_temporal_divisor = 2;
+ 	    case 0x6b:
+ 	      cpu_features->preferred[index_arch_AVX_Fast_Unaligned_Load]
+ 		  &= ~bit_arch_AVX_Fast_Unaligned_Load;
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index cd4d0351..2c5b6d69 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -769,8 +769,10 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+   /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
+      a higher risk of actually thrashing the cache as they don't have a HW LRU
+      hint. As well, their performance in highly parallel situations is
+-     noticeably worse.  */
+-  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++     noticeably worse. Zhaoxin processors are an exception, the lowbound is not
++     suitable for them based on actual test data.  */
++  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)
++      && cpu_features->basic.kind != arch_kind_zhaoxin)
+     non_temporal_threshold = non_temporal_threshold_lowbound;
+   /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
+      'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
+-- 
+2.27.0
+