aboutsummarylogtreecommitdiffstats
diff options
context:
space:
mode:
authorScott B2021-11-23 12:43:25 -0800
committerScott B2021-11-23 12:44:34 -0800
commitfd405d8edf8cc076afc0ad614256aae305c8cd0e (patch)
tree821d7da1246715ab1d08e476a2ad8b055d9367e4
parent58692b1f2a4ed01712a05d0ab4db3b4adb9ff7ed (diff)
downloadaur-fd405d8edf8cc076afc0ad614256aae305c8cd0e.tar.gz
patch: tcp csum optimization
-rw-r--r--.SRCINFO2
-rw-r--r--PKGBUILD4
-rw-r--r--x86-csum-Rewrite-optimize-csum_partial.patch269
3 files changed, 275 insertions, 0 deletions
diff --git a/.SRCINFO b/.SRCINFO
index d109ad2a6c86..c951d79451a9 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -31,6 +31,7 @@ pkgbase = linux-xanmod-rog
source = zstd-udpate-fixes.patch
source = x86-ACPI-State-Optimize-C3-entry-on-AMD-CPUs.patch
source = x86-change-default-to-spec_store_bypass_disable-prct.patch
+ source = x86-csum-Rewrite-optimize-csum_partial.patch
source = acpi-battery-Always-read-fresh-battery-state-on-update.patch
source = cfg80211-dont-WARN-if-a-self-managed-device.patch
source = HID-asus-Reduce-object-size-by-consolidating-calls.patch
@@ -64,6 +65,7 @@ pkgbase = linux-xanmod-rog
sha256sums = d636bd74a71b2d898b20246e3c013b853fd1a462ed622e7e90302d53b4157428
sha256sums = 923230ed8367e28adfdeed75d3cdba9eec6b781818c37f6f3d3eb64101d2e716
sha256sums = cc401107f1bf7b7d8e8a78ee594f9db4b6fa252b7239b6aa88f678aef84d935c
+ sha256sums = 261807a9bc838709bd04e65a83eba2fefd8554699e5dfc8da9a1ee8499807813
sha256sums = f7a4bf6293912bfc4a20743e58a5a266be8c4dbe3c1862d196d3a3b45f2f7c90
sha256sums = 3d8961438b5c8110588ff0b881d472fc71a4304d306808d78a4055a4150f351e
sha256sums = 544464bf0807b324120767d55867f03014a9fda4e1804768ca341be902d7ade4
diff --git a/PKGBUILD b/PKGBUILD
index 4b8b3167299a..124c4d874727 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -146,6 +146,9 @@ source=("https://cdn.kernel.org/pub/linux/kernel/v${_branch}/linux-${_major}.tar
# 5.16 spectre defaults
"x86-change-default-to-spec_store_bypass_disable-prct.patch"
+ # 5.17 TCP csum optimization
+ "x86-csum-Rewrite-optimize-csum_partial.patch"
+
# -- patch from Chromium developers; more accurately report battery state changes
"acpi-battery-Always-read-fresh-battery-state-on-update.patch"
@@ -194,6 +197,7 @@ sha256sums=('57b2cf6991910e3b67a1b3490022e8a0674b6965c74c12da1e99d138d1991ee8'
'd636bd74a71b2d898b20246e3c013b853fd1a462ed622e7e90302d53b4157428'
'923230ed8367e28adfdeed75d3cdba9eec6b781818c37f6f3d3eb64101d2e716'
'cc401107f1bf7b7d8e8a78ee594f9db4b6fa252b7239b6aa88f678aef84d935c'
+ '261807a9bc838709bd04e65a83eba2fefd8554699e5dfc8da9a1ee8499807813'
'f7a4bf6293912bfc4a20743e58a5a266be8c4dbe3c1862d196d3a3b45f2f7c90'
'3d8961438b5c8110588ff0b881d472fc71a4304d306808d78a4055a4150f351e'
'544464bf0807b324120767d55867f03014a9fda4e1804768ca341be902d7ade4'
diff --git a/x86-csum-Rewrite-optimize-csum_partial.patch b/x86-csum-Rewrite-optimize-csum_partial.patch
new file mode 100644
index 000000000000..70a8faa9eec8
--- /dev/null
+++ b/x86-csum-Rewrite-optimize-csum_partial.patch
@@ -0,0 +1,269 @@
+From 81fd3a6fc35b5ac7669459569d918b6f8e29a97f Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Fri, 12 Nov 2021 08:19:50 -0800
+Subject: [PATCH] x86/csum: Rewrite/optimize csum_partial()
+
+With more NIC supporting CHECKSUM_COMPLETE, and IPv6 being widely
+used. csum_partial() is heavily used with small amount of bytes, and
+is consuming many cycles.
+
+IPv6 header size for instance is 40 bytes.
+
+Another thing to consider is that NET_IP_ALIGN is 0 on x86, meaning
+that network headers are not word-aligned, unless the driver forces
+this.
+
+This means that csum_partial() fetches one u16 to 'align the buffer',
+then perform three u64 additions with carry in a loop, then a
+remaining u32, then a remaining u16.
+
+With this new version, we perform a loop only for the 64 bytes blocks,
+then the remaining is bisected.
+
+Tested on various cpus, all of them show a big reduction in
+csum_partial() cost (by 50 to 80 %)
+
+Before:
+ 4.16% [kernel] [k] csum_partial
+After:
+ 0.83% [kernel] [k] csum_partial
+
+If run in a loop 1,000,000 times:
+
+Before:
+ 26,922,913 cycles # 3846130.429 GHz
+ 80,302,961 instructions # 2.98 insn per cycle
+ 21,059,816 branches # 3008545142.857 M/sec
+ 2,896 branch-misses # 0.01% of all branches
+After:
+ 17,960,709 cycles # 3592141.800 GHz
+ 41,292,805 instructions # 2.30 insn per cycle
+ 11,058,119 branches # 2211623800.000 M/sec
+ 2,997 branch-misses # 0.03% of all branches
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
+Link: https://lore.kernel.org/r/20211112161950.528886-1-eric.dumazet@gmail.com
+---
+ arch/x86/lib/csum-partial_64.c | 184 ++++++++++++++-------------------
+ 1 file changed, 78 insertions(+), 106 deletions(-)
+
+diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
+index e7925d668b68..5ec35626945b 100644
+--- a/arch/x86/lib/csum-partial_64.c
++++ b/arch/x86/lib/csum-partial_64.c
+@@ -9,6 +9,7 @@
+ #include <linux/compiler.h>
+ #include <linux/export.h>
+ #include <asm/checksum.h>
++#include <asm/word-at-a-time.h>
+
+ static inline unsigned short from32to16(unsigned a)
+ {
+@@ -21,120 +22,92 @@ static inline unsigned short from32to16(unsigned a)
+ }
+
+ /*
+- * Do a 64-bit checksum on an arbitrary memory area.
++ * Do a checksum on an arbitrary memory area.
+ * Returns a 32bit checksum.
+ *
+ * This isn't as time critical as it used to be because many NICs
+ * do hardware checksumming these days.
+- *
+- * Things tried and found to not make it faster:
+- * Manual Prefetching
+- * Unrolling to an 128 bytes inner loop.
+- * Using interleaving with more registers to break the carry chains.
+- */
+-static unsigned do_csum(const unsigned char *buff, unsigned len)
+-{
+- unsigned odd, count;
+- unsigned long result = 0;
+-
+- if (unlikely(len == 0))
+- return result;
+- odd = 1 & (unsigned long) buff;
+- if (unlikely(odd)) {
+- result = *buff << 8;
+- len--;
+- buff++;
+- }
+- count = len >> 1; /* nr of 16-bit words.. */
+- if (count) {
+- if (2 & (unsigned long) buff) {
+- result += *(unsigned short *)buff;
+- count--;
+- len -= 2;
+- buff += 2;
+- }
+- count >>= 1; /* nr of 32-bit words.. */
+- if (count) {
+- unsigned long zero;
+- unsigned count64;
+- if (4 & (unsigned long) buff) {
+- result += *(unsigned int *) buff;
+- count--;
+- len -= 4;
+- buff += 4;
+- }
+- count >>= 1; /* nr of 64-bit words.. */
+-
+- /* main loop using 64byte blocks */
+- zero = 0;
+- count64 = count >> 3;
+- while (count64) {
+- asm("addq 0*8(%[src]),%[res]\n\t"
+- "adcq 1*8(%[src]),%[res]\n\t"
+- "adcq 2*8(%[src]),%[res]\n\t"
+- "adcq 3*8(%[src]),%[res]\n\t"
+- "adcq 4*8(%[src]),%[res]\n\t"
+- "adcq 5*8(%[src]),%[res]\n\t"
+- "adcq 6*8(%[src]),%[res]\n\t"
+- "adcq 7*8(%[src]),%[res]\n\t"
+- "adcq %[zero],%[res]"
+- : [res] "=r" (result)
+- : [src] "r" (buff), [zero] "r" (zero),
+- "[res]" (result));
+- buff += 64;
+- count64--;
+- }
+-
+- /* last up to 7 8byte blocks */
+- count %= 8;
+- while (count) {
+- asm("addq %1,%0\n\t"
+- "adcq %2,%0\n"
+- : "=r" (result)
+- : "m" (*(unsigned long *)buff),
+- "r" (zero), "0" (result));
+- --count;
+- buff += 8;
+- }
+- result = add32_with_carry(result>>32,
+- result&0xffffffff);
+-
+- if (len & 4) {
+- result += *(unsigned int *) buff;
+- buff += 4;
+- }
+- }
+- if (len & 2) {
+- result += *(unsigned short *) buff;
+- buff += 2;
+- }
+- }
+- if (len & 1)
+- result += *buff;
+- result = add32_with_carry(result>>32, result & 0xffffffff);
+- if (unlikely(odd)) {
+- result = from32to16(result);
+- result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+- }
+- return result;
+-}
+-
+-/*
+- * computes the checksum of a memory block at buff, length len,
+- * and adds in "sum" (32-bit)
+- *
+- * returns a 32-bit number suitable for feeding into itself
+- * or csum_tcpudp_magic
+- *
+- * this function must be called with even lengths, except
+- * for the last fragment, which may be odd
+ *
++ * Still, with CHECKSUM_COMPLETE this is called to compute
++ * checksums on IPv6 headers (40 bytes) and other small parts.
+ * it's best to have buff aligned on a 64-bit boundary
+ */
+ __wsum csum_partial(const void *buff, int len, __wsum sum)
+ {
+- return (__force __wsum)add32_with_carry(do_csum(buff, len),
+- (__force u32)sum);
++ u64 temp64 = (__force u64)sum;
++ unsigned odd, result;
++
++ odd = 1 & (unsigned long) buff;
++ if (unlikely(odd)) {
++ if (unlikely(len == 0))
++ return sum;
++ temp64 += (*(unsigned char *)buff << 8);
++ len--;
++ buff++;
++ }
++
++ while (unlikely(len >= 64)) {
++ asm("addq 0*8(%[src]),%[res]\n\t"
++ "adcq 1*8(%[src]),%[res]\n\t"
++ "adcq 2*8(%[src]),%[res]\n\t"
++ "adcq 3*8(%[src]),%[res]\n\t"
++ "adcq 4*8(%[src]),%[res]\n\t"
++ "adcq 5*8(%[src]),%[res]\n\t"
++ "adcq 6*8(%[src]),%[res]\n\t"
++ "adcq 7*8(%[src]),%[res]\n\t"
++ "adcq $0,%[res]"
++ : [res] "+r" (temp64)
++ : [src] "r" (buff)
++ : "memory");
++ buff += 64;
++ len -= 64;
++ }
++
++ if (len & 32) {
++ asm("addq 0*8(%[src]),%[res]\n\t"
++ "adcq 1*8(%[src]),%[res]\n\t"
++ "adcq 2*8(%[src]),%[res]\n\t"
++ "adcq 3*8(%[src]),%[res]\n\t"
++ "adcq $0,%[res]"
++ : [res] "+r" (temp64)
++ : [src] "r" (buff)
++ : "memory");
++ buff += 32;
++ }
++ if (len & 16) {
++ asm("addq 0*8(%[src]),%[res]\n\t"
++ "adcq 1*8(%[src]),%[res]\n\t"
++ "adcq $0,%[res]"
++ : [res] "+r" (temp64)
++ : [src] "r" (buff)
++ : "memory");
++ buff += 16;
++ }
++ if (len & 8) {
++ asm("addq 0*8(%[src]),%[res]\n\t"
++ "adcq $0,%[res]"
++ : [res] "+r" (temp64)
++ : [src] "r" (buff)
++ : "memory");
++ buff += 8;
++ }
++ if (len & 7) {
++ unsigned int shift = (8 - (len & 7)) * 8;
++ unsigned long trail;
++
++ trail = (load_unaligned_zeropad(buff) << shift) >> shift;
++
++ asm("addq %[trail],%[res]\n\t"
++ "adcq $0,%[res]"
++ : [res] "+r" (temp64)
++ : [trail] "r" (trail));
++ }
++ result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
++ if (unlikely(odd)) {
++ result = from32to16(result);
++ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
++ }
++ return (__force __wsum)result;
+ }
+ EXPORT_SYMBOL(csum_partial);
+
+@@ -147,4 +120,3 @@ __sum16 ip_compute_csum(const void *buff, int len)
+ return csum_fold(csum_partial(buff,len,0));
+ }
+ EXPORT_SYMBOL(ip_compute_csum);
+-
+--
+2.34.0
+