3 files changed, 275 insertions, 0 deletions
diff --git a/.SRCINFO b/.SRCINFO
index d109ad2a6c86..c951d79451a9 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -31,6 +31,7 @@ pkgbase = linux-xanmod-rog
 	source = zstd-udpate-fixes.patch
 	source = x86-ACPI-State-Optimize-C3-entry-on-AMD-CPUs.patch
 	source = x86-change-default-to-spec_store_bypass_disable-prct.patch
+	source = x86-csum-Rewrite-optimize-csum_partial.patch
 	source = acpi-battery-Always-read-fresh-battery-state-on-update.patch
 	source = cfg80211-dont-WARN-if-a-self-managed-device.patch
 	source = HID-asus-Reduce-object-size-by-consolidating-calls.patch
@@ -64,6 +65,7 @@ pkgbase = linux-xanmod-rog
 	sha256sums = d636bd74a71b2d898b20246e3c013b853fd1a462ed622e7e90302d53b4157428
 	sha256sums = 923230ed8367e28adfdeed75d3cdba9eec6b781818c37f6f3d3eb64101d2e716
 	sha256sums = cc401107f1bf7b7d8e8a78ee594f9db4b6fa252b7239b6aa88f678aef84d935c
+	sha256sums = 261807a9bc838709bd04e65a83eba2fefd8554699e5dfc8da9a1ee8499807813
 	sha256sums = f7a4bf6293912bfc4a20743e58a5a266be8c4dbe3c1862d196d3a3b45f2f7c90
 	sha256sums = 3d8961438b5c8110588ff0b881d472fc71a4304d306808d78a4055a4150f351e
 	sha256sums = 544464bf0807b324120767d55867f03014a9fda4e1804768ca341be902d7ade4
diff --git a/PKGBUILD b/PKGBUILD
index 4b8b3167299a..124c4d874727 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -146,6 +146,9 @@ source=("https://cdn.kernel.org/pub/linux/kernel/v${_branch}/linux-${_major}.tar
         # 5.16 spectre defaults
         "x86-change-default-to-spec_store_bypass_disable-prct.patch"
 
+        # 5.17 TCP csum optimization
+        "x86-csum-Rewrite-optimize-csum_partial.patch"
+
         # -- patch from Chromium developers; more accurately report battery state changes
         "acpi-battery-Always-read-fresh-battery-state-on-update.patch"
 
@@ -194,6 +197,7 @@ sha256sums=('57b2cf6991910e3b67a1b3490022e8a0674b6965c74c12da1e99d138d1991ee8'
             'd636bd74a71b2d898b20246e3c013b853fd1a462ed622e7e90302d53b4157428'
             '923230ed8367e28adfdeed75d3cdba9eec6b781818c37f6f3d3eb64101d2e716'
             'cc401107f1bf7b7d8e8a78ee594f9db4b6fa252b7239b6aa88f678aef84d935c'
+            '261807a9bc838709bd04e65a83eba2fefd8554699e5dfc8da9a1ee8499807813'
             'f7a4bf6293912bfc4a20743e58a5a266be8c4dbe3c1862d196d3a3b45f2f7c90'
             '3d8961438b5c8110588ff0b881d472fc71a4304d306808d78a4055a4150f351e'
             '544464bf0807b324120767d55867f03014a9fda4e1804768ca341be902d7ade4'
diff --git a/x86-csum-Rewrite-optimize-csum_partial.patch b/x86-csum-Rewrite-optimize-csum_partial.patch
new file mode 100644
index 000000000000..70a8faa9eec8
--- /dev/null
+++ b/x86-csum-Rewrite-optimize-csum_partial.patch
@@ -0,0 +1,269 @@
+From 81fd3a6fc35b5ac7669459569d918b6f8e29a97f Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Fri, 12 Nov 2021 08:19:50 -0800
+Subject: [PATCH] x86/csum: Rewrite/optimize csum_partial()
+
+With more NIC supporting CHECKSUM_COMPLETE, and IPv6 being widely
+used.  csum_partial() is heavily used with small amount of bytes, and
+is consuming many cycles.
+
+IPv6 header size for instance is 40 bytes.
+
+Another thing to consider is that NET_IP_ALIGN is 0 on x86, meaning
+that network headers are not word-aligned, unless the driver forces
+this.
+
+This means that csum_partial() fetches one u16 to 'align the buffer',
+then perform three u64 additions with carry in a loop, then a
+remaining u32, then a remaining u16.
+
+With this new version, we perform a loop only for the 64 bytes blocks,
+then the remaining is bisected.
+
+Tested on various cpus, all of them show a big reduction in
+csum_partial() cost (by 50 to 80 %)
+
+Before:
+	4.16%  [kernel]       [k] csum_partial
+After:
+	0.83%  [kernel]       [k] csum_partial
+
+If run in a loop 1,000,000 times:
+
+Before:
+	26,922,913      cycles                    # 3846130.429 GHz
+	80,302,961      instructions              #    2.98  insn per cycle
+	21,059,816      branches                  # 3008545142.857 M/sec
+	     2,896      branch-misses             #    0.01% of all branches
+After:
+	17,960,709      cycles                    # 3592141.800 GHz
+	41,292,805      instructions              #    2.30  insn per cycle
+	11,058,119      branches                  # 2211623800.000 M/sec
+	     2,997      branch-misses             #    0.03% of all branches
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
+Link: https://lore.kernel.org/r/20211112161950.528886-1-eric.dumazet@gmail.com
+---
+ arch/x86/lib/csum-partial_64.c | 184 ++++++++++++++-------------------
+ 1 file changed, 78 insertions(+), 106 deletions(-)
+
+diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
+index e7925d668b68..5ec35626945b 100644
+--- a/arch/x86/lib/csum-partial_64.c
++++ b/arch/x86/lib/csum-partial_64.c
+@@ -9,6 +9,7 @@
+ #include <linux/compiler.h>
+ #include <linux/export.h>
+ #include <asm/checksum.h>
++#include <asm/word-at-a-time.h>
+ 
+ static inline unsigned short from32to16(unsigned a) 
+ {
+@@ -21,120 +22,92 @@ static inline unsigned short from32to16(unsigned a)
+ }
+ 
+ /*
+- * Do a 64-bit checksum on an arbitrary memory area.
++ * Do a checksum on an arbitrary memory area.
+  * Returns a 32bit checksum.
+  *
+  * This isn't as time critical as it used to be because many NICs
+  * do hardware checksumming these days.
+- * 
+- * Things tried and found to not make it faster:
+- * Manual Prefetching
+- * Unrolling to an 128 bytes inner loop.
+- * Using interleaving with more registers to break the carry chains.
+- */
+-static unsigned do_csum(const unsigned char *buff, unsigned len)
+-{
+-	unsigned odd, count;
+-	unsigned long result = 0;
+-
+-	if (unlikely(len == 0))
+-		return result; 
+-	odd = 1 & (unsigned long) buff;
+-	if (unlikely(odd)) {
+-		result = *buff << 8;
+-		len--;
+-		buff++;
+-	}
+-	count = len >> 1;		/* nr of 16-bit words.. */
+-	if (count) {
+-		if (2 & (unsigned long) buff) {
+-			result += *(unsigned short *)buff;
+-			count--;
+-			len -= 2;
+-			buff += 2;
+-		}
+-		count >>= 1;		/* nr of 32-bit words.. */
+-		if (count) {
+-			unsigned long zero;
+-			unsigned count64;
+-			if (4 & (unsigned long) buff) {
+-				result += *(unsigned int *) buff;
+-				count--;
+-				len -= 4;
+-				buff += 4;
+-			}
+-			count >>= 1;	/* nr of 64-bit words.. */
+-
+-			/* main loop using 64byte blocks */
+-			zero = 0;
+-			count64 = count >> 3;
+-			while (count64) { 
+-				asm("addq 0*8(%[src]),%[res]\n\t"
+-				    "adcq 1*8(%[src]),%[res]\n\t"
+-				    "adcq 2*8(%[src]),%[res]\n\t"
+-				    "adcq 3*8(%[src]),%[res]\n\t"
+-				    "adcq 4*8(%[src]),%[res]\n\t"
+-				    "adcq 5*8(%[src]),%[res]\n\t"
+-				    "adcq 6*8(%[src]),%[res]\n\t"
+-				    "adcq 7*8(%[src]),%[res]\n\t"
+-				    "adcq %[zero],%[res]"
+-				    : [res] "=r" (result)
+-				    : [src] "r" (buff), [zero] "r" (zero),
+-				    "[res]" (result));
+-				buff += 64;
+-				count64--;
+-			}
+-
+-			/* last up to 7 8byte blocks */
+-			count %= 8; 
+-			while (count) { 
+-				asm("addq %1,%0\n\t"
+-				    "adcq %2,%0\n" 
+-					    : "=r" (result)
+-				    : "m" (*(unsigned long *)buff), 
+-				    "r" (zero),  "0" (result));
+-				--count; 
+-				buff += 8;
+-			}
+-			result = add32_with_carry(result>>32,
+-						  result&0xffffffff); 
+-
+-			if (len & 4) {
+-				result += *(unsigned int *) buff;
+-				buff += 4;
+-			}
+-		}
+-		if (len & 2) {
+-			result += *(unsigned short *) buff;
+-			buff += 2;
+-		}
+-	}
+-	if (len & 1)
+-		result += *buff;
+-	result = add32_with_carry(result>>32, result & 0xffffffff); 
+-	if (unlikely(odd)) { 
+-		result = from32to16(result);
+-		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+-	}
+-	return result;
+-}
+-
+-/*
+- * computes the checksum of a memory block at buff, length len,
+- * and adds in "sum" (32-bit)
+- *
+- * returns a 32-bit number suitable for feeding into itself
+- * or csum_tcpudp_magic
+- *
+- * this function must be called with even lengths, except
+- * for the last fragment, which may be odd
+  *
++ * Still, with CHECKSUM_COMPLETE this is called to compute
++ * checksums on IPv6 headers (40 bytes) and other small parts.
+  * it's best to have buff aligned on a 64-bit boundary
+  */
+ __wsum csum_partial(const void *buff, int len, __wsum sum)
+ {
+-	return (__force __wsum)add32_with_carry(do_csum(buff, len),
+-						(__force u32)sum);
++	u64 temp64 = (__force u64)sum;
++	unsigned odd, result;
++
++	odd = 1 & (unsigned long) buff;
++	if (unlikely(odd)) {
++		if (unlikely(len == 0))
++			return sum;
++		temp64 += (*(unsigned char *)buff << 8);
++		len--;
++		buff++;
++	}
++
++	while (unlikely(len >= 64)) {
++		asm("addq 0*8(%[src]),%[res]\n\t"
++		    "adcq 1*8(%[src]),%[res]\n\t"
++		    "adcq 2*8(%[src]),%[res]\n\t"
++		    "adcq 3*8(%[src]),%[res]\n\t"
++		    "adcq 4*8(%[src]),%[res]\n\t"
++		    "adcq 5*8(%[src]),%[res]\n\t"
++		    "adcq 6*8(%[src]),%[res]\n\t"
++		    "adcq 7*8(%[src]),%[res]\n\t"
++		    "adcq $0,%[res]"
++		    : [res] "+r" (temp64)
++		    : [src] "r" (buff)
++		    : "memory");
++		buff += 64;
++		len -= 64;
++	}
++
++	if (len & 32) {
++		asm("addq 0*8(%[src]),%[res]\n\t"
++		    "adcq 1*8(%[src]),%[res]\n\t"
++		    "adcq 2*8(%[src]),%[res]\n\t"
++		    "adcq 3*8(%[src]),%[res]\n\t"
++		    "adcq $0,%[res]"
++			: [res] "+r" (temp64)
++			: [src] "r" (buff)
++			: "memory");
++		buff += 32;
++	}
++	if (len & 16) {
++		asm("addq 0*8(%[src]),%[res]\n\t"
++		    "adcq 1*8(%[src]),%[res]\n\t"
++		    "adcq $0,%[res]"
++			: [res] "+r" (temp64)
++			: [src] "r" (buff)
++			: "memory");
++		buff += 16;
++	}
++	if (len & 8) {
++		asm("addq 0*8(%[src]),%[res]\n\t"
++		    "adcq $0,%[res]"
++			: [res] "+r" (temp64)
++			: [src] "r" (buff)
++			: "memory");
++		buff += 8;
++	}
++	if (len & 7) {
++		unsigned int shift = (8 - (len & 7)) * 8;
++		unsigned long trail;
++
++		trail = (load_unaligned_zeropad(buff) << shift) >> shift;
++
++		asm("addq %[trail],%[res]\n\t"
++		    "adcq $0,%[res]"
++			: [res] "+r" (temp64)
++			: [trail] "r" (trail));
++	}
++	result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
++	if (unlikely(odd)) { 
++		result = from32to16(result);
++		result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
++	}
++	return (__force __wsum)result;
+ }
+ EXPORT_SYMBOL(csum_partial);
+ 
+@@ -147,4 +120,3 @@ __sum16 ip_compute_csum(const void *buff, int len)
+ 	return csum_fold(csum_partial(buff,len,0));
+ }
+ EXPORT_SYMBOL(ip_compute_csum);
+-
+-- 
+2.34.0
+