diff options
-rw-r--r-- | .SRCINFO | 2 | ||||
-rw-r--r-- | PKGBUILD | 4 | ||||
-rw-r--r-- | x86-csum-Rewrite-optimize-csum_partial.patch | 269 |
3 files changed, 275 insertions, 0 deletions
@@ -31,6 +31,7 @@ pkgbase = linux-xanmod-rog source = zstd-udpate-fixes.patch source = x86-ACPI-State-Optimize-C3-entry-on-AMD-CPUs.patch source = x86-change-default-to-spec_store_bypass_disable-prct.patch + source = x86-csum-Rewrite-optimize-csum_partial.patch source = acpi-battery-Always-read-fresh-battery-state-on-update.patch source = cfg80211-dont-WARN-if-a-self-managed-device.patch source = HID-asus-Reduce-object-size-by-consolidating-calls.patch @@ -64,6 +65,7 @@ pkgbase = linux-xanmod-rog sha256sums = d636bd74a71b2d898b20246e3c013b853fd1a462ed622e7e90302d53b4157428 sha256sums = 923230ed8367e28adfdeed75d3cdba9eec6b781818c37f6f3d3eb64101d2e716 sha256sums = cc401107f1bf7b7d8e8a78ee594f9db4b6fa252b7239b6aa88f678aef84d935c + sha256sums = 261807a9bc838709bd04e65a83eba2fefd8554699e5dfc8da9a1ee8499807813 sha256sums = f7a4bf6293912bfc4a20743e58a5a266be8c4dbe3c1862d196d3a3b45f2f7c90 sha256sums = 3d8961438b5c8110588ff0b881d472fc71a4304d306808d78a4055a4150f351e sha256sums = 544464bf0807b324120767d55867f03014a9fda4e1804768ca341be902d7ade4 @@ -146,6 +146,9 @@ source=("https://cdn.kernel.org/pub/linux/kernel/v${_branch}/linux-${_major}.tar # 5.16 spectre defaults "x86-change-default-to-spec_store_bypass_disable-prct.patch" + # 5.17 TCP csum optimization + "x86-csum-Rewrite-optimize-csum_partial.patch" + # -- patch from Chromium developers; more accurately report battery state changes "acpi-battery-Always-read-fresh-battery-state-on-update.patch" @@ -194,6 +197,7 @@ sha256sums=('57b2cf6991910e3b67a1b3490022e8a0674b6965c74c12da1e99d138d1991ee8' 'd636bd74a71b2d898b20246e3c013b853fd1a462ed622e7e90302d53b4157428' '923230ed8367e28adfdeed75d3cdba9eec6b781818c37f6f3d3eb64101d2e716' 'cc401107f1bf7b7d8e8a78ee594f9db4b6fa252b7239b6aa88f678aef84d935c' + '261807a9bc838709bd04e65a83eba2fefd8554699e5dfc8da9a1ee8499807813' 'f7a4bf6293912bfc4a20743e58a5a266be8c4dbe3c1862d196d3a3b45f2f7c90' '3d8961438b5c8110588ff0b881d472fc71a4304d306808d78a4055a4150f351e' '544464bf0807b324120767d55867f03014a9fda4e1804768ca341be902d7ade4' diff --git a/x86-csum-Rewrite-optimize-csum_partial.patch b/x86-csum-Rewrite-optimize-csum_partial.patch new file mode 100644 index 000000000000..70a8faa9eec8 --- /dev/null +++ b/x86-csum-Rewrite-optimize-csum_partial.patch @@ -0,0 +1,269 @@ +From 81fd3a6fc35b5ac7669459569d918b6f8e29a97f Mon Sep 17 00:00:00 2001 +From: Eric Dumazet <edumazet@google.com> +Date: Fri, 12 Nov 2021 08:19:50 -0800 +Subject: [PATCH] x86/csum: Rewrite/optimize csum_partial() + +With more NIC supporting CHECKSUM_COMPLETE, and IPv6 being widely +used. csum_partial() is heavily used with small amount of bytes, and +is consuming many cycles. + +IPv6 header size for instance is 40 bytes. + +Another thing to consider is that NET_IP_ALIGN is 0 on x86, meaning +that network headers are not word-aligned, unless the driver forces +this. + +This means that csum_partial() fetches one u16 to 'align the buffer', +then perform three u64 additions with carry in a loop, then a +remaining u32, then a remaining u16. + +With this new version, we perform a loop only for the 64 bytes blocks, +then the remaining is bisected. + +Tested on various cpus, all of them show a big reduction in +csum_partial() cost (by 50 to 80 %) + +Before: + 4.16% [kernel] [k] csum_partial +After: + 0.83% [kernel] [k] csum_partial + +If run in a loop 1,000,000 times: + +Before: + 26,922,913 cycles # 3846130.429 GHz + 80,302,961 instructions # 2.98 insn per cycle + 21,059,816 branches # 3008545142.857 M/sec + 2,896 branch-misses # 0.01% of all branches +After: + 17,960,709 cycles # 3592141.800 GHz + 41,292,805 instructions # 2.30 insn per cycle + 11,058,119 branches # 2211623800.000 M/sec + 2,997 branch-misses # 0.03% of all branches + +Signed-off-by: Eric Dumazet <edumazet@google.com> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Reviewed-by: Alexander Duyck <alexanderduyck@fb.com> +Link: https://lore.kernel.org/r/20211112161950.528886-1-eric.dumazet@gmail.com +--- + arch/x86/lib/csum-partial_64.c | 184 ++++++++++++++------------------- + 1 file changed, 78 insertions(+), 106 deletions(-) + +diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c +index e7925d668b68..5ec35626945b 100644 +--- a/arch/x86/lib/csum-partial_64.c ++++ b/arch/x86/lib/csum-partial_64.c +@@ -9,6 +9,7 @@ + #include <linux/compiler.h> + #include <linux/export.h> + #include <asm/checksum.h> ++#include <asm/word-at-a-time.h> + + static inline unsigned short from32to16(unsigned a) + { +@@ -21,120 +22,92 @@ static inline unsigned short from32to16(unsigned a) + } + + /* +- * Do a 64-bit checksum on an arbitrary memory area. ++ * Do a checksum on an arbitrary memory area. + * Returns a 32bit checksum. + * + * This isn't as time critical as it used to be because many NICs + * do hardware checksumming these days. +- * +- * Things tried and found to not make it faster: +- * Manual Prefetching +- * Unrolling to an 128 bytes inner loop. +- * Using interleaving with more registers to break the carry chains. +- */ +-static unsigned do_csum(const unsigned char *buff, unsigned len) +-{ +- unsigned odd, count; +- unsigned long result = 0; +- +- if (unlikely(len == 0)) +- return result; +- odd = 1 & (unsigned long) buff; +- if (unlikely(odd)) { +- result = *buff << 8; +- len--; +- buff++; +- } +- count = len >> 1; /* nr of 16-bit words.. */ +- if (count) { +- if (2 & (unsigned long) buff) { +- result += *(unsigned short *)buff; +- count--; +- len -= 2; +- buff += 2; +- } +- count >>= 1; /* nr of 32-bit words.. */ +- if (count) { +- unsigned long zero; +- unsigned count64; +- if (4 & (unsigned long) buff) { +- result += *(unsigned int *) buff; +- count--; +- len -= 4; +- buff += 4; +- } +- count >>= 1; /* nr of 64-bit words.. */ +- +- /* main loop using 64byte blocks */ +- zero = 0; +- count64 = count >> 3; +- while (count64) { +- asm("addq 0*8(%[src]),%[res]\n\t" +- "adcq 1*8(%[src]),%[res]\n\t" +- "adcq 2*8(%[src]),%[res]\n\t" +- "adcq 3*8(%[src]),%[res]\n\t" +- "adcq 4*8(%[src]),%[res]\n\t" +- "adcq 5*8(%[src]),%[res]\n\t" +- "adcq 6*8(%[src]),%[res]\n\t" +- "adcq 7*8(%[src]),%[res]\n\t" +- "adcq %[zero],%[res]" +- : [res] "=r" (result) +- : [src] "r" (buff), [zero] "r" (zero), +- "[res]" (result)); +- buff += 64; +- count64--; +- } +- +- /* last up to 7 8byte blocks */ +- count %= 8; +- while (count) { +- asm("addq %1,%0\n\t" +- "adcq %2,%0\n" +- : "=r" (result) +- : "m" (*(unsigned long *)buff), +- "r" (zero), "0" (result)); +- --count; +- buff += 8; +- } +- result = add32_with_carry(result>>32, +- result&0xffffffff); +- +- if (len & 4) { +- result += *(unsigned int *) buff; +- buff += 4; +- } +- } +- if (len & 2) { +- result += *(unsigned short *) buff; +- buff += 2; +- } +- } +- if (len & 1) +- result += *buff; +- result = add32_with_carry(result>>32, result & 0xffffffff); +- if (unlikely(odd)) { +- result = from32to16(result); +- result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); +- } +- return result; +-} +- +-/* +- * computes the checksum of a memory block at buff, length len, +- * and adds in "sum" (32-bit) +- * +- * returns a 32-bit number suitable for feeding into itself +- * or csum_tcpudp_magic +- * +- * this function must be called with even lengths, except +- * for the last fragment, which may be odd + * ++ * Still, with CHECKSUM_COMPLETE this is called to compute ++ * checksums on IPv6 headers (40 bytes) and other small parts. + * it's best to have buff aligned on a 64-bit boundary + */ + __wsum csum_partial(const void *buff, int len, __wsum sum) + { +- return (__force __wsum)add32_with_carry(do_csum(buff, len), +- (__force u32)sum); ++ u64 temp64 = (__force u64)sum; ++ unsigned odd, result; ++ ++ odd = 1 & (unsigned long) buff; ++ if (unlikely(odd)) { ++ if (unlikely(len == 0)) ++ return sum; ++ temp64 += (*(unsigned char *)buff << 8); ++ len--; ++ buff++; ++ } ++ ++ while (unlikely(len >= 64)) { ++ asm("addq 0*8(%[src]),%[res]\n\t" ++ "adcq 1*8(%[src]),%[res]\n\t" ++ "adcq 2*8(%[src]),%[res]\n\t" ++ "adcq 3*8(%[src]),%[res]\n\t" ++ "adcq 4*8(%[src]),%[res]\n\t" ++ "adcq 5*8(%[src]),%[res]\n\t" ++ "adcq 6*8(%[src]),%[res]\n\t" ++ "adcq 7*8(%[src]),%[res]\n\t" ++ "adcq $0,%[res]" ++ : [res] "+r" (temp64) ++ : [src] "r" (buff) ++ : "memory"); ++ buff += 64; ++ len -= 64; ++ } ++ ++ if (len & 32) { ++ asm("addq 0*8(%[src]),%[res]\n\t" ++ "adcq 1*8(%[src]),%[res]\n\t" ++ "adcq 2*8(%[src]),%[res]\n\t" ++ "adcq 3*8(%[src]),%[res]\n\t" ++ "adcq $0,%[res]" ++ : [res] "+r" (temp64) ++ : [src] "r" (buff) ++ : "memory"); ++ buff += 32; ++ } ++ if (len & 16) { ++ asm("addq 0*8(%[src]),%[res]\n\t" ++ "adcq 1*8(%[src]),%[res]\n\t" ++ "adcq $0,%[res]" ++ : [res] "+r" (temp64) ++ : [src] "r" (buff) ++ : "memory"); ++ buff += 16; ++ } ++ if (len & 8) { ++ asm("addq 0*8(%[src]),%[res]\n\t" ++ "adcq $0,%[res]" ++ : [res] "+r" (temp64) ++ : [src] "r" (buff) ++ : "memory"); ++ buff += 8; ++ } ++ if (len & 7) { ++ unsigned int shift = (8 - (len & 7)) * 8; ++ unsigned long trail; ++ ++ trail = (load_unaligned_zeropad(buff) << shift) >> shift; ++ ++ asm("addq %[trail],%[res]\n\t" ++ "adcq $0,%[res]" ++ : [res] "+r" (temp64) ++ : [trail] "r" (trail)); ++ } ++ result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff); ++ if (unlikely(odd)) { ++ result = from32to16(result); ++ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8); ++ } ++ return (__force __wsum)result; + } + EXPORT_SYMBOL(csum_partial); + +@@ -147,4 +120,3 @@ __sum16 ip_compute_csum(const void *buff, int len) + return csum_fold(csum_partial(buff,len,0)); + } + EXPORT_SYMBOL(ip_compute_csum); +- +-- +2.34.0 + |