aboutsummarylogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.SRCINFO2
-rw-r--r--PKGBUILD4
-rw-r--r--xor-enable-auto-vectorization-in-Clang.patch1485
3 files changed, 1491 insertions, 0 deletions
diff --git a/.SRCINFO b/.SRCINFO
index 0ca1d4266b75..bf4741ed40c5 100644
--- a/.SRCINFO
+++ b/.SRCINFO
@@ -28,6 +28,7 @@ pkgbase = linux-xanmod-rog
source = CONFIG_RCU_FAST_NO_HZ-removal-for-v5.17.patch
source = implement-threaded-console-printing.patch
source = Parallel-boot-v4-on-5.16.5.patch
+ source = xor-enable-auto-vectorization-in-Clang.patch
source = acpi-battery-Always-read-fresh-battery-state-on-update.patch
source = cfg80211-dont-WARN-if-a-self-managed-device.patch
source = HID-asus-Reduce-object-size-by-consolidating-calls.patch
@@ -54,6 +55,7 @@ pkgbase = linux-xanmod-rog
sha256sums = e74649fb883f2c8c3703c730e45119fca7126dc0599d6bc814de6dcf5a07e7cb
sha256sums = 81ce39928b98caf2be3cbeaf7f86305051523fea2ccad225d619bd3999d63ddf
sha256sums = ad73b179ea1a309205744fbf415a7b19108e1c1b452b913894752bb717165d81
+ sha256sums = 9fd097f24ddbc7fd3be031fb649b32645082580c0aa039c491bc5fe161c7a7ac
sha256sums = f7a4bf6293912bfc4a20743e58a5a266be8c4dbe3c1862d196d3a3b45f2f7c90
sha256sums = 3d8961438b5c8110588ff0b881d472fc71a4304d306808d78a4055a4150f351e
sha256sums = 544464bf0807b324120767d55867f03014a9fda4e1804768ca341be902d7ade4
diff --git a/PKGBUILD b/PKGBUILD
index 6745446a16d5..f98464615f9a 100644
--- a/PKGBUILD
+++ b/PKGBUILD
@@ -131,6 +131,9 @@ source=("https://cdn.kernel.org/pub/linux/kernel/v${_branch}/linux-${_major}.tar
# see: https://lore.kernel.org/lkml/20220201205328.123066-1-dwmw2@infradead.org/
"Parallel-boot-v4-on-5.16.5.patch"
+ # crypto-next: Clang auto-vectorization
+ "xor-enable-auto-vectorization-in-Clang.patch"
+
# -- patch from Chromium developers; more accurately report battery state changes
"acpi-battery-Always-read-fresh-battery-state-on-update.patch"
@@ -172,6 +175,7 @@ sha256sums=('027d7e8988bb69ac12ee92406c3be1fe13f990b1ca2249e226225cd1573308bb'
'e74649fb883f2c8c3703c730e45119fca7126dc0599d6bc814de6dcf5a07e7cb'
'81ce39928b98caf2be3cbeaf7f86305051523fea2ccad225d619bd3999d63ddf'
'ad73b179ea1a309205744fbf415a7b19108e1c1b452b913894752bb717165d81'
+ '9fd097f24ddbc7fd3be031fb649b32645082580c0aa039c491bc5fe161c7a7ac'
'f7a4bf6293912bfc4a20743e58a5a266be8c4dbe3c1862d196d3a3b45f2f7c90'
'3d8961438b5c8110588ff0b881d472fc71a4304d306808d78a4055a4150f351e'
'544464bf0807b324120767d55867f03014a9fda4e1804768ca341be902d7ade4'
diff --git a/xor-enable-auto-vectorization-in-Clang.patch b/xor-enable-auto-vectorization-in-Clang.patch
new file mode 100644
index 000000000000..00e6b0c20ef6
--- /dev/null
+++ b/xor-enable-auto-vectorization-in-Clang.patch
@@ -0,0 +1,1485 @@
+From c74c764e899bead1b71a7c9de4c1128f6d7c65bf Mon Sep 17 00:00:00 2001
+From: Scott B <arglebargle@arglebargle.dev>
+Date: Tue, 8 Feb 2022 15:31:24 -0800
+Subject: [PATCH] xor: enable auto-vectorization in Clang
+
+Message-Id: <20220205152346.237392-1-ardb@kernel.org>
+Squashed commit of the following:
+
+commit 20ed86ac97b7187cd825d0d1fcd6a15bf79b134e
+Author: Ard Biesheuvel <ardb@kernel.org>
+Date: Sat Feb 5 16:23:46 2022 +0100
+
+ crypto: arm/xor - make vectorized C code Clang-friendly
+
+ The ARM version of the accelerated XOR routines are simply the 8-way C
+ routines passed through the auto-vectorizer with SIMD codegen enabled.
+ This used to require GCC version 4.6 at least, but given that 5.1 is now
+ the baseline, this check is no longer necessary, and actually
+ misidentifies Clang as GCC < 4.6 as Clang defines the GCC major/minor as
+ well, but makes no attempt at doing this in a way that conveys feature
+ parity with a certain version of GCC (which would not be a great idea in
+ the first place).
+
+ So let's drop the version check, and make the auto-vectorize pragma
+ (which is based on a GCC-specific command line option) GCC-only. Since
+ Clang performs SIMD auto-vectorization by default at -O2, no pragma is
+ necessary here.
+
+ Tested-by: Nathan Chancellor <nathan@kernel.org>
+ Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+ Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
+ Link: https://github.com/ClangBuiltLinux/linux/issues/496
+ Link: https://github.com/ClangBuiltLinux/linux/issues/503
+
+commit 59e9aef814d268f029c92184c2beb1dbfbf0ae9e
+Author: Ard Biesheuvel <ardb@kernel.org>
+Date: Sat Feb 5 16:23:45 2022 +0100
+
+ lib/xor: make xor prototypes more friendly to compiler vectorization
+
+ Modern compilers are perfectly capable of extracting parallelism from
+ the XOR routines, provided that the prototypes reflect the nature of the
+ input accurately, in particular, the fact that the input vectors are
+ expected not to overlap. This is not documented explicitly, but is
+ implied by the interchangeability of the various C routines, some of
+ which use temporary variables while others don't: this means that these
+ routines only behave identically for non-overlapping inputs.
+
+ So let's decorate these input vectors with the __restrict modifier,
+ which informs the compiler that there is no overlap. While at it, make
+ the input-only vectors pointer-to-const as well.
+
+ Tested-by: Nathan Chancellor <nathan@kernel.org>
+ Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+ Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
+ Link: https://github.com/ClangBuiltLinux/linux/issues/563
+
+commit a16842e20fe29cdf325122b61ca0fd159510acbb
+Author: Ard Biesheuvel <ardb@kernel.org>
+Date: Mon Dec 13 15:02:52 2021 +0100
+
+ arm64/xor: use EOR3 instructions when available
+
+ Use the EOR3 instruction to implement xor_blocks() if the instruction is
+ available, which is the case if the CPU implements the SHA-3 extension.
+ This is about 20% faster on Apple M1 when using the 5-way version.
+
+ Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+ Link: https://lore.kernel.org/r/20211213140252.2856053-1-ardb@kernel.org
+ Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+---
+ arch/alpha/include/asm/xor.h | 53 +++++---
+ arch/arm/include/asm/xor.h | 42 ++++--
+ arch/arm/lib/xor-neon.c | 12 +-
+ arch/arm64/Kconfig | 6 +
+ arch/arm64/Makefile | 5 +
+ arch/arm64/include/asm/xor.h | 21 ++-
+ arch/arm64/lib/xor-neon.c | 177 +++++++++++++++++++++++--
+ arch/ia64/include/asm/xor.h | 21 ++-
+ arch/powerpc/include/asm/xor_altivec.h | 25 ++--
+ arch/powerpc/lib/xor_vmx.c | 28 ++--
+ arch/powerpc/lib/xor_vmx.h | 27 ++--
+ arch/powerpc/lib/xor_vmx_glue.c | 32 +++--
+ arch/s390/lib/xor.c | 21 ++-
+ arch/sparc/include/asm/xor_32.h | 21 ++-
+ arch/sparc/include/asm/xor_64.h | 42 ++++--
+ arch/x86/include/asm/xor.h | 42 ++++--
+ arch/x86/include/asm/xor_32.h | 42 ++++--
+ arch/x86/include/asm/xor_avx.h | 21 ++-
+ include/asm-generic/xor.h | 84 ++++++++----
+ include/linux/raid/xor.h | 21 ++-
+ 20 files changed, 533 insertions(+), 210 deletions(-)
+
+diff --git a/arch/alpha/include/asm/xor.h b/arch/alpha/include/asm/xor.h
+index 5aeb4fb3cb7c..e0de0c233ab9 100644
+--- a/arch/alpha/include/asm/xor.h
++++ b/arch/alpha/include/asm/xor.h
+@@ -5,24 +5,43 @@
+ * Optimized RAID-5 checksumming functions for alpha EV5 and EV6
+ */
+
+-extern void xor_alpha_2(unsigned long, unsigned long *, unsigned long *);
+-extern void xor_alpha_3(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *);
+-extern void xor_alpha_4(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *, unsigned long *);
+-extern void xor_alpha_5(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *, unsigned long *, unsigned long *);
++extern void
++xor_alpha_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2);
++extern void
++xor_alpha_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3);
++extern void
++xor_alpha_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4);
++extern void
++xor_alpha_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5);
+
+-extern void xor_alpha_prefetch_2(unsigned long, unsigned long *,
+- unsigned long *);
+-extern void xor_alpha_prefetch_3(unsigned long, unsigned long *,
+- unsigned long *, unsigned long *);
+-extern void xor_alpha_prefetch_4(unsigned long, unsigned long *,
+- unsigned long *, unsigned long *,
+- unsigned long *);
+-extern void xor_alpha_prefetch_5(unsigned long, unsigned long *,
+- unsigned long *, unsigned long *,
+- unsigned long *, unsigned long *);
++extern void
++xor_alpha_prefetch_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2);
++extern void
++xor_alpha_prefetch_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3);
++extern void
++xor_alpha_prefetch_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4);
++extern void
++xor_alpha_prefetch_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5);
+
+ asm(" \n\
+ .text \n\
+diff --git a/arch/arm/include/asm/xor.h b/arch/arm/include/asm/xor.h
+index aefddec79286..669cad5194d3 100644
+--- a/arch/arm/include/asm/xor.h
++++ b/arch/arm/include/asm/xor.h
+@@ -44,7 +44,8 @@
+ : "0" (dst), "r" (a1), "r" (a2), "r" (a3), "r" (a4))
+
+ static void
+-xor_arm4regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++xor_arm4regs_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ unsigned int lines = bytes / sizeof(unsigned long) / 4;
+ register unsigned int a1 __asm__("r4");
+@@ -64,8 +65,9 @@ xor_arm4regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ }
+
+ static void
+-xor_arm4regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++xor_arm4regs_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ unsigned int lines = bytes / sizeof(unsigned long) / 4;
+ register unsigned int a1 __asm__("r4");
+@@ -86,8 +88,10 @@ xor_arm4regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_arm4regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++xor_arm4regs_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ unsigned int lines = bytes / sizeof(unsigned long) / 2;
+ register unsigned int a1 __asm__("r8");
+@@ -105,8 +109,11 @@ xor_arm4regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_arm4regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++xor_arm4regs_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ unsigned int lines = bytes / sizeof(unsigned long) / 2;
+ register unsigned int a1 __asm__("r8");
+@@ -146,7 +153,8 @@ static struct xor_block_template xor_block_arm4regs = {
+ extern struct xor_block_template const xor_block_neon_inner;
+
+ static void
+-xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ if (in_interrupt()) {
+ xor_arm4regs_2(bytes, p1, p2);
+@@ -158,8 +166,9 @@ xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ }
+
+ static void
+-xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ if (in_interrupt()) {
+ xor_arm4regs_3(bytes, p1, p2, p3);
+@@ -171,8 +180,10 @@ xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ if (in_interrupt()) {
+ xor_arm4regs_4(bytes, p1, p2, p3, p4);
+@@ -184,8 +195,11 @@ xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ if (in_interrupt()) {
+ xor_arm4regs_5(bytes, p1, p2, p3, p4, p5);
+diff --git a/arch/arm/lib/xor-neon.c b/arch/arm/lib/xor-neon.c
+index b99dd8e1c93f..522510baed49 100644
+--- a/arch/arm/lib/xor-neon.c
++++ b/arch/arm/lib/xor-neon.c
+@@ -17,17 +17,11 @@ MODULE_LICENSE("GPL");
+ /*
+ * Pull in the reference implementations while instructing GCC (through
+ * -ftree-vectorize) to attempt to exploit implicit parallelism and emit
+- * NEON instructions.
++ * NEON instructions. Clang does this by default at O2 so no pragma is
++ * needed.
+ */
+-#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
++#ifdef CONFIG_CC_IS_GCC
+ #pragma GCC optimize "tree-vectorize"
+-#else
+-/*
+- * While older versions of GCC do not generate incorrect code, they fail to
+- * recognize the parallel nature of these functions, and emit plain ARM code,
+- * which is known to be slower than the optimized ARM code in asm-arm/xor.h.
+- */
+-#warning This code requires at least version 4.6 of GCC
+ #endif
+
+ #pragma GCC diagnostic ignored "-Wunused-variable"
+diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
+index c4207cf9bb17..63d41ba4e716 100644
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -1545,6 +1545,12 @@ endmenu
+
+ menu "ARMv8.2 architectural features"
+
++config AS_HAS_ARMV8_2
++ def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a)
++
++config AS_HAS_SHA3
++ def_bool $(as-instr,.arch armv8.2-a+sha3)
++
+ config ARM64_PMEM
+ bool "Enable support for persistent memory"
+ select ARCH_HAS_PMEM_API
+diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
+index e8cfc5868aa8..2f1de88651e6 100644
+--- a/arch/arm64/Makefile
++++ b/arch/arm64/Makefile
+@@ -58,6 +58,11 @@ stack_protector_prepare: prepare0
+ include/generated/asm-offsets.h))
+ endif
+
++ifeq ($(CONFIG_AS_HAS_ARMV8_2), y)
++# make sure to pass the newest target architecture to -march.
++asm-arch := armv8.2-a
++endif
++
+ # Ensure that if the compiler supports branch protection we default it
+ # off, this will be overridden if we are using branch protection.
+ branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
+diff --git a/arch/arm64/include/asm/xor.h b/arch/arm64/include/asm/xor.h
+index 947f6a4f1aa0..befcd8a7abc9 100644
+--- a/arch/arm64/include/asm/xor.h
++++ b/arch/arm64/include/asm/xor.h
+@@ -16,7 +16,8 @@
+ extern struct xor_block_template const xor_block_inner_neon;
+
+ static void
+-xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ kernel_neon_begin();
+ xor_block_inner_neon.do_2(bytes, p1, p2);
+@@ -24,8 +25,9 @@ xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ }
+
+ static void
+-xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ kernel_neon_begin();
+ xor_block_inner_neon.do_3(bytes, p1, p2, p3);
+@@ -33,8 +35,10 @@ xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ kernel_neon_begin();
+ xor_block_inner_neon.do_4(bytes, p1, p2, p3, p4);
+@@ -42,8 +46,11 @@ xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ kernel_neon_begin();
+ xor_block_inner_neon.do_5(bytes, p1, p2, p3, p4, p5);
+diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
+index 11bf4f8aca68..96b171995d19 100644
+--- a/arch/arm64/lib/xor-neon.c
++++ b/arch/arm64/lib/xor-neon.c
+@@ -10,8 +10,8 @@
+ #include <linux/module.h>
+ #include <asm/neon-intrinsics.h>
+
+-void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
+- unsigned long *p2)
++void xor_arm64_neon_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+@@ -37,8 +37,9 @@ void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
+ } while (--lines > 0);
+ }
+
+-void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
+- unsigned long *p2, unsigned long *p3)
++void xor_arm64_neon_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+@@ -72,8 +73,10 @@ void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
+ } while (--lines > 0);
+ }
+
+-void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
+- unsigned long *p2, unsigned long *p3, unsigned long *p4)
++void xor_arm64_neon_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+@@ -115,9 +118,11 @@ void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
+ } while (--lines > 0);
+ }
+
+-void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
+- unsigned long *p2, unsigned long *p3,
+- unsigned long *p4, unsigned long *p5)
++void xor_arm64_neon_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ uint64_t *dp1 = (uint64_t *)p1;
+ uint64_t *dp2 = (uint64_t *)p2;
+@@ -167,7 +172,7 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
+ } while (--lines > 0);
+ }
+
+-struct xor_block_template const xor_block_inner_neon = {
++struct xor_block_template xor_block_inner_neon __ro_after_init = {
+ .name = "__inner_neon__",
+ .do_2 = xor_arm64_neon_2,
+ .do_3 = xor_arm64_neon_3,
+@@ -176,6 +181,158 @@ struct xor_block_template const xor_block_inner_neon = {
+ };
+ EXPORT_SYMBOL(xor_block_inner_neon);
+
++static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
++{
++ uint64x2_t res;
++
++ asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
++ "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
++ : "=w"(res) : "w"(p), "w"(q), "w"(r));
++ return res;
++}
++
++static void xor_arm64_eor3_3(unsigned long bytes,
++ unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
++{
++ uint64_t *dp1 = (uint64_t *)p1;
++ uint64_t *dp2 = (uint64_t *)p2;
++ uint64_t *dp3 = (uint64_t *)p3;
++
++ register uint64x2_t v0, v1, v2, v3;
++ long lines = bytes / (sizeof(uint64x2_t) * 4);
++
++ do {
++ /* p1 ^= p2 ^ p3 */
++ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
++ vld1q_u64(dp3 + 0));
++ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
++ vld1q_u64(dp3 + 2));
++ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
++ vld1q_u64(dp3 + 4));
++ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
++ vld1q_u64(dp3 + 6));
++
++ /* store */
++ vst1q_u64(dp1 + 0, v0);
++ vst1q_u64(dp1 + 2, v1);
++ vst1q_u64(dp1 + 4, v2);
++ vst1q_u64(dp1 + 6, v3);
++
++ dp1 += 8;
++ dp2 += 8;
++ dp3 += 8;
++ } while (--lines > 0);
++}
++
++static void xor_arm64_eor3_4(unsigned long bytes,
++ unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
++{
++ uint64_t *dp1 = (uint64_t *)p1;
++ uint64_t *dp2 = (uint64_t *)p2;
++ uint64_t *dp3 = (uint64_t *)p3;
++ uint64_t *dp4 = (uint64_t *)p4;
++
++ register uint64x2_t v0, v1, v2, v3;
++ long lines = bytes / (sizeof(uint64x2_t) * 4);
++
++ do {
++ /* p1 ^= p2 ^ p3 */
++ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
++ vld1q_u64(dp3 + 0));
++ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
++ vld1q_u64(dp3 + 2));
++ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
++ vld1q_u64(dp3 + 4));
++ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
++ vld1q_u64(dp3 + 6));
++
++ /* p1 ^= p4 */
++ v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
++ v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
++ v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
++ v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
++
++ /* store */
++ vst1q_u64(dp1 + 0, v0);
++ vst1q_u64(dp1 + 2, v1);
++ vst1q_u64(dp1 + 4, v2);
++ vst1q_u64(dp1 + 6, v3);
++
++ dp1 += 8;
++ dp2 += 8;
++ dp3 += 8;
++ dp4 += 8;
++ } while (--lines > 0);
++}
++
++static void xor_arm64_eor3_5(unsigned long bytes,
++ unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
++{
++ uint64_t *dp1 = (uint64_t *)p1;
++ uint64_t *dp2 = (uint64_t *)p2;
++ uint64_t *dp3 = (uint64_t *)p3;
++ uint64_t *dp4 = (uint64_t *)p4;
++ uint64_t *dp5 = (uint64_t *)p5;
++
++ register uint64x2_t v0, v1, v2, v3;
++ long lines = bytes / (sizeof(uint64x2_t) * 4);
++
++ do {
++ /* p1 ^= p2 ^ p3 */
++ v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
++ vld1q_u64(dp3 + 0));
++ v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
++ vld1q_u64(dp3 + 2));
++ v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
++ vld1q_u64(dp3 + 4));
++ v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
++ vld1q_u64(dp3 + 6));
++
++ /* p1 ^= p4 ^ p5 */
++ v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
++ v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
++ v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
++ v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
++
++ /* store */
++ vst1q_u64(dp1 + 0, v0);
++ vst1q_u64(dp1 + 2, v1);
++ vst1q_u64(dp1 + 4, v2);
++ vst1q_u64(dp1 + 6, v3);
++
++ dp1 += 8;
++ dp2 += 8;
++ dp3 += 8;
++ dp4 += 8;
++ dp5 += 8;
++ } while (--lines > 0);
++}
++
++static int __init xor_neon_init(void)
++{
++ if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
++ xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
++ xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
++ xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
++ }
++ return 0;
++}
++module_init(xor_neon_init);
++
++static void __exit xor_neon_exit(void)
++{
++}
++module_exit(xor_neon_exit);
++
+ MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
+ MODULE_DESCRIPTION("ARMv8 XOR Extensions");
+ MODULE_LICENSE("GPL");
+diff --git a/arch/ia64/include/asm/xor.h b/arch/ia64/include/asm/xor.h
+index 673051bf9d7d..6785f70d3208 100644
+--- a/arch/ia64/include/asm/xor.h
++++ b/arch/ia64/include/asm/xor.h
+@@ -4,13 +4,20 @@
+ */
+
+
+-extern void xor_ia64_2(unsigned long, unsigned long *, unsigned long *);
+-extern void xor_ia64_3(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *);
+-extern void xor_ia64_4(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *, unsigned long *);
+-extern void xor_ia64_5(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *, unsigned long *, unsigned long *);
++extern void xor_ia64_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2);
++extern void xor_ia64_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3);
++extern void xor_ia64_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4);
++extern void xor_ia64_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5);
+
+ static struct xor_block_template xor_block_ia64 = {
+ .name = "ia64",
+diff --git a/arch/powerpc/include/asm/xor_altivec.h b/arch/powerpc/include/asm/xor_altivec.h
+index 6ca923510b59..294620a25f80 100644
+--- a/arch/powerpc/include/asm/xor_altivec.h
++++ b/arch/powerpc/include/asm/xor_altivec.h
+@@ -3,17 +3,20 @@
+ #define _ASM_POWERPC_XOR_ALTIVEC_H
+
+ #ifdef CONFIG_ALTIVEC
+-
+-void xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in);
+-void xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in, unsigned long *v3_in);
+-void xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in, unsigned long *v3_in,
+- unsigned long *v4_in);
+-void xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in, unsigned long *v3_in,
+- unsigned long *v4_in, unsigned long *v5_in);
++void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2);
++void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3);
++void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4);
++void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5);
+
+ #endif
+ #endif /* _ASM_POWERPC_XOR_ALTIVEC_H */
+diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c
+index 54e61979e80e..aab49d056d18 100644
+--- a/arch/powerpc/lib/xor_vmx.c
++++ b/arch/powerpc/lib/xor_vmx.c
+@@ -49,8 +49,9 @@ typedef vector signed char unative_t;
+ V1##_3 = vec_xor(V1##_3, V2##_3); \
+ } while (0)
+
+-void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in)
++void __xor_altivec_2(unsigned long bytes,
++ unsigned long * __restrict v1_in,
++ const unsigned long * __restrict v2_in)
+ {
+ DEFINE(v1);
+ DEFINE(v2);
+@@ -67,8 +68,10 @@ void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
+ } while (--lines > 0);
+ }
+
+-void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in, unsigned long *v3_in)
++void __xor_altivec_3(unsigned long bytes,
++ unsigned long * __restrict v1_in,
++ const unsigned long * __restrict v2_in,
++ const unsigned long * __restrict v3_in)
+ {
+ DEFINE(v1);
+ DEFINE(v2);
+@@ -89,9 +92,11 @@ void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
+ } while (--lines > 0);
+ }
+
+-void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in, unsigned long *v3_in,
+- unsigned long *v4_in)
++void __xor_altivec_4(unsigned long bytes,
++ unsigned long * __restrict v1_in,
++ const unsigned long * __restrict v2_in,
++ const unsigned long * __restrict v3_in,
++ const unsigned long * __restrict v4_in)
+ {
+ DEFINE(v1);
+ DEFINE(v2);
+@@ -116,9 +121,12 @@ void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
+ } while (--lines > 0);
+ }
+
+-void __xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in, unsigned long *v3_in,
+- unsigned long *v4_in, unsigned long *v5_in)
++void __xor_altivec_5(unsigned long bytes,
++ unsigned long * __restrict v1_in,
++ const unsigned long * __restrict v2_in,
++ const unsigned long * __restrict v3_in,
++ const unsigned long * __restrict v4_in,
++ const unsigned long * __restrict v5_in)
+ {
+ DEFINE(v1);
+ DEFINE(v2);
+diff --git a/arch/powerpc/lib/xor_vmx.h b/arch/powerpc/lib/xor_vmx.h
+index 5c2b0839b179..573c41d90dac 100644
+--- a/arch/powerpc/lib/xor_vmx.h
++++ b/arch/powerpc/lib/xor_vmx.h
+@@ -6,16 +6,17 @@
+ * outside of the enable/disable altivec block.
+ */
+
+-void __xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in);
+-
+-void __xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in, unsigned long *v3_in);
+-
+-void __xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in, unsigned long *v3_in,
+- unsigned long *v4_in);
+-
+-void __xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in, unsigned long *v3_in,
+- unsigned long *v4_in, unsigned long *v5_in);
++void __xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2);
++void __xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3);
++void __xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4);
++void __xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5);
+diff --git a/arch/powerpc/lib/xor_vmx_glue.c b/arch/powerpc/lib/xor_vmx_glue.c
+index 80dba916c367..35d917ece4d1 100644
+--- a/arch/powerpc/lib/xor_vmx_glue.c
++++ b/arch/powerpc/lib/xor_vmx_glue.c
+@@ -12,47 +12,51 @@
+ #include <asm/xor_altivec.h>
+ #include "xor_vmx.h"
+
+-void xor_altivec_2(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in)
++void xor_altivec_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ preempt_disable();
+ enable_kernel_altivec();
+- __xor_altivec_2(bytes, v1_in, v2_in);
++ __xor_altivec_2(bytes, p1, p2);
+ disable_kernel_altivec();
+ preempt_enable();
+ }
+ EXPORT_SYMBOL(xor_altivec_2);
+
+-void xor_altivec_3(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in, unsigned long *v3_in)
++void xor_altivec_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ preempt_disable();
+ enable_kernel_altivec();
+- __xor_altivec_3(bytes, v1_in, v2_in, v3_in);
++ __xor_altivec_3(bytes, p1, p2, p3);
+ disable_kernel_altivec();
+ preempt_enable();
+ }
+ EXPORT_SYMBOL(xor_altivec_3);
+
+-void xor_altivec_4(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in, unsigned long *v3_in,
+- unsigned long *v4_in)
++void xor_altivec_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ preempt_disable();
+ enable_kernel_altivec();
+- __xor_altivec_4(bytes, v1_in, v2_in, v3_in, v4_in);
++ __xor_altivec_4(bytes, p1, p2, p3, p4);
+ disable_kernel_altivec();
+ preempt_enable();
+ }
+ EXPORT_SYMBOL(xor_altivec_4);
+
+-void xor_altivec_5(unsigned long bytes, unsigned long *v1_in,
+- unsigned long *v2_in, unsigned long *v3_in,
+- unsigned long *v4_in, unsigned long *v5_in)
++void xor_altivec_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ preempt_disable();
+ enable_kernel_altivec();
+- __xor_altivec_5(bytes, v1_in, v2_in, v3_in, v4_in, v5_in);
++ __xor_altivec_5(bytes, p1, p2, p3, p4, p5);
+ disable_kernel_altivec();
+ preempt_enable();
+ }
+diff --git a/arch/s390/lib/xor.c b/arch/s390/lib/xor.c
+index a963c3d8ad0d..fb924a8041dc 100644
+--- a/arch/s390/lib/xor.c
++++ b/arch/s390/lib/xor.c
+@@ -11,7 +11,8 @@
+ #include <linux/raid/xor.h>
+ #include <asm/xor.h>
+
+-static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++static void xor_xc_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ asm volatile(
+ " larl 1,2f\n"
+@@ -32,8 +33,9 @@ static void xor_xc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ : "0", "1", "cc", "memory");
+ }
+
+-static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++static void xor_xc_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ asm volatile(
+ " larl 1,2f\n"
+@@ -58,8 +60,10 @@ static void xor_xc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ : : "0", "1", "cc", "memory");
+ }
+
+-static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++static void xor_xc_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ asm volatile(
+ " larl 1,2f\n"
+@@ -88,8 +92,11 @@ static void xor_xc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ : : "0", "1", "cc", "memory");
+ }
+
+-static void xor_xc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++static void xor_xc_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ asm volatile(
+ " larl 1,2f\n"
+diff --git a/arch/sparc/include/asm/xor_32.h b/arch/sparc/include/asm/xor_32.h
+index 3e5af37e4b9c..0351813cf3af 100644
+--- a/arch/sparc/include/asm/xor_32.h
++++ b/arch/sparc/include/asm/xor_32.h
+@@ -13,7 +13,8 @@
+ */
+
+ static void
+-sparc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++sparc_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ int lines = bytes / (sizeof (long)) / 8;
+
+@@ -50,8 +51,9 @@ sparc_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ }
+
+ static void
+-sparc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++sparc_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ int lines = bytes / (sizeof (long)) / 8;
+
+@@ -101,8 +103,10 @@ sparc_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-sparc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++sparc_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ int lines = bytes / (sizeof (long)) / 8;
+
+@@ -165,8 +169,11 @@ sparc_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-sparc_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++sparc_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ int lines = bytes / (sizeof (long)) / 8;
+
+diff --git a/arch/sparc/include/asm/xor_64.h b/arch/sparc/include/asm/xor_64.h
+index 16169f3edcd5..caaddea8ad79 100644
+--- a/arch/sparc/include/asm/xor_64.h
++++ b/arch/sparc/include/asm/xor_64.h
+@@ -12,13 +12,20 @@
+
+ #include <asm/spitfire.h>
+
+-void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
+-void xor_vis_3(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *);
+-void xor_vis_4(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *, unsigned long *);
+-void xor_vis_5(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *, unsigned long *, unsigned long *);
++void xor_vis_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2);
++void xor_vis_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3);
++void xor_vis_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4);
++void xor_vis_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5);
+
+ /* XXX Ugh, write cheetah versions... -DaveM */
+
+@@ -30,13 +37,20 @@ static struct xor_block_template xor_block_VIS = {
+ .do_5 = xor_vis_5,
+ };
+
+-void xor_niagara_2(unsigned long, unsigned long *, unsigned long *);
+-void xor_niagara_3(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *);
+-void xor_niagara_4(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *, unsigned long *);
+-void xor_niagara_5(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *, unsigned long *, unsigned long *);
++void xor_niagara_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2);
++void xor_niagara_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3);
++void xor_niagara_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4);
++void xor_niagara_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5);
+
+ static struct xor_block_template xor_block_niagara = {
+ .name = "Niagara",
+diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
+index 2ee95a7769e6..7b0307acc410 100644
+--- a/arch/x86/include/asm/xor.h
++++ b/arch/x86/include/asm/xor.h
+@@ -57,7 +57,8 @@
+ op(i + 3, 3)
+
+ static void
+-xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ unsigned long lines = bytes >> 8;
+
+@@ -108,7 +109,8 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ }
+
+ static void
+-xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ unsigned long lines = bytes >> 8;
+
+@@ -142,8 +144,9 @@ xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ }
+
+ static void
+-xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ unsigned long lines = bytes >> 8;
+
+@@ -201,8 +204,9 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ unsigned long lines = bytes >> 8;
+
+@@ -238,8 +242,10 @@ xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ unsigned long lines = bytes >> 8;
+
+@@ -304,8 +310,10 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ unsigned long lines = bytes >> 8;
+
+@@ -343,8 +351,11 @@ xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ unsigned long lines = bytes >> 8;
+
+@@ -416,8 +427,11 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ unsigned long lines = bytes >> 8;
+
+diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
+index 67ceb790e639..7a6b9474591e 100644
+--- a/arch/x86/include/asm/xor_32.h
++++ b/arch/x86/include/asm/xor_32.h
+@@ -21,7 +21,8 @@
+ #include <asm/fpu/api.h>
+
+ static void
+-xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++xor_pII_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ unsigned long lines = bytes >> 7;
+
+@@ -64,8 +65,9 @@ xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ }
+
+ static void
+-xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++xor_pII_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ unsigned long lines = bytes >> 7;
+
+@@ -113,8 +115,10 @@ xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++xor_pII_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ unsigned long lines = bytes >> 7;
+
+@@ -168,8 +172,11 @@ xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+
+
+ static void
+-xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++xor_pII_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ unsigned long lines = bytes >> 7;
+
+@@ -248,7 +255,8 @@ xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ #undef BLOCK
+
+ static void
+-xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++xor_p5_mmx_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ unsigned long lines = bytes >> 6;
+
+@@ -295,8 +303,9 @@ xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ }
+
+ static void
+-xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++xor_p5_mmx_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ unsigned long lines = bytes >> 6;
+
+@@ -352,8 +361,10 @@ xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++xor_p5_mmx_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ unsigned long lines = bytes >> 6;
+
+@@ -418,8 +429,11 @@ xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++xor_p5_mmx_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ unsigned long lines = bytes >> 6;
+
+diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
+index 0c4e5b5e3852..7f81dd5897f4 100644
+--- a/arch/x86/include/asm/xor_avx.h
++++ b/arch/x86/include/asm/xor_avx.h
+@@ -26,7 +26,8 @@
+ BLOCK4(8) \
+ BLOCK4(12)
+
+-static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
++static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
++ const unsigned long * __restrict p1)
+ {
+ unsigned long lines = bytes >> 9;
+
+@@ -52,8 +53,9 @@ do { \
+ kernel_fpu_end();
+ }
+
+-static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+- unsigned long *p2)
++static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
++ const unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ unsigned long lines = bytes >> 9;
+
+@@ -82,8 +84,10 @@ do { \
+ kernel_fpu_end();
+ }
+
+-static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+- unsigned long *p2, unsigned long *p3)
++static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
++ const unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ unsigned long lines = bytes >> 9;
+
+@@ -115,8 +119,11 @@ do { \
+ kernel_fpu_end();
+ }
+
+-static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+- unsigned long *p2, unsigned long *p3, unsigned long *p4)
++static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
++ const unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ unsigned long lines = bytes >> 9;
+
+diff --git a/include/asm-generic/xor.h b/include/asm-generic/xor.h
+index b62a2a56a4d4..44509d48fca2 100644
+--- a/include/asm-generic/xor.h
++++ b/include/asm-generic/xor.h
+@@ -8,7 +8,8 @@
+ #include <linux/prefetch.h>
+
+ static void
+-xor_8regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++xor_8regs_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ long lines = bytes / (sizeof (long)) / 8;
+
+@@ -27,8 +28,9 @@ xor_8regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ }
+
+ static void
+-xor_8regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++xor_8regs_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ long lines = bytes / (sizeof (long)) / 8;
+
+@@ -48,8 +50,10 @@ xor_8regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_8regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++xor_8regs_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ long lines = bytes / (sizeof (long)) / 8;
+
+@@ -70,8 +74,11 @@ xor_8regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_8regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++xor_8regs_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ long lines = bytes / (sizeof (long)) / 8;
+
+@@ -93,7 +100,8 @@ xor_8regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_32regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++xor_32regs_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ long lines = bytes / (sizeof (long)) / 8;
+
+@@ -129,8 +137,9 @@ xor_32regs_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ }
+
+ static void
+-xor_32regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++xor_32regs_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ long lines = bytes / (sizeof (long)) / 8;
+
+@@ -175,8 +184,10 @@ xor_32regs_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_32regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++xor_32regs_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ long lines = bytes / (sizeof (long)) / 8;
+
+@@ -230,8 +241,11 @@ xor_32regs_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_32regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++xor_32regs_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ long lines = bytes / (sizeof (long)) / 8;
+
+@@ -294,7 +308,8 @@ xor_32regs_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_8regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++xor_8regs_p_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ long lines = bytes / (sizeof (long)) / 8 - 1;
+ prefetchw(p1);
+@@ -320,8 +335,9 @@ xor_8regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ }
+
+ static void
+-xor_8regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++xor_8regs_p_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ long lines = bytes / (sizeof (long)) / 8 - 1;
+ prefetchw(p1);
+@@ -350,8 +366,10 @@ xor_8regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_8regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++xor_8regs_p_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ long lines = bytes / (sizeof (long)) / 8 - 1;
+
+@@ -384,8 +402,11 @@ xor_8regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_8regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++xor_8regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ long lines = bytes / (sizeof (long)) / 8 - 1;
+
+@@ -421,7 +442,8 @@ xor_8regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_32regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
++xor_32regs_p_2(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2)
+ {
+ long lines = bytes / (sizeof (long)) / 8 - 1;
+
+@@ -466,8 +488,9 @@ xor_32regs_p_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+ }
+
+ static void
+-xor_32regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3)
++xor_32regs_p_3(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3)
+ {
+ long lines = bytes / (sizeof (long)) / 8 - 1;
+
+@@ -523,8 +546,10 @@ xor_32regs_p_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_32regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4)
++xor_32regs_p_4(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4)
+ {
+ long lines = bytes / (sizeof (long)) / 8 - 1;
+
+@@ -591,8 +616,11 @@ xor_32regs_p_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+ }
+
+ static void
+-xor_32regs_p_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+- unsigned long *p3, unsigned long *p4, unsigned long *p5)
++xor_32regs_p_5(unsigned long bytes, unsigned long * __restrict p1,
++ const unsigned long * __restrict p2,
++ const unsigned long * __restrict p3,
++ const unsigned long * __restrict p4,
++ const unsigned long * __restrict p5)
+ {
+ long lines = bytes / (sizeof (long)) / 8 - 1;
+
+diff --git a/include/linux/raid/xor.h b/include/linux/raid/xor.h
+index 2a9fee8ddae3..51b811b62322 100644
+--- a/include/linux/raid/xor.h
++++ b/include/linux/raid/xor.h
+@@ -11,13 +11,20 @@ struct xor_block_template {
+ struct xor_block_template *next;
+ const char *name;
+ int speed;
+- void (*do_2)(unsigned long, unsigned long *, unsigned long *);
+- void (*do_3)(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *);
+- void (*do_4)(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *, unsigned long *);
+- void (*do_5)(unsigned long, unsigned long *, unsigned long *,
+- unsigned long *, unsigned long *, unsigned long *);
++ void (*do_2)(unsigned long, unsigned long * __restrict,
++ const unsigned long * __restrict);
++ void (*do_3)(unsigned long, unsigned long * __restrict,
++ const unsigned long * __restrict,
++ const unsigned long * __restrict);
++ void (*do_4)(unsigned long, unsigned long * __restrict,
++ const unsigned long * __restrict,
++ const unsigned long * __restrict,
++ const unsigned long * __restrict);
++ void (*do_5)(unsigned long, unsigned long * __restrict,
++ const unsigned long * __restrict,
++ const unsigned long * __restrict,
++ const unsigned long * __restrict,
++ const unsigned long * __restrict);
+ };
+
+ #endif
+--
+2.35.1
+