diff options
18 files changed, 1199 insertions, 10294 deletions
@@ -1,5 +1,5 @@ pkgbase = linux-acs-manjaro - pkgver = 5.15.16 + pkgver = 5.16.2 pkgrel = 1 url = https://www.kernel.org/ arch = x86_64 @@ -17,24 +17,17 @@ pkgbase = linux-acs-manjaro makedepends = tar makedepends = xz options = !strip - source = https://www.kernel.org/pub/linux/kernel/v5.x/linux-5.15.tar.xz - source = https://www.kernel.org/pub/linux/kernel/v5.x/patch-5.15.16.xz + source = https://www.kernel.org/pub/linux/kernel/v5.x/linux-5.16.tar.xz + source = https://www.kernel.org/pub/linux/kernel/v5.x/patch-5.16.2.xz source = config source = 0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-CLONE_NEWUSER.patch - source = 0002-PCI_Add_more_NVIDIA_controllers_to_the_MSI_masking_quirk.patch - source = 0003-iommu_intel_do_deep_dma-unmapping_to_avoid_kernel-flooding.patch - source = 0004-cpufreq_intel_pstate_ITMT_support_for_overclocked_system.patch - source = 0005-Bluetooth_btintel_Fix_bdaddress_comparison_with_garbage_value.patch - source = 0006-lg-laptop_Recognize_more_models.patch + source = 0002-Btintel_Fix_bdaddress_comparison_with_garbage_value.patch source = 0101-i2c-nuvoton-nc677x-hwmon-driver.patch - source = 0103-futex.patch - source = 0104-revert-xhci-Add-support-for-Renesas-controller-with-memory.patch source = 0105-quirk-kernel-org-bug-210681-firmware_rome_error.patch - source = 0108-drm_i915_Add_workaround_numbers_to_GEN7_COMMON_SLICE_CHICKEN1_whitelisting.patch::https://patchwork.freedesktop.org/patch/463650/raw/ - source = 0201-lenovo-wmi2.patch - source = 0301-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch - source = 0302-revert-fbcon-remove-no-op-fbcon_set_origin.patch - source = 0303-revert-fbcon-remove-soft-scrollback-code.patch + source = 0301-revert-garbage-collect-fbdev-scrolling-acceleration.patch + source = 0302-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch + source = 0303-revert-fbcon-remove-no-op-fbcon_set_origin.patch + source = 0304-revert-fbcon-remove-soft-scrollback-code.patch source = 0401-bootsplash.patch source = 0402-bootsplash.patch source = 0403-bootsplash.patch @@ -49,21 +42,14 @@ pkgbase = linux-acs-manjaro source = 0412-bootsplash.patch source = 0413-bootsplash.gitpatch source = 0999-acs.gitpatch - sha256sums = 57b2cf6991910e3b67a1b3490022e8a0674b6965c74c12da1e99d138d1991ee8 - sha256sums = 0817171996521675b3c1130568503f08d8b1672c955cc842200a21bf5914cd95 - sha256sums = 93320dbe5928e51fb777a4f13dd9a7364eb150d7983073f7dc159e89a6ffa747 + sha256sums = 027d7e8988bb69ac12ee92406c3be1fe13f990b1ca2249e226225cd1573308bb + sha256sums = 3a09c2f1ad410c09cf03921abeed1a6ca7c38138fb508171ee673d429d179171 + sha256sums = cb2d729cc20743014d9e3bd08facb9f5bdd19d9fa89014f415c61b4a6eb78e97 sha256sums = 986f8d802f37b72a54256f0ab84da83cb229388d58c0b6750f7c770818a18421 - sha256sums = e2823eff3355b7c88a3fa327ea2f84f23cbd36569e0a5f0f76599023f63a52ca - sha256sums = ce53090a4572cd6162d22225113082f7e4df5028a1230529d170460e26dcf849 - sha256sums = ab0360eac59329eb84f028c2f402ee4a17e4b3dfacb7957355e6178d35af87b9 - sha256sums = 76701599bbafa49b90ccb073ef29ce2dc3731566e8fa852bd1e9e7796e184754 - sha256sums = a2a0a0542055a6a921542fbb05cedb6eb6f3d3fb0c038bfb2304bfd3931a0f71 + sha256sums = b89188b1bc3516d54965dd36def6a2af3d81379e53ff7e527bbd91f77c6f191b sha256sums = 7823d7488f42bc4ed7dfae6d1014dbde679d8b862c9a3697a39ba0dae5918978 - sha256sums = 844e66a95d7df754c55ac2f1ce7e215b1e56e20ca095462d926a993d557b20e0 - sha256sums = d9330ea593829a6ef3b824db9570253280cbff7da2b4beb47cbc037824d1a29b sha256sums = 5e804e1f241ce542f3f0e83d274ede6aa4b0539e510fb9376f8106e8732ce69b - sha256sums = e8e6120035977903a7117ba215809b9b162b64a789848107513f219180baaada - sha256sums = 1d58ef2991c625f6f0eb33b4cb8303932f53f1c4694e42bae24c9cd36d2ad013 + sha256sums = 365d4225a7db60bd064ebbc34ce0ae582a0c378ad6c4cec7960a5ae4641a6757 sha256sums = 2b11905b63b05b25807dd64757c779da74dd4c37e36d3f7a46485b1ee5a9d326 sha256sums = 94a8538251ad148f1025cc3de446ce64f73dc32b01815426fb159c722e8fa5bc sha256sums = 1f18c5c10a3c63e41ecd05ad34cd9f6653ba96e9f1049ce2b7bb6da2578ae710 @@ -80,7 +66,7 @@ pkgbase = linux-acs-manjaro sha256sums = 27471eee564ca3149dd271b0817719b5565a9594dc4d884fe3dc51a5f03832bc sha256sums = 60e295601e4fb33d9bf65f198c54c7eb07c0d1e91e2ad1e0dd6cd6e142cb266d sha256sums = 035ea4b2a7621054f4560471f45336b981538a40172d8f17285910d4e0e0b3ef - sha256sums = 6d6b327ec7c7798f628f98ab964f4457d3cf043bad2632eb8f27548478a83cc1 + sha256sums = 2542b5cea79ab5817ce3d30c54acd045966b9c14587bfb0b2f50d473da48a1d5 pkgname = linux-acs-manjaro pkgdesc = The Linux Manjaro standart kernel and modules with ACS patch diff --git a/0005-Bluetooth_btintel_Fix_bdaddress_comparison_with_garbage_value.patch b/0002-Btintel_Fix_bdaddress_comparison_with_garbage_value.patch index 38cf2bde55bd..80cd663cd131 100644 --- a/0005-Bluetooth_btintel_Fix_bdaddress_comparison_with_garbage_value.patch +++ b/0002-Btintel_Fix_bdaddress_comparison_with_garbage_value.patch @@ -1,4 +1,4 @@ -From ae3386d67597db29ad2ba2685815e224a39897bc Mon Sep 17 00:00:00 2001 +From efbb86e8bf678eb5a376deaa3b693fb7a21b8e41 Mon Sep 17 00:00:00 2001 From: Kiran K <kiran.k@intel.com> Date: Wed, 13 Oct 2021 13:35:11 +0530 Subject: [PATCH] Bluetooth: btintel: Fix bdaddress comparison with garbage @@ -16,10 +16,10 @@ Reviewed-by: Tedd Ho-Jeong An <tedd.an@intel.com> 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c -index f1705b46fc8898..b9055a3e61ed76 100644 +index 9359bff4729659..8f9109b40961f4 100644 --- a/drivers/bluetooth/btintel.c +++ b/drivers/bluetooth/btintel.c -@@ -2006,14 +2006,16 @@ static int btintel_prepare_fw_download_tlv(struct hci_dev *hdev, +@@ -2081,14 +2081,16 @@ static int btintel_prepare_fw_download_tlv(struct hci_dev *hdev, if (ver->img_type == 0x03) { btintel_clear_flag(hdev, INTEL_BOOTLOADER); btintel_check_bdaddr(hdev); @@ -44,7 +44,7 @@ index f1705b46fc8898..b9055a3e61ed76 100644 } btintel_get_fw_name_tlv(ver, fwname, sizeof(fwname), "sfi"); -@@ -2303,6 +2305,10 @@ static int btintel_setup_combined(struct hci_dev *hdev) +@@ -2466,6 +2468,10 @@ static int btintel_setup_combined(struct hci_dev *hdev) goto exit_error; } @@ -55,3 +55,4 @@ index f1705b46fc8898..b9055a3e61ed76 100644 /* For TLV type device, parse the tlv data */ err = btintel_parse_version_tlv(hdev, &ver_tlv, skb); if (err) { + diff --git a/0002-PCI_Add_more_NVIDIA_controllers_to_the_MSI_masking_quirk.patch b/0002-PCI_Add_more_NVIDIA_controllers_to_the_MSI_masking_quirk.patch deleted file mode 100644 index 01b324a03a17..000000000000 --- a/0002-PCI_Add_more_NVIDIA_controllers_to_the_MSI_masking_quirk.patch +++ /dev/null @@ -1,21 +0,0 @@ -From 1ac8f753e4249e6864c1c42070ba957ceef1f82a Mon Sep 17 00:00:00 2001 -From: "Jan Alexander Steffens (heftig)" <heftig@archlinux.org> -Date: Thu, 18 Nov 2021 22:53:31 +0100 -Subject: [PATCH] PCI: Add more NVIDIA controllers to the MSI masking quirk - -For: https://bugs.archlinux.org/task/72734 -For: https://bugs.archlinux.org/task/72777 ---- - drivers/pci/quirks.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index 208fa03acdda00..7fdb7e9c2e12c4 100644 ---- a/drivers/pci/quirks.c -+++ b/drivers/pci/quirks.c -@@ -5802,3 +5802,5 @@ static void nvidia_ion_ahci_fixup(struct pci_dev *pdev) - pdev->dev_flags |= PCI_DEV_FLAGS_HAS_MSI_MASKING; - } - DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, 0x0ab8, nvidia_ion_ahci_fixup); -+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, 0x0ab9, nvidia_ion_ahci_fixup); -+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, 0x0d88, nvidia_ion_ahci_fixup); diff --git a/0003-iommu_intel_do_deep_dma-unmapping_to_avoid_kernel-flooding.patch b/0003-iommu_intel_do_deep_dma-unmapping_to_avoid_kernel-flooding.patch deleted file mode 100644 index bc9dc1857912..000000000000 --- a/0003-iommu_intel_do_deep_dma-unmapping_to_avoid_kernel-flooding.patch +++ /dev/null @@ -1,85 +0,0 @@ -From 74db74ec6ce112c6137d51610429e7ac9ea5b6c1 Mon Sep 17 00:00:00 2001 -From: Ajay Garg <ajaygargnsit@gmail.com> -Date: Tue, 12 Oct 2021 19:26:53 +0530 -Subject: [PATCH] iommu: intel: do deep dma-unmapping, to avoid - kernel-flooding. - -Origins at : -https://lists.linuxfoundation.org/pipermail/iommu/2021-October/thread.html - -=== Changes from v1 => v2 === - -a) -Improved patch-description. - -b) -A more root-level fix, as suggested by - - 1. - Alex Williamson <alex.williamson@redhat.com> - - 2. - Lu Baolu <baolu.lu@linux.intel.com> - -=== Issue === - -Kernel-flooding is seen, when an x86_64 L1 guest (Ubuntu-21) is booted in qemu/kvm -on a x86_64 host (Ubuntu-21), with a host-pci-device attached. - -Following kind of logs, along with the stacktraces, cause the flood : - -...... - DMAR: ERROR: DMA PTE for vPFN 0x428ec already set (to 3f6ec003 not 3f6ec003) - DMAR: ERROR: DMA PTE for vPFN 0x428ed already set (to 3f6ed003 not 3f6ed003) - DMAR: ERROR: DMA PTE for vPFN 0x428ee already set (to 3f6ee003 not 3f6ee003) - DMAR: ERROR: DMA PTE for vPFN 0x428ef already set (to 3f6ef003 not 3f6ef003) - DMAR: ERROR: DMA PTE for vPFN 0x428f0 already set (to 3f6f0003 not 3f6f0003) -...... - -=== Current Behaviour, leading to the issue === - -Currently, when we do a dma-unmapping, we unmap/unlink the mappings, but -the pte-entries are not cleared. - -Thus, following sequencing would flood the kernel-logs : - -i) -A dma-unmapping makes the real/leaf-level pte-slot invalid, but the -pte-content itself is not cleared. - -ii) -Now, during some later dma-mapping procedure, as the pte-slot is about -to hold a new pte-value, the intel-iommu checks if a prior -pte-entry exists in the pte-slot. If it exists, it logs a kernel-error, -along with a corresponding stacktrace. - -iii) -Step ii) runs in abundance, and the kernel-logs run insane. - -=== Fix === - -We ensure that as part of a dma-unmapping, each (unmapped) pte-slot -is also cleared of its value/content (at the leaf-level, where the -real mapping from a iova => pfn mapping is stored). - -This completes a "deep" dma-unmapping. - -Signed-off-by: Ajay Garg <ajaygargnsit@gmail.com> -Link: https://lore.kernel.org/linux-iommu/20211012135653.3852-1-ajaygargnsit@gmail.com/ ---- - drivers/iommu/intel/iommu.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c -index 78f8c8e6803e97..d8da48a91ba3b2 100644 ---- a/drivers/iommu/intel/iommu.c -+++ b/drivers/iommu/intel/iommu.c -@@ -5092,6 +5092,8 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain, - gather->freelist = domain_unmap(dmar_domain, start_pfn, - last_pfn, gather->freelist); - -+ dma_pte_clear_range(dmar_domain, start_pfn, last_pfn); -+ - if (dmar_domain->max_addr == iova + size) - dmar_domain->max_addr = iova; - diff --git a/0004-cpufreq_intel_pstate_ITMT_support_for_overclocked_system.patch b/0004-cpufreq_intel_pstate_ITMT_support_for_overclocked_system.patch deleted file mode 100644 index 1f7922e34722..000000000000 --- a/0004-cpufreq_intel_pstate_ITMT_support_for_overclocked_system.patch +++ /dev/null @@ -1,51 +0,0 @@ -From 62f1f7606485d450b23f86bc18dab101e7a2443d Mon Sep 17 00:00:00 2001 -From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> -Date: Thu, 18 Nov 2021 21:18:01 -0800 -Subject: [PATCH] cpufreq: intel_pstate: ITMT support for overclocked system - -On systems with overclocking enabled, CPPC Highest Performance can be -hard coded to 0xff. In this case even if we have cores with different -highest performance, ITMT can't be enabled as the current implementation -depends on CPPC Highest Performance. - -On such systems we can use MSR_HWP_CAPABILITIES maximum performance field -when CPPC.Highest Performance is 0xff. - -Due to legacy reasons, we can't solely depend on MSR_HWP_CAPABILITIES as -in some older systems CPPC Highest Performance is the only way to identify -different performing cores. - -Reported-by: Michael Larabel <Michael@MichaelLarabel.com> -Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> ---- - drivers/cpufreq/intel_pstate.c | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c -index e15c3bc17a55ce..8a2c6b58b6524f 100644 ---- a/drivers/cpufreq/intel_pstate.c -+++ b/drivers/cpufreq/intel_pstate.c -@@ -335,6 +335,8 @@ static void intel_pstste_sched_itmt_work_fn(struct work_struct *work) - - static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn); - -+#define CPPC_MAX_PERF U8_MAX -+ - static void intel_pstate_set_itmt_prio(int cpu) - { - struct cppc_perf_caps cppc_perf; -@@ -345,6 +347,14 @@ static void intel_pstate_set_itmt_prio(int cpu) - if (ret) - return; - -+ /* -+ * On some systems with overclocking enabled, CPPC.highest_perf is hardcoded to 0xff. -+ * In this case we can't use CPPC.highest_perf to enable ITMT. -+ * In this case we can look at MSR_HWP_CAPABILITIES bits [8:0] to decide. -+ */ -+ if (cppc_perf.highest_perf == CPPC_MAX_PERF) -+ cppc_perf.highest_perf = HWP_HIGHEST_PERF(READ_ONCE(all_cpu_data[cpu]->hwp_cap_cached)); -+ - /* - * The priorities can be set regardless of whether or not - * sched_set_itmt_support(true) has been called and it is valid to diff --git a/0006-lg-laptop_Recognize_more_models.patch b/0006-lg-laptop_Recognize_more_models.patch deleted file mode 100644 index 8fbd217c36a2..000000000000 --- a/0006-lg-laptop_Recognize_more_models.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 675d4b66de78eec370cf5053eecdf00b26780af3 Mon Sep 17 00:00:00 2001 -From: Matan Ziv-Av <matan@svgalib.org> -Date: Tue, 23 Nov 2021 22:14:55 +0200 -Subject: [PATCH] lg-laptop: Recognize more models - -LG uses 5 instead of 0 in the third digit (second digit after 2019) of the year string to indicate newer models in the same year. Handle this case as well. - -Signed-off-by: Matan Ziv-Av <matan@svgalib.org> -For: https://bugs.archlinux.org/task/71772 ---- - drivers/platform/x86/lg-laptop.c | 12 ++++++++++++ - 1 file changed, 12 insertions(+) - -diff --git a/drivers/platform/x86/lg-laptop.c b/drivers/platform/x86/lg-laptop.c -index 88b551caeaaf41..d6f74d3a7605e2 100644 ---- a/drivers/platform/x86/lg-laptop.c -+++ b/drivers/platform/x86/lg-laptop.c -@@ -658,6 +658,18 @@ static int acpi_add(struct acpi_device *device) - if (product && strlen(product) > 4) - switch (product[4]) { - case '5': -+ if (strlen(product) > 5) -+ switch (product[5]) { -+ case 'N': -+ year = 2021; -+ break; -+ case '0': -+ year = 2016; -+ break; -+ default: -+ year = 2022; -+ } -+ break; - case '6': - year = 2016; - break; diff --git a/0102-iomap-iomap_bmap-should-accept-unwritten-maps.patch b/0102-iomap-iomap_bmap-should-accept-unwritten-maps.patch deleted file mode 100644 index 9ca50277e88c..000000000000 --- a/0102-iomap-iomap_bmap-should-accept-unwritten-maps.patch +++ /dev/null @@ -1,38 +0,0 @@ -From: Yuxuan Shui <yshuiv7@gmail.com> -To: viro@zeniv.linux.org.uk -Cc: linux-fsdevel@vger.kernel.org, Yuxuan Shui <yshuiv7@gmail.com> -Subject: [PATCH] iomap: iomap_bmap should accept unwritten maps -Date: Tue, 5 May 2020 19:36:08 +0100 -Message-ID: <20200505183608.10280-1-yshuiv7@gmail.com> (raw) - -commit ac58e4fb03f9d111d733a4ad379d06eef3a24705 moved ext4_bmap from -generic_block_bmap to iomap_bmap, this introduced a regression which -prevents some user from using previously working swapfiles. The kernel -will complain about holes while there is none. - -What is happening here is that the swapfile has unwritten mappings, -which is rejected by iomap_bmap, but was accepted by ext4_get_block. - -This commit makes sure iomap_bmap would accept unwritten mappings as -well. - -Signed-off-by: Yuxuan Shui <yshuiv7@gmail.com> ---- - fs/iomap/fiemap.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c -index d55e8f491a5e..fb488dcfa8c7 100644 ---- a/fs/iomap/fiemap.c -+++ b/fs/iomap/fiemap.c -@@ -115,7 +115,7 @@ iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, - { - sector_t *bno = data, addr; - -- if (iomap->type == IOMAP_MAPPED) { -+ if (iomap->type == IOMAP_MAPPED || iomap->type == IOMAP_UNWRITTEN) { - addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; - *bno = addr; - } --- -2.26.2 diff --git a/0103-futex.patch b/0103-futex.patch deleted file mode 100644 index d33f488ae054..000000000000 --- a/0103-futex.patch +++ /dev/null @@ -1,9811 +0,0 @@ -From 4dc2913212c08c6970f6e8971fd23b6328982f94 Mon Sep 17 00:00:00 2001 -From: Piotr Gorski <lucjan.lucjanov@gmail.com> -Date: Mon, 1 Nov 2021 12:11:04 +0100 -Subject: [PATCH] futex: resync from gitlab.collabora.com - -Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com> ---- - Documentation/userspace-api/futex2.rst | 86 + - Documentation/userspace-api/index.rst | 1 + - MAINTAINERS | 3 +- - arch/arm/tools/syscall.tbl | 1 + - arch/arm64/include/asm/unistd.h | 2 +- - arch/arm64/include/asm/unistd32.h | 2 + - arch/x86/entry/syscalls/syscall_32.tbl | 1 + - arch/x86/entry/syscalls/syscall_64.tbl | 1 + - include/linux/syscalls.h | 7 +- - include/uapi/asm-generic/unistd.h | 5 +- - include/uapi/linux/futex.h | 25 + - kernel/Makefile | 2 +- - kernel/futex.c | 4272 ----------------- - kernel/futex/Makefile | 3 + - kernel/futex/core.c | 1176 +++++ - kernel/futex/futex.h | 295 ++ - kernel/futex/pi.c | 1233 +++++ - kernel/futex/requeue.c | 897 ++++ - kernel/futex/syscalls.c | 396 ++ - kernel/futex/waitwake.c | 708 +++ - kernel/sys_ni.c | 3 +- - .../selftests/futex/functional/.gitignore | 1 + - .../selftests/futex/functional/Makefile | 3 +- - .../futex/functional/futex_wait_timeout.c | 21 +- - .../futex/functional/futex_wait_wouldblock.c | 41 +- - .../selftests/futex/functional/futex_waitv.c | 237 + - .../testing/selftests/futex/functional/run.sh | 3 + - .../selftests/futex/include/futex2test.h | 22 + - 28 files changed, 5163 insertions(+), 4284 deletions(-) - create mode 100644 Documentation/userspace-api/futex2.rst - delete mode 100644 kernel/futex.c - create mode 100644 kernel/futex/Makefile - create mode 100644 kernel/futex/core.c - create mode 100644 kernel/futex/futex.h - create mode 100644 kernel/futex/pi.c - create mode 100644 kernel/futex/requeue.c - create mode 100644 kernel/futex/syscalls.c - create mode 100644 kernel/futex/waitwake.c - create mode 100644 tools/testing/selftests/futex/functional/futex_waitv.c - create mode 100644 tools/testing/selftests/futex/include/futex2test.h - -diff --git a/Documentation/userspace-api/futex2.rst b/Documentation/userspace-api/futex2.rst -new file mode 100644 -index 000000000..7d37409df ---- /dev/null -+++ b/Documentation/userspace-api/futex2.rst -@@ -0,0 +1,86 @@ -+.. SPDX-License-Identifier: GPL-2.0 -+ -+====== -+futex2 -+====== -+ -+:Author: AndrĂ© Almeida <andrealmeid@collabora.com> -+ -+futex, or fast user mutex, is a set of syscalls to allow userspace to create -+performant synchronization mechanisms, such as mutexes, semaphores and -+conditional variables in userspace. C standard libraries, like glibc, uses it -+as a means to implement more high level interfaces like pthreads. -+ -+futex2 is a followup version of the initial futex syscall, designed to overcome -+limitations of the original interface. -+ -+User API -+======== -+ -+``futex_waitv()`` -+----------------- -+ -+Wait on an array of futexes, wake on any:: -+ -+ futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes, -+ unsigned int flags, struct timespec *timeout, clockid_t clockid) -+ -+ struct futex_waitv { -+ __u64 val; -+ __u64 uaddr; -+ __u32 flags; -+ __u32 __reserved; -+ }; -+ -+Userspace sets an array of struct futex_waitv (up to a max of 128 entries), -+using ``uaddr`` for the address to wait for, ``val`` for the expected value -+and ``flags`` to specify the type (e.g. private) and size of futex. -+``__reserved`` needs to be 0, but it can be used for future extension. The -+pointer for the first item of the array is passed as ``waiters``. An invalid -+address for ``waiters`` or for any ``uaddr`` returns ``-EFAULT``. -+ -+If userspace has 32-bit pointers, it should do a explicit cast to make sure -+the upper bits are zeroed. ``uintptr_t`` does the tricky and it works for -+both 32/64-bit pointers. -+ -+``nr_futexes`` specifies the size of the array. Numbers out of [1, 128] -+interval will make the syscall return ``-EINVAL``. -+ -+The ``flags`` argument of the syscall needs to be 0, but it can be used for -+future extension. -+ -+For each entry in ``waiters`` array, the current value at ``uaddr`` is compared -+to ``val``. If it's different, the syscall undo all the work done so far and -+return ``-EAGAIN``. If all tests and verifications succeeds, syscall waits until -+one of the following happens: -+ -+- The timeout expires, returning ``-ETIMEOUT``. -+- A signal was sent to the sleeping task, returning ``-ERESTARTSYS``. -+- Some futex at the list was awaken, returning the index of some waked futex. -+ -+An example of how to use the interface can be found at ``tools/testing/selftests/futex/functional/futex_waitv.c``. -+ -+Timeout -+------- -+ -+``struct timespec *timeout`` argument is an optional argument that points to an -+absolute timeout. You need to specify the type of clock being used at -+``clockid`` argument. ``CLOCK_MONOTONIC`` and ``CLOCK_REALTIME`` are supported. -+This syscall accepts only 64bit timespec structs. -+ -+Types of futex -+-------------- -+ -+A futex can be either private or shared. Private is used for processes that -+shares the same memory space and the virtual address of the futex will be the -+same for all processes. This allows for optimizations in the kernel. To use -+private futexes, it's necessary to specify ``FUTEX_PRIVATE_FLAG`` in the futex -+flag. For processes that doesn't share the same memory space and therefore can -+have different virtual addresses for the same futex (using, for instance, a -+file-backed shared memory) requires different internal mechanisms to be get -+properly enqueued. This is the default behavior, and it works with both private -+and shared futexes. -+ -+Futexes can be of different sizes: 8, 16, 32 or 64 bits. Currently, the only -+supported one is 32 bit sized futex, and it need to be specified using -+``FUTEX_32`` flag. -diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst -index c432be070..a61eac0c7 100644 ---- a/Documentation/userspace-api/index.rst -+++ b/Documentation/userspace-api/index.rst -@@ -28,6 +28,7 @@ place where this information is gathered. - media/index - sysfs-platform_profile - vduse -+ futex2 - - .. only:: subproject and html - -diff --git a/MAINTAINERS b/MAINTAINERS -index 3b79fd441..dd165835f 100644 ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -7737,6 +7737,7 @@ M: Ingo Molnar <mingo@redhat.com> - R: Peter Zijlstra <peterz@infradead.org> - R: Darren Hart <dvhart@infradead.org> - R: Davidlohr Bueso <dave@stgolabs.net> -+R: AndrĂ© Almeida <andrealmeid@collabora.com> - L: linux-kernel@vger.kernel.org - S: Maintained - T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core -@@ -7744,7 +7745,7 @@ F: Documentation/locking/*futex* - F: include/asm-generic/futex.h - F: include/linux/futex.h - F: include/uapi/linux/futex.h --F: kernel/futex.c -+F: kernel/futex/* - F: tools/perf/bench/futex* - F: tools/testing/selftests/futex/ - -diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl -index e842209e1..543100151 100644 ---- a/arch/arm/tools/syscall.tbl -+++ b/arch/arm/tools/syscall.tbl -@@ -462,3 +462,4 @@ - 446 common landlock_restrict_self sys_landlock_restrict_self - # 447 reserved for memfd_secret - 448 common process_mrelease sys_process_mrelease -+449 common futex_waitv sys_futex_waitv -diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h -index 3cb206aea..6bdb5f5db 100644 ---- a/arch/arm64/include/asm/unistd.h -+++ b/arch/arm64/include/asm/unistd.h -@@ -38,7 +38,7 @@ - #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) - #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) - --#define __NR_compat_syscalls 449 -+#define __NR_compat_syscalls 450 - #endif - - #define __ARCH_WANT_SYS_CLONE -diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h -index 844f6ae58..41ea1195e 100644 ---- a/arch/arm64/include/asm/unistd32.h -+++ b/arch/arm64/include/asm/unistd32.h -@@ -903,6 +903,8 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule) - __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) - #define __NR_process_mrelease 448 - __SYSCALL(__NR_process_mrelease, sys_process_mrelease) -+#define __NR_futex_waitv 449 -+__SYSCALL(__NR_futex_waitv, sys_futex_waitv) - - /* - * Please add new compat syscalls above this comment and update -diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl -index 960a021d5..7e2554369 100644 ---- a/arch/x86/entry/syscalls/syscall_32.tbl -+++ b/arch/x86/entry/syscalls/syscall_32.tbl -@@ -453,3 +453,4 @@ - 446 i386 landlock_restrict_self sys_landlock_restrict_self - 447 i386 memfd_secret sys_memfd_secret - 448 i386 process_mrelease sys_process_mrelease -+449 i386 futex_waitv sys_futex_waitv -diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl -index 18b5500ea..fe8f8dd15 100644 ---- a/arch/x86/entry/syscalls/syscall_64.tbl -+++ b/arch/x86/entry/syscalls/syscall_64.tbl -@@ -370,6 +370,7 @@ - 446 common landlock_restrict_self sys_landlock_restrict_self - 447 common memfd_secret sys_memfd_secret - 448 common process_mrelease sys_process_mrelease -+449 common futex_waitv sys_futex_waitv - - # - # Due to a historical design error, certain syscalls are numbered differently -diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h -index 252243c77..528a478db 100644 ---- a/include/linux/syscalls.h -+++ b/include/linux/syscalls.h -@@ -58,6 +58,7 @@ struct mq_attr; - struct compat_stat; - struct old_timeval32; - struct robust_list_head; -+struct futex_waitv; - struct getcpu_cache; - struct old_linux_dirent; - struct perf_event_attr; -@@ -610,7 +611,7 @@ asmlinkage long sys_waitid(int which, pid_t pid, - asmlinkage long sys_set_tid_address(int __user *tidptr); - asmlinkage long sys_unshare(unsigned long unshare_flags); - --/* kernel/futex.c */ -+/* kernel/futex/syscalls.c */ - asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, - const struct __kernel_timespec __user *utime, - u32 __user *uaddr2, u32 val3); -@@ -623,6 +624,10 @@ asmlinkage long sys_get_robust_list(int pid, - asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, - size_t len); - -+asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, -+ unsigned int nr_futexes, unsigned int flags, -+ struct __kernel_timespec __user *timeout, clockid_t clockid); -+ - /* kernel/hrtimer.c */ - asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp, - struct __kernel_timespec __user *rmtp); -diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h -index 1c5fb86d4..4557a8b60 100644 ---- a/include/uapi/asm-generic/unistd.h -+++ b/include/uapi/asm-generic/unistd.h -@@ -880,8 +880,11 @@ __SYSCALL(__NR_memfd_secret, sys_memfd_secret) - #define __NR_process_mrelease 448 - __SYSCALL(__NR_process_mrelease, sys_process_mrelease) - -+#define __NR_futex_waitv 449 -+__SYSCALL(__NR_futex_waitv, sys_futex_waitv) -+ - #undef __NR_syscalls --#define __NR_syscalls 449 -+#define __NR_syscalls 450 - - /* - * 32 bit systems traditionally used different -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index 235e5b2fa..71a5df8d2 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -43,6 +43,31 @@ - #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ - FUTEX_PRIVATE_FLAG) - -+/* -+ * Flags to specify the bit length of the futex word for futex2 syscalls. -+ * Currently, only 32 is supported. -+ */ -+#define FUTEX_32 2 -+ -+/* -+ * Max numbers of elements in a futex_waitv array -+ */ -+#define FUTEX_WAITV_MAX 128 -+ -+/** -+ * struct futex_waitv - A waiter for vectorized wait -+ * @val: Expected value at uaddr -+ * @uaddr: User address to wait on -+ * @flags: Flags for this waiter -+ * @__reserved: Reserved member to preserve data alignment. Should be 0. -+ */ -+struct futex_waitv { -+ __u64 val; -+ __u64 uaddr; -+ __u32 flags; -+ __u32 __reserved; -+}; -+ - /* - * Support for robust futexes: the kernel cleans up held futexes at - * thread exit time. -diff --git a/kernel/Makefile b/kernel/Makefile -index 4df609be4..3f6ab5d50 100644 ---- a/kernel/Makefile -+++ b/kernel/Makefile -@@ -59,7 +59,7 @@ obj-$(CONFIG_FREEZER) += freezer.o - obj-$(CONFIG_PROFILING) += profile.o - obj-$(CONFIG_STACKTRACE) += stacktrace.o - obj-y += time/ --obj-$(CONFIG_FUTEX) += futex.o -+obj-$(CONFIG_FUTEX) += futex/ - obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o - obj-$(CONFIG_SMP) += smp.o - ifneq ($(CONFIG_SMP),y) -diff --git a/kernel/futex.c b/kernel/futex.c -deleted file mode 100644 -index c15ad276f..000000000 ---- a/kernel/futex.c -+++ /dev/null -@@ -1,4272 +0,0 @@ --// SPDX-License-Identifier: GPL-2.0-or-later --/* -- * Fast Userspace Mutexes (which I call "Futexes!"). -- * (C) Rusty Russell, IBM 2002 -- * -- * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar -- * (C) Copyright 2003 Red Hat Inc, All Rights Reserved -- * -- * Removed page pinning, fix privately mapped COW pages and other cleanups -- * (C) Copyright 2003, 2004 Jamie Lokier -- * -- * Robust futex support started by Ingo Molnar -- * (C) Copyright 2006 Red Hat Inc, All Rights Reserved -- * Thanks to Thomas Gleixner for suggestions, analysis and fixes. -- * -- * PI-futex support started by Ingo Molnar and Thomas Gleixner -- * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> -- * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> -- * -- * PRIVATE futexes by Eric Dumazet -- * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> -- * -- * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> -- * Copyright (C) IBM Corporation, 2009 -- * Thanks to Thomas Gleixner for conceptual design and careful reviews. -- * -- * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly -- * enough at me, Linus for the original (flawed) idea, Matthew -- * Kirkwood for proof-of-concept implementation. -- * -- * "The futexes are also cursed." -- * "But they come in a choice of three flavours!" -- */ --#include <linux/compat.h> --#include <linux/jhash.h> --#include <linux/pagemap.h> --#include <linux/syscalls.h> --#include <linux/freezer.h> --#include <linux/memblock.h> --#include <linux/fault-inject.h> --#include <linux/time_namespace.h> -- --#include <asm/futex.h> -- --#include "locking/rtmutex_common.h" -- --/* -- * READ this before attempting to hack on futexes! -- * -- * Basic futex operation and ordering guarantees -- * ============================================= -- * -- * The waiter reads the futex value in user space and calls -- * futex_wait(). This function computes the hash bucket and acquires -- * the hash bucket lock. After that it reads the futex user space value -- * again and verifies that the data has not changed. If it has not changed -- * it enqueues itself into the hash bucket, releases the hash bucket lock -- * and schedules. -- * -- * The waker side modifies the user space value of the futex and calls -- * futex_wake(). This function computes the hash bucket and acquires the -- * hash bucket lock. Then it looks for waiters on that futex in the hash -- * bucket and wakes them. -- * -- * In futex wake up scenarios where no tasks are blocked on a futex, taking -- * the hb spinlock can be avoided and simply return. In order for this -- * optimization to work, ordering guarantees must exist so that the waiter -- * being added to the list is acknowledged when the list is concurrently being -- * checked by the waker, avoiding scenarios like the following: -- * -- * CPU 0 CPU 1 -- * val = *futex; -- * sys_futex(WAIT, futex, val); -- * futex_wait(futex, val); -- * uval = *futex; -- * *futex = newval; -- * sys_futex(WAKE, futex); -- * futex_wake(futex); -- * if (queue_empty()) -- * return; -- * if (uval == val) -- * lock(hash_bucket(futex)); -- * queue(); -- * unlock(hash_bucket(futex)); -- * schedule(); -- * -- * This would cause the waiter on CPU 0 to wait forever because it -- * missed the transition of the user space value from val to newval -- * and the waker did not find the waiter in the hash bucket queue. -- * -- * The correct serialization ensures that a waiter either observes -- * the changed user space value before blocking or is woken by a -- * concurrent waker: -- * -- * CPU 0 CPU 1 -- * val = *futex; -- * sys_futex(WAIT, futex, val); -- * futex_wait(futex, val); -- * -- * waiters++; (a) -- * smp_mb(); (A) <-- paired with -. -- * | -- * lock(hash_bucket(futex)); | -- * | -- * uval = *futex; | -- * | *futex = newval; -- * | sys_futex(WAKE, futex); -- * | futex_wake(futex); -- * | -- * `--------> smp_mb(); (B) -- * if (uval == val) -- * queue(); -- * unlock(hash_bucket(futex)); -- * schedule(); if (waiters) -- * lock(hash_bucket(futex)); -- * else wake_waiters(futex); -- * waiters--; (b) unlock(hash_bucket(futex)); -- * -- * Where (A) orders the waiters increment and the futex value read through -- * atomic operations (see hb_waiters_inc) and where (B) orders the write -- * to futex and the waiters read (see hb_waiters_pending()). -- * -- * This yields the following case (where X:=waiters, Y:=futex): -- * -- * X = Y = 0 -- * -- * w[X]=1 w[Y]=1 -- * MB MB -- * r[Y]=y r[X]=x -- * -- * Which guarantees that x==0 && y==0 is impossible; which translates back into -- * the guarantee that we cannot both miss the futex variable change and the -- * enqueue. -- * -- * Note that a new waiter is accounted for in (a) even when it is possible that -- * the wait call can return error, in which case we backtrack from it in (b). -- * Refer to the comment in queue_lock(). -- * -- * Similarly, in order to account for waiters being requeued on another -- * address we always increment the waiters for the destination bucket before -- * acquiring the lock. It then decrements them again after releasing it - -- * the code that actually moves the futex(es) between hash buckets (requeue_futex) -- * will do the additional required waiter count housekeeping. This is done for -- * double_lock_hb() and double_unlock_hb(), respectively. -- */ -- --#ifdef CONFIG_HAVE_FUTEX_CMPXCHG --#define futex_cmpxchg_enabled 1 --#else --static int __read_mostly futex_cmpxchg_enabled; --#endif -- --/* -- * Futex flags used to encode options to functions and preserve them across -- * restarts. -- */ --#ifdef CONFIG_MMU --# define FLAGS_SHARED 0x01 --#else --/* -- * NOMMU does not have per process address space. Let the compiler optimize -- * code away. -- */ --# define FLAGS_SHARED 0x00 --#endif --#define FLAGS_CLOCKRT 0x02 --#define FLAGS_HAS_TIMEOUT 0x04 -- --/* -- * Priority Inheritance state: -- */ --struct futex_pi_state { -- /* -- * list of 'owned' pi_state instances - these have to be -- * cleaned up in do_exit() if the task exits prematurely: -- */ -- struct list_head list; -- -- /* -- * The PI object: -- */ -- struct rt_mutex_base pi_mutex; -- -- struct task_struct *owner; -- refcount_t refcount; -- -- union futex_key key; --} __randomize_layout; -- --/** -- * struct futex_q - The hashed futex queue entry, one per waiting task -- * @list: priority-sorted list of tasks waiting on this futex -- * @task: the task waiting on the futex -- * @lock_ptr: the hash bucket lock -- * @key: the key the futex is hashed on -- * @pi_state: optional priority inheritance state -- * @rt_waiter: rt_waiter storage for use with requeue_pi -- * @requeue_pi_key: the requeue_pi target futex key -- * @bitset: bitset for the optional bitmasked wakeup -- * @requeue_state: State field for futex_requeue_pi() -- * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) -- * -- * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so -- * we can wake only the relevant ones (hashed queues may be shared). -- * -- * A futex_q has a woken state, just like tasks have TASK_RUNNING. -- * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. -- * The order of wakeup is always to make the first condition true, then -- * the second. -- * -- * PI futexes are typically woken before they are removed from the hash list via -- * the rt_mutex code. See unqueue_me_pi(). -- */ --struct futex_q { -- struct plist_node list; -- -- struct task_struct *task; -- spinlock_t *lock_ptr; -- union futex_key key; -- struct futex_pi_state *pi_state; -- struct rt_mutex_waiter *rt_waiter; -- union futex_key *requeue_pi_key; -- u32 bitset; -- atomic_t requeue_state; --#ifdef CONFIG_PREEMPT_RT -- struct rcuwait requeue_wait; --#endif --} __randomize_layout; -- --/* -- * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an -- * underlying rtmutex. The task which is about to be requeued could have -- * just woken up (timeout, signal). After the wake up the task has to -- * acquire hash bucket lock, which is held by the requeue code. As a task -- * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking -- * and the hash bucket lock blocking would collide and corrupt state. -- * -- * On !PREEMPT_RT this is not a problem and everything could be serialized -- * on hash bucket lock, but aside of having the benefit of common code, -- * this allows to avoid doing the requeue when the task is already on the -- * way out and taking the hash bucket lock of the original uaddr1 when the -- * requeue has been completed. -- * -- * The following state transitions are valid: -- * -- * On the waiter side: -- * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE -- * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT -- * -- * On the requeue side: -- * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS -- * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED -- * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed) -- * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED -- * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed) -- * -- * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this -- * signals that the waiter is already on the way out. It also means that -- * the waiter is still on the 'wait' futex, i.e. uaddr1. -- * -- * The waiter side signals early wakeup to the requeue side either through -- * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending -- * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately -- * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, -- * which means the wakeup is interleaving with a requeue in progress it has -- * to wait for the requeue side to change the state. Either to DONE/LOCKED -- * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex -- * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by -- * the requeue side when the requeue attempt failed via deadlock detection -- * and therefore the waiter q is still on the uaddr1 futex. -- */ --enum { -- Q_REQUEUE_PI_NONE = 0, -- Q_REQUEUE_PI_IGNORE, -- Q_REQUEUE_PI_IN_PROGRESS, -- Q_REQUEUE_PI_WAIT, -- Q_REQUEUE_PI_DONE, -- Q_REQUEUE_PI_LOCKED, --}; -- --static const struct futex_q futex_q_init = { -- /* list gets initialized in queue_me()*/ -- .key = FUTEX_KEY_INIT, -- .bitset = FUTEX_BITSET_MATCH_ANY, -- .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), --}; -- --/* -- * Hash buckets are shared by all the futex_keys that hash to the same -- * location. Each key may have multiple futex_q structures, one for each task -- * waiting on a futex. -- */ --struct futex_hash_bucket { -- atomic_t waiters; -- spinlock_t lock; -- struct plist_head chain; --} ____cacheline_aligned_in_smp; -- --/* -- * The base of the bucket array and its size are always used together -- * (after initialization only in hash_futex()), so ensure that they -- * reside in the same cacheline. -- */ --static struct { -- struct futex_hash_bucket *queues; -- unsigned long hashsize; --} __futex_data __read_mostly __aligned(2*sizeof(long)); --#define futex_queues (__futex_data.queues) --#define futex_hashsize (__futex_data.hashsize) -- -- --/* -- * Fault injections for futexes. -- */ --#ifdef CONFIG_FAIL_FUTEX -- --static struct { -- struct fault_attr attr; -- -- bool ignore_private; --} fail_futex = { -- .attr = FAULT_ATTR_INITIALIZER, -- .ignore_private = false, --}; -- --static int __init setup_fail_futex(char *str) --{ -- return setup_fault_attr(&fail_futex.attr, str); --} --__setup("fail_futex=", setup_fail_futex); -- --static bool should_fail_futex(bool fshared) --{ -- if (fail_futex.ignore_private && !fshared) -- return false; -- -- return should_fail(&fail_futex.attr, 1); --} -- --#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS -- --static int __init fail_futex_debugfs(void) --{ -- umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; -- struct dentry *dir; -- -- dir = fault_create_debugfs_attr("fail_futex", NULL, -- &fail_futex.attr); -- if (IS_ERR(dir)) -- return PTR_ERR(dir); -- -- debugfs_create_bool("ignore-private", mode, dir, -- &fail_futex.ignore_private); -- return 0; --} -- --late_initcall(fail_futex_debugfs); -- --#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ -- --#else --static inline bool should_fail_futex(bool fshared) --{ -- return false; --} --#endif /* CONFIG_FAIL_FUTEX */ -- --#ifdef CONFIG_COMPAT --static void compat_exit_robust_list(struct task_struct *curr); --#endif -- --/* -- * Reflects a new waiter being added to the waitqueue. -- */ --static inline void hb_waiters_inc(struct futex_hash_bucket *hb) --{ --#ifdef CONFIG_SMP -- atomic_inc(&hb->waiters); -- /* -- * Full barrier (A), see the ordering comment above. -- */ -- smp_mb__after_atomic(); --#endif --} -- --/* -- * Reflects a waiter being removed from the waitqueue by wakeup -- * paths. -- */ --static inline void hb_waiters_dec(struct futex_hash_bucket *hb) --{ --#ifdef CONFIG_SMP -- atomic_dec(&hb->waiters); --#endif --} -- --static inline int hb_waiters_pending(struct futex_hash_bucket *hb) --{ --#ifdef CONFIG_SMP -- /* -- * Full barrier (B), see the ordering comment above. -- */ -- smp_mb(); -- return atomic_read(&hb->waiters); --#else -- return 1; --#endif --} -- --/** -- * hash_futex - Return the hash bucket in the global hash -- * @key: Pointer to the futex key for which the hash is calculated -- * -- * We hash on the keys returned from get_futex_key (see below) and return the -- * corresponding hash bucket in the global hash. -- */ --static struct futex_hash_bucket *hash_futex(union futex_key *key) --{ -- u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, -- key->both.offset); -- -- return &futex_queues[hash & (futex_hashsize - 1)]; --} -- -- --/** -- * match_futex - Check whether two futex keys are equal -- * @key1: Pointer to key1 -- * @key2: Pointer to key2 -- * -- * Return 1 if two futex_keys are equal, 0 otherwise. -- */ --static inline int match_futex(union futex_key *key1, union futex_key *key2) --{ -- return (key1 && key2 -- && key1->both.word == key2->both.word -- && key1->both.ptr == key2->both.ptr -- && key1->both.offset == key2->both.offset); --} -- --enum futex_access { -- FUTEX_READ, -- FUTEX_WRITE --}; -- --/** -- * futex_setup_timer - set up the sleeping hrtimer. -- * @time: ptr to the given timeout value -- * @timeout: the hrtimer_sleeper structure to be set up -- * @flags: futex flags -- * @range_ns: optional range in ns -- * -- * Return: Initialized hrtimer_sleeper structure or NULL if no timeout -- * value given -- */ --static inline struct hrtimer_sleeper * --futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, -- int flags, u64 range_ns) --{ -- if (!time) -- return NULL; -- -- hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? -- CLOCK_REALTIME : CLOCK_MONOTONIC, -- HRTIMER_MODE_ABS); -- /* -- * If range_ns is 0, calling hrtimer_set_expires_range_ns() is -- * effectively the same as calling hrtimer_set_expires(). -- */ -- hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); -- -- return timeout; --} -- --/* -- * Generate a machine wide unique identifier for this inode. -- * -- * This relies on u64 not wrapping in the life-time of the machine; which with -- * 1ns resolution means almost 585 years. -- * -- * This further relies on the fact that a well formed program will not unmap -- * the file while it has a (shared) futex waiting on it. This mapping will have -- * a file reference which pins the mount and inode. -- * -- * If for some reason an inode gets evicted and read back in again, it will get -- * a new sequence number and will _NOT_ match, even though it is the exact same -- * file. -- * -- * It is important that match_futex() will never have a false-positive, esp. -- * for PI futexes that can mess up the state. The above argues that false-negatives -- * are only possible for malformed programs. -- */ --static u64 get_inode_sequence_number(struct inode *inode) --{ -- static atomic64_t i_seq; -- u64 old; -- -- /* Does the inode already have a sequence number? */ -- old = atomic64_read(&inode->i_sequence); -- if (likely(old)) -- return old; -- -- for (;;) { -- u64 new = atomic64_add_return(1, &i_seq); -- if (WARN_ON_ONCE(!new)) -- continue; -- -- old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); -- if (old) -- return old; -- return new; -- } --} -- --/** -- * get_futex_key() - Get parameters which are the keys for a futex -- * @uaddr: virtual address of the futex -- * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED -- * @key: address where result is stored. -- * @rw: mapping needs to be read/write (values: FUTEX_READ, -- * FUTEX_WRITE) -- * -- * Return: a negative error code or 0 -- * -- * The key words are stored in @key on success. -- * -- * For shared mappings (when @fshared), the key is: -- * -- * ( inode->i_sequence, page->index, offset_within_page ) -- * -- * [ also see get_inode_sequence_number() ] -- * -- * For private mappings (or when !@fshared), the key is: -- * -- * ( current->mm, address, 0 ) -- * -- * This allows (cross process, where applicable) identification of the futex -- * without keeping the page pinned for the duration of the FUTEX_WAIT. -- * -- * lock_page() might sleep, the caller should not hold a spinlock. -- */ --static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, -- enum futex_access rw) --{ -- unsigned long address = (unsigned long)uaddr; -- struct mm_struct *mm = current->mm; -- struct page *page, *tail; -- struct address_space *mapping; -- int err, ro = 0; -- -- /* -- * The futex address must be "naturally" aligned. -- */ -- key->both.offset = address % PAGE_SIZE; -- if (unlikely((address % sizeof(u32)) != 0)) -- return -EINVAL; -- address -= key->both.offset; -- -- if (unlikely(!access_ok(uaddr, sizeof(u32)))) -- return -EFAULT; -- -- if (unlikely(should_fail_futex(fshared))) -- return -EFAULT; -- -- /* -- * PROCESS_PRIVATE futexes are fast. -- * As the mm cannot disappear under us and the 'key' only needs -- * virtual address, we dont even have to find the underlying vma. -- * Note : We do have to check 'uaddr' is a valid user address, -- * but access_ok() should be faster than find_vma() -- */ -- if (!fshared) { -- key->private.mm = mm; -- key->private.address = address; -- return 0; -- } -- --again: -- /* Ignore any VERIFY_READ mapping (futex common case) */ -- if (unlikely(should_fail_futex(true))) -- return -EFAULT; -- -- err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); -- /* -- * If write access is not required (eg. FUTEX_WAIT), try -- * and get read-only access. -- */ -- if (err == -EFAULT && rw == FUTEX_READ) { -- err = get_user_pages_fast(address, 1, 0, &page); -- ro = 1; -- } -- if (err < 0) -- return err; -- else -- err = 0; -- -- /* -- * The treatment of mapping from this point on is critical. The page -- * lock protects many things but in this context the page lock -- * stabilizes mapping, prevents inode freeing in the shared -- * file-backed region case and guards against movement to swap cache. -- * -- * Strictly speaking the page lock is not needed in all cases being -- * considered here and page lock forces unnecessarily serialization -- * From this point on, mapping will be re-verified if necessary and -- * page lock will be acquired only if it is unavoidable -- * -- * Mapping checks require the head page for any compound page so the -- * head page and mapping is looked up now. For anonymous pages, it -- * does not matter if the page splits in the future as the key is -- * based on the address. For filesystem-backed pages, the tail is -- * required as the index of the page determines the key. For -- * base pages, there is no tail page and tail == page. -- */ -- tail = page; -- page = compound_head(page); -- mapping = READ_ONCE(page->mapping); -- -- /* -- * If page->mapping is NULL, then it cannot be a PageAnon -- * page; but it might be the ZERO_PAGE or in the gate area or -- * in a special mapping (all cases which we are happy to fail); -- * or it may have been a good file page when get_user_pages_fast -- * found it, but truncated or holepunched or subjected to -- * invalidate_complete_page2 before we got the page lock (also -- * cases which we are happy to fail). And we hold a reference, -- * so refcount care in invalidate_complete_page's remove_mapping -- * prevents drop_caches from setting mapping to NULL beneath us. -- * -- * The case we do have to guard against is when memory pressure made -- * shmem_writepage move it from filecache to swapcache beneath us: -- * an unlikely race, but we do need to retry for page->mapping. -- */ -- if (unlikely(!mapping)) { -- int shmem_swizzled; -- -- /* -- * Page lock is required to identify which special case above -- * applies. If this is really a shmem page then the page lock -- * will prevent unexpected transitions. -- */ -- lock_page(page); -- shmem_swizzled = PageSwapCache(page) || page->mapping; -- unlock_page(page); -- put_page(page); -- -- if (shmem_swizzled) -- goto again; -- -- return -EFAULT; -- } -- -- /* -- * Private mappings are handled in a simple way. -- * -- * If the futex key is stored on an anonymous page, then the associated -- * object is the mm which is implicitly pinned by the calling process. -- * -- * NOTE: When userspace waits on a MAP_SHARED mapping, even if -- * it's a read-only handle, it's expected that futexes attach to -- * the object not the particular process. -- */ -- if (PageAnon(page)) { -- /* -- * A RO anonymous page will never change and thus doesn't make -- * sense for futex operations. -- */ -- if (unlikely(should_fail_futex(true)) || ro) { -- err = -EFAULT; -- goto out; -- } -- -- key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ -- key->private.mm = mm; -- key->private.address = address; -- -- } else { -- struct inode *inode; -- -- /* -- * The associated futex object in this case is the inode and -- * the page->mapping must be traversed. Ordinarily this should -- * be stabilised under page lock but it's not strictly -- * necessary in this case as we just want to pin the inode, not -- * update the radix tree or anything like that. -- * -- * The RCU read lock is taken as the inode is finally freed -- * under RCU. If the mapping still matches expectations then the -- * mapping->host can be safely accessed as being a valid inode. -- */ -- rcu_read_lock(); -- -- if (READ_ONCE(page->mapping) != mapping) { -- rcu_read_unlock(); -- put_page(page); -- -- goto again; -- } -- -- inode = READ_ONCE(mapping->host); -- if (!inode) { -- rcu_read_unlock(); -- put_page(page); -- -- goto again; -- } -- -- key->both.offset |= FUT_OFF_INODE; /* inode-based key */ -- key->shared.i_seq = get_inode_sequence_number(inode); -- key->shared.pgoff = page_to_pgoff(tail); -- rcu_read_unlock(); -- } -- --out: -- put_page(page); -- return err; --} -- --/** -- * fault_in_user_writeable() - Fault in user address and verify RW access -- * @uaddr: pointer to faulting user space address -- * -- * Slow path to fixup the fault we just took in the atomic write -- * access to @uaddr. -- * -- * We have no generic implementation of a non-destructive write to the -- * user address. We know that we faulted in the atomic pagefault -- * disabled section so we can as well avoid the #PF overhead by -- * calling get_user_pages() right away. -- */ --static int fault_in_user_writeable(u32 __user *uaddr) --{ -- struct mm_struct *mm = current->mm; -- int ret; -- -- mmap_read_lock(mm); -- ret = fixup_user_fault(mm, (unsigned long)uaddr, -- FAULT_FLAG_WRITE, NULL); -- mmap_read_unlock(mm); -- -- return ret < 0 ? ret : 0; --} -- --/** -- * futex_top_waiter() - Return the highest priority waiter on a futex -- * @hb: the hash bucket the futex_q's reside in -- * @key: the futex key (to distinguish it from other futex futex_q's) -- * -- * Must be called with the hb lock held. -- */ --static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, -- union futex_key *key) --{ -- struct futex_q *this; -- -- plist_for_each_entry(this, &hb->chain, list) { -- if (match_futex(&this->key, key)) -- return this; -- } -- return NULL; --} -- --static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, -- u32 uval, u32 newval) --{ -- int ret; -- -- pagefault_disable(); -- ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); -- pagefault_enable(); -- -- return ret; --} -- --static int get_futex_value_locked(u32 *dest, u32 __user *from) --{ -- int ret; -- -- pagefault_disable(); -- ret = __get_user(*dest, from); -- pagefault_enable(); -- -- return ret ? -EFAULT : 0; --} -- -- --/* -- * PI code: -- */ --static int refill_pi_state_cache(void) --{ -- struct futex_pi_state *pi_state; -- -- if (likely(current->pi_state_cache)) -- return 0; -- -- pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); -- -- if (!pi_state) -- return -ENOMEM; -- -- INIT_LIST_HEAD(&pi_state->list); -- /* pi_mutex gets initialized later */ -- pi_state->owner = NULL; -- refcount_set(&pi_state->refcount, 1); -- pi_state->key = FUTEX_KEY_INIT; -- -- current->pi_state_cache = pi_state; -- -- return 0; --} -- --static struct futex_pi_state *alloc_pi_state(void) --{ -- struct futex_pi_state *pi_state = current->pi_state_cache; -- -- WARN_ON(!pi_state); -- current->pi_state_cache = NULL; -- -- return pi_state; --} -- --static void pi_state_update_owner(struct futex_pi_state *pi_state, -- struct task_struct *new_owner) --{ -- struct task_struct *old_owner = pi_state->owner; -- -- lockdep_assert_held(&pi_state->pi_mutex.wait_lock); -- -- if (old_owner) { -- raw_spin_lock(&old_owner->pi_lock); -- WARN_ON(list_empty(&pi_state->list)); -- list_del_init(&pi_state->list); -- raw_spin_unlock(&old_owner->pi_lock); -- } -- -- if (new_owner) { -- raw_spin_lock(&new_owner->pi_lock); -- WARN_ON(!list_empty(&pi_state->list)); -- list_add(&pi_state->list, &new_owner->pi_state_list); -- pi_state->owner = new_owner; -- raw_spin_unlock(&new_owner->pi_lock); -- } --} -- --static void get_pi_state(struct futex_pi_state *pi_state) --{ -- WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); --} -- --/* -- * Drops a reference to the pi_state object and frees or caches it -- * when the last reference is gone. -- */ --static void put_pi_state(struct futex_pi_state *pi_state) --{ -- if (!pi_state) -- return; -- -- if (!refcount_dec_and_test(&pi_state->refcount)) -- return; -- -- /* -- * If pi_state->owner is NULL, the owner is most probably dying -- * and has cleaned up the pi_state already -- */ -- if (pi_state->owner) { -- unsigned long flags; -- -- raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); -- pi_state_update_owner(pi_state, NULL); -- rt_mutex_proxy_unlock(&pi_state->pi_mutex); -- raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); -- } -- -- if (current->pi_state_cache) { -- kfree(pi_state); -- } else { -- /* -- * pi_state->list is already empty. -- * clear pi_state->owner. -- * refcount is at 0 - put it back to 1. -- */ -- pi_state->owner = NULL; -- refcount_set(&pi_state->refcount, 1); -- current->pi_state_cache = pi_state; -- } --} -- --#ifdef CONFIG_FUTEX_PI -- --/* -- * This task is holding PI mutexes at exit time => bad. -- * Kernel cleans up PI-state, but userspace is likely hosed. -- * (Robust-futex cleanup is separate and might save the day for userspace.) -- */ --static void exit_pi_state_list(struct task_struct *curr) --{ -- struct list_head *next, *head = &curr->pi_state_list; -- struct futex_pi_state *pi_state; -- struct futex_hash_bucket *hb; -- union futex_key key = FUTEX_KEY_INIT; -- -- if (!futex_cmpxchg_enabled) -- return; -- /* -- * We are a ZOMBIE and nobody can enqueue itself on -- * pi_state_list anymore, but we have to be careful -- * versus waiters unqueueing themselves: -- */ -- raw_spin_lock_irq(&curr->pi_lock); -- while (!list_empty(head)) { -- next = head->next; -- pi_state = list_entry(next, struct futex_pi_state, list); -- key = pi_state->key; -- hb = hash_futex(&key); -- -- /* -- * We can race against put_pi_state() removing itself from the -- * list (a waiter going away). put_pi_state() will first -- * decrement the reference count and then modify the list, so -- * its possible to see the list entry but fail this reference -- * acquire. -- * -- * In that case; drop the locks to let put_pi_state() make -- * progress and retry the loop. -- */ -- if (!refcount_inc_not_zero(&pi_state->refcount)) { -- raw_spin_unlock_irq(&curr->pi_lock); -- cpu_relax(); -- raw_spin_lock_irq(&curr->pi_lock); -- continue; -- } -- raw_spin_unlock_irq(&curr->pi_lock); -- -- spin_lock(&hb->lock); -- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -- raw_spin_lock(&curr->pi_lock); -- /* -- * We dropped the pi-lock, so re-check whether this -- * task still owns the PI-state: -- */ -- if (head->next != next) { -- /* retain curr->pi_lock for the loop invariant */ -- raw_spin_unlock(&pi_state->pi_mutex.wait_lock); -- spin_unlock(&hb->lock); -- put_pi_state(pi_state); -- continue; -- } -- -- WARN_ON(pi_state->owner != curr); -- WARN_ON(list_empty(&pi_state->list)); -- list_del_init(&pi_state->list); -- pi_state->owner = NULL; -- -- raw_spin_unlock(&curr->pi_lock); -- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -- spin_unlock(&hb->lock); -- -- rt_mutex_futex_unlock(&pi_state->pi_mutex); -- put_pi_state(pi_state); -- -- raw_spin_lock_irq(&curr->pi_lock); -- } -- raw_spin_unlock_irq(&curr->pi_lock); --} --#else --static inline void exit_pi_state_list(struct task_struct *curr) { } --#endif -- --/* -- * We need to check the following states: -- * -- * Waiter | pi_state | pi->owner | uTID | uODIED | ? -- * -- * [1] NULL | --- | --- | 0 | 0/1 | Valid -- * [2] NULL | --- | --- | >0 | 0/1 | Valid -- * -- * [3] Found | NULL | -- | Any | 0/1 | Invalid -- * -- * [4] Found | Found | NULL | 0 | 1 | Valid -- * [5] Found | Found | NULL | >0 | 1 | Invalid -- * -- * [6] Found | Found | task | 0 | 1 | Valid -- * -- * [7] Found | Found | NULL | Any | 0 | Invalid -- * -- * [8] Found | Found | task | ==taskTID | 0/1 | Valid -- * [9] Found | Found | task | 0 | 0 | Invalid -- * [10] Found | Found | task | !=taskTID | 0/1 | Invalid -- * -- * [1] Indicates that the kernel can acquire the futex atomically. We -- * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. -- * -- * [2] Valid, if TID does not belong to a kernel thread. If no matching -- * thread is found then it indicates that the owner TID has died. -- * -- * [3] Invalid. The waiter is queued on a non PI futex -- * -- * [4] Valid state after exit_robust_list(), which sets the user space -- * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. -- * -- * [5] The user space value got manipulated between exit_robust_list() -- * and exit_pi_state_list() -- * -- * [6] Valid state after exit_pi_state_list() which sets the new owner in -- * the pi_state but cannot access the user space value. -- * -- * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. -- * -- * [8] Owner and user space value match -- * -- * [9] There is no transient state which sets the user space TID to 0 -- * except exit_robust_list(), but this is indicated by the -- * FUTEX_OWNER_DIED bit. See [4] -- * -- * [10] There is no transient state which leaves owner and user space -- * TID out of sync. Except one error case where the kernel is denied -- * write access to the user address, see fixup_pi_state_owner(). -- * -- * -- * Serialization and lifetime rules: -- * -- * hb->lock: -- * -- * hb -> futex_q, relation -- * futex_q -> pi_state, relation -- * -- * (cannot be raw because hb can contain arbitrary amount -- * of futex_q's) -- * -- * pi_mutex->wait_lock: -- * -- * {uval, pi_state} -- * -- * (and pi_mutex 'obviously') -- * -- * p->pi_lock: -- * -- * p->pi_state_list -> pi_state->list, relation -- * pi_mutex->owner -> pi_state->owner, relation -- * -- * pi_state->refcount: -- * -- * pi_state lifetime -- * -- * -- * Lock order: -- * -- * hb->lock -- * pi_mutex->wait_lock -- * p->pi_lock -- * -- */ -- --/* -- * Validate that the existing waiter has a pi_state and sanity check -- * the pi_state against the user space value. If correct, attach to -- * it. -- */ --static int attach_to_pi_state(u32 __user *uaddr, u32 uval, -- struct futex_pi_state *pi_state, -- struct futex_pi_state **ps) --{ -- pid_t pid = uval & FUTEX_TID_MASK; -- u32 uval2; -- int ret; -- -- /* -- * Userspace might have messed up non-PI and PI futexes [3] -- */ -- if (unlikely(!pi_state)) -- return -EINVAL; -- -- /* -- * We get here with hb->lock held, and having found a -- * futex_top_waiter(). This means that futex_lock_pi() of said futex_q -- * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), -- * which in turn means that futex_lock_pi() still has a reference on -- * our pi_state. -- * -- * The waiter holding a reference on @pi_state also protects against -- * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() -- * and futex_wait_requeue_pi() as it cannot go to 0 and consequently -- * free pi_state before we can take a reference ourselves. -- */ -- WARN_ON(!refcount_read(&pi_state->refcount)); -- -- /* -- * Now that we have a pi_state, we can acquire wait_lock -- * and do the state validation. -- */ -- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -- -- /* -- * Since {uval, pi_state} is serialized by wait_lock, and our current -- * uval was read without holding it, it can have changed. Verify it -- * still is what we expect it to be, otherwise retry the entire -- * operation. -- */ -- if (get_futex_value_locked(&uval2, uaddr)) -- goto out_efault; -- -- if (uval != uval2) -- goto out_eagain; -- -- /* -- * Handle the owner died case: -- */ -- if (uval & FUTEX_OWNER_DIED) { -- /* -- * exit_pi_state_list sets owner to NULL and wakes the -- * topmost waiter. The task which acquires the -- * pi_state->rt_mutex will fixup owner. -- */ -- if (!pi_state->owner) { -- /* -- * No pi state owner, but the user space TID -- * is not 0. Inconsistent state. [5] -- */ -- if (pid) -- goto out_einval; -- /* -- * Take a ref on the state and return success. [4] -- */ -- goto out_attach; -- } -- -- /* -- * If TID is 0, then either the dying owner has not -- * yet executed exit_pi_state_list() or some waiter -- * acquired the rtmutex in the pi state, but did not -- * yet fixup the TID in user space. -- * -- * Take a ref on the state and return success. [6] -- */ -- if (!pid) -- goto out_attach; -- } else { -- /* -- * If the owner died bit is not set, then the pi_state -- * must have an owner. [7] -- */ -- if (!pi_state->owner) -- goto out_einval; -- } -- -- /* -- * Bail out if user space manipulated the futex value. If pi -- * state exists then the owner TID must be the same as the -- * user space TID. [9/10] -- */ -- if (pid != task_pid_vnr(pi_state->owner)) -- goto out_einval; -- --out_attach: -- get_pi_state(pi_state); -- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -- *ps = pi_state; -- return 0; -- --out_einval: -- ret = -EINVAL; -- goto out_error; -- --out_eagain: -- ret = -EAGAIN; -- goto out_error; -- --out_efault: -- ret = -EFAULT; -- goto out_error; -- --out_error: -- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -- return ret; --} -- --/** -- * wait_for_owner_exiting - Block until the owner has exited -- * @ret: owner's current futex lock status -- * @exiting: Pointer to the exiting task -- * -- * Caller must hold a refcount on @exiting. -- */ --static void wait_for_owner_exiting(int ret, struct task_struct *exiting) --{ -- if (ret != -EBUSY) { -- WARN_ON_ONCE(exiting); -- return; -- } -- -- if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) -- return; -- -- mutex_lock(&exiting->futex_exit_mutex); -- /* -- * No point in doing state checking here. If the waiter got here -- * while the task was in exec()->exec_futex_release() then it can -- * have any FUTEX_STATE_* value when the waiter has acquired the -- * mutex. OK, if running, EXITING or DEAD if it reached exit() -- * already. Highly unlikely and not a problem. Just one more round -- * through the futex maze. -- */ -- mutex_unlock(&exiting->futex_exit_mutex); -- -- put_task_struct(exiting); --} -- --static int handle_exit_race(u32 __user *uaddr, u32 uval, -- struct task_struct *tsk) --{ -- u32 uval2; -- -- /* -- * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the -- * caller that the alleged owner is busy. -- */ -- if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) -- return -EBUSY; -- -- /* -- * Reread the user space value to handle the following situation: -- * -- * CPU0 CPU1 -- * -- * sys_exit() sys_futex() -- * do_exit() futex_lock_pi() -- * futex_lock_pi_atomic() -- * exit_signals(tsk) No waiters: -- * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID -- * mm_release(tsk) Set waiter bit -- * exit_robust_list(tsk) { *uaddr = 0x80000PID; -- * Set owner died attach_to_pi_owner() { -- * *uaddr = 0xC0000000; tsk = get_task(PID); -- * } if (!tsk->flags & PF_EXITING) { -- * ... attach(); -- * tsk->futex_state = } else { -- * FUTEX_STATE_DEAD; if (tsk->futex_state != -- * FUTEX_STATE_DEAD) -- * return -EAGAIN; -- * return -ESRCH; <--- FAIL -- * } -- * -- * Returning ESRCH unconditionally is wrong here because the -- * user space value has been changed by the exiting task. -- * -- * The same logic applies to the case where the exiting task is -- * already gone. -- */ -- if (get_futex_value_locked(&uval2, uaddr)) -- return -EFAULT; -- -- /* If the user space value has changed, try again. */ -- if (uval2 != uval) -- return -EAGAIN; -- -- /* -- * The exiting task did not have a robust list, the robust list was -- * corrupted or the user space value in *uaddr is simply bogus. -- * Give up and tell user space. -- */ -- return -ESRCH; --} -- --static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, -- struct futex_pi_state **ps) --{ -- /* -- * No existing pi state. First waiter. [2] -- * -- * This creates pi_state, we have hb->lock held, this means nothing can -- * observe this state, wait_lock is irrelevant. -- */ -- struct futex_pi_state *pi_state = alloc_pi_state(); -- -- /* -- * Initialize the pi_mutex in locked state and make @p -- * the owner of it: -- */ -- rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); -- -- /* Store the key for possible exit cleanups: */ -- pi_state->key = *key; -- -- WARN_ON(!list_empty(&pi_state->list)); -- list_add(&pi_state->list, &p->pi_state_list); -- /* -- * Assignment without holding pi_state->pi_mutex.wait_lock is safe -- * because there is no concurrency as the object is not published yet. -- */ -- pi_state->owner = p; -- -- *ps = pi_state; --} --/* -- * Lookup the task for the TID provided from user space and attach to -- * it after doing proper sanity checks. -- */ --static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, -- struct futex_pi_state **ps, -- struct task_struct **exiting) --{ -- pid_t pid = uval & FUTEX_TID_MASK; -- struct task_struct *p; -- -- /* -- * We are the first waiter - try to look up the real owner and attach -- * the new pi_state to it, but bail out when TID = 0 [1] -- * -- * The !pid check is paranoid. None of the call sites should end up -- * with pid == 0, but better safe than sorry. Let the caller retry -- */ -- if (!pid) -- return -EAGAIN; -- p = find_get_task_by_vpid(pid); -- if (!p) -- return handle_exit_race(uaddr, uval, NULL); -- -- if (unlikely(p->flags & PF_KTHREAD)) { -- put_task_struct(p); -- return -EPERM; -- } -- -- /* -- * We need to look at the task state to figure out, whether the -- * task is exiting. To protect against the change of the task state -- * in futex_exit_release(), we do this protected by p->pi_lock: -- */ -- raw_spin_lock_irq(&p->pi_lock); -- if (unlikely(p->futex_state != FUTEX_STATE_OK)) { -- /* -- * The task is on the way out. When the futex state is -- * FUTEX_STATE_DEAD, we know that the task has finished -- * the cleanup: -- */ -- int ret = handle_exit_race(uaddr, uval, p); -- -- raw_spin_unlock_irq(&p->pi_lock); -- /* -- * If the owner task is between FUTEX_STATE_EXITING and -- * FUTEX_STATE_DEAD then store the task pointer and keep -- * the reference on the task struct. The calling code will -- * drop all locks, wait for the task to reach -- * FUTEX_STATE_DEAD and then drop the refcount. This is -- * required to prevent a live lock when the current task -- * preempted the exiting task between the two states. -- */ -- if (ret == -EBUSY) -- *exiting = p; -- else -- put_task_struct(p); -- return ret; -- } -- -- __attach_to_pi_owner(p, key, ps); -- raw_spin_unlock_irq(&p->pi_lock); -- -- put_task_struct(p); -- -- return 0; --} -- --static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) --{ -- int err; -- u32 curval; -- -- if (unlikely(should_fail_futex(true))) -- return -EFAULT; -- -- err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); -- if (unlikely(err)) -- return err; -- -- /* If user space value changed, let the caller retry */ -- return curval != uval ? -EAGAIN : 0; --} -- --/** -- * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex -- * @uaddr: the pi futex user address -- * @hb: the pi futex hash bucket -- * @key: the futex key associated with uaddr and hb -- * @ps: the pi_state pointer where we store the result of the -- * lookup -- * @task: the task to perform the atomic lock work for. This will -- * be "current" except in the case of requeue pi. -- * @exiting: Pointer to store the task pointer of the owner task -- * which is in the middle of exiting -- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) -- * -- * Return: -- * - 0 - ready to wait; -- * - 1 - acquired the lock; -- * - <0 - error -- * -- * The hb->lock must be held by the caller. -- * -- * @exiting is only set when the return value is -EBUSY. If so, this holds -- * a refcount on the exiting task on return and the caller needs to drop it -- * after waiting for the exit to complete. -- */ --static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, -- union futex_key *key, -- struct futex_pi_state **ps, -- struct task_struct *task, -- struct task_struct **exiting, -- int set_waiters) --{ -- u32 uval, newval, vpid = task_pid_vnr(task); -- struct futex_q *top_waiter; -- int ret; -- -- /* -- * Read the user space value first so we can validate a few -- * things before proceeding further. -- */ -- if (get_futex_value_locked(&uval, uaddr)) -- return -EFAULT; -- -- if (unlikely(should_fail_futex(true))) -- return -EFAULT; -- -- /* -- * Detect deadlocks. -- */ -- if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) -- return -EDEADLK; -- -- if ((unlikely(should_fail_futex(true)))) -- return -EDEADLK; -- -- /* -- * Lookup existing state first. If it exists, try to attach to -- * its pi_state. -- */ -- top_waiter = futex_top_waiter(hb, key); -- if (top_waiter) -- return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); -- -- /* -- * No waiter and user TID is 0. We are here because the -- * waiters or the owner died bit is set or called from -- * requeue_cmp_pi or for whatever reason something took the -- * syscall. -- */ -- if (!(uval & FUTEX_TID_MASK)) { -- /* -- * We take over the futex. No other waiters and the user space -- * TID is 0. We preserve the owner died bit. -- */ -- newval = uval & FUTEX_OWNER_DIED; -- newval |= vpid; -- -- /* The futex requeue_pi code can enforce the waiters bit */ -- if (set_waiters) -- newval |= FUTEX_WAITERS; -- -- ret = lock_pi_update_atomic(uaddr, uval, newval); -- if (ret) -- return ret; -- -- /* -- * If the waiter bit was requested the caller also needs PI -- * state attached to the new owner of the user space futex. -- * -- * @task is guaranteed to be alive and it cannot be exiting -- * because it is either sleeping or waiting in -- * futex_requeue_pi_wakeup_sync(). -- * -- * No need to do the full attach_to_pi_owner() exercise -- * because @task is known and valid. -- */ -- if (set_waiters) { -- raw_spin_lock_irq(&task->pi_lock); -- __attach_to_pi_owner(task, key, ps); -- raw_spin_unlock_irq(&task->pi_lock); -- } -- return 1; -- } -- -- /* -- * First waiter. Set the waiters bit before attaching ourself to -- * the owner. If owner tries to unlock, it will be forced into -- * the kernel and blocked on hb->lock. -- */ -- newval = uval | FUTEX_WAITERS; -- ret = lock_pi_update_atomic(uaddr, uval, newval); -- if (ret) -- return ret; -- /* -- * If the update of the user space value succeeded, we try to -- * attach to the owner. If that fails, no harm done, we only -- * set the FUTEX_WAITERS bit in the user space variable. -- */ -- return attach_to_pi_owner(uaddr, newval, key, ps, exiting); --} -- --/** -- * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket -- * @q: The futex_q to unqueue -- * -- * The q->lock_ptr must not be NULL and must be held by the caller. -- */ --static void __unqueue_futex(struct futex_q *q) --{ -- struct futex_hash_bucket *hb; -- -- if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) -- return; -- lockdep_assert_held(q->lock_ptr); -- -- hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); -- plist_del(&q->list, &hb->chain); -- hb_waiters_dec(hb); --} -- --/* -- * The hash bucket lock must be held when this is called. -- * Afterwards, the futex_q must not be accessed. Callers -- * must ensure to later call wake_up_q() for the actual -- * wakeups to occur. -- */ --static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) --{ -- struct task_struct *p = q->task; -- -- if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) -- return; -- -- get_task_struct(p); -- __unqueue_futex(q); -- /* -- * The waiting task can free the futex_q as soon as q->lock_ptr = NULL -- * is written, without taking any locks. This is possible in the event -- * of a spurious wakeup, for example. A memory barrier is required here -- * to prevent the following store to lock_ptr from getting ahead of the -- * plist_del in __unqueue_futex(). -- */ -- smp_store_release(&q->lock_ptr, NULL); -- -- /* -- * Queue the task for later wakeup for after we've released -- * the hb->lock. -- */ -- wake_q_add_safe(wake_q, p); --} -- --/* -- * Caller must hold a reference on @pi_state. -- */ --static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) --{ -- struct rt_mutex_waiter *top_waiter; -- struct task_struct *new_owner; -- bool postunlock = false; -- DEFINE_RT_WAKE_Q(wqh); -- u32 curval, newval; -- int ret = 0; -- -- top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); -- if (WARN_ON_ONCE(!top_waiter)) { -- /* -- * As per the comment in futex_unlock_pi() this should not happen. -- * -- * When this happens, give up our locks and try again, giving -- * the futex_lock_pi() instance time to complete, either by -- * waiting on the rtmutex or removing itself from the futex -- * queue. -- */ -- ret = -EAGAIN; -- goto out_unlock; -- } -- -- new_owner = top_waiter->task; -- -- /* -- * We pass it to the next owner. The WAITERS bit is always kept -- * enabled while there is PI state around. We cleanup the owner -- * died bit, because we are the owner. -- */ -- newval = FUTEX_WAITERS | task_pid_vnr(new_owner); -- -- if (unlikely(should_fail_futex(true))) { -- ret = -EFAULT; -- goto out_unlock; -- } -- -- ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); -- if (!ret && (curval != uval)) { -- /* -- * If a unconditional UNLOCK_PI operation (user space did not -- * try the TID->0 transition) raced with a waiter setting the -- * FUTEX_WAITERS flag between get_user() and locking the hash -- * bucket lock, retry the operation. -- */ -- if ((FUTEX_TID_MASK & curval) == uval) -- ret = -EAGAIN; -- else -- ret = -EINVAL; -- } -- -- if (!ret) { -- /* -- * This is a point of no return; once we modified the uval -- * there is no going back and subsequent operations must -- * not fail. -- */ -- pi_state_update_owner(pi_state, new_owner); -- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); -- } -- --out_unlock: -- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -- -- if (postunlock) -- rt_mutex_postunlock(&wqh); -- -- return ret; --} -- --/* -- * Express the locking dependencies for lockdep: -- */ --static inline void --double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) --{ -- if (hb1 <= hb2) { -- spin_lock(&hb1->lock); -- if (hb1 < hb2) -- spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); -- } else { /* hb1 > hb2 */ -- spin_lock(&hb2->lock); -- spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); -- } --} -- --static inline void --double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) --{ -- spin_unlock(&hb1->lock); -- if (hb1 != hb2) -- spin_unlock(&hb2->lock); --} -- --/* -- * Wake up waiters matching bitset queued on this futex (uaddr). -- */ --static int --futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) --{ -- struct futex_hash_bucket *hb; -- struct futex_q *this, *next; -- union futex_key key = FUTEX_KEY_INIT; -- int ret; -- DEFINE_WAKE_Q(wake_q); -- -- if (!bitset) -- return -EINVAL; -- -- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); -- if (unlikely(ret != 0)) -- return ret; -- -- hb = hash_futex(&key); -- -- /* Make sure we really have tasks to wakeup */ -- if (!hb_waiters_pending(hb)) -- return ret; -- -- spin_lock(&hb->lock); -- -- plist_for_each_entry_safe(this, next, &hb->chain, list) { -- if (match_futex (&this->key, &key)) { -- if (this->pi_state || this->rt_waiter) { -- ret = -EINVAL; -- break; -- } -- -- /* Check if one of the bits is set in both bitsets */ -- if (!(this->bitset & bitset)) -- continue; -- -- mark_wake_futex(&wake_q, this); -- if (++ret >= nr_wake) -- break; -- } -- } -- -- spin_unlock(&hb->lock); -- wake_up_q(&wake_q); -- return ret; --} -- --static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) --{ -- unsigned int op = (encoded_op & 0x70000000) >> 28; -- unsigned int cmp = (encoded_op & 0x0f000000) >> 24; -- int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); -- int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); -- int oldval, ret; -- -- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { -- if (oparg < 0 || oparg > 31) { -- char comm[sizeof(current->comm)]; -- /* -- * kill this print and return -EINVAL when userspace -- * is sane again -- */ -- pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", -- get_task_comm(comm, current), oparg); -- oparg &= 31; -- } -- oparg = 1 << oparg; -- } -- -- pagefault_disable(); -- ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); -- pagefault_enable(); -- if (ret) -- return ret; -- -- switch (cmp) { -- case FUTEX_OP_CMP_EQ: -- return oldval == cmparg; -- case FUTEX_OP_CMP_NE: -- return oldval != cmparg; -- case FUTEX_OP_CMP_LT: -- return oldval < cmparg; -- case FUTEX_OP_CMP_GE: -- return oldval >= cmparg; -- case FUTEX_OP_CMP_LE: -- return oldval <= cmparg; -- case FUTEX_OP_CMP_GT: -- return oldval > cmparg; -- default: -- return -ENOSYS; -- } --} -- --/* -- * Wake up all waiters hashed on the physical page that is mapped -- * to this virtual address: -- */ --static int --futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, -- int nr_wake, int nr_wake2, int op) --{ -- union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; -- struct futex_hash_bucket *hb1, *hb2; -- struct futex_q *this, *next; -- int ret, op_ret; -- DEFINE_WAKE_Q(wake_q); -- --retry: -- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); -- if (unlikely(ret != 0)) -- return ret; -- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); -- if (unlikely(ret != 0)) -- return ret; -- -- hb1 = hash_futex(&key1); -- hb2 = hash_futex(&key2); -- --retry_private: -- double_lock_hb(hb1, hb2); -- op_ret = futex_atomic_op_inuser(op, uaddr2); -- if (unlikely(op_ret < 0)) { -- double_unlock_hb(hb1, hb2); -- -- if (!IS_ENABLED(CONFIG_MMU) || -- unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { -- /* -- * we don't get EFAULT from MMU faults if we don't have -- * an MMU, but we might get them from range checking -- */ -- ret = op_ret; -- return ret; -- } -- -- if (op_ret == -EFAULT) { -- ret = fault_in_user_writeable(uaddr2); -- if (ret) -- return ret; -- } -- -- cond_resched(); -- if (!(flags & FLAGS_SHARED)) -- goto retry_private; -- goto retry; -- } -- -- plist_for_each_entry_safe(this, next, &hb1->chain, list) { -- if (match_futex (&this->key, &key1)) { -- if (this->pi_state || this->rt_waiter) { -- ret = -EINVAL; -- goto out_unlock; -- } -- mark_wake_futex(&wake_q, this); -- if (++ret >= nr_wake) -- break; -- } -- } -- -- if (op_ret > 0) { -- op_ret = 0; -- plist_for_each_entry_safe(this, next, &hb2->chain, list) { -- if (match_futex (&this->key, &key2)) { -- if (this->pi_state || this->rt_waiter) { -- ret = -EINVAL; -- goto out_unlock; -- } -- mark_wake_futex(&wake_q, this); -- if (++op_ret >= nr_wake2) -- break; -- } -- } -- ret += op_ret; -- } -- --out_unlock: -- double_unlock_hb(hb1, hb2); -- wake_up_q(&wake_q); -- return ret; --} -- --/** -- * requeue_futex() - Requeue a futex_q from one hb to another -- * @q: the futex_q to requeue -- * @hb1: the source hash_bucket -- * @hb2: the target hash_bucket -- * @key2: the new key for the requeued futex_q -- */ --static inline --void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, -- struct futex_hash_bucket *hb2, union futex_key *key2) --{ -- -- /* -- * If key1 and key2 hash to the same bucket, no need to -- * requeue. -- */ -- if (likely(&hb1->chain != &hb2->chain)) { -- plist_del(&q->list, &hb1->chain); -- hb_waiters_dec(hb1); -- hb_waiters_inc(hb2); -- plist_add(&q->list, &hb2->chain); -- q->lock_ptr = &hb2->lock; -- } -- q->key = *key2; --} -- --static inline bool futex_requeue_pi_prepare(struct futex_q *q, -- struct futex_pi_state *pi_state) --{ -- int old, new; -- -- /* -- * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has -- * already set Q_REQUEUE_PI_IGNORE to signal that requeue should -- * ignore the waiter. -- */ -- old = atomic_read_acquire(&q->requeue_state); -- do { -- if (old == Q_REQUEUE_PI_IGNORE) -- return false; -- -- /* -- * futex_proxy_trylock_atomic() might have set it to -- * IN_PROGRESS and a interleaved early wake to WAIT. -- * -- * It was considered to have an extra state for that -- * trylock, but that would just add more conditionals -- * all over the place for a dubious value. -- */ -- if (old != Q_REQUEUE_PI_NONE) -- break; -- -- new = Q_REQUEUE_PI_IN_PROGRESS; -- } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); -- -- q->pi_state = pi_state; -- return true; --} -- --static inline void futex_requeue_pi_complete(struct futex_q *q, int locked) --{ -- int old, new; -- -- old = atomic_read_acquire(&q->requeue_state); -- do { -- if (old == Q_REQUEUE_PI_IGNORE) -- return; -- -- if (locked >= 0) { -- /* Requeue succeeded. Set DONE or LOCKED */ -- WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS && -- old != Q_REQUEUE_PI_WAIT); -- new = Q_REQUEUE_PI_DONE + locked; -- } else if (old == Q_REQUEUE_PI_IN_PROGRESS) { -- /* Deadlock, no early wakeup interleave */ -- new = Q_REQUEUE_PI_NONE; -- } else { -- /* Deadlock, early wakeup interleave. */ -- WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT); -- new = Q_REQUEUE_PI_IGNORE; -- } -- } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); -- --#ifdef CONFIG_PREEMPT_RT -- /* If the waiter interleaved with the requeue let it know */ -- if (unlikely(old == Q_REQUEUE_PI_WAIT)) -- rcuwait_wake_up(&q->requeue_wait); --#endif --} -- --static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) --{ -- int old, new; -- -- old = atomic_read_acquire(&q->requeue_state); -- do { -- /* Is requeue done already? */ -- if (old >= Q_REQUEUE_PI_DONE) -- return old; -- -- /* -- * If not done, then tell the requeue code to either ignore -- * the waiter or to wake it up once the requeue is done. -- */ -- new = Q_REQUEUE_PI_WAIT; -- if (old == Q_REQUEUE_PI_NONE) -- new = Q_REQUEUE_PI_IGNORE; -- } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); -- -- /* If the requeue was in progress, wait for it to complete */ -- if (old == Q_REQUEUE_PI_IN_PROGRESS) { --#ifdef CONFIG_PREEMPT_RT -- rcuwait_wait_event(&q->requeue_wait, -- atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT, -- TASK_UNINTERRUPTIBLE); --#else -- (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT); --#endif -- } -- -- /* -- * Requeue is now either prohibited or complete. Reread state -- * because during the wait above it might have changed. Nothing -- * will modify q->requeue_state after this point. -- */ -- return atomic_read(&q->requeue_state); --} -- --/** -- * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue -- * @q: the futex_q -- * @key: the key of the requeue target futex -- * @hb: the hash_bucket of the requeue target futex -- * -- * During futex_requeue, with requeue_pi=1, it is possible to acquire the -- * target futex if it is uncontended or via a lock steal. -- * -- * 1) Set @q::key to the requeue target futex key so the waiter can detect -- * the wakeup on the right futex. -- * -- * 2) Dequeue @q from the hash bucket. -- * -- * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock -- * acquisition. -- * -- * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that -- * the waiter has to fixup the pi state. -- * -- * 5) Complete the requeue state so the waiter can make progress. After -- * this point the waiter task can return from the syscall immediately in -- * case that the pi state does not have to be fixed up. -- * -- * 6) Wake the waiter task. -- * -- * Must be called with both q->lock_ptr and hb->lock held. -- */ --static inline --void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, -- struct futex_hash_bucket *hb) --{ -- q->key = *key; -- -- __unqueue_futex(q); -- -- WARN_ON(!q->rt_waiter); -- q->rt_waiter = NULL; -- -- q->lock_ptr = &hb->lock; -- -- /* Signal locked state to the waiter */ -- futex_requeue_pi_complete(q, 1); -- wake_up_state(q->task, TASK_NORMAL); --} -- --/** -- * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter -- * @pifutex: the user address of the to futex -- * @hb1: the from futex hash bucket, must be locked by the caller -- * @hb2: the to futex hash bucket, must be locked by the caller -- * @key1: the from futex key -- * @key2: the to futex key -- * @ps: address to store the pi_state pointer -- * @exiting: Pointer to store the task pointer of the owner task -- * which is in the middle of exiting -- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) -- * -- * Try and get the lock on behalf of the top waiter if we can do it atomically. -- * Wake the top waiter if we succeed. If the caller specified set_waiters, -- * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. -- * hb1 and hb2 must be held by the caller. -- * -- * @exiting is only set when the return value is -EBUSY. If so, this holds -- * a refcount on the exiting task on return and the caller needs to drop it -- * after waiting for the exit to complete. -- * -- * Return: -- * - 0 - failed to acquire the lock atomically; -- * - >0 - acquired the lock, return value is vpid of the top_waiter -- * - <0 - error -- */ --static int --futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, -- struct futex_hash_bucket *hb2, union futex_key *key1, -- union futex_key *key2, struct futex_pi_state **ps, -- struct task_struct **exiting, int set_waiters) --{ -- struct futex_q *top_waiter = NULL; -- u32 curval; -- int ret; -- -- if (get_futex_value_locked(&curval, pifutex)) -- return -EFAULT; -- -- if (unlikely(should_fail_futex(true))) -- return -EFAULT; -- -- /* -- * Find the top_waiter and determine if there are additional waiters. -- * If the caller intends to requeue more than 1 waiter to pifutex, -- * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, -- * as we have means to handle the possible fault. If not, don't set -- * the bit unnecessarily as it will force the subsequent unlock to enter -- * the kernel. -- */ -- top_waiter = futex_top_waiter(hb1, key1); -- -- /* There are no waiters, nothing for us to do. */ -- if (!top_waiter) -- return 0; -- -- /* -- * Ensure that this is a waiter sitting in futex_wait_requeue_pi() -- * and waiting on the 'waitqueue' futex which is always !PI. -- */ -- if (!top_waiter->rt_waiter || top_waiter->pi_state) -- return -EINVAL; -- -- /* Ensure we requeue to the expected futex. */ -- if (!match_futex(top_waiter->requeue_pi_key, key2)) -- return -EINVAL; -- -- /* Ensure that this does not race against an early wakeup */ -- if (!futex_requeue_pi_prepare(top_waiter, NULL)) -- return -EAGAIN; -- -- /* -- * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit -- * in the contended case or if @set_waiters is true. -- * -- * In the contended case PI state is attached to the lock owner. If -- * the user space lock can be acquired then PI state is attached to -- * the new owner (@top_waiter->task) when @set_waiters is true. -- */ -- ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, -- exiting, set_waiters); -- if (ret == 1) { -- /* -- * Lock was acquired in user space and PI state was -- * attached to @top_waiter->task. That means state is fully -- * consistent and the waiter can return to user space -- * immediately after the wakeup. -- */ -- requeue_pi_wake_futex(top_waiter, key2, hb2); -- } else if (ret < 0) { -- /* Rewind top_waiter::requeue_state */ -- futex_requeue_pi_complete(top_waiter, ret); -- } else { -- /* -- * futex_lock_pi_atomic() did not acquire the user space -- * futex, but managed to establish the proxy lock and pi -- * state. top_waiter::requeue_state cannot be fixed up here -- * because the waiter is not enqueued on the rtmutex -- * yet. This is handled at the callsite depending on the -- * result of rt_mutex_start_proxy_lock() which is -- * guaranteed to be reached with this function returning 0. -- */ -- } -- return ret; --} -- --/** -- * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 -- * @uaddr1: source futex user address -- * @flags: futex flags (FLAGS_SHARED, etc.) -- * @uaddr2: target futex user address -- * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) -- * @nr_requeue: number of waiters to requeue (0-INT_MAX) -- * @cmpval: @uaddr1 expected value (or %NULL) -- * @requeue_pi: if we are attempting to requeue from a non-pi futex to a -- * pi futex (pi to pi requeue is not supported) -- * -- * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire -- * uaddr2 atomically on behalf of the top waiter. -- * -- * Return: -- * - >=0 - on success, the number of tasks requeued or woken; -- * - <0 - on error -- */ --static int futex_requeue(u32 __user *uaddr1, unsigned int flags, -- u32 __user *uaddr2, int nr_wake, int nr_requeue, -- u32 *cmpval, int requeue_pi) --{ -- union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; -- int task_count = 0, ret; -- struct futex_pi_state *pi_state = NULL; -- struct futex_hash_bucket *hb1, *hb2; -- struct futex_q *this, *next; -- DEFINE_WAKE_Q(wake_q); -- -- if (nr_wake < 0 || nr_requeue < 0) -- return -EINVAL; -- -- /* -- * When PI not supported: return -ENOSYS if requeue_pi is true, -- * consequently the compiler knows requeue_pi is always false past -- * this point which will optimize away all the conditional code -- * further down. -- */ -- if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) -- return -ENOSYS; -- -- if (requeue_pi) { -- /* -- * Requeue PI only works on two distinct uaddrs. This -- * check is only valid for private futexes. See below. -- */ -- if (uaddr1 == uaddr2) -- return -EINVAL; -- -- /* -- * futex_requeue() allows the caller to define the number -- * of waiters to wake up via the @nr_wake argument. With -- * REQUEUE_PI, waking up more than one waiter is creating -- * more problems than it solves. Waking up a waiter makes -- * only sense if the PI futex @uaddr2 is uncontended as -- * this allows the requeue code to acquire the futex -- * @uaddr2 before waking the waiter. The waiter can then -- * return to user space without further action. A secondary -- * wakeup would just make the futex_wait_requeue_pi() -- * handling more complex, because that code would have to -- * look up pi_state and do more or less all the handling -- * which the requeue code has to do for the to be requeued -- * waiters. So restrict the number of waiters to wake to -- * one, and only wake it up when the PI futex is -- * uncontended. Otherwise requeue it and let the unlock of -- * the PI futex handle the wakeup. -- * -- * All REQUEUE_PI users, e.g. pthread_cond_signal() and -- * pthread_cond_broadcast() must use nr_wake=1. -- */ -- if (nr_wake != 1) -- return -EINVAL; -- -- /* -- * requeue_pi requires a pi_state, try to allocate it now -- * without any locks in case it fails. -- */ -- if (refill_pi_state_cache()) -- return -ENOMEM; -- } -- --retry: -- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); -- if (unlikely(ret != 0)) -- return ret; -- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, -- requeue_pi ? FUTEX_WRITE : FUTEX_READ); -- if (unlikely(ret != 0)) -- return ret; -- -- /* -- * The check above which compares uaddrs is not sufficient for -- * shared futexes. We need to compare the keys: -- */ -- if (requeue_pi && match_futex(&key1, &key2)) -- return -EINVAL; -- -- hb1 = hash_futex(&key1); -- hb2 = hash_futex(&key2); -- --retry_private: -- hb_waiters_inc(hb2); -- double_lock_hb(hb1, hb2); -- -- if (likely(cmpval != NULL)) { -- u32 curval; -- -- ret = get_futex_value_locked(&curval, uaddr1); -- -- if (unlikely(ret)) { -- double_unlock_hb(hb1, hb2); -- hb_waiters_dec(hb2); -- -- ret = get_user(curval, uaddr1); -- if (ret) -- return ret; -- -- if (!(flags & FLAGS_SHARED)) -- goto retry_private; -- -- goto retry; -- } -- if (curval != *cmpval) { -- ret = -EAGAIN; -- goto out_unlock; -- } -- } -- -- if (requeue_pi) { -- struct task_struct *exiting = NULL; -- -- /* -- * Attempt to acquire uaddr2 and wake the top waiter. If we -- * intend to requeue waiters, force setting the FUTEX_WAITERS -- * bit. We force this here where we are able to easily handle -- * faults rather in the requeue loop below. -- * -- * Updates topwaiter::requeue_state if a top waiter exists. -- */ -- ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, -- &key2, &pi_state, -- &exiting, nr_requeue); -- -- /* -- * At this point the top_waiter has either taken uaddr2 or -- * is waiting on it. In both cases pi_state has been -- * established and an initial refcount on it. In case of an -- * error there's nothing. -- * -- * The top waiter's requeue_state is up to date: -- * -- * - If the lock was acquired atomically (ret == 1), then -- * the state is Q_REQUEUE_PI_LOCKED. -- * -- * The top waiter has been dequeued and woken up and can -- * return to user space immediately. The kernel/user -- * space state is consistent. In case that there must be -- * more waiters requeued the WAITERS bit in the user -- * space futex is set so the top waiter task has to go -- * into the syscall slowpath to unlock the futex. This -- * will block until this requeue operation has been -- * completed and the hash bucket locks have been -- * dropped. -- * -- * - If the trylock failed with an error (ret < 0) then -- * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing -- * happened", or Q_REQUEUE_PI_IGNORE when there was an -- * interleaved early wakeup. -- * -- * - If the trylock did not succeed (ret == 0) then the -- * state is either Q_REQUEUE_PI_IN_PROGRESS or -- * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. -- * This will be cleaned up in the loop below, which -- * cannot fail because futex_proxy_trylock_atomic() did -- * the same sanity checks for requeue_pi as the loop -- * below does. -- */ -- switch (ret) { -- case 0: -- /* We hold a reference on the pi state. */ -- break; -- -- case 1: -- /* -- * futex_proxy_trylock_atomic() acquired the user space -- * futex. Adjust task_count. -- */ -- task_count++; -- ret = 0; -- break; -- -- /* -- * If the above failed, then pi_state is NULL and -- * waiter::requeue_state is correct. -- */ -- case -EFAULT: -- double_unlock_hb(hb1, hb2); -- hb_waiters_dec(hb2); -- ret = fault_in_user_writeable(uaddr2); -- if (!ret) -- goto retry; -- return ret; -- case -EBUSY: -- case -EAGAIN: -- /* -- * Two reasons for this: -- * - EBUSY: Owner is exiting and we just wait for the -- * exit to complete. -- * - EAGAIN: The user space value changed. -- */ -- double_unlock_hb(hb1, hb2); -- hb_waiters_dec(hb2); -- /* -- * Handle the case where the owner is in the middle of -- * exiting. Wait for the exit to complete otherwise -- * this task might loop forever, aka. live lock. -- */ -- wait_for_owner_exiting(ret, exiting); -- cond_resched(); -- goto retry; -- default: -- goto out_unlock; -- } -- } -- -- plist_for_each_entry_safe(this, next, &hb1->chain, list) { -- if (task_count - nr_wake >= nr_requeue) -- break; -- -- if (!match_futex(&this->key, &key1)) -- continue; -- -- /* -- * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always -- * be paired with each other and no other futex ops. -- * -- * We should never be requeueing a futex_q with a pi_state, -- * which is awaiting a futex_unlock_pi(). -- */ -- if ((requeue_pi && !this->rt_waiter) || -- (!requeue_pi && this->rt_waiter) || -- this->pi_state) { -- ret = -EINVAL; -- break; -- } -- -- /* Plain futexes just wake or requeue and are done */ -- if (!requeue_pi) { -- if (++task_count <= nr_wake) -- mark_wake_futex(&wake_q, this); -- else -- requeue_futex(this, hb1, hb2, &key2); -- continue; -- } -- -- /* Ensure we requeue to the expected futex for requeue_pi. */ -- if (!match_futex(this->requeue_pi_key, &key2)) { -- ret = -EINVAL; -- break; -- } -- -- /* -- * Requeue nr_requeue waiters and possibly one more in the case -- * of requeue_pi if we couldn't acquire the lock atomically. -- * -- * Prepare the waiter to take the rt_mutex. Take a refcount -- * on the pi_state and store the pointer in the futex_q -- * object of the waiter. -- */ -- get_pi_state(pi_state); -- -- /* Don't requeue when the waiter is already on the way out. */ -- if (!futex_requeue_pi_prepare(this, pi_state)) { -- /* -- * Early woken waiter signaled that it is on the -- * way out. Drop the pi_state reference and try the -- * next waiter. @this->pi_state is still NULL. -- */ -- put_pi_state(pi_state); -- continue; -- } -- -- ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, -- this->rt_waiter, -- this->task); -- -- if (ret == 1) { -- /* -- * We got the lock. We do neither drop the refcount -- * on pi_state nor clear this->pi_state because the -- * waiter needs the pi_state for cleaning up the -- * user space value. It will drop the refcount -- * after doing so. this::requeue_state is updated -- * in the wakeup as well. -- */ -- requeue_pi_wake_futex(this, &key2, hb2); -- task_count++; -- } else if (!ret) { -- /* Waiter is queued, move it to hb2 */ -- requeue_futex(this, hb1, hb2, &key2); -- futex_requeue_pi_complete(this, 0); -- task_count++; -- } else { -- /* -- * rt_mutex_start_proxy_lock() detected a potential -- * deadlock when we tried to queue that waiter. -- * Drop the pi_state reference which we took above -- * and remove the pointer to the state from the -- * waiters futex_q object. -- */ -- this->pi_state = NULL; -- put_pi_state(pi_state); -- futex_requeue_pi_complete(this, ret); -- /* -- * We stop queueing more waiters and let user space -- * deal with the mess. -- */ -- break; -- } -- } -- -- /* -- * We took an extra initial reference to the pi_state in -- * futex_proxy_trylock_atomic(). We need to drop it here again. -- */ -- put_pi_state(pi_state); -- --out_unlock: -- double_unlock_hb(hb1, hb2); -- wake_up_q(&wake_q); -- hb_waiters_dec(hb2); -- return ret ? ret : task_count; --} -- --/* The key must be already stored in q->key. */ --static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) -- __acquires(&hb->lock) --{ -- struct futex_hash_bucket *hb; -- -- hb = hash_futex(&q->key); -- -- /* -- * Increment the counter before taking the lock so that -- * a potential waker won't miss a to-be-slept task that is -- * waiting for the spinlock. This is safe as all queue_lock() -- * users end up calling queue_me(). Similarly, for housekeeping, -- * decrement the counter at queue_unlock() when some error has -- * occurred and we don't end up adding the task to the list. -- */ -- hb_waiters_inc(hb); /* implies smp_mb(); (A) */ -- -- q->lock_ptr = &hb->lock; -- -- spin_lock(&hb->lock); -- return hb; --} -- --static inline void --queue_unlock(struct futex_hash_bucket *hb) -- __releases(&hb->lock) --{ -- spin_unlock(&hb->lock); -- hb_waiters_dec(hb); --} -- --static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) --{ -- int prio; -- -- /* -- * The priority used to register this element is -- * - either the real thread-priority for the real-time threads -- * (i.e. threads with a priority lower than MAX_RT_PRIO) -- * - or MAX_RT_PRIO for non-RT threads. -- * Thus, all RT-threads are woken first in priority order, and -- * the others are woken last, in FIFO order. -- */ -- prio = min(current->normal_prio, MAX_RT_PRIO); -- -- plist_node_init(&q->list, prio); -- plist_add(&q->list, &hb->chain); -- q->task = current; --} -- --/** -- * queue_me() - Enqueue the futex_q on the futex_hash_bucket -- * @q: The futex_q to enqueue -- * @hb: The destination hash bucket -- * -- * The hb->lock must be held by the caller, and is released here. A call to -- * queue_me() is typically paired with exactly one call to unqueue_me(). The -- * exceptions involve the PI related operations, which may use unqueue_me_pi() -- * or nothing if the unqueue is done as part of the wake process and the unqueue -- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for -- * an example). -- */ --static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) -- __releases(&hb->lock) --{ -- __queue_me(q, hb); -- spin_unlock(&hb->lock); --} -- --/** -- * unqueue_me() - Remove the futex_q from its futex_hash_bucket -- * @q: The futex_q to unqueue -- * -- * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must -- * be paired with exactly one earlier call to queue_me(). -- * -- * Return: -- * - 1 - if the futex_q was still queued (and we removed unqueued it); -- * - 0 - if the futex_q was already removed by the waking thread -- */ --static int unqueue_me(struct futex_q *q) --{ -- spinlock_t *lock_ptr; -- int ret = 0; -- -- /* In the common case we don't take the spinlock, which is nice. */ --retry: -- /* -- * q->lock_ptr can change between this read and the following spin_lock. -- * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and -- * optimizing lock_ptr out of the logic below. -- */ -- lock_ptr = READ_ONCE(q->lock_ptr); -- if (lock_ptr != NULL) { -- spin_lock(lock_ptr); -- /* -- * q->lock_ptr can change between reading it and -- * spin_lock(), causing us to take the wrong lock. This -- * corrects the race condition. -- * -- * Reasoning goes like this: if we have the wrong lock, -- * q->lock_ptr must have changed (maybe several times) -- * between reading it and the spin_lock(). It can -- * change again after the spin_lock() but only if it was -- * already changed before the spin_lock(). It cannot, -- * however, change back to the original value. Therefore -- * we can detect whether we acquired the correct lock. -- */ -- if (unlikely(lock_ptr != q->lock_ptr)) { -- spin_unlock(lock_ptr); -- goto retry; -- } -- __unqueue_futex(q); -- -- BUG_ON(q->pi_state); -- -- spin_unlock(lock_ptr); -- ret = 1; -- } -- -- return ret; --} -- --/* -- * PI futexes can not be requeued and must remove themselves from the -- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. -- */ --static void unqueue_me_pi(struct futex_q *q) --{ -- __unqueue_futex(q); -- -- BUG_ON(!q->pi_state); -- put_pi_state(q->pi_state); -- q->pi_state = NULL; --} -- --static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, -- struct task_struct *argowner) --{ -- struct futex_pi_state *pi_state = q->pi_state; -- struct task_struct *oldowner, *newowner; -- u32 uval, curval, newval, newtid; -- int err = 0; -- -- oldowner = pi_state->owner; -- -- /* -- * We are here because either: -- * -- * - we stole the lock and pi_state->owner needs updating to reflect -- * that (@argowner == current), -- * -- * or: -- * -- * - someone stole our lock and we need to fix things to point to the -- * new owner (@argowner == NULL). -- * -- * Either way, we have to replace the TID in the user space variable. -- * This must be atomic as we have to preserve the owner died bit here. -- * -- * Note: We write the user space value _before_ changing the pi_state -- * because we can fault here. Imagine swapped out pages or a fork -- * that marked all the anonymous memory readonly for cow. -- * -- * Modifying pi_state _before_ the user space value would leave the -- * pi_state in an inconsistent state when we fault here, because we -- * need to drop the locks to handle the fault. This might be observed -- * in the PID checks when attaching to PI state . -- */ --retry: -- if (!argowner) { -- if (oldowner != current) { -- /* -- * We raced against a concurrent self; things are -- * already fixed up. Nothing to do. -- */ -- return 0; -- } -- -- if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { -- /* We got the lock. pi_state is correct. Tell caller. */ -- return 1; -- } -- -- /* -- * The trylock just failed, so either there is an owner or -- * there is a higher priority waiter than this one. -- */ -- newowner = rt_mutex_owner(&pi_state->pi_mutex); -- /* -- * If the higher priority waiter has not yet taken over the -- * rtmutex then newowner is NULL. We can't return here with -- * that state because it's inconsistent vs. the user space -- * state. So drop the locks and try again. It's a valid -- * situation and not any different from the other retry -- * conditions. -- */ -- if (unlikely(!newowner)) { -- err = -EAGAIN; -- goto handle_err; -- } -- } else { -- WARN_ON_ONCE(argowner != current); -- if (oldowner == current) { -- /* -- * We raced against a concurrent self; things are -- * already fixed up. Nothing to do. -- */ -- return 1; -- } -- newowner = argowner; -- } -- -- newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; -- /* Owner died? */ -- if (!pi_state->owner) -- newtid |= FUTEX_OWNER_DIED; -- -- err = get_futex_value_locked(&uval, uaddr); -- if (err) -- goto handle_err; -- -- for (;;) { -- newval = (uval & FUTEX_OWNER_DIED) | newtid; -- -- err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); -- if (err) -- goto handle_err; -- -- if (curval == uval) -- break; -- uval = curval; -- } -- -- /* -- * We fixed up user space. Now we need to fix the pi_state -- * itself. -- */ -- pi_state_update_owner(pi_state, newowner); -- -- return argowner == current; -- -- /* -- * In order to reschedule or handle a page fault, we need to drop the -- * locks here. In the case of a fault, this gives the other task -- * (either the highest priority waiter itself or the task which stole -- * the rtmutex) the chance to try the fixup of the pi_state. So once we -- * are back from handling the fault we need to check the pi_state after -- * reacquiring the locks and before trying to do another fixup. When -- * the fixup has been done already we simply return. -- * -- * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely -- * drop hb->lock since the caller owns the hb -> futex_q relation. -- * Dropping the pi_mutex->wait_lock requires the state revalidate. -- */ --handle_err: -- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -- spin_unlock(q->lock_ptr); -- -- switch (err) { -- case -EFAULT: -- err = fault_in_user_writeable(uaddr); -- break; -- -- case -EAGAIN: -- cond_resched(); -- err = 0; -- break; -- -- default: -- WARN_ON_ONCE(1); -- break; -- } -- -- spin_lock(q->lock_ptr); -- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -- -- /* -- * Check if someone else fixed it for us: -- */ -- if (pi_state->owner != oldowner) -- return argowner == current; -- -- /* Retry if err was -EAGAIN or the fault in succeeded */ -- if (!err) -- goto retry; -- -- /* -- * fault_in_user_writeable() failed so user state is immutable. At -- * best we can make the kernel state consistent but user state will -- * be most likely hosed and any subsequent unlock operation will be -- * rejected due to PI futex rule [10]. -- * -- * Ensure that the rtmutex owner is also the pi_state owner despite -- * the user space value claiming something different. There is no -- * point in unlocking the rtmutex if current is the owner as it -- * would need to wait until the next waiter has taken the rtmutex -- * to guarantee consistent state. Keep it simple. Userspace asked -- * for this wreckaged state. -- * -- * The rtmutex has an owner - either current or some other -- * task. See the EAGAIN loop above. -- */ -- pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); -- -- return err; --} -- --static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, -- struct task_struct *argowner) --{ -- struct futex_pi_state *pi_state = q->pi_state; -- int ret; -- -- lockdep_assert_held(q->lock_ptr); -- -- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -- ret = __fixup_pi_state_owner(uaddr, q, argowner); -- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -- return ret; --} -- --static long futex_wait_restart(struct restart_block *restart); -- --/** -- * fixup_owner() - Post lock pi_state and corner case management -- * @uaddr: user address of the futex -- * @q: futex_q (contains pi_state and access to the rt_mutex) -- * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) -- * -- * After attempting to lock an rt_mutex, this function is called to cleanup -- * the pi_state owner as well as handle race conditions that may allow us to -- * acquire the lock. Must be called with the hb lock held. -- * -- * Return: -- * - 1 - success, lock taken; -- * - 0 - success, lock not taken; -- * - <0 - on error (-EFAULT) -- */ --static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) --{ -- if (locked) { -- /* -- * Got the lock. We might not be the anticipated owner if we -- * did a lock-steal - fix up the PI-state in that case: -- * -- * Speculative pi_state->owner read (we don't hold wait_lock); -- * since we own the lock pi_state->owner == current is the -- * stable state, anything else needs more attention. -- */ -- if (q->pi_state->owner != current) -- return fixup_pi_state_owner(uaddr, q, current); -- return 1; -- } -- -- /* -- * If we didn't get the lock; check if anybody stole it from us. In -- * that case, we need to fix up the uval to point to them instead of -- * us, otherwise bad things happen. [10] -- * -- * Another speculative read; pi_state->owner == current is unstable -- * but needs our attention. -- */ -- if (q->pi_state->owner == current) -- return fixup_pi_state_owner(uaddr, q, NULL); -- -- /* -- * Paranoia check. If we did not take the lock, then we should not be -- * the owner of the rt_mutex. Warn and establish consistent state. -- */ -- if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) -- return fixup_pi_state_owner(uaddr, q, current); -- -- return 0; --} -- --/** -- * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal -- * @hb: the futex hash bucket, must be locked by the caller -- * @q: the futex_q to queue up on -- * @timeout: the prepared hrtimer_sleeper, or null for no timeout -- */ --static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, -- struct hrtimer_sleeper *timeout) --{ -- /* -- * The task state is guaranteed to be set before another task can -- * wake it. set_current_state() is implemented using smp_store_mb() and -- * queue_me() calls spin_unlock() upon completion, both serializing -- * access to the hash list and forcing another memory barrier. -- */ -- set_current_state(TASK_INTERRUPTIBLE); -- queue_me(q, hb); -- -- /* Arm the timer */ -- if (timeout) -- hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); -- -- /* -- * If we have been removed from the hash list, then another task -- * has tried to wake us, and we can skip the call to schedule(). -- */ -- if (likely(!plist_node_empty(&q->list))) { -- /* -- * If the timer has already expired, current will already be -- * flagged for rescheduling. Only call schedule if there -- * is no timeout, or if it has yet to expire. -- */ -- if (!timeout || timeout->task) -- freezable_schedule(); -- } -- __set_current_state(TASK_RUNNING); --} -- --/** -- * futex_wait_setup() - Prepare to wait on a futex -- * @uaddr: the futex userspace address -- * @val: the expected value -- * @flags: futex flags (FLAGS_SHARED, etc.) -- * @q: the associated futex_q -- * @hb: storage for hash_bucket pointer to be returned to caller -- * -- * Setup the futex_q and locate the hash_bucket. Get the futex value and -- * compare it with the expected value. Handle atomic faults internally. -- * Return with the hb lock held on success, and unlocked on failure. -- * -- * Return: -- * - 0 - uaddr contains val and hb has been locked; -- * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked -- */ --static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, -- struct futex_q *q, struct futex_hash_bucket **hb) --{ -- u32 uval; -- int ret; -- -- /* -- * Access the page AFTER the hash-bucket is locked. -- * Order is important: -- * -- * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); -- * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } -- * -- * The basic logical guarantee of a futex is that it blocks ONLY -- * if cond(var) is known to be true at the time of blocking, for -- * any cond. If we locked the hash-bucket after testing *uaddr, that -- * would open a race condition where we could block indefinitely with -- * cond(var) false, which would violate the guarantee. -- * -- * On the other hand, we insert q and release the hash-bucket only -- * after testing *uaddr. This guarantees that futex_wait() will NOT -- * absorb a wakeup if *uaddr does not match the desired values -- * while the syscall executes. -- */ --retry: -- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); -- if (unlikely(ret != 0)) -- return ret; -- --retry_private: -- *hb = queue_lock(q); -- -- ret = get_futex_value_locked(&uval, uaddr); -- -- if (ret) { -- queue_unlock(*hb); -- -- ret = get_user(uval, uaddr); -- if (ret) -- return ret; -- -- if (!(flags & FLAGS_SHARED)) -- goto retry_private; -- -- goto retry; -- } -- -- if (uval != val) { -- queue_unlock(*hb); -- ret = -EWOULDBLOCK; -- } -- -- return ret; --} -- --static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, -- ktime_t *abs_time, u32 bitset) --{ -- struct hrtimer_sleeper timeout, *to; -- struct restart_block *restart; -- struct futex_hash_bucket *hb; -- struct futex_q q = futex_q_init; -- int ret; -- -- if (!bitset) -- return -EINVAL; -- q.bitset = bitset; -- -- to = futex_setup_timer(abs_time, &timeout, flags, -- current->timer_slack_ns); --retry: -- /* -- * Prepare to wait on uaddr. On success, it holds hb->lock and q -- * is initialized. -- */ -- ret = futex_wait_setup(uaddr, val, flags, &q, &hb); -- if (ret) -- goto out; -- -- /* queue_me and wait for wakeup, timeout, or a signal. */ -- futex_wait_queue_me(hb, &q, to); -- -- /* If we were woken (and unqueued), we succeeded, whatever. */ -- ret = 0; -- if (!unqueue_me(&q)) -- goto out; -- ret = -ETIMEDOUT; -- if (to && !to->task) -- goto out; -- -- /* -- * We expect signal_pending(current), but we might be the -- * victim of a spurious wakeup as well. -- */ -- if (!signal_pending(current)) -- goto retry; -- -- ret = -ERESTARTSYS; -- if (!abs_time) -- goto out; -- -- restart = ¤t->restart_block; -- restart->futex.uaddr = uaddr; -- restart->futex.val = val; -- restart->futex.time = *abs_time; -- restart->futex.bitset = bitset; -- restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; -- -- ret = set_restart_fn(restart, futex_wait_restart); -- --out: -- if (to) { -- hrtimer_cancel(&to->timer); -- destroy_hrtimer_on_stack(&to->timer); -- } -- return ret; --} -- -- --static long futex_wait_restart(struct restart_block *restart) --{ -- u32 __user *uaddr = restart->futex.uaddr; -- ktime_t t, *tp = NULL; -- -- if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { -- t = restart->futex.time; -- tp = &t; -- } -- restart->fn = do_no_restart_syscall; -- -- return (long)futex_wait(uaddr, restart->futex.flags, -- restart->futex.val, tp, restart->futex.bitset); --} -- -- --/* -- * Userspace tried a 0 -> TID atomic transition of the futex value -- * and failed. The kernel side here does the whole locking operation: -- * if there are waiters then it will block as a consequence of relying -- * on rt-mutexes, it does PI, etc. (Due to races the kernel might see -- * a 0 value of the futex too.). -- * -- * Also serves as futex trylock_pi()'ing, and due semantics. -- */ --static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, -- ktime_t *time, int trylock) --{ -- struct hrtimer_sleeper timeout, *to; -- struct task_struct *exiting = NULL; -- struct rt_mutex_waiter rt_waiter; -- struct futex_hash_bucket *hb; -- struct futex_q q = futex_q_init; -- int res, ret; -- -- if (!IS_ENABLED(CONFIG_FUTEX_PI)) -- return -ENOSYS; -- -- if (refill_pi_state_cache()) -- return -ENOMEM; -- -- to = futex_setup_timer(time, &timeout, flags, 0); -- --retry: -- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); -- if (unlikely(ret != 0)) -- goto out; -- --retry_private: -- hb = queue_lock(&q); -- -- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, -- &exiting, 0); -- if (unlikely(ret)) { -- /* -- * Atomic work succeeded and we got the lock, -- * or failed. Either way, we do _not_ block. -- */ -- switch (ret) { -- case 1: -- /* We got the lock. */ -- ret = 0; -- goto out_unlock_put_key; -- case -EFAULT: -- goto uaddr_faulted; -- case -EBUSY: -- case -EAGAIN: -- /* -- * Two reasons for this: -- * - EBUSY: Task is exiting and we just wait for the -- * exit to complete. -- * - EAGAIN: The user space value changed. -- */ -- queue_unlock(hb); -- /* -- * Handle the case where the owner is in the middle of -- * exiting. Wait for the exit to complete otherwise -- * this task might loop forever, aka. live lock. -- */ -- wait_for_owner_exiting(ret, exiting); -- cond_resched(); -- goto retry; -- default: -- goto out_unlock_put_key; -- } -- } -- -- WARN_ON(!q.pi_state); -- -- /* -- * Only actually queue now that the atomic ops are done: -- */ -- __queue_me(&q, hb); -- -- if (trylock) { -- ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); -- /* Fixup the trylock return value: */ -- ret = ret ? 0 : -EWOULDBLOCK; -- goto no_block; -- } -- -- rt_mutex_init_waiter(&rt_waiter); -- -- /* -- * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not -- * hold it while doing rt_mutex_start_proxy(), because then it will -- * include hb->lock in the blocking chain, even through we'll not in -- * fact hold it while blocking. This will lead it to report -EDEADLK -- * and BUG when futex_unlock_pi() interleaves with this. -- * -- * Therefore acquire wait_lock while holding hb->lock, but drop the -- * latter before calling __rt_mutex_start_proxy_lock(). This -- * interleaves with futex_unlock_pi() -- which does a similar lock -- * handoff -- such that the latter can observe the futex_q::pi_state -- * before __rt_mutex_start_proxy_lock() is done. -- */ -- raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); -- spin_unlock(q.lock_ptr); -- /* -- * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter -- * such that futex_unlock_pi() is guaranteed to observe the waiter when -- * it sees the futex_q::pi_state. -- */ -- ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); -- raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); -- -- if (ret) { -- if (ret == 1) -- ret = 0; -- goto cleanup; -- } -- -- if (unlikely(to)) -- hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); -- -- ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); -- --cleanup: -- spin_lock(q.lock_ptr); -- /* -- * If we failed to acquire the lock (deadlock/signal/timeout), we must -- * first acquire the hb->lock before removing the lock from the -- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait -- * lists consistent. -- * -- * In particular; it is important that futex_unlock_pi() can not -- * observe this inconsistency. -- */ -- if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) -- ret = 0; -- --no_block: -- /* -- * Fixup the pi_state owner and possibly acquire the lock if we -- * haven't already. -- */ -- res = fixup_owner(uaddr, &q, !ret); -- /* -- * If fixup_owner() returned an error, propagate that. If it acquired -- * the lock, clear our -ETIMEDOUT or -EINTR. -- */ -- if (res) -- ret = (res < 0) ? res : 0; -- -- unqueue_me_pi(&q); -- spin_unlock(q.lock_ptr); -- goto out; -- --out_unlock_put_key: -- queue_unlock(hb); -- --out: -- if (to) { -- hrtimer_cancel(&to->timer); -- destroy_hrtimer_on_stack(&to->timer); -- } -- return ret != -EINTR ? ret : -ERESTARTNOINTR; -- --uaddr_faulted: -- queue_unlock(hb); -- -- ret = fault_in_user_writeable(uaddr); -- if (ret) -- goto out; -- -- if (!(flags & FLAGS_SHARED)) -- goto retry_private; -- -- goto retry; --} -- --/* -- * Userspace attempted a TID -> 0 atomic transition, and failed. -- * This is the in-kernel slowpath: we look up the PI state (if any), -- * and do the rt-mutex unlock. -- */ --static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) --{ -- u32 curval, uval, vpid = task_pid_vnr(current); -- union futex_key key = FUTEX_KEY_INIT; -- struct futex_hash_bucket *hb; -- struct futex_q *top_waiter; -- int ret; -- -- if (!IS_ENABLED(CONFIG_FUTEX_PI)) -- return -ENOSYS; -- --retry: -- if (get_user(uval, uaddr)) -- return -EFAULT; -- /* -- * We release only a lock we actually own: -- */ -- if ((uval & FUTEX_TID_MASK) != vpid) -- return -EPERM; -- -- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); -- if (ret) -- return ret; -- -- hb = hash_futex(&key); -- spin_lock(&hb->lock); -- -- /* -- * Check waiters first. We do not trust user space values at -- * all and we at least want to know if user space fiddled -- * with the futex value instead of blindly unlocking. -- */ -- top_waiter = futex_top_waiter(hb, &key); -- if (top_waiter) { -- struct futex_pi_state *pi_state = top_waiter->pi_state; -- -- ret = -EINVAL; -- if (!pi_state) -- goto out_unlock; -- -- /* -- * If current does not own the pi_state then the futex is -- * inconsistent and user space fiddled with the futex value. -- */ -- if (pi_state->owner != current) -- goto out_unlock; -- -- get_pi_state(pi_state); -- /* -- * By taking wait_lock while still holding hb->lock, we ensure -- * there is no point where we hold neither; and therefore -- * wake_futex_pi() must observe a state consistent with what we -- * observed. -- * -- * In particular; this forces __rt_mutex_start_proxy() to -- * complete such that we're guaranteed to observe the -- * rt_waiter. Also see the WARN in wake_futex_pi(). -- */ -- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -- spin_unlock(&hb->lock); -- -- /* drops pi_state->pi_mutex.wait_lock */ -- ret = wake_futex_pi(uaddr, uval, pi_state); -- -- put_pi_state(pi_state); -- -- /* -- * Success, we're done! No tricky corner cases. -- */ -- if (!ret) -- return ret; -- /* -- * The atomic access to the futex value generated a -- * pagefault, so retry the user-access and the wakeup: -- */ -- if (ret == -EFAULT) -- goto pi_faulted; -- /* -- * A unconditional UNLOCK_PI op raced against a waiter -- * setting the FUTEX_WAITERS bit. Try again. -- */ -- if (ret == -EAGAIN) -- goto pi_retry; -- /* -- * wake_futex_pi has detected invalid state. Tell user -- * space. -- */ -- return ret; -- } -- -- /* -- * We have no kernel internal state, i.e. no waiters in the -- * kernel. Waiters which are about to queue themselves are stuck -- * on hb->lock. So we can safely ignore them. We do neither -- * preserve the WAITERS bit not the OWNER_DIED one. We are the -- * owner. -- */ -- if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { -- spin_unlock(&hb->lock); -- switch (ret) { -- case -EFAULT: -- goto pi_faulted; -- -- case -EAGAIN: -- goto pi_retry; -- -- default: -- WARN_ON_ONCE(1); -- return ret; -- } -- } -- -- /* -- * If uval has changed, let user space handle it. -- */ -- ret = (curval == uval) ? 0 : -EAGAIN; -- --out_unlock: -- spin_unlock(&hb->lock); -- return ret; -- --pi_retry: -- cond_resched(); -- goto retry; -- --pi_faulted: -- -- ret = fault_in_user_writeable(uaddr); -- if (!ret) -- goto retry; -- -- return ret; --} -- --/** -- * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex -- * @hb: the hash_bucket futex_q was original enqueued on -- * @q: the futex_q woken while waiting to be requeued -- * @timeout: the timeout associated with the wait (NULL if none) -- * -- * Determine the cause for the early wakeup. -- * -- * Return: -- * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR -- */ --static inline --int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, -- struct futex_q *q, -- struct hrtimer_sleeper *timeout) --{ -- int ret; -- -- /* -- * With the hb lock held, we avoid races while we process the wakeup. -- * We only need to hold hb (and not hb2) to ensure atomicity as the -- * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. -- * It can't be requeued from uaddr2 to something else since we don't -- * support a PI aware source futex for requeue. -- */ -- WARN_ON_ONCE(&hb->lock != q->lock_ptr); -- -- /* -- * We were woken prior to requeue by a timeout or a signal. -- * Unqueue the futex_q and determine which it was. -- */ -- plist_del(&q->list, &hb->chain); -- hb_waiters_dec(hb); -- -- /* Handle spurious wakeups gracefully */ -- ret = -EWOULDBLOCK; -- if (timeout && !timeout->task) -- ret = -ETIMEDOUT; -- else if (signal_pending(current)) -- ret = -ERESTARTNOINTR; -- return ret; --} -- --/** -- * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 -- * @uaddr: the futex we initially wait on (non-pi) -- * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be -- * the same type, no requeueing from private to shared, etc. -- * @val: the expected value of uaddr -- * @abs_time: absolute timeout -- * @bitset: 32 bit wakeup bitset set by userspace, defaults to all -- * @uaddr2: the pi futex we will take prior to returning to user-space -- * -- * The caller will wait on uaddr and will be requeued by futex_requeue() to -- * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake -- * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to -- * userspace. This ensures the rt_mutex maintains an owner when it has waiters; -- * without one, the pi logic would not know which task to boost/deboost, if -- * there was a need to. -- * -- * We call schedule in futex_wait_queue_me() when we enqueue and return there -- * via the following-- -- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() -- * 2) wakeup on uaddr2 after a requeue -- * 3) signal -- * 4) timeout -- * -- * If 3, cleanup and return -ERESTARTNOINTR. -- * -- * If 2, we may then block on trying to take the rt_mutex and return via: -- * 5) successful lock -- * 6) signal -- * 7) timeout -- * 8) other lock acquisition failure -- * -- * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). -- * -- * If 4 or 7, we cleanup and return with -ETIMEDOUT. -- * -- * Return: -- * - 0 - On success; -- * - <0 - On error -- */ --static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, -- u32 val, ktime_t *abs_time, u32 bitset, -- u32 __user *uaddr2) --{ -- struct hrtimer_sleeper timeout, *to; -- struct rt_mutex_waiter rt_waiter; -- struct futex_hash_bucket *hb; -- union futex_key key2 = FUTEX_KEY_INIT; -- struct futex_q q = futex_q_init; -- struct rt_mutex_base *pi_mutex; -- int res, ret; -- -- if (!IS_ENABLED(CONFIG_FUTEX_PI)) -- return -ENOSYS; -- -- if (uaddr == uaddr2) -- return -EINVAL; -- -- if (!bitset) -- return -EINVAL; -- -- to = futex_setup_timer(abs_time, &timeout, flags, -- current->timer_slack_ns); -- -- /* -- * The waiter is allocated on our stack, manipulated by the requeue -- * code while we sleep on uaddr. -- */ -- rt_mutex_init_waiter(&rt_waiter); -- -- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); -- if (unlikely(ret != 0)) -- goto out; -- -- q.bitset = bitset; -- q.rt_waiter = &rt_waiter; -- q.requeue_pi_key = &key2; -- -- /* -- * Prepare to wait on uaddr. On success, it holds hb->lock and q -- * is initialized. -- */ -- ret = futex_wait_setup(uaddr, val, flags, &q, &hb); -- if (ret) -- goto out; -- -- /* -- * The check above which compares uaddrs is not sufficient for -- * shared futexes. We need to compare the keys: -- */ -- if (match_futex(&q.key, &key2)) { -- queue_unlock(hb); -- ret = -EINVAL; -- goto out; -- } -- -- /* Queue the futex_q, drop the hb lock, wait for wakeup. */ -- futex_wait_queue_me(hb, &q, to); -- -- switch (futex_requeue_pi_wakeup_sync(&q)) { -- case Q_REQUEUE_PI_IGNORE: -- /* The waiter is still on uaddr1 */ -- spin_lock(&hb->lock); -- ret = handle_early_requeue_pi_wakeup(hb, &q, to); -- spin_unlock(&hb->lock); -- break; -- -- case Q_REQUEUE_PI_LOCKED: -- /* The requeue acquired the lock */ -- if (q.pi_state && (q.pi_state->owner != current)) { -- spin_lock(q.lock_ptr); -- ret = fixup_owner(uaddr2, &q, true); -- /* -- * Drop the reference to the pi state which the -- * requeue_pi() code acquired for us. -- */ -- put_pi_state(q.pi_state); -- spin_unlock(q.lock_ptr); -- /* -- * Adjust the return value. It's either -EFAULT or -- * success (1) but the caller expects 0 for success. -- */ -- ret = ret < 0 ? ret : 0; -- } -- break; -- -- case Q_REQUEUE_PI_DONE: -- /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */ -- pi_mutex = &q.pi_state->pi_mutex; -- ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); -- -- /* Current is not longer pi_blocked_on */ -- spin_lock(q.lock_ptr); -- if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) -- ret = 0; -- -- debug_rt_mutex_free_waiter(&rt_waiter); -- /* -- * Fixup the pi_state owner and possibly acquire the lock if we -- * haven't already. -- */ -- res = fixup_owner(uaddr2, &q, !ret); -- /* -- * If fixup_owner() returned an error, propagate that. If it -- * acquired the lock, clear -ETIMEDOUT or -EINTR. -- */ -- if (res) -- ret = (res < 0) ? res : 0; -- -- unqueue_me_pi(&q); -- spin_unlock(q.lock_ptr); -- -- if (ret == -EINTR) { -- /* -- * We've already been requeued, but cannot restart -- * by calling futex_lock_pi() directly. We could -- * restart this syscall, but it would detect that -- * the user space "val" changed and return -- * -EWOULDBLOCK. Save the overhead of the restart -- * and return -EWOULDBLOCK directly. -- */ -- ret = -EWOULDBLOCK; -- } -- break; -- default: -- BUG(); -- } -- --out: -- if (to) { -- hrtimer_cancel(&to->timer); -- destroy_hrtimer_on_stack(&to->timer); -- } -- return ret; --} -- --/* -- * Support for robust futexes: the kernel cleans up held futexes at -- * thread exit time. -- * -- * Implementation: user-space maintains a per-thread list of locks it -- * is holding. Upon do_exit(), the kernel carefully walks this list, -- * and marks all locks that are owned by this thread with the -- * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is -- * always manipulated with the lock held, so the list is private and -- * per-thread. Userspace also maintains a per-thread 'list_op_pending' -- * field, to allow the kernel to clean up if the thread dies after -- * acquiring the lock, but just before it could have added itself to -- * the list. There can only be one such pending lock. -- */ -- --/** -- * sys_set_robust_list() - Set the robust-futex list head of a task -- * @head: pointer to the list-head -- * @len: length of the list-head, as userspace expects -- */ --SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, -- size_t, len) --{ -- if (!futex_cmpxchg_enabled) -- return -ENOSYS; -- /* -- * The kernel knows only one size for now: -- */ -- if (unlikely(len != sizeof(*head))) -- return -EINVAL; -- -- current->robust_list = head; -- -- return 0; --} -- --/** -- * sys_get_robust_list() - Get the robust-futex list head of a task -- * @pid: pid of the process [zero for current task] -- * @head_ptr: pointer to a list-head pointer, the kernel fills it in -- * @len_ptr: pointer to a length field, the kernel fills in the header size -- */ --SYSCALL_DEFINE3(get_robust_list, int, pid, -- struct robust_list_head __user * __user *, head_ptr, -- size_t __user *, len_ptr) --{ -- struct robust_list_head __user *head; -- unsigned long ret; -- struct task_struct *p; -- -- if (!futex_cmpxchg_enabled) -- return -ENOSYS; -- -- rcu_read_lock(); -- -- ret = -ESRCH; -- if (!pid) -- p = current; -- else { -- p = find_task_by_vpid(pid); -- if (!p) -- goto err_unlock; -- } -- -- ret = -EPERM; -- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) -- goto err_unlock; -- -- head = p->robust_list; -- rcu_read_unlock(); -- -- if (put_user(sizeof(*head), len_ptr)) -- return -EFAULT; -- return put_user(head, head_ptr); -- --err_unlock: -- rcu_read_unlock(); -- -- return ret; --} -- --/* Constants for the pending_op argument of handle_futex_death */ --#define HANDLE_DEATH_PENDING true --#define HANDLE_DEATH_LIST false -- --/* -- * Process a futex-list entry, check whether it's owned by the -- * dying task, and do notification if so: -- */ --static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, -- bool pi, bool pending_op) --{ -- u32 uval, nval, mval; -- int err; -- -- /* Futex address must be 32bit aligned */ -- if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) -- return -1; -- --retry: -- if (get_user(uval, uaddr)) -- return -1; -- -- /* -- * Special case for regular (non PI) futexes. The unlock path in -- * user space has two race scenarios: -- * -- * 1. The unlock path releases the user space futex value and -- * before it can execute the futex() syscall to wake up -- * waiters it is killed. -- * -- * 2. A woken up waiter is killed before it can acquire the -- * futex in user space. -- * -- * In both cases the TID validation below prevents a wakeup of -- * potential waiters which can cause these waiters to block -- * forever. -- * -- * In both cases the following conditions are met: -- * -- * 1) task->robust_list->list_op_pending != NULL -- * @pending_op == true -- * 2) User space futex value == 0 -- * 3) Regular futex: @pi == false -- * -- * If these conditions are met, it is safe to attempt waking up a -- * potential waiter without touching the user space futex value and -- * trying to set the OWNER_DIED bit. The user space futex value is -- * uncontended and the rest of the user space mutex state is -- * consistent, so a woken waiter will just take over the -- * uncontended futex. Setting the OWNER_DIED bit would create -- * inconsistent state and malfunction of the user space owner died -- * handling. -- */ -- if (pending_op && !pi && !uval) { -- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); -- return 0; -- } -- -- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) -- return 0; -- -- /* -- * Ok, this dying thread is truly holding a futex -- * of interest. Set the OWNER_DIED bit atomically -- * via cmpxchg, and if the value had FUTEX_WAITERS -- * set, wake up a waiter (if any). (We have to do a -- * futex_wake() even if OWNER_DIED is already set - -- * to handle the rare but possible case of recursive -- * thread-death.) The rest of the cleanup is done in -- * userspace. -- */ -- mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; -- -- /* -- * We are not holding a lock here, but we want to have -- * the pagefault_disable/enable() protection because -- * we want to handle the fault gracefully. If the -- * access fails we try to fault in the futex with R/W -- * verification via get_user_pages. get_user() above -- * does not guarantee R/W access. If that fails we -- * give up and leave the futex locked. -- */ -- if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) { -- switch (err) { -- case -EFAULT: -- if (fault_in_user_writeable(uaddr)) -- return -1; -- goto retry; -- -- case -EAGAIN: -- cond_resched(); -- goto retry; -- -- default: -- WARN_ON_ONCE(1); -- return err; -- } -- } -- -- if (nval != uval) -- goto retry; -- -- /* -- * Wake robust non-PI futexes here. The wakeup of -- * PI futexes happens in exit_pi_state(): -- */ -- if (!pi && (uval & FUTEX_WAITERS)) -- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); -- -- return 0; --} -- --/* -- * Fetch a robust-list pointer. Bit 0 signals PI futexes: -- */ --static inline int fetch_robust_entry(struct robust_list __user **entry, -- struct robust_list __user * __user *head, -- unsigned int *pi) --{ -- unsigned long uentry; -- -- if (get_user(uentry, (unsigned long __user *)head)) -- return -EFAULT; -- -- *entry = (void __user *)(uentry & ~1UL); -- *pi = uentry & 1; -- -- return 0; --} -- --/* -- * Walk curr->robust_list (very carefully, it's a userspace list!) -- * and mark any locks found there dead, and notify any waiters. -- * -- * We silently return on any sign of list-walking problem. -- */ --static void exit_robust_list(struct task_struct *curr) --{ -- struct robust_list_head __user *head = curr->robust_list; -- struct robust_list __user *entry, *next_entry, *pending; -- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; -- unsigned int next_pi; -- unsigned long futex_offset; -- int rc; -- -- if (!futex_cmpxchg_enabled) -- return; -- -- /* -- * Fetch the list head (which was registered earlier, via -- * sys_set_robust_list()): -- */ -- if (fetch_robust_entry(&entry, &head->list.next, &pi)) -- return; -- /* -- * Fetch the relative futex offset: -- */ -- if (get_user(futex_offset, &head->futex_offset)) -- return; -- /* -- * Fetch any possibly pending lock-add first, and handle it -- * if it exists: -- */ -- if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) -- return; -- -- next_entry = NULL; /* avoid warning with gcc */ -- while (entry != &head->list) { -- /* -- * Fetch the next entry in the list before calling -- * handle_futex_death: -- */ -- rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); -- /* -- * A pending lock might already be on the list, so -- * don't process it twice: -- */ -- if (entry != pending) { -- if (handle_futex_death((void __user *)entry + futex_offset, -- curr, pi, HANDLE_DEATH_LIST)) -- return; -- } -- if (rc) -- return; -- entry = next_entry; -- pi = next_pi; -- /* -- * Avoid excessively long or circular lists: -- */ -- if (!--limit) -- break; -- -- cond_resched(); -- } -- -- if (pending) { -- handle_futex_death((void __user *)pending + futex_offset, -- curr, pip, HANDLE_DEATH_PENDING); -- } --} -- --static void futex_cleanup(struct task_struct *tsk) --{ -- if (unlikely(tsk->robust_list)) { -- exit_robust_list(tsk); -- tsk->robust_list = NULL; -- } -- --#ifdef CONFIG_COMPAT -- if (unlikely(tsk->compat_robust_list)) { -- compat_exit_robust_list(tsk); -- tsk->compat_robust_list = NULL; -- } --#endif -- -- if (unlikely(!list_empty(&tsk->pi_state_list))) -- exit_pi_state_list(tsk); --} -- --/** -- * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD -- * @tsk: task to set the state on -- * -- * Set the futex exit state of the task lockless. The futex waiter code -- * observes that state when a task is exiting and loops until the task has -- * actually finished the futex cleanup. The worst case for this is that the -- * waiter runs through the wait loop until the state becomes visible. -- * -- * This is called from the recursive fault handling path in do_exit(). -- * -- * This is best effort. Either the futex exit code has run already or -- * not. If the OWNER_DIED bit has been set on the futex then the waiter can -- * take it over. If not, the problem is pushed back to user space. If the -- * futex exit code did not run yet, then an already queued waiter might -- * block forever, but there is nothing which can be done about that. -- */ --void futex_exit_recursive(struct task_struct *tsk) --{ -- /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ -- if (tsk->futex_state == FUTEX_STATE_EXITING) -- mutex_unlock(&tsk->futex_exit_mutex); -- tsk->futex_state = FUTEX_STATE_DEAD; --} -- --static void futex_cleanup_begin(struct task_struct *tsk) --{ -- /* -- * Prevent various race issues against a concurrent incoming waiter -- * including live locks by forcing the waiter to block on -- * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in -- * attach_to_pi_owner(). -- */ -- mutex_lock(&tsk->futex_exit_mutex); -- -- /* -- * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. -- * -- * This ensures that all subsequent checks of tsk->futex_state in -- * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with -- * tsk->pi_lock held. -- * -- * It guarantees also that a pi_state which was queued right before -- * the state change under tsk->pi_lock by a concurrent waiter must -- * be observed in exit_pi_state_list(). -- */ -- raw_spin_lock_irq(&tsk->pi_lock); -- tsk->futex_state = FUTEX_STATE_EXITING; -- raw_spin_unlock_irq(&tsk->pi_lock); --} -- --static void futex_cleanup_end(struct task_struct *tsk, int state) --{ -- /* -- * Lockless store. The only side effect is that an observer might -- * take another loop until it becomes visible. -- */ -- tsk->futex_state = state; -- /* -- * Drop the exit protection. This unblocks waiters which observed -- * FUTEX_STATE_EXITING to reevaluate the state. -- */ -- mutex_unlock(&tsk->futex_exit_mutex); --} -- --void futex_exec_release(struct task_struct *tsk) --{ -- /* -- * The state handling is done for consistency, but in the case of -- * exec() there is no way to prevent further damage as the PID stays -- * the same. But for the unlikely and arguably buggy case that a -- * futex is held on exec(), this provides at least as much state -- * consistency protection which is possible. -- */ -- futex_cleanup_begin(tsk); -- futex_cleanup(tsk); -- /* -- * Reset the state to FUTEX_STATE_OK. The task is alive and about -- * exec a new binary. -- */ -- futex_cleanup_end(tsk, FUTEX_STATE_OK); --} -- --void futex_exit_release(struct task_struct *tsk) --{ -- futex_cleanup_begin(tsk); -- futex_cleanup(tsk); -- futex_cleanup_end(tsk, FUTEX_STATE_DEAD); --} -- --long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, -- u32 __user *uaddr2, u32 val2, u32 val3) --{ -- int cmd = op & FUTEX_CMD_MASK; -- unsigned int flags = 0; -- -- if (!(op & FUTEX_PRIVATE_FLAG)) -- flags |= FLAGS_SHARED; -- -- if (op & FUTEX_CLOCK_REALTIME) { -- flags |= FLAGS_CLOCKRT; -- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && -- cmd != FUTEX_LOCK_PI2) -- return -ENOSYS; -- } -- -- switch (cmd) { -- case FUTEX_LOCK_PI: -- case FUTEX_LOCK_PI2: -- case FUTEX_UNLOCK_PI: -- case FUTEX_TRYLOCK_PI: -- case FUTEX_WAIT_REQUEUE_PI: -- case FUTEX_CMP_REQUEUE_PI: -- if (!futex_cmpxchg_enabled) -- return -ENOSYS; -- } -- -- switch (cmd) { -- case FUTEX_WAIT: -- val3 = FUTEX_BITSET_MATCH_ANY; -- fallthrough; -- case FUTEX_WAIT_BITSET: -- return futex_wait(uaddr, flags, val, timeout, val3); -- case FUTEX_WAKE: -- val3 = FUTEX_BITSET_MATCH_ANY; -- fallthrough; -- case FUTEX_WAKE_BITSET: -- return futex_wake(uaddr, flags, val, val3); -- case FUTEX_REQUEUE: -- return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); -- case FUTEX_CMP_REQUEUE: -- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); -- case FUTEX_WAKE_OP: -- return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); -- case FUTEX_LOCK_PI: -- flags |= FLAGS_CLOCKRT; -- fallthrough; -- case FUTEX_LOCK_PI2: -- return futex_lock_pi(uaddr, flags, timeout, 0); -- case FUTEX_UNLOCK_PI: -- return futex_unlock_pi(uaddr, flags); -- case FUTEX_TRYLOCK_PI: -- return futex_lock_pi(uaddr, flags, NULL, 1); -- case FUTEX_WAIT_REQUEUE_PI: -- val3 = FUTEX_BITSET_MATCH_ANY; -- return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, -- uaddr2); -- case FUTEX_CMP_REQUEUE_PI: -- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); -- } -- return -ENOSYS; --} -- --static __always_inline bool futex_cmd_has_timeout(u32 cmd) --{ -- switch (cmd) { -- case FUTEX_WAIT: -- case FUTEX_LOCK_PI: -- case FUTEX_LOCK_PI2: -- case FUTEX_WAIT_BITSET: -- case FUTEX_WAIT_REQUEUE_PI: -- return true; -- } -- return false; --} -- --static __always_inline int --futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) --{ -- if (!timespec64_valid(ts)) -- return -EINVAL; -- -- *t = timespec64_to_ktime(*ts); -- if (cmd == FUTEX_WAIT) -- *t = ktime_add_safe(ktime_get(), *t); -- else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) -- *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); -- return 0; --} -- --SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, -- const struct __kernel_timespec __user *, utime, -- u32 __user *, uaddr2, u32, val3) --{ -- int ret, cmd = op & FUTEX_CMD_MASK; -- ktime_t t, *tp = NULL; -- struct timespec64 ts; -- -- if (utime && futex_cmd_has_timeout(cmd)) { -- if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) -- return -EFAULT; -- if (get_timespec64(&ts, utime)) -- return -EFAULT; -- ret = futex_init_timeout(cmd, op, &ts, &t); -- if (ret) -- return ret; -- tp = &t; -- } -- -- return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); --} -- --#ifdef CONFIG_COMPAT --/* -- * Fetch a robust-list pointer. Bit 0 signals PI futexes: -- */ --static inline int --compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, -- compat_uptr_t __user *head, unsigned int *pi) --{ -- if (get_user(*uentry, head)) -- return -EFAULT; -- -- *entry = compat_ptr((*uentry) & ~1); -- *pi = (unsigned int)(*uentry) & 1; -- -- return 0; --} -- --static void __user *futex_uaddr(struct robust_list __user *entry, -- compat_long_t futex_offset) --{ -- compat_uptr_t base = ptr_to_compat(entry); -- void __user *uaddr = compat_ptr(base + futex_offset); -- -- return uaddr; --} -- --/* -- * Walk curr->robust_list (very carefully, it's a userspace list!) -- * and mark any locks found there dead, and notify any waiters. -- * -- * We silently return on any sign of list-walking problem. -- */ --static void compat_exit_robust_list(struct task_struct *curr) --{ -- struct compat_robust_list_head __user *head = curr->compat_robust_list; -- struct robust_list __user *entry, *next_entry, *pending; -- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; -- unsigned int next_pi; -- compat_uptr_t uentry, next_uentry, upending; -- compat_long_t futex_offset; -- int rc; -- -- if (!futex_cmpxchg_enabled) -- return; -- -- /* -- * Fetch the list head (which was registered earlier, via -- * sys_set_robust_list()): -- */ -- if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) -- return; -- /* -- * Fetch the relative futex offset: -- */ -- if (get_user(futex_offset, &head->futex_offset)) -- return; -- /* -- * Fetch any possibly pending lock-add first, and handle it -- * if it exists: -- */ -- if (compat_fetch_robust_entry(&upending, &pending, -- &head->list_op_pending, &pip)) -- return; -- -- next_entry = NULL; /* avoid warning with gcc */ -- while (entry != (struct robust_list __user *) &head->list) { -- /* -- * Fetch the next entry in the list before calling -- * handle_futex_death: -- */ -- rc = compat_fetch_robust_entry(&next_uentry, &next_entry, -- (compat_uptr_t __user *)&entry->next, &next_pi); -- /* -- * A pending lock might already be on the list, so -- * dont process it twice: -- */ -- if (entry != pending) { -- void __user *uaddr = futex_uaddr(entry, futex_offset); -- -- if (handle_futex_death(uaddr, curr, pi, -- HANDLE_DEATH_LIST)) -- return; -- } -- if (rc) -- return; -- uentry = next_uentry; -- entry = next_entry; -- pi = next_pi; -- /* -- * Avoid excessively long or circular lists: -- */ -- if (!--limit) -- break; -- -- cond_resched(); -- } -- if (pending) { -- void __user *uaddr = futex_uaddr(pending, futex_offset); -- -- handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); -- } --} -- --COMPAT_SYSCALL_DEFINE2(set_robust_list, -- struct compat_robust_list_head __user *, head, -- compat_size_t, len) --{ -- if (!futex_cmpxchg_enabled) -- return -ENOSYS; -- -- if (unlikely(len != sizeof(*head))) -- return -EINVAL; -- -- current->compat_robust_list = head; -- -- return 0; --} -- --COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, -- compat_uptr_t __user *, head_ptr, -- compat_size_t __user *, len_ptr) --{ -- struct compat_robust_list_head __user *head; -- unsigned long ret; -- struct task_struct *p; -- -- if (!futex_cmpxchg_enabled) -- return -ENOSYS; -- -- rcu_read_lock(); -- -- ret = -ESRCH; -- if (!pid) -- p = current; -- else { -- p = find_task_by_vpid(pid); -- if (!p) -- goto err_unlock; -- } -- -- ret = -EPERM; -- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) -- goto err_unlock; -- -- head = p->compat_robust_list; -- rcu_read_unlock(); -- -- if (put_user(sizeof(*head), len_ptr)) -- return -EFAULT; -- return put_user(ptr_to_compat(head), head_ptr); -- --err_unlock: -- rcu_read_unlock(); -- -- return ret; --} --#endif /* CONFIG_COMPAT */ -- --#ifdef CONFIG_COMPAT_32BIT_TIME --SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, -- const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, -- u32, val3) --{ -- int ret, cmd = op & FUTEX_CMD_MASK; -- ktime_t t, *tp = NULL; -- struct timespec64 ts; -- -- if (utime && futex_cmd_has_timeout(cmd)) { -- if (get_old_timespec32(&ts, utime)) -- return -EFAULT; -- ret = futex_init_timeout(cmd, op, &ts, &t); -- if (ret) -- return ret; -- tp = &t; -- } -- -- return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); --} --#endif /* CONFIG_COMPAT_32BIT_TIME */ -- --static void __init futex_detect_cmpxchg(void) --{ --#ifndef CONFIG_HAVE_FUTEX_CMPXCHG -- u32 curval; -- -- /* -- * This will fail and we want it. Some arch implementations do -- * runtime detection of the futex_atomic_cmpxchg_inatomic() -- * functionality. We want to know that before we call in any -- * of the complex code paths. Also we want to prevent -- * registration of robust lists in that case. NULL is -- * guaranteed to fault and we get -EFAULT on functional -- * implementation, the non-functional ones will return -- * -ENOSYS. -- */ -- if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) -- futex_cmpxchg_enabled = 1; --#endif --} -- --static int __init futex_init(void) --{ -- unsigned int futex_shift; -- unsigned long i; -- --#if CONFIG_BASE_SMALL -- futex_hashsize = 16; --#else -- futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); --#endif -- -- futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), -- futex_hashsize, 0, -- futex_hashsize < 256 ? HASH_SMALL : 0, -- &futex_shift, NULL, -- futex_hashsize, futex_hashsize); -- futex_hashsize = 1UL << futex_shift; -- -- futex_detect_cmpxchg(); -- -- for (i = 0; i < futex_hashsize; i++) { -- atomic_set(&futex_queues[i].waiters, 0); -- plist_head_init(&futex_queues[i].chain); -- spin_lock_init(&futex_queues[i].lock); -- } -- -- return 0; --} --core_initcall(futex_init); -diff --git a/kernel/futex/Makefile b/kernel/futex/Makefile -new file mode 100644 -index 000000000..b77188d1f ---- /dev/null -+++ b/kernel/futex/Makefile -@@ -0,0 +1,3 @@ -+# SPDX-License-Identifier: GPL-2.0 -+ -+obj-y += core.o syscalls.o pi.o requeue.o waitwake.o -diff --git a/kernel/futex/core.c b/kernel/futex/core.c -new file mode 100644 -index 000000000..25d8a88b3 ---- /dev/null -+++ b/kernel/futex/core.c -@@ -0,0 +1,1176 @@ -+// SPDX-License-Identifier: GPL-2.0-or-later -+/* -+ * Fast Userspace Mutexes (which I call "Futexes!"). -+ * (C) Rusty Russell, IBM 2002 -+ * -+ * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar -+ * (C) Copyright 2003 Red Hat Inc, All Rights Reserved -+ * -+ * Removed page pinning, fix privately mapped COW pages and other cleanups -+ * (C) Copyright 2003, 2004 Jamie Lokier -+ * -+ * Robust futex support started by Ingo Molnar -+ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved -+ * Thanks to Thomas Gleixner for suggestions, analysis and fixes. -+ * -+ * PI-futex support started by Ingo Molnar and Thomas Gleixner -+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> -+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> -+ * -+ * PRIVATE futexes by Eric Dumazet -+ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> -+ * -+ * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> -+ * Copyright (C) IBM Corporation, 2009 -+ * Thanks to Thomas Gleixner for conceptual design and careful reviews. -+ * -+ * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly -+ * enough at me, Linus for the original (flawed) idea, Matthew -+ * Kirkwood for proof-of-concept implementation. -+ * -+ * "The futexes are also cursed." -+ * "But they come in a choice of three flavours!" -+ */ -+#include <linux/compat.h> -+#include <linux/jhash.h> -+#include <linux/pagemap.h> -+#include <linux/memblock.h> -+#include <linux/fault-inject.h> -+#include <linux/slab.h> -+ -+#include "futex.h" -+#include "../locking/rtmutex_common.h" -+ -+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG -+int __read_mostly futex_cmpxchg_enabled; -+#endif -+ -+ -+/* -+ * The base of the bucket array and its size are always used together -+ * (after initialization only in futex_hash()), so ensure that they -+ * reside in the same cacheline. -+ */ -+static struct { -+ struct futex_hash_bucket *queues; -+ unsigned long hashsize; -+} __futex_data __read_mostly __aligned(2*sizeof(long)); -+#define futex_queues (__futex_data.queues) -+#define futex_hashsize (__futex_data.hashsize) -+ -+ -+/* -+ * Fault injections for futexes. -+ */ -+#ifdef CONFIG_FAIL_FUTEX -+ -+static struct { -+ struct fault_attr attr; -+ -+ bool ignore_private; -+} fail_futex = { -+ .attr = FAULT_ATTR_INITIALIZER, -+ .ignore_private = false, -+}; -+ -+static int __init setup_fail_futex(char *str) -+{ -+ return setup_fault_attr(&fail_futex.attr, str); -+} -+__setup("fail_futex=", setup_fail_futex); -+ -+bool should_fail_futex(bool fshared) -+{ -+ if (fail_futex.ignore_private && !fshared) -+ return false; -+ -+ return should_fail(&fail_futex.attr, 1); -+} -+ -+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS -+ -+static int __init fail_futex_debugfs(void) -+{ -+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; -+ struct dentry *dir; -+ -+ dir = fault_create_debugfs_attr("fail_futex", NULL, -+ &fail_futex.attr); -+ if (IS_ERR(dir)) -+ return PTR_ERR(dir); -+ -+ debugfs_create_bool("ignore-private", mode, dir, -+ &fail_futex.ignore_private); -+ return 0; -+} -+ -+late_initcall(fail_futex_debugfs); -+ -+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ -+ -+#endif /* CONFIG_FAIL_FUTEX */ -+ -+/** -+ * futex_hash - Return the hash bucket in the global hash -+ * @key: Pointer to the futex key for which the hash is calculated -+ * -+ * We hash on the keys returned from get_futex_key (see below) and return the -+ * corresponding hash bucket in the global hash. -+ */ -+struct futex_hash_bucket *futex_hash(union futex_key *key) -+{ -+ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, -+ key->both.offset); -+ -+ return &futex_queues[hash & (futex_hashsize - 1)]; -+} -+ -+ -+/** -+ * futex_setup_timer - set up the sleeping hrtimer. -+ * @time: ptr to the given timeout value -+ * @timeout: the hrtimer_sleeper structure to be set up -+ * @flags: futex flags -+ * @range_ns: optional range in ns -+ * -+ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout -+ * value given -+ */ -+struct hrtimer_sleeper * -+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, -+ int flags, u64 range_ns) -+{ -+ if (!time) -+ return NULL; -+ -+ hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? -+ CLOCK_REALTIME : CLOCK_MONOTONIC, -+ HRTIMER_MODE_ABS); -+ /* -+ * If range_ns is 0, calling hrtimer_set_expires_range_ns() is -+ * effectively the same as calling hrtimer_set_expires(). -+ */ -+ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); -+ -+ return timeout; -+} -+ -+/* -+ * Generate a machine wide unique identifier for this inode. -+ * -+ * This relies on u64 not wrapping in the life-time of the machine; which with -+ * 1ns resolution means almost 585 years. -+ * -+ * This further relies on the fact that a well formed program will not unmap -+ * the file while it has a (shared) futex waiting on it. This mapping will have -+ * a file reference which pins the mount and inode. -+ * -+ * If for some reason an inode gets evicted and read back in again, it will get -+ * a new sequence number and will _NOT_ match, even though it is the exact same -+ * file. -+ * -+ * It is important that futex_match() will never have a false-positive, esp. -+ * for PI futexes that can mess up the state. The above argues that false-negatives -+ * are only possible for malformed programs. -+ */ -+static u64 get_inode_sequence_number(struct inode *inode) -+{ -+ static atomic64_t i_seq; -+ u64 old; -+ -+ /* Does the inode already have a sequence number? */ -+ old = atomic64_read(&inode->i_sequence); -+ if (likely(old)) -+ return old; -+ -+ for (;;) { -+ u64 new = atomic64_add_return(1, &i_seq); -+ if (WARN_ON_ONCE(!new)) -+ continue; -+ -+ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); -+ if (old) -+ return old; -+ return new; -+ } -+} -+ -+/** -+ * get_futex_key() - Get parameters which are the keys for a futex -+ * @uaddr: virtual address of the futex -+ * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED -+ * @key: address where result is stored. -+ * @rw: mapping needs to be read/write (values: FUTEX_READ, -+ * FUTEX_WRITE) -+ * -+ * Return: a negative error code or 0 -+ * -+ * The key words are stored in @key on success. -+ * -+ * For shared mappings (when @fshared), the key is: -+ * -+ * ( inode->i_sequence, page->index, offset_within_page ) -+ * -+ * [ also see get_inode_sequence_number() ] -+ * -+ * For private mappings (or when !@fshared), the key is: -+ * -+ * ( current->mm, address, 0 ) -+ * -+ * This allows (cross process, where applicable) identification of the futex -+ * without keeping the page pinned for the duration of the FUTEX_WAIT. -+ * -+ * lock_page() might sleep, the caller should not hold a spinlock. -+ */ -+int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, -+ enum futex_access rw) -+{ -+ unsigned long address = (unsigned long)uaddr; -+ struct mm_struct *mm = current->mm; -+ struct page *page, *tail; -+ struct address_space *mapping; -+ int err, ro = 0; -+ -+ /* -+ * The futex address must be "naturally" aligned. -+ */ -+ key->both.offset = address % PAGE_SIZE; -+ if (unlikely((address % sizeof(u32)) != 0)) -+ return -EINVAL; -+ address -= key->both.offset; -+ -+ if (unlikely(!access_ok(uaddr, sizeof(u32)))) -+ return -EFAULT; -+ -+ if (unlikely(should_fail_futex(fshared))) -+ return -EFAULT; -+ -+ /* -+ * PROCESS_PRIVATE futexes are fast. -+ * As the mm cannot disappear under us and the 'key' only needs -+ * virtual address, we dont even have to find the underlying vma. -+ * Note : We do have to check 'uaddr' is a valid user address, -+ * but access_ok() should be faster than find_vma() -+ */ -+ if (!fshared) { -+ key->private.mm = mm; -+ key->private.address = address; -+ return 0; -+ } -+ -+again: -+ /* Ignore any VERIFY_READ mapping (futex common case) */ -+ if (unlikely(should_fail_futex(true))) -+ return -EFAULT; -+ -+ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); -+ /* -+ * If write access is not required (eg. FUTEX_WAIT), try -+ * and get read-only access. -+ */ -+ if (err == -EFAULT && rw == FUTEX_READ) { -+ err = get_user_pages_fast(address, 1, 0, &page); -+ ro = 1; -+ } -+ if (err < 0) -+ return err; -+ else -+ err = 0; -+ -+ /* -+ * The treatment of mapping from this point on is critical. The page -+ * lock protects many things but in this context the page lock -+ * stabilizes mapping, prevents inode freeing in the shared -+ * file-backed region case and guards against movement to swap cache. -+ * -+ * Strictly speaking the page lock is not needed in all cases being -+ * considered here and page lock forces unnecessarily serialization -+ * From this point on, mapping will be re-verified if necessary and -+ * page lock will be acquired only if it is unavoidable -+ * -+ * Mapping checks require the head page for any compound page so the -+ * head page and mapping is looked up now. For anonymous pages, it -+ * does not matter if the page splits in the future as the key is -+ * based on the address. For filesystem-backed pages, the tail is -+ * required as the index of the page determines the key. For -+ * base pages, there is no tail page and tail == page. -+ */ -+ tail = page; -+ page = compound_head(page); -+ mapping = READ_ONCE(page->mapping); -+ -+ /* -+ * If page->mapping is NULL, then it cannot be a PageAnon -+ * page; but it might be the ZERO_PAGE or in the gate area or -+ * in a special mapping (all cases which we are happy to fail); -+ * or it may have been a good file page when get_user_pages_fast -+ * found it, but truncated or holepunched or subjected to -+ * invalidate_complete_page2 before we got the page lock (also -+ * cases which we are happy to fail). And we hold a reference, -+ * so refcount care in invalidate_complete_page's remove_mapping -+ * prevents drop_caches from setting mapping to NULL beneath us. -+ * -+ * The case we do have to guard against is when memory pressure made -+ * shmem_writepage move it from filecache to swapcache beneath us: -+ * an unlikely race, but we do need to retry for page->mapping. -+ */ -+ if (unlikely(!mapping)) { -+ int shmem_swizzled; -+ -+ /* -+ * Page lock is required to identify which special case above -+ * applies. If this is really a shmem page then the page lock -+ * will prevent unexpected transitions. -+ */ -+ lock_page(page); -+ shmem_swizzled = PageSwapCache(page) || page->mapping; -+ unlock_page(page); -+ put_page(page); -+ -+ if (shmem_swizzled) -+ goto again; -+ -+ return -EFAULT; -+ } -+ -+ /* -+ * Private mappings are handled in a simple way. -+ * -+ * If the futex key is stored on an anonymous page, then the associated -+ * object is the mm which is implicitly pinned by the calling process. -+ * -+ * NOTE: When userspace waits on a MAP_SHARED mapping, even if -+ * it's a read-only handle, it's expected that futexes attach to -+ * the object not the particular process. -+ */ -+ if (PageAnon(page)) { -+ /* -+ * A RO anonymous page will never change and thus doesn't make -+ * sense for futex operations. -+ */ -+ if (unlikely(should_fail_futex(true)) || ro) { -+ err = -EFAULT; -+ goto out; -+ } -+ -+ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ -+ key->private.mm = mm; -+ key->private.address = address; -+ -+ } else { -+ struct inode *inode; -+ -+ /* -+ * The associated futex object in this case is the inode and -+ * the page->mapping must be traversed. Ordinarily this should -+ * be stabilised under page lock but it's not strictly -+ * necessary in this case as we just want to pin the inode, not -+ * update the radix tree or anything like that. -+ * -+ * The RCU read lock is taken as the inode is finally freed -+ * under RCU. If the mapping still matches expectations then the -+ * mapping->host can be safely accessed as being a valid inode. -+ */ -+ rcu_read_lock(); -+ -+ if (READ_ONCE(page->mapping) != mapping) { -+ rcu_read_unlock(); -+ put_page(page); -+ -+ goto again; -+ } -+ -+ inode = READ_ONCE(mapping->host); -+ if (!inode) { -+ rcu_read_unlock(); -+ put_page(page); -+ -+ goto again; -+ } -+ -+ key->both.offset |= FUT_OFF_INODE; /* inode-based key */ -+ key->shared.i_seq = get_inode_sequence_number(inode); -+ key->shared.pgoff = page_to_pgoff(tail); -+ rcu_read_unlock(); -+ } -+ -+out: -+ put_page(page); -+ return err; -+} -+ -+/** -+ * fault_in_user_writeable() - Fault in user address and verify RW access -+ * @uaddr: pointer to faulting user space address -+ * -+ * Slow path to fixup the fault we just took in the atomic write -+ * access to @uaddr. -+ * -+ * We have no generic implementation of a non-destructive write to the -+ * user address. We know that we faulted in the atomic pagefault -+ * disabled section so we can as well avoid the #PF overhead by -+ * calling get_user_pages() right away. -+ */ -+int fault_in_user_writeable(u32 __user *uaddr) -+{ -+ struct mm_struct *mm = current->mm; -+ int ret; -+ -+ mmap_read_lock(mm); -+ ret = fixup_user_fault(mm, (unsigned long)uaddr, -+ FAULT_FLAG_WRITE, NULL); -+ mmap_read_unlock(mm); -+ -+ return ret < 0 ? ret : 0; -+} -+ -+/** -+ * futex_top_waiter() - Return the highest priority waiter on a futex -+ * @hb: the hash bucket the futex_q's reside in -+ * @key: the futex key (to distinguish it from other futex futex_q's) -+ * -+ * Must be called with the hb lock held. -+ */ -+struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) -+{ -+ struct futex_q *this; -+ -+ plist_for_each_entry(this, &hb->chain, list) { -+ if (futex_match(&this->key, key)) -+ return this; -+ } -+ return NULL; -+} -+ -+int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) -+{ -+ int ret; -+ -+ pagefault_disable(); -+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); -+ pagefault_enable(); -+ -+ return ret; -+} -+ -+int futex_get_value_locked(u32 *dest, u32 __user *from) -+{ -+ int ret; -+ -+ pagefault_disable(); -+ ret = __get_user(*dest, from); -+ pagefault_enable(); -+ -+ return ret ? -EFAULT : 0; -+} -+ -+/** -+ * wait_for_owner_exiting - Block until the owner has exited -+ * @ret: owner's current futex lock status -+ * @exiting: Pointer to the exiting task -+ * -+ * Caller must hold a refcount on @exiting. -+ */ -+void wait_for_owner_exiting(int ret, struct task_struct *exiting) -+{ -+ if (ret != -EBUSY) { -+ WARN_ON_ONCE(exiting); -+ return; -+ } -+ -+ if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) -+ return; -+ -+ mutex_lock(&exiting->futex_exit_mutex); -+ /* -+ * No point in doing state checking here. If the waiter got here -+ * while the task was in exec()->exec_futex_release() then it can -+ * have any FUTEX_STATE_* value when the waiter has acquired the -+ * mutex. OK, if running, EXITING or DEAD if it reached exit() -+ * already. Highly unlikely and not a problem. Just one more round -+ * through the futex maze. -+ */ -+ mutex_unlock(&exiting->futex_exit_mutex); -+ -+ put_task_struct(exiting); -+} -+ -+/** -+ * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket -+ * @q: The futex_q to unqueue -+ * -+ * The q->lock_ptr must not be NULL and must be held by the caller. -+ */ -+void __futex_unqueue(struct futex_q *q) -+{ -+ struct futex_hash_bucket *hb; -+ -+ if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) -+ return; -+ lockdep_assert_held(q->lock_ptr); -+ -+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); -+ plist_del(&q->list, &hb->chain); -+ futex_hb_waiters_dec(hb); -+} -+ -+/* The key must be already stored in q->key. */ -+struct futex_hash_bucket *futex_q_lock(struct futex_q *q) -+ __acquires(&hb->lock) -+{ -+ struct futex_hash_bucket *hb; -+ -+ hb = futex_hash(&q->key); -+ -+ /* -+ * Increment the counter before taking the lock so that -+ * a potential waker won't miss a to-be-slept task that is -+ * waiting for the spinlock. This is safe as all futex_q_lock() -+ * users end up calling futex_queue(). Similarly, for housekeeping, -+ * decrement the counter at futex_q_unlock() when some error has -+ * occurred and we don't end up adding the task to the list. -+ */ -+ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ -+ -+ q->lock_ptr = &hb->lock; -+ -+ spin_lock(&hb->lock); -+ return hb; -+} -+ -+void futex_q_unlock(struct futex_hash_bucket *hb) -+ __releases(&hb->lock) -+{ -+ spin_unlock(&hb->lock); -+ futex_hb_waiters_dec(hb); -+} -+ -+void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) -+{ -+ int prio; -+ -+ /* -+ * The priority used to register this element is -+ * - either the real thread-priority for the real-time threads -+ * (i.e. threads with a priority lower than MAX_RT_PRIO) -+ * - or MAX_RT_PRIO for non-RT threads. -+ * Thus, all RT-threads are woken first in priority order, and -+ * the others are woken last, in FIFO order. -+ */ -+ prio = min(current->normal_prio, MAX_RT_PRIO); -+ -+ plist_node_init(&q->list, prio); -+ plist_add(&q->list, &hb->chain); -+ q->task = current; -+} -+ -+/** -+ * futex_unqueue() - Remove the futex_q from its futex_hash_bucket -+ * @q: The futex_q to unqueue -+ * -+ * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must -+ * be paired with exactly one earlier call to futex_queue(). -+ * -+ * Return: -+ * - 1 - if the futex_q was still queued (and we removed unqueued it); -+ * - 0 - if the futex_q was already removed by the waking thread -+ */ -+int futex_unqueue(struct futex_q *q) -+{ -+ spinlock_t *lock_ptr; -+ int ret = 0; -+ -+ /* In the common case we don't take the spinlock, which is nice. */ -+retry: -+ /* -+ * q->lock_ptr can change between this read and the following spin_lock. -+ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and -+ * optimizing lock_ptr out of the logic below. -+ */ -+ lock_ptr = READ_ONCE(q->lock_ptr); -+ if (lock_ptr != NULL) { -+ spin_lock(lock_ptr); -+ /* -+ * q->lock_ptr can change between reading it and -+ * spin_lock(), causing us to take the wrong lock. This -+ * corrects the race condition. -+ * -+ * Reasoning goes like this: if we have the wrong lock, -+ * q->lock_ptr must have changed (maybe several times) -+ * between reading it and the spin_lock(). It can -+ * change again after the spin_lock() but only if it was -+ * already changed before the spin_lock(). It cannot, -+ * however, change back to the original value. Therefore -+ * we can detect whether we acquired the correct lock. -+ */ -+ if (unlikely(lock_ptr != q->lock_ptr)) { -+ spin_unlock(lock_ptr); -+ goto retry; -+ } -+ __futex_unqueue(q); -+ -+ BUG_ON(q->pi_state); -+ -+ spin_unlock(lock_ptr); -+ ret = 1; -+ } -+ -+ return ret; -+} -+ -+/* -+ * PI futexes can not be requeued and must remove themselves from the -+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. -+ */ -+void futex_unqueue_pi(struct futex_q *q) -+{ -+ __futex_unqueue(q); -+ -+ BUG_ON(!q->pi_state); -+ put_pi_state(q->pi_state); -+ q->pi_state = NULL; -+} -+ -+/* Constants for the pending_op argument of handle_futex_death */ -+#define HANDLE_DEATH_PENDING true -+#define HANDLE_DEATH_LIST false -+ -+/* -+ * Process a futex-list entry, check whether it's owned by the -+ * dying task, and do notification if so: -+ */ -+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, -+ bool pi, bool pending_op) -+{ -+ u32 uval, nval, mval; -+ int err; -+ -+ /* Futex address must be 32bit aligned */ -+ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) -+ return -1; -+ -+retry: -+ if (get_user(uval, uaddr)) -+ return -1; -+ -+ /* -+ * Special case for regular (non PI) futexes. The unlock path in -+ * user space has two race scenarios: -+ * -+ * 1. The unlock path releases the user space futex value and -+ * before it can execute the futex() syscall to wake up -+ * waiters it is killed. -+ * -+ * 2. A woken up waiter is killed before it can acquire the -+ * futex in user space. -+ * -+ * In both cases the TID validation below prevents a wakeup of -+ * potential waiters which can cause these waiters to block -+ * forever. -+ * -+ * In both cases the following conditions are met: -+ * -+ * 1) task->robust_list->list_op_pending != NULL -+ * @pending_op == true -+ * 2) User space futex value == 0 -+ * 3) Regular futex: @pi == false -+ * -+ * If these conditions are met, it is safe to attempt waking up a -+ * potential waiter without touching the user space futex value and -+ * trying to set the OWNER_DIED bit. The user space futex value is -+ * uncontended and the rest of the user space mutex state is -+ * consistent, so a woken waiter will just take over the -+ * uncontended futex. Setting the OWNER_DIED bit would create -+ * inconsistent state and malfunction of the user space owner died -+ * handling. -+ */ -+ if (pending_op && !pi && !uval) { -+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); -+ return 0; -+ } -+ -+ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) -+ return 0; -+ -+ /* -+ * Ok, this dying thread is truly holding a futex -+ * of interest. Set the OWNER_DIED bit atomically -+ * via cmpxchg, and if the value had FUTEX_WAITERS -+ * set, wake up a waiter (if any). (We have to do a -+ * futex_wake() even if OWNER_DIED is already set - -+ * to handle the rare but possible case of recursive -+ * thread-death.) The rest of the cleanup is done in -+ * userspace. -+ */ -+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; -+ -+ /* -+ * We are not holding a lock here, but we want to have -+ * the pagefault_disable/enable() protection because -+ * we want to handle the fault gracefully. If the -+ * access fails we try to fault in the futex with R/W -+ * verification via get_user_pages. get_user() above -+ * does not guarantee R/W access. If that fails we -+ * give up and leave the futex locked. -+ */ -+ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { -+ switch (err) { -+ case -EFAULT: -+ if (fault_in_user_writeable(uaddr)) -+ return -1; -+ goto retry; -+ -+ case -EAGAIN: -+ cond_resched(); -+ goto retry; -+ -+ default: -+ WARN_ON_ONCE(1); -+ return err; -+ } -+ } -+ -+ if (nval != uval) -+ goto retry; -+ -+ /* -+ * Wake robust non-PI futexes here. The wakeup of -+ * PI futexes happens in exit_pi_state(): -+ */ -+ if (!pi && (uval & FUTEX_WAITERS)) -+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); -+ -+ return 0; -+} -+ -+/* -+ * Fetch a robust-list pointer. Bit 0 signals PI futexes: -+ */ -+static inline int fetch_robust_entry(struct robust_list __user **entry, -+ struct robust_list __user * __user *head, -+ unsigned int *pi) -+{ -+ unsigned long uentry; -+ -+ if (get_user(uentry, (unsigned long __user *)head)) -+ return -EFAULT; -+ -+ *entry = (void __user *)(uentry & ~1UL); -+ *pi = uentry & 1; -+ -+ return 0; -+} -+ -+/* -+ * Walk curr->robust_list (very carefully, it's a userspace list!) -+ * and mark any locks found there dead, and notify any waiters. -+ * -+ * We silently return on any sign of list-walking problem. -+ */ -+static void exit_robust_list(struct task_struct *curr) -+{ -+ struct robust_list_head __user *head = curr->robust_list; -+ struct robust_list __user *entry, *next_entry, *pending; -+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; -+ unsigned int next_pi; -+ unsigned long futex_offset; -+ int rc; -+ -+ if (!futex_cmpxchg_enabled) -+ return; -+ -+ /* -+ * Fetch the list head (which was registered earlier, via -+ * sys_set_robust_list()): -+ */ -+ if (fetch_robust_entry(&entry, &head->list.next, &pi)) -+ return; -+ /* -+ * Fetch the relative futex offset: -+ */ -+ if (get_user(futex_offset, &head->futex_offset)) -+ return; -+ /* -+ * Fetch any possibly pending lock-add first, and handle it -+ * if it exists: -+ */ -+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) -+ return; -+ -+ next_entry = NULL; /* avoid warning with gcc */ -+ while (entry != &head->list) { -+ /* -+ * Fetch the next entry in the list before calling -+ * handle_futex_death: -+ */ -+ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); -+ /* -+ * A pending lock might already be on the list, so -+ * don't process it twice: -+ */ -+ if (entry != pending) { -+ if (handle_futex_death((void __user *)entry + futex_offset, -+ curr, pi, HANDLE_DEATH_LIST)) -+ return; -+ } -+ if (rc) -+ return; -+ entry = next_entry; -+ pi = next_pi; -+ /* -+ * Avoid excessively long or circular lists: -+ */ -+ if (!--limit) -+ break; -+ -+ cond_resched(); -+ } -+ -+ if (pending) { -+ handle_futex_death((void __user *)pending + futex_offset, -+ curr, pip, HANDLE_DEATH_PENDING); -+ } -+} -+ -+#ifdef CONFIG_COMPAT -+static void __user *futex_uaddr(struct robust_list __user *entry, -+ compat_long_t futex_offset) -+{ -+ compat_uptr_t base = ptr_to_compat(entry); -+ void __user *uaddr = compat_ptr(base + futex_offset); -+ -+ return uaddr; -+} -+ -+/* -+ * Fetch a robust-list pointer. Bit 0 signals PI futexes: -+ */ -+static inline int -+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, -+ compat_uptr_t __user *head, unsigned int *pi) -+{ -+ if (get_user(*uentry, head)) -+ return -EFAULT; -+ -+ *entry = compat_ptr((*uentry) & ~1); -+ *pi = (unsigned int)(*uentry) & 1; -+ -+ return 0; -+} -+ -+/* -+ * Walk curr->robust_list (very carefully, it's a userspace list!) -+ * and mark any locks found there dead, and notify any waiters. -+ * -+ * We silently return on any sign of list-walking problem. -+ */ -+static void compat_exit_robust_list(struct task_struct *curr) -+{ -+ struct compat_robust_list_head __user *head = curr->compat_robust_list; -+ struct robust_list __user *entry, *next_entry, *pending; -+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; -+ unsigned int next_pi; -+ compat_uptr_t uentry, next_uentry, upending; -+ compat_long_t futex_offset; -+ int rc; -+ -+ if (!futex_cmpxchg_enabled) -+ return; -+ -+ /* -+ * Fetch the list head (which was registered earlier, via -+ * sys_set_robust_list()): -+ */ -+ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) -+ return; -+ /* -+ * Fetch the relative futex offset: -+ */ -+ if (get_user(futex_offset, &head->futex_offset)) -+ return; -+ /* -+ * Fetch any possibly pending lock-add first, and handle it -+ * if it exists: -+ */ -+ if (compat_fetch_robust_entry(&upending, &pending, -+ &head->list_op_pending, &pip)) -+ return; -+ -+ next_entry = NULL; /* avoid warning with gcc */ -+ while (entry != (struct robust_list __user *) &head->list) { -+ /* -+ * Fetch the next entry in the list before calling -+ * handle_futex_death: -+ */ -+ rc = compat_fetch_robust_entry(&next_uentry, &next_entry, -+ (compat_uptr_t __user *)&entry->next, &next_pi); -+ /* -+ * A pending lock might already be on the list, so -+ * dont process it twice: -+ */ -+ if (entry != pending) { -+ void __user *uaddr = futex_uaddr(entry, futex_offset); -+ -+ if (handle_futex_death(uaddr, curr, pi, -+ HANDLE_DEATH_LIST)) -+ return; -+ } -+ if (rc) -+ return; -+ uentry = next_uentry; -+ entry = next_entry; -+ pi = next_pi; -+ /* -+ * Avoid excessively long or circular lists: -+ */ -+ if (!--limit) -+ break; -+ -+ cond_resched(); -+ } -+ if (pending) { -+ void __user *uaddr = futex_uaddr(pending, futex_offset); -+ -+ handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); -+ } -+} -+#endif -+ -+#ifdef CONFIG_FUTEX_PI -+ -+/* -+ * This task is holding PI mutexes at exit time => bad. -+ * Kernel cleans up PI-state, but userspace is likely hosed. -+ * (Robust-futex cleanup is separate and might save the day for userspace.) -+ */ -+static void exit_pi_state_list(struct task_struct *curr) -+{ -+ struct list_head *next, *head = &curr->pi_state_list; -+ struct futex_pi_state *pi_state; -+ struct futex_hash_bucket *hb; -+ union futex_key key = FUTEX_KEY_INIT; -+ -+ if (!futex_cmpxchg_enabled) -+ return; -+ /* -+ * We are a ZOMBIE and nobody can enqueue itself on -+ * pi_state_list anymore, but we have to be careful -+ * versus waiters unqueueing themselves: -+ */ -+ raw_spin_lock_irq(&curr->pi_lock); -+ while (!list_empty(head)) { -+ next = head->next; -+ pi_state = list_entry(next, struct futex_pi_state, list); -+ key = pi_state->key; -+ hb = futex_hash(&key); -+ -+ /* -+ * We can race against put_pi_state() removing itself from the -+ * list (a waiter going away). put_pi_state() will first -+ * decrement the reference count and then modify the list, so -+ * its possible to see the list entry but fail this reference -+ * acquire. -+ * -+ * In that case; drop the locks to let put_pi_state() make -+ * progress and retry the loop. -+ */ -+ if (!refcount_inc_not_zero(&pi_state->refcount)) { -+ raw_spin_unlock_irq(&curr->pi_lock); -+ cpu_relax(); -+ raw_spin_lock_irq(&curr->pi_lock); -+ continue; -+ } -+ raw_spin_unlock_irq(&curr->pi_lock); -+ -+ spin_lock(&hb->lock); -+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -+ raw_spin_lock(&curr->pi_lock); -+ /* -+ * We dropped the pi-lock, so re-check whether this -+ * task still owns the PI-state: -+ */ -+ if (head->next != next) { -+ /* retain curr->pi_lock for the loop invariant */ -+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); -+ spin_unlock(&hb->lock); -+ put_pi_state(pi_state); -+ continue; -+ } -+ -+ WARN_ON(pi_state->owner != curr); -+ WARN_ON(list_empty(&pi_state->list)); -+ list_del_init(&pi_state->list); -+ pi_state->owner = NULL; -+ -+ raw_spin_unlock(&curr->pi_lock); -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ spin_unlock(&hb->lock); -+ -+ rt_mutex_futex_unlock(&pi_state->pi_mutex); -+ put_pi_state(pi_state); -+ -+ raw_spin_lock_irq(&curr->pi_lock); -+ } -+ raw_spin_unlock_irq(&curr->pi_lock); -+} -+#else -+static inline void exit_pi_state_list(struct task_struct *curr) { } -+#endif -+ -+static void futex_cleanup(struct task_struct *tsk) -+{ -+ if (unlikely(tsk->robust_list)) { -+ exit_robust_list(tsk); -+ tsk->robust_list = NULL; -+ } -+ -+#ifdef CONFIG_COMPAT -+ if (unlikely(tsk->compat_robust_list)) { -+ compat_exit_robust_list(tsk); -+ tsk->compat_robust_list = NULL; -+ } -+#endif -+ -+ if (unlikely(!list_empty(&tsk->pi_state_list))) -+ exit_pi_state_list(tsk); -+} -+ -+/** -+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD -+ * @tsk: task to set the state on -+ * -+ * Set the futex exit state of the task lockless. The futex waiter code -+ * observes that state when a task is exiting and loops until the task has -+ * actually finished the futex cleanup. The worst case for this is that the -+ * waiter runs through the wait loop until the state becomes visible. -+ * -+ * This is called from the recursive fault handling path in do_exit(). -+ * -+ * This is best effort. Either the futex exit code has run already or -+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can -+ * take it over. If not, the problem is pushed back to user space. If the -+ * futex exit code did not run yet, then an already queued waiter might -+ * block forever, but there is nothing which can be done about that. -+ */ -+void futex_exit_recursive(struct task_struct *tsk) -+{ -+ /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ -+ if (tsk->futex_state == FUTEX_STATE_EXITING) -+ mutex_unlock(&tsk->futex_exit_mutex); -+ tsk->futex_state = FUTEX_STATE_DEAD; -+} -+ -+static void futex_cleanup_begin(struct task_struct *tsk) -+{ -+ /* -+ * Prevent various race issues against a concurrent incoming waiter -+ * including live locks by forcing the waiter to block on -+ * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in -+ * attach_to_pi_owner(). -+ */ -+ mutex_lock(&tsk->futex_exit_mutex); -+ -+ /* -+ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. -+ * -+ * This ensures that all subsequent checks of tsk->futex_state in -+ * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with -+ * tsk->pi_lock held. -+ * -+ * It guarantees also that a pi_state which was queued right before -+ * the state change under tsk->pi_lock by a concurrent waiter must -+ * be observed in exit_pi_state_list(). -+ */ -+ raw_spin_lock_irq(&tsk->pi_lock); -+ tsk->futex_state = FUTEX_STATE_EXITING; -+ raw_spin_unlock_irq(&tsk->pi_lock); -+} -+ -+static void futex_cleanup_end(struct task_struct *tsk, int state) -+{ -+ /* -+ * Lockless store. The only side effect is that an observer might -+ * take another loop until it becomes visible. -+ */ -+ tsk->futex_state = state; -+ /* -+ * Drop the exit protection. This unblocks waiters which observed -+ * FUTEX_STATE_EXITING to reevaluate the state. -+ */ -+ mutex_unlock(&tsk->futex_exit_mutex); -+} -+ -+void futex_exec_release(struct task_struct *tsk) -+{ -+ /* -+ * The state handling is done for consistency, but in the case of -+ * exec() there is no way to prevent further damage as the PID stays -+ * the same. But for the unlikely and arguably buggy case that a -+ * futex is held on exec(), this provides at least as much state -+ * consistency protection which is possible. -+ */ -+ futex_cleanup_begin(tsk); -+ futex_cleanup(tsk); -+ /* -+ * Reset the state to FUTEX_STATE_OK. The task is alive and about -+ * exec a new binary. -+ */ -+ futex_cleanup_end(tsk, FUTEX_STATE_OK); -+} -+ -+void futex_exit_release(struct task_struct *tsk) -+{ -+ futex_cleanup_begin(tsk); -+ futex_cleanup(tsk); -+ futex_cleanup_end(tsk, FUTEX_STATE_DEAD); -+} -+ -+static void __init futex_detect_cmpxchg(void) -+{ -+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG -+ u32 curval; -+ -+ /* -+ * This will fail and we want it. Some arch implementations do -+ * runtime detection of the futex_atomic_cmpxchg_inatomic() -+ * functionality. We want to know that before we call in any -+ * of the complex code paths. Also we want to prevent -+ * registration of robust lists in that case. NULL is -+ * guaranteed to fault and we get -EFAULT on functional -+ * implementation, the non-functional ones will return -+ * -ENOSYS. -+ */ -+ if (futex_cmpxchg_value_locked(&curval, NULL, 0, 0) == -EFAULT) -+ futex_cmpxchg_enabled = 1; -+#endif -+} -+ -+static int __init futex_init(void) -+{ -+ unsigned int futex_shift; -+ unsigned long i; -+ -+#if CONFIG_BASE_SMALL -+ futex_hashsize = 16; -+#else -+ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); -+#endif -+ -+ futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), -+ futex_hashsize, 0, -+ futex_hashsize < 256 ? HASH_SMALL : 0, -+ &futex_shift, NULL, -+ futex_hashsize, futex_hashsize); -+ futex_hashsize = 1UL << futex_shift; -+ -+ futex_detect_cmpxchg(); -+ -+ for (i = 0; i < futex_hashsize; i++) { -+ atomic_set(&futex_queues[i].waiters, 0); -+ plist_head_init(&futex_queues[i].chain); -+ spin_lock_init(&futex_queues[i].lock); -+ } -+ -+ return 0; -+} -+core_initcall(futex_init); -diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h -new file mode 100644 -index 000000000..948fcf317 ---- /dev/null -+++ b/kernel/futex/futex.h -@@ -0,0 +1,295 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _FUTEX_H -+#define _FUTEX_H -+ -+#include <linux/futex.h> -+#include <linux/sched/wake_q.h> -+ -+#include <asm/futex.h> -+ -+/* -+ * Futex flags used to encode options to functions and preserve them across -+ * restarts. -+ */ -+#ifdef CONFIG_MMU -+# define FLAGS_SHARED 0x01 -+#else -+/* -+ * NOMMU does not have per process address space. Let the compiler optimize -+ * code away. -+ */ -+# define FLAGS_SHARED 0x00 -+#endif -+#define FLAGS_CLOCKRT 0x02 -+#define FLAGS_HAS_TIMEOUT 0x04 -+ -+#ifdef CONFIG_HAVE_FUTEX_CMPXCHG -+#define futex_cmpxchg_enabled 1 -+#else -+extern int __read_mostly futex_cmpxchg_enabled; -+#endif -+ -+#ifdef CONFIG_FAIL_FUTEX -+extern bool should_fail_futex(bool fshared); -+#else -+static inline bool should_fail_futex(bool fshared) -+{ -+ return false; -+} -+#endif -+ -+/* -+ * Hash buckets are shared by all the futex_keys that hash to the same -+ * location. Each key may have multiple futex_q structures, one for each task -+ * waiting on a futex. -+ */ -+struct futex_hash_bucket { -+ atomic_t waiters; -+ spinlock_t lock; -+ struct plist_head chain; -+} ____cacheline_aligned_in_smp; -+ -+/* -+ * Priority Inheritance state: -+ */ -+struct futex_pi_state { -+ /* -+ * list of 'owned' pi_state instances - these have to be -+ * cleaned up in do_exit() if the task exits prematurely: -+ */ -+ struct list_head list; -+ -+ /* -+ * The PI object: -+ */ -+ struct rt_mutex_base pi_mutex; -+ -+ struct task_struct *owner; -+ refcount_t refcount; -+ -+ union futex_key key; -+} __randomize_layout; -+ -+/** -+ * struct futex_q - The hashed futex queue entry, one per waiting task -+ * @list: priority-sorted list of tasks waiting on this futex -+ * @task: the task waiting on the futex -+ * @lock_ptr: the hash bucket lock -+ * @key: the key the futex is hashed on -+ * @pi_state: optional priority inheritance state -+ * @rt_waiter: rt_waiter storage for use with requeue_pi -+ * @requeue_pi_key: the requeue_pi target futex key -+ * @bitset: bitset for the optional bitmasked wakeup -+ * @requeue_state: State field for futex_requeue_pi() -+ * @requeue_wait: RCU wait for futex_requeue_pi() (RT only) -+ * -+ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so -+ * we can wake only the relevant ones (hashed queues may be shared). -+ * -+ * A futex_q has a woken state, just like tasks have TASK_RUNNING. -+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. -+ * The order of wakeup is always to make the first condition true, then -+ * the second. -+ * -+ * PI futexes are typically woken before they are removed from the hash list via -+ * the rt_mutex code. See futex_unqueue_pi(). -+ */ -+struct futex_q { -+ struct plist_node list; -+ -+ struct task_struct *task; -+ spinlock_t *lock_ptr; -+ union futex_key key; -+ struct futex_pi_state *pi_state; -+ struct rt_mutex_waiter *rt_waiter; -+ union futex_key *requeue_pi_key; -+ u32 bitset; -+ atomic_t requeue_state; -+#ifdef CONFIG_PREEMPT_RT -+ struct rcuwait requeue_wait; -+#endif -+} __randomize_layout; -+ -+extern const struct futex_q futex_q_init; -+ -+enum futex_access { -+ FUTEX_READ, -+ FUTEX_WRITE -+}; -+ -+extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, -+ enum futex_access rw); -+ -+extern struct hrtimer_sleeper * -+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, -+ int flags, u64 range_ns); -+ -+extern struct futex_hash_bucket *futex_hash(union futex_key *key); -+ -+/** -+ * futex_match - Check whether two futex keys are equal -+ * @key1: Pointer to key1 -+ * @key2: Pointer to key2 -+ * -+ * Return 1 if two futex_keys are equal, 0 otherwise. -+ */ -+static inline int futex_match(union futex_key *key1, union futex_key *key2) -+{ -+ return (key1 && key2 -+ && key1->both.word == key2->both.word -+ && key1->both.ptr == key2->both.ptr -+ && key1->both.offset == key2->both.offset); -+} -+ -+extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, -+ struct futex_q *q, struct futex_hash_bucket **hb); -+extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, -+ struct hrtimer_sleeper *timeout); -+extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); -+ -+extern int fault_in_user_writeable(u32 __user *uaddr); -+extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval); -+extern int futex_get_value_locked(u32 *dest, u32 __user *from); -+extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key); -+ -+extern void __futex_unqueue(struct futex_q *q); -+extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb); -+extern int futex_unqueue(struct futex_q *q); -+ -+/** -+ * futex_queue() - Enqueue the futex_q on the futex_hash_bucket -+ * @q: The futex_q to enqueue -+ * @hb: The destination hash bucket -+ * -+ * The hb->lock must be held by the caller, and is released here. A call to -+ * futex_queue() is typically paired with exactly one call to futex_unqueue(). The -+ * exceptions involve the PI related operations, which may use futex_unqueue_pi() -+ * or nothing if the unqueue is done as part of the wake process and the unqueue -+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for -+ * an example). -+ */ -+static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) -+ __releases(&hb->lock) -+{ -+ __futex_queue(q, hb); -+ spin_unlock(&hb->lock); -+} -+ -+extern void futex_unqueue_pi(struct futex_q *q); -+ -+extern void wait_for_owner_exiting(int ret, struct task_struct *exiting); -+ -+/* -+ * Reflects a new waiter being added to the waitqueue. -+ */ -+static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb) -+{ -+#ifdef CONFIG_SMP -+ atomic_inc(&hb->waiters); -+ /* -+ * Full barrier (A), see the ordering comment above. -+ */ -+ smp_mb__after_atomic(); -+#endif -+} -+ -+/* -+ * Reflects a waiter being removed from the waitqueue by wakeup -+ * paths. -+ */ -+static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb) -+{ -+#ifdef CONFIG_SMP -+ atomic_dec(&hb->waiters); -+#endif -+} -+ -+static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * Full barrier (B), see the ordering comment above. -+ */ -+ smp_mb(); -+ return atomic_read(&hb->waiters); -+#else -+ return 1; -+#endif -+} -+ -+extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q); -+extern void futex_q_unlock(struct futex_hash_bucket *hb); -+ -+ -+extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, -+ union futex_key *key, -+ struct futex_pi_state **ps, -+ struct task_struct *task, -+ struct task_struct **exiting, -+ int set_waiters); -+ -+extern int refill_pi_state_cache(void); -+extern void get_pi_state(struct futex_pi_state *pi_state); -+extern void put_pi_state(struct futex_pi_state *pi_state); -+extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked); -+ -+/* -+ * Express the locking dependencies for lockdep: -+ */ -+static inline void -+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -+{ -+ if (hb1 > hb2) -+ swap(hb1, hb2); -+ -+ spin_lock(&hb1->lock); -+ if (hb1 != hb2) -+ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); -+} -+ -+static inline void -+double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) -+{ -+ spin_unlock(&hb1->lock); -+ if (hb1 != hb2) -+ spin_unlock(&hb2->lock); -+} -+ -+/* syscalls */ -+ -+extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 -+ val, ktime_t *abs_time, u32 bitset, u32 __user -+ *uaddr2); -+ -+extern int futex_requeue(u32 __user *uaddr1, unsigned int flags, -+ u32 __user *uaddr2, int nr_wake, int nr_requeue, -+ u32 *cmpval, int requeue_pi); -+ -+extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, -+ ktime_t *abs_time, u32 bitset); -+ -+/** -+ * struct futex_vector - Auxiliary struct for futex_waitv() -+ * @w: Userspace provided data -+ * @q: Kernel side data -+ * -+ * Struct used to build an array with all data need for futex_waitv() -+ */ -+struct futex_vector { -+ struct futex_waitv w; -+ struct futex_q q; -+}; -+ -+extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count, -+ struct hrtimer_sleeper *to); -+ -+extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset); -+ -+extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags, -+ u32 __user *uaddr2, int nr_wake, int nr_wake2, int op); -+ -+extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags); -+ -+extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock); -+ -+#endif /* _FUTEX_H */ -diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c -new file mode 100644 -index 000000000..183b28c32 ---- /dev/null -+++ b/kernel/futex/pi.c -@@ -0,0 +1,1233 @@ -+// SPDX-License-Identifier: GPL-2.0-or-later -+ -+#include <linux/slab.h> -+#include <linux/sched/task.h> -+ -+#include "futex.h" -+#include "../locking/rtmutex_common.h" -+ -+/* -+ * PI code: -+ */ -+int refill_pi_state_cache(void) -+{ -+ struct futex_pi_state *pi_state; -+ -+ if (likely(current->pi_state_cache)) -+ return 0; -+ -+ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); -+ -+ if (!pi_state) -+ return -ENOMEM; -+ -+ INIT_LIST_HEAD(&pi_state->list); -+ /* pi_mutex gets initialized later */ -+ pi_state->owner = NULL; -+ refcount_set(&pi_state->refcount, 1); -+ pi_state->key = FUTEX_KEY_INIT; -+ -+ current->pi_state_cache = pi_state; -+ -+ return 0; -+} -+ -+static struct futex_pi_state *alloc_pi_state(void) -+{ -+ struct futex_pi_state *pi_state = current->pi_state_cache; -+ -+ WARN_ON(!pi_state); -+ current->pi_state_cache = NULL; -+ -+ return pi_state; -+} -+ -+static void pi_state_update_owner(struct futex_pi_state *pi_state, -+ struct task_struct *new_owner) -+{ -+ struct task_struct *old_owner = pi_state->owner; -+ -+ lockdep_assert_held(&pi_state->pi_mutex.wait_lock); -+ -+ if (old_owner) { -+ raw_spin_lock(&old_owner->pi_lock); -+ WARN_ON(list_empty(&pi_state->list)); -+ list_del_init(&pi_state->list); -+ raw_spin_unlock(&old_owner->pi_lock); -+ } -+ -+ if (new_owner) { -+ raw_spin_lock(&new_owner->pi_lock); -+ WARN_ON(!list_empty(&pi_state->list)); -+ list_add(&pi_state->list, &new_owner->pi_state_list); -+ pi_state->owner = new_owner; -+ raw_spin_unlock(&new_owner->pi_lock); -+ } -+} -+ -+void get_pi_state(struct futex_pi_state *pi_state) -+{ -+ WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); -+} -+ -+/* -+ * Drops a reference to the pi_state object and frees or caches it -+ * when the last reference is gone. -+ */ -+void put_pi_state(struct futex_pi_state *pi_state) -+{ -+ if (!pi_state) -+ return; -+ -+ if (!refcount_dec_and_test(&pi_state->refcount)) -+ return; -+ -+ /* -+ * If pi_state->owner is NULL, the owner is most probably dying -+ * and has cleaned up the pi_state already -+ */ -+ if (pi_state->owner) { -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); -+ pi_state_update_owner(pi_state, NULL); -+ rt_mutex_proxy_unlock(&pi_state->pi_mutex); -+ raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); -+ } -+ -+ if (current->pi_state_cache) { -+ kfree(pi_state); -+ } else { -+ /* -+ * pi_state->list is already empty. -+ * clear pi_state->owner. -+ * refcount is at 0 - put it back to 1. -+ */ -+ pi_state->owner = NULL; -+ refcount_set(&pi_state->refcount, 1); -+ current->pi_state_cache = pi_state; -+ } -+} -+ -+/* -+ * We need to check the following states: -+ * -+ * Waiter | pi_state | pi->owner | uTID | uODIED | ? -+ * -+ * [1] NULL | --- | --- | 0 | 0/1 | Valid -+ * [2] NULL | --- | --- | >0 | 0/1 | Valid -+ * -+ * [3] Found | NULL | -- | Any | 0/1 | Invalid -+ * -+ * [4] Found | Found | NULL | 0 | 1 | Valid -+ * [5] Found | Found | NULL | >0 | 1 | Invalid -+ * -+ * [6] Found | Found | task | 0 | 1 | Valid -+ * -+ * [7] Found | Found | NULL | Any | 0 | Invalid -+ * -+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid -+ * [9] Found | Found | task | 0 | 0 | Invalid -+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid -+ * -+ * [1] Indicates that the kernel can acquire the futex atomically. We -+ * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. -+ * -+ * [2] Valid, if TID does not belong to a kernel thread. If no matching -+ * thread is found then it indicates that the owner TID has died. -+ * -+ * [3] Invalid. The waiter is queued on a non PI futex -+ * -+ * [4] Valid state after exit_robust_list(), which sets the user space -+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. -+ * -+ * [5] The user space value got manipulated between exit_robust_list() -+ * and exit_pi_state_list() -+ * -+ * [6] Valid state after exit_pi_state_list() which sets the new owner in -+ * the pi_state but cannot access the user space value. -+ * -+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. -+ * -+ * [8] Owner and user space value match -+ * -+ * [9] There is no transient state which sets the user space TID to 0 -+ * except exit_robust_list(), but this is indicated by the -+ * FUTEX_OWNER_DIED bit. See [4] -+ * -+ * [10] There is no transient state which leaves owner and user space -+ * TID out of sync. Except one error case where the kernel is denied -+ * write access to the user address, see fixup_pi_state_owner(). -+ * -+ * -+ * Serialization and lifetime rules: -+ * -+ * hb->lock: -+ * -+ * hb -> futex_q, relation -+ * futex_q -> pi_state, relation -+ * -+ * (cannot be raw because hb can contain arbitrary amount -+ * of futex_q's) -+ * -+ * pi_mutex->wait_lock: -+ * -+ * {uval, pi_state} -+ * -+ * (and pi_mutex 'obviously') -+ * -+ * p->pi_lock: -+ * -+ * p->pi_state_list -> pi_state->list, relation -+ * pi_mutex->owner -> pi_state->owner, relation -+ * -+ * pi_state->refcount: -+ * -+ * pi_state lifetime -+ * -+ * -+ * Lock order: -+ * -+ * hb->lock -+ * pi_mutex->wait_lock -+ * p->pi_lock -+ * -+ */ -+ -+/* -+ * Validate that the existing waiter has a pi_state and sanity check -+ * the pi_state against the user space value. If correct, attach to -+ * it. -+ */ -+static int attach_to_pi_state(u32 __user *uaddr, u32 uval, -+ struct futex_pi_state *pi_state, -+ struct futex_pi_state **ps) -+{ -+ pid_t pid = uval & FUTEX_TID_MASK; -+ u32 uval2; -+ int ret; -+ -+ /* -+ * Userspace might have messed up non-PI and PI futexes [3] -+ */ -+ if (unlikely(!pi_state)) -+ return -EINVAL; -+ -+ /* -+ * We get here with hb->lock held, and having found a -+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q -+ * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(), -+ * which in turn means that futex_lock_pi() still has a reference on -+ * our pi_state. -+ * -+ * The waiter holding a reference on @pi_state also protects against -+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() -+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently -+ * free pi_state before we can take a reference ourselves. -+ */ -+ WARN_ON(!refcount_read(&pi_state->refcount)); -+ -+ /* -+ * Now that we have a pi_state, we can acquire wait_lock -+ * and do the state validation. -+ */ -+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -+ -+ /* -+ * Since {uval, pi_state} is serialized by wait_lock, and our current -+ * uval was read without holding it, it can have changed. Verify it -+ * still is what we expect it to be, otherwise retry the entire -+ * operation. -+ */ -+ if (futex_get_value_locked(&uval2, uaddr)) -+ goto out_efault; -+ -+ if (uval != uval2) -+ goto out_eagain; -+ -+ /* -+ * Handle the owner died case: -+ */ -+ if (uval & FUTEX_OWNER_DIED) { -+ /* -+ * exit_pi_state_list sets owner to NULL and wakes the -+ * topmost waiter. The task which acquires the -+ * pi_state->rt_mutex will fixup owner. -+ */ -+ if (!pi_state->owner) { -+ /* -+ * No pi state owner, but the user space TID -+ * is not 0. Inconsistent state. [5] -+ */ -+ if (pid) -+ goto out_einval; -+ /* -+ * Take a ref on the state and return success. [4] -+ */ -+ goto out_attach; -+ } -+ -+ /* -+ * If TID is 0, then either the dying owner has not -+ * yet executed exit_pi_state_list() or some waiter -+ * acquired the rtmutex in the pi state, but did not -+ * yet fixup the TID in user space. -+ * -+ * Take a ref on the state and return success. [6] -+ */ -+ if (!pid) -+ goto out_attach; -+ } else { -+ /* -+ * If the owner died bit is not set, then the pi_state -+ * must have an owner. [7] -+ */ -+ if (!pi_state->owner) -+ goto out_einval; -+ } -+ -+ /* -+ * Bail out if user space manipulated the futex value. If pi -+ * state exists then the owner TID must be the same as the -+ * user space TID. [9/10] -+ */ -+ if (pid != task_pid_vnr(pi_state->owner)) -+ goto out_einval; -+ -+out_attach: -+ get_pi_state(pi_state); -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ *ps = pi_state; -+ return 0; -+ -+out_einval: -+ ret = -EINVAL; -+ goto out_error; -+ -+out_eagain: -+ ret = -EAGAIN; -+ goto out_error; -+ -+out_efault: -+ ret = -EFAULT; -+ goto out_error; -+ -+out_error: -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ return ret; -+} -+ -+static int handle_exit_race(u32 __user *uaddr, u32 uval, -+ struct task_struct *tsk) -+{ -+ u32 uval2; -+ -+ /* -+ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the -+ * caller that the alleged owner is busy. -+ */ -+ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) -+ return -EBUSY; -+ -+ /* -+ * Reread the user space value to handle the following situation: -+ * -+ * CPU0 CPU1 -+ * -+ * sys_exit() sys_futex() -+ * do_exit() futex_lock_pi() -+ * futex_lock_pi_atomic() -+ * exit_signals(tsk) No waiters: -+ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID -+ * mm_release(tsk) Set waiter bit -+ * exit_robust_list(tsk) { *uaddr = 0x80000PID; -+ * Set owner died attach_to_pi_owner() { -+ * *uaddr = 0xC0000000; tsk = get_task(PID); -+ * } if (!tsk->flags & PF_EXITING) { -+ * ... attach(); -+ * tsk->futex_state = } else { -+ * FUTEX_STATE_DEAD; if (tsk->futex_state != -+ * FUTEX_STATE_DEAD) -+ * return -EAGAIN; -+ * return -ESRCH; <--- FAIL -+ * } -+ * -+ * Returning ESRCH unconditionally is wrong here because the -+ * user space value has been changed by the exiting task. -+ * -+ * The same logic applies to the case where the exiting task is -+ * already gone. -+ */ -+ if (futex_get_value_locked(&uval2, uaddr)) -+ return -EFAULT; -+ -+ /* If the user space value has changed, try again. */ -+ if (uval2 != uval) -+ return -EAGAIN; -+ -+ /* -+ * The exiting task did not have a robust list, the robust list was -+ * corrupted or the user space value in *uaddr is simply bogus. -+ * Give up and tell user space. -+ */ -+ return -ESRCH; -+} -+ -+static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, -+ struct futex_pi_state **ps) -+{ -+ /* -+ * No existing pi state. First waiter. [2] -+ * -+ * This creates pi_state, we have hb->lock held, this means nothing can -+ * observe this state, wait_lock is irrelevant. -+ */ -+ struct futex_pi_state *pi_state = alloc_pi_state(); -+ -+ /* -+ * Initialize the pi_mutex in locked state and make @p -+ * the owner of it: -+ */ -+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); -+ -+ /* Store the key for possible exit cleanups: */ -+ pi_state->key = *key; -+ -+ WARN_ON(!list_empty(&pi_state->list)); -+ list_add(&pi_state->list, &p->pi_state_list); -+ /* -+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe -+ * because there is no concurrency as the object is not published yet. -+ */ -+ pi_state->owner = p; -+ -+ *ps = pi_state; -+} -+/* -+ * Lookup the task for the TID provided from user space and attach to -+ * it after doing proper sanity checks. -+ */ -+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, -+ struct futex_pi_state **ps, -+ struct task_struct **exiting) -+{ -+ pid_t pid = uval & FUTEX_TID_MASK; -+ struct task_struct *p; -+ -+ /* -+ * We are the first waiter - try to look up the real owner and attach -+ * the new pi_state to it, but bail out when TID = 0 [1] -+ * -+ * The !pid check is paranoid. None of the call sites should end up -+ * with pid == 0, but better safe than sorry. Let the caller retry -+ */ -+ if (!pid) -+ return -EAGAIN; -+ p = find_get_task_by_vpid(pid); -+ if (!p) -+ return handle_exit_race(uaddr, uval, NULL); -+ -+ if (unlikely(p->flags & PF_KTHREAD)) { -+ put_task_struct(p); -+ return -EPERM; -+ } -+ -+ /* -+ * We need to look at the task state to figure out, whether the -+ * task is exiting. To protect against the change of the task state -+ * in futex_exit_release(), we do this protected by p->pi_lock: -+ */ -+ raw_spin_lock_irq(&p->pi_lock); -+ if (unlikely(p->futex_state != FUTEX_STATE_OK)) { -+ /* -+ * The task is on the way out. When the futex state is -+ * FUTEX_STATE_DEAD, we know that the task has finished -+ * the cleanup: -+ */ -+ int ret = handle_exit_race(uaddr, uval, p); -+ -+ raw_spin_unlock_irq(&p->pi_lock); -+ /* -+ * If the owner task is between FUTEX_STATE_EXITING and -+ * FUTEX_STATE_DEAD then store the task pointer and keep -+ * the reference on the task struct. The calling code will -+ * drop all locks, wait for the task to reach -+ * FUTEX_STATE_DEAD and then drop the refcount. This is -+ * required to prevent a live lock when the current task -+ * preempted the exiting task between the two states. -+ */ -+ if (ret == -EBUSY) -+ *exiting = p; -+ else -+ put_task_struct(p); -+ return ret; -+ } -+ -+ __attach_to_pi_owner(p, key, ps); -+ raw_spin_unlock_irq(&p->pi_lock); -+ -+ put_task_struct(p); -+ -+ return 0; -+} -+ -+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) -+{ -+ int err; -+ u32 curval; -+ -+ if (unlikely(should_fail_futex(true))) -+ return -EFAULT; -+ -+ err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); -+ if (unlikely(err)) -+ return err; -+ -+ /* If user space value changed, let the caller retry */ -+ return curval != uval ? -EAGAIN : 0; -+} -+ -+/** -+ * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex -+ * @uaddr: the pi futex user address -+ * @hb: the pi futex hash bucket -+ * @key: the futex key associated with uaddr and hb -+ * @ps: the pi_state pointer where we store the result of the -+ * lookup -+ * @task: the task to perform the atomic lock work for. This will -+ * be "current" except in the case of requeue pi. -+ * @exiting: Pointer to store the task pointer of the owner task -+ * which is in the middle of exiting -+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) -+ * -+ * Return: -+ * - 0 - ready to wait; -+ * - 1 - acquired the lock; -+ * - <0 - error -+ * -+ * The hb->lock must be held by the caller. -+ * -+ * @exiting is only set when the return value is -EBUSY. If so, this holds -+ * a refcount on the exiting task on return and the caller needs to drop it -+ * after waiting for the exit to complete. -+ */ -+int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, -+ union futex_key *key, -+ struct futex_pi_state **ps, -+ struct task_struct *task, -+ struct task_struct **exiting, -+ int set_waiters) -+{ -+ u32 uval, newval, vpid = task_pid_vnr(task); -+ struct futex_q *top_waiter; -+ int ret; -+ -+ /* -+ * Read the user space value first so we can validate a few -+ * things before proceeding further. -+ */ -+ if (futex_get_value_locked(&uval, uaddr)) -+ return -EFAULT; -+ -+ if (unlikely(should_fail_futex(true))) -+ return -EFAULT; -+ -+ /* -+ * Detect deadlocks. -+ */ -+ if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) -+ return -EDEADLK; -+ -+ if ((unlikely(should_fail_futex(true)))) -+ return -EDEADLK; -+ -+ /* -+ * Lookup existing state first. If it exists, try to attach to -+ * its pi_state. -+ */ -+ top_waiter = futex_top_waiter(hb, key); -+ if (top_waiter) -+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); -+ -+ /* -+ * No waiter and user TID is 0. We are here because the -+ * waiters or the owner died bit is set or called from -+ * requeue_cmp_pi or for whatever reason something took the -+ * syscall. -+ */ -+ if (!(uval & FUTEX_TID_MASK)) { -+ /* -+ * We take over the futex. No other waiters and the user space -+ * TID is 0. We preserve the owner died bit. -+ */ -+ newval = uval & FUTEX_OWNER_DIED; -+ newval |= vpid; -+ -+ /* The futex requeue_pi code can enforce the waiters bit */ -+ if (set_waiters) -+ newval |= FUTEX_WAITERS; -+ -+ ret = lock_pi_update_atomic(uaddr, uval, newval); -+ if (ret) -+ return ret; -+ -+ /* -+ * If the waiter bit was requested the caller also needs PI -+ * state attached to the new owner of the user space futex. -+ * -+ * @task is guaranteed to be alive and it cannot be exiting -+ * because it is either sleeping or waiting in -+ * futex_requeue_pi_wakeup_sync(). -+ * -+ * No need to do the full attach_to_pi_owner() exercise -+ * because @task is known and valid. -+ */ -+ if (set_waiters) { -+ raw_spin_lock_irq(&task->pi_lock); -+ __attach_to_pi_owner(task, key, ps); -+ raw_spin_unlock_irq(&task->pi_lock); -+ } -+ return 1; -+ } -+ -+ /* -+ * First waiter. Set the waiters bit before attaching ourself to -+ * the owner. If owner tries to unlock, it will be forced into -+ * the kernel and blocked on hb->lock. -+ */ -+ newval = uval | FUTEX_WAITERS; -+ ret = lock_pi_update_atomic(uaddr, uval, newval); -+ if (ret) -+ return ret; -+ /* -+ * If the update of the user space value succeeded, we try to -+ * attach to the owner. If that fails, no harm done, we only -+ * set the FUTEX_WAITERS bit in the user space variable. -+ */ -+ return attach_to_pi_owner(uaddr, newval, key, ps, exiting); -+} -+ -+/* -+ * Caller must hold a reference on @pi_state. -+ */ -+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) -+{ -+ struct rt_mutex_waiter *top_waiter; -+ struct task_struct *new_owner; -+ bool postunlock = false; -+ DEFINE_RT_WAKE_Q(wqh); -+ u32 curval, newval; -+ int ret = 0; -+ -+ top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); -+ if (WARN_ON_ONCE(!top_waiter)) { -+ /* -+ * As per the comment in futex_unlock_pi() this should not happen. -+ * -+ * When this happens, give up our locks and try again, giving -+ * the futex_lock_pi() instance time to complete, either by -+ * waiting on the rtmutex or removing itself from the futex -+ * queue. -+ */ -+ ret = -EAGAIN; -+ goto out_unlock; -+ } -+ -+ new_owner = top_waiter->task; -+ -+ /* -+ * We pass it to the next owner. The WAITERS bit is always kept -+ * enabled while there is PI state around. We cleanup the owner -+ * died bit, because we are the owner. -+ */ -+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); -+ -+ if (unlikely(should_fail_futex(true))) { -+ ret = -EFAULT; -+ goto out_unlock; -+ } -+ -+ ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); -+ if (!ret && (curval != uval)) { -+ /* -+ * If a unconditional UNLOCK_PI operation (user space did not -+ * try the TID->0 transition) raced with a waiter setting the -+ * FUTEX_WAITERS flag between get_user() and locking the hash -+ * bucket lock, retry the operation. -+ */ -+ if ((FUTEX_TID_MASK & curval) == uval) -+ ret = -EAGAIN; -+ else -+ ret = -EINVAL; -+ } -+ -+ if (!ret) { -+ /* -+ * This is a point of no return; once we modified the uval -+ * there is no going back and subsequent operations must -+ * not fail. -+ */ -+ pi_state_update_owner(pi_state, new_owner); -+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); -+ } -+ -+out_unlock: -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ -+ if (postunlock) -+ rt_mutex_postunlock(&wqh); -+ -+ return ret; -+} -+ -+static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, -+ struct task_struct *argowner) -+{ -+ struct futex_pi_state *pi_state = q->pi_state; -+ struct task_struct *oldowner, *newowner; -+ u32 uval, curval, newval, newtid; -+ int err = 0; -+ -+ oldowner = pi_state->owner; -+ -+ /* -+ * We are here because either: -+ * -+ * - we stole the lock and pi_state->owner needs updating to reflect -+ * that (@argowner == current), -+ * -+ * or: -+ * -+ * - someone stole our lock and we need to fix things to point to the -+ * new owner (@argowner == NULL). -+ * -+ * Either way, we have to replace the TID in the user space variable. -+ * This must be atomic as we have to preserve the owner died bit here. -+ * -+ * Note: We write the user space value _before_ changing the pi_state -+ * because we can fault here. Imagine swapped out pages or a fork -+ * that marked all the anonymous memory readonly for cow. -+ * -+ * Modifying pi_state _before_ the user space value would leave the -+ * pi_state in an inconsistent state when we fault here, because we -+ * need to drop the locks to handle the fault. This might be observed -+ * in the PID checks when attaching to PI state . -+ */ -+retry: -+ if (!argowner) { -+ if (oldowner != current) { -+ /* -+ * We raced against a concurrent self; things are -+ * already fixed up. Nothing to do. -+ */ -+ return 0; -+ } -+ -+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { -+ /* We got the lock. pi_state is correct. Tell caller. */ -+ return 1; -+ } -+ -+ /* -+ * The trylock just failed, so either there is an owner or -+ * there is a higher priority waiter than this one. -+ */ -+ newowner = rt_mutex_owner(&pi_state->pi_mutex); -+ /* -+ * If the higher priority waiter has not yet taken over the -+ * rtmutex then newowner is NULL. We can't return here with -+ * that state because it's inconsistent vs. the user space -+ * state. So drop the locks and try again. It's a valid -+ * situation and not any different from the other retry -+ * conditions. -+ */ -+ if (unlikely(!newowner)) { -+ err = -EAGAIN; -+ goto handle_err; -+ } -+ } else { -+ WARN_ON_ONCE(argowner != current); -+ if (oldowner == current) { -+ /* -+ * We raced against a concurrent self; things are -+ * already fixed up. Nothing to do. -+ */ -+ return 1; -+ } -+ newowner = argowner; -+ } -+ -+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; -+ /* Owner died? */ -+ if (!pi_state->owner) -+ newtid |= FUTEX_OWNER_DIED; -+ -+ err = futex_get_value_locked(&uval, uaddr); -+ if (err) -+ goto handle_err; -+ -+ for (;;) { -+ newval = (uval & FUTEX_OWNER_DIED) | newtid; -+ -+ err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); -+ if (err) -+ goto handle_err; -+ -+ if (curval == uval) -+ break; -+ uval = curval; -+ } -+ -+ /* -+ * We fixed up user space. Now we need to fix the pi_state -+ * itself. -+ */ -+ pi_state_update_owner(pi_state, newowner); -+ -+ return argowner == current; -+ -+ /* -+ * In order to reschedule or handle a page fault, we need to drop the -+ * locks here. In the case of a fault, this gives the other task -+ * (either the highest priority waiter itself or the task which stole -+ * the rtmutex) the chance to try the fixup of the pi_state. So once we -+ * are back from handling the fault we need to check the pi_state after -+ * reacquiring the locks and before trying to do another fixup. When -+ * the fixup has been done already we simply return. -+ * -+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely -+ * drop hb->lock since the caller owns the hb -> futex_q relation. -+ * Dropping the pi_mutex->wait_lock requires the state revalidate. -+ */ -+handle_err: -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ spin_unlock(q->lock_ptr); -+ -+ switch (err) { -+ case -EFAULT: -+ err = fault_in_user_writeable(uaddr); -+ break; -+ -+ case -EAGAIN: -+ cond_resched(); -+ err = 0; -+ break; -+ -+ default: -+ WARN_ON_ONCE(1); -+ break; -+ } -+ -+ spin_lock(q->lock_ptr); -+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -+ -+ /* -+ * Check if someone else fixed it for us: -+ */ -+ if (pi_state->owner != oldowner) -+ return argowner == current; -+ -+ /* Retry if err was -EAGAIN or the fault in succeeded */ -+ if (!err) -+ goto retry; -+ -+ /* -+ * fault_in_user_writeable() failed so user state is immutable. At -+ * best we can make the kernel state consistent but user state will -+ * be most likely hosed and any subsequent unlock operation will be -+ * rejected due to PI futex rule [10]. -+ * -+ * Ensure that the rtmutex owner is also the pi_state owner despite -+ * the user space value claiming something different. There is no -+ * point in unlocking the rtmutex if current is the owner as it -+ * would need to wait until the next waiter has taken the rtmutex -+ * to guarantee consistent state. Keep it simple. Userspace asked -+ * for this wreckaged state. -+ * -+ * The rtmutex has an owner - either current or some other -+ * task. See the EAGAIN loop above. -+ */ -+ pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); -+ -+ return err; -+} -+ -+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, -+ struct task_struct *argowner) -+{ -+ struct futex_pi_state *pi_state = q->pi_state; -+ int ret; -+ -+ lockdep_assert_held(q->lock_ptr); -+ -+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -+ ret = __fixup_pi_state_owner(uaddr, q, argowner); -+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -+ return ret; -+} -+ -+/** -+ * fixup_pi_owner() - Post lock pi_state and corner case management -+ * @uaddr: user address of the futex -+ * @q: futex_q (contains pi_state and access to the rt_mutex) -+ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) -+ * -+ * After attempting to lock an rt_mutex, this function is called to cleanup -+ * the pi_state owner as well as handle race conditions that may allow us to -+ * acquire the lock. Must be called with the hb lock held. -+ * -+ * Return: -+ * - 1 - success, lock taken; -+ * - 0 - success, lock not taken; -+ * - <0 - on error (-EFAULT) -+ */ -+int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) -+{ -+ if (locked) { -+ /* -+ * Got the lock. We might not be the anticipated owner if we -+ * did a lock-steal - fix up the PI-state in that case: -+ * -+ * Speculative pi_state->owner read (we don't hold wait_lock); -+ * since we own the lock pi_state->owner == current is the -+ * stable state, anything else needs more attention. -+ */ -+ if (q->pi_state->owner != current) -+ return fixup_pi_state_owner(uaddr, q, current); -+ return 1; -+ } -+ -+ /* -+ * If we didn't get the lock; check if anybody stole it from us. In -+ * that case, we need to fix up the uval to point to them instead of -+ * us, otherwise bad things happen. [10] -+ * -+ * Another speculative read; pi_state->owner == current is unstable -+ * but needs our attention. -+ */ -+ if (q->pi_state->owner == current) -+ return fixup_pi_state_owner(uaddr, q, NULL); -+ -+ /* -+ * Paranoia check. If we did not take the lock, then we should not be -+ * the owner of the rt_mutex. Warn and establish consistent state. -+ */ -+ if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) -+ return fixup_pi_state_owner(uaddr, q, current); -+ -+ return 0; -+} -+ -+/* -+ * Userspace tried a 0 -> TID atomic transition of the futex value -+ * and failed. The kernel side here does the whole locking operation: -+ * if there are waiters then it will block as a consequence of relying -+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see -+ * a 0 value of the futex too.). -+ * -+ * Also serves as futex trylock_pi()'ing, and due semantics. -+ */ -+int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) -+{ -+ struct hrtimer_sleeper timeout, *to; -+ struct task_struct *exiting = NULL; -+ struct rt_mutex_waiter rt_waiter; -+ struct futex_hash_bucket *hb; -+ struct futex_q q = futex_q_init; -+ int res, ret; -+ -+ if (!IS_ENABLED(CONFIG_FUTEX_PI)) -+ return -ENOSYS; -+ -+ if (refill_pi_state_cache()) -+ return -ENOMEM; -+ -+ to = futex_setup_timer(time, &timeout, flags, 0); -+ -+retry: -+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); -+ if (unlikely(ret != 0)) -+ goto out; -+ -+retry_private: -+ hb = futex_q_lock(&q); -+ -+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, -+ &exiting, 0); -+ if (unlikely(ret)) { -+ /* -+ * Atomic work succeeded and we got the lock, -+ * or failed. Either way, we do _not_ block. -+ */ -+ switch (ret) { -+ case 1: -+ /* We got the lock. */ -+ ret = 0; -+ goto out_unlock_put_key; -+ case -EFAULT: -+ goto uaddr_faulted; -+ case -EBUSY: -+ case -EAGAIN: -+ /* -+ * Two reasons for this: -+ * - EBUSY: Task is exiting and we just wait for the -+ * exit to complete. -+ * - EAGAIN: The user space value changed. -+ */ -+ futex_q_unlock(hb); -+ /* -+ * Handle the case where the owner is in the middle of -+ * exiting. Wait for the exit to complete otherwise -+ * this task might loop forever, aka. live lock. -+ */ -+ wait_for_owner_exiting(ret, exiting); -+ cond_resched(); -+ goto retry; -+ default: -+ goto out_unlock_put_key; -+ } -+ } -+ -+ WARN_ON(!q.pi_state); -+ -+ /* -+ * Only actually queue now that the atomic ops are done: -+ */ -+ __futex_queue(&q, hb); -+ -+ if (trylock) { -+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); -+ /* Fixup the trylock return value: */ -+ ret = ret ? 0 : -EWOULDBLOCK; -+ goto no_block; -+ } -+ -+ rt_mutex_init_waiter(&rt_waiter); -+ -+ /* -+ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not -+ * hold it while doing rt_mutex_start_proxy(), because then it will -+ * include hb->lock in the blocking chain, even through we'll not in -+ * fact hold it while blocking. This will lead it to report -EDEADLK -+ * and BUG when futex_unlock_pi() interleaves with this. -+ * -+ * Therefore acquire wait_lock while holding hb->lock, but drop the -+ * latter before calling __rt_mutex_start_proxy_lock(). This -+ * interleaves with futex_unlock_pi() -- which does a similar lock -+ * handoff -- such that the latter can observe the futex_q::pi_state -+ * before __rt_mutex_start_proxy_lock() is done. -+ */ -+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); -+ spin_unlock(q.lock_ptr); -+ /* -+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter -+ * such that futex_unlock_pi() is guaranteed to observe the waiter when -+ * it sees the futex_q::pi_state. -+ */ -+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); -+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); -+ -+ if (ret) { -+ if (ret == 1) -+ ret = 0; -+ goto cleanup; -+ } -+ -+ if (unlikely(to)) -+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); -+ -+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); -+ -+cleanup: -+ spin_lock(q.lock_ptr); -+ /* -+ * If we failed to acquire the lock (deadlock/signal/timeout), we must -+ * first acquire the hb->lock before removing the lock from the -+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait -+ * lists consistent. -+ * -+ * In particular; it is important that futex_unlock_pi() can not -+ * observe this inconsistency. -+ */ -+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) -+ ret = 0; -+ -+no_block: -+ /* -+ * Fixup the pi_state owner and possibly acquire the lock if we -+ * haven't already. -+ */ -+ res = fixup_pi_owner(uaddr, &q, !ret); -+ /* -+ * If fixup_pi_owner() returned an error, propagate that. If it acquired -+ * the lock, clear our -ETIMEDOUT or -EINTR. -+ */ -+ if (res) -+ ret = (res < 0) ? res : 0; -+ -+ futex_unqueue_pi(&q); -+ spin_unlock(q.lock_ptr); -+ goto out; -+ -+out_unlock_put_key: -+ futex_q_unlock(hb); -+ -+out: -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ return ret != -EINTR ? ret : -ERESTARTNOINTR; -+ -+uaddr_faulted: -+ futex_q_unlock(hb); -+ -+ ret = fault_in_user_writeable(uaddr); -+ if (ret) -+ goto out; -+ -+ if (!(flags & FLAGS_SHARED)) -+ goto retry_private; -+ -+ goto retry; -+} -+ -+/* -+ * Userspace attempted a TID -> 0 atomic transition, and failed. -+ * This is the in-kernel slowpath: we look up the PI state (if any), -+ * and do the rt-mutex unlock. -+ */ -+int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) -+{ -+ u32 curval, uval, vpid = task_pid_vnr(current); -+ union futex_key key = FUTEX_KEY_INIT; -+ struct futex_hash_bucket *hb; -+ struct futex_q *top_waiter; -+ int ret; -+ -+ if (!IS_ENABLED(CONFIG_FUTEX_PI)) -+ return -ENOSYS; -+ -+retry: -+ if (get_user(uval, uaddr)) -+ return -EFAULT; -+ /* -+ * We release only a lock we actually own: -+ */ -+ if ((uval & FUTEX_TID_MASK) != vpid) -+ return -EPERM; -+ -+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); -+ if (ret) -+ return ret; -+ -+ hb = futex_hash(&key); -+ spin_lock(&hb->lock); -+ -+ /* -+ * Check waiters first. We do not trust user space values at -+ * all and we at least want to know if user space fiddled -+ * with the futex value instead of blindly unlocking. -+ */ -+ top_waiter = futex_top_waiter(hb, &key); -+ if (top_waiter) { -+ struct futex_pi_state *pi_state = top_waiter->pi_state; -+ -+ ret = -EINVAL; -+ if (!pi_state) -+ goto out_unlock; -+ -+ /* -+ * If current does not own the pi_state then the futex is -+ * inconsistent and user space fiddled with the futex value. -+ */ -+ if (pi_state->owner != current) -+ goto out_unlock; -+ -+ get_pi_state(pi_state); -+ /* -+ * By taking wait_lock while still holding hb->lock, we ensure -+ * there is no point where we hold neither; and therefore -+ * wake_futex_p() must observe a state consistent with what we -+ * observed. -+ * -+ * In particular; this forces __rt_mutex_start_proxy() to -+ * complete such that we're guaranteed to observe the -+ * rt_waiter. Also see the WARN in wake_futex_pi(). -+ */ -+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); -+ spin_unlock(&hb->lock); -+ -+ /* drops pi_state->pi_mutex.wait_lock */ -+ ret = wake_futex_pi(uaddr, uval, pi_state); -+ -+ put_pi_state(pi_state); -+ -+ /* -+ * Success, we're done! No tricky corner cases. -+ */ -+ if (!ret) -+ return ret; -+ /* -+ * The atomic access to the futex value generated a -+ * pagefault, so retry the user-access and the wakeup: -+ */ -+ if (ret == -EFAULT) -+ goto pi_faulted; -+ /* -+ * A unconditional UNLOCK_PI op raced against a waiter -+ * setting the FUTEX_WAITERS bit. Try again. -+ */ -+ if (ret == -EAGAIN) -+ goto pi_retry; -+ /* -+ * wake_futex_pi has detected invalid state. Tell user -+ * space. -+ */ -+ return ret; -+ } -+ -+ /* -+ * We have no kernel internal state, i.e. no waiters in the -+ * kernel. Waiters which are about to queue themselves are stuck -+ * on hb->lock. So we can safely ignore them. We do neither -+ * preserve the WAITERS bit not the OWNER_DIED one. We are the -+ * owner. -+ */ -+ if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) { -+ spin_unlock(&hb->lock); -+ switch (ret) { -+ case -EFAULT: -+ goto pi_faulted; -+ -+ case -EAGAIN: -+ goto pi_retry; -+ -+ default: -+ WARN_ON_ONCE(1); -+ return ret; -+ } -+ } -+ -+ /* -+ * If uval has changed, let user space handle it. -+ */ -+ ret = (curval == uval) ? 0 : -EAGAIN; -+ -+out_unlock: -+ spin_unlock(&hb->lock); -+ return ret; -+ -+pi_retry: -+ cond_resched(); -+ goto retry; -+ -+pi_faulted: -+ -+ ret = fault_in_user_writeable(uaddr); -+ if (!ret) -+ goto retry; -+ -+ return ret; -+} -+ -diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c -new file mode 100644 -index 000000000..cba8b1a6a ---- /dev/null -+++ b/kernel/futex/requeue.c -@@ -0,0 +1,897 @@ -+// SPDX-License-Identifier: GPL-2.0-or-later -+ -+#include <linux/sched/signal.h> -+ -+#include "futex.h" -+#include "../locking/rtmutex_common.h" -+ -+/* -+ * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an -+ * underlying rtmutex. The task which is about to be requeued could have -+ * just woken up (timeout, signal). After the wake up the task has to -+ * acquire hash bucket lock, which is held by the requeue code. As a task -+ * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking -+ * and the hash bucket lock blocking would collide and corrupt state. -+ * -+ * On !PREEMPT_RT this is not a problem and everything could be serialized -+ * on hash bucket lock, but aside of having the benefit of common code, -+ * this allows to avoid doing the requeue when the task is already on the -+ * way out and taking the hash bucket lock of the original uaddr1 when the -+ * requeue has been completed. -+ * -+ * The following state transitions are valid: -+ * -+ * On the waiter side: -+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE -+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT -+ * -+ * On the requeue side: -+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS -+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED -+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed) -+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED -+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed) -+ * -+ * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this -+ * signals that the waiter is already on the way out. It also means that -+ * the waiter is still on the 'wait' futex, i.e. uaddr1. -+ * -+ * The waiter side signals early wakeup to the requeue side either through -+ * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending -+ * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately -+ * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT, -+ * which means the wakeup is interleaving with a requeue in progress it has -+ * to wait for the requeue side to change the state. Either to DONE/LOCKED -+ * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex -+ * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by -+ * the requeue side when the requeue attempt failed via deadlock detection -+ * and therefore the waiter q is still on the uaddr1 futex. -+ */ -+enum { -+ Q_REQUEUE_PI_NONE = 0, -+ Q_REQUEUE_PI_IGNORE, -+ Q_REQUEUE_PI_IN_PROGRESS, -+ Q_REQUEUE_PI_WAIT, -+ Q_REQUEUE_PI_DONE, -+ Q_REQUEUE_PI_LOCKED, -+}; -+ -+const struct futex_q futex_q_init = { -+ /* list gets initialized in futex_queue()*/ -+ .key = FUTEX_KEY_INIT, -+ .bitset = FUTEX_BITSET_MATCH_ANY, -+ .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE), -+}; -+ -+/** -+ * requeue_futex() - Requeue a futex_q from one hb to another -+ * @q: the futex_q to requeue -+ * @hb1: the source hash_bucket -+ * @hb2: the target hash_bucket -+ * @key2: the new key for the requeued futex_q -+ */ -+static inline -+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, -+ struct futex_hash_bucket *hb2, union futex_key *key2) -+{ -+ -+ /* -+ * If key1 and key2 hash to the same bucket, no need to -+ * requeue. -+ */ -+ if (likely(&hb1->chain != &hb2->chain)) { -+ plist_del(&q->list, &hb1->chain); -+ futex_hb_waiters_dec(hb1); -+ futex_hb_waiters_inc(hb2); -+ plist_add(&q->list, &hb2->chain); -+ q->lock_ptr = &hb2->lock; -+ } -+ q->key = *key2; -+} -+ -+static inline bool futex_requeue_pi_prepare(struct futex_q *q, -+ struct futex_pi_state *pi_state) -+{ -+ int old, new; -+ -+ /* -+ * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has -+ * already set Q_REQUEUE_PI_IGNORE to signal that requeue should -+ * ignore the waiter. -+ */ -+ old = atomic_read_acquire(&q->requeue_state); -+ do { -+ if (old == Q_REQUEUE_PI_IGNORE) -+ return false; -+ -+ /* -+ * futex_proxy_trylock_atomic() might have set it to -+ * IN_PROGRESS and a interleaved early wake to WAIT. -+ * -+ * It was considered to have an extra state for that -+ * trylock, but that would just add more conditionals -+ * all over the place for a dubious value. -+ */ -+ if (old != Q_REQUEUE_PI_NONE) -+ break; -+ -+ new = Q_REQUEUE_PI_IN_PROGRESS; -+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); -+ -+ q->pi_state = pi_state; -+ return true; -+} -+ -+static inline void futex_requeue_pi_complete(struct futex_q *q, int locked) -+{ -+ int old, new; -+ -+ old = atomic_read_acquire(&q->requeue_state); -+ do { -+ if (old == Q_REQUEUE_PI_IGNORE) -+ return; -+ -+ if (locked >= 0) { -+ /* Requeue succeeded. Set DONE or LOCKED */ -+ WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS && -+ old != Q_REQUEUE_PI_WAIT); -+ new = Q_REQUEUE_PI_DONE + locked; -+ } else if (old == Q_REQUEUE_PI_IN_PROGRESS) { -+ /* Deadlock, no early wakeup interleave */ -+ new = Q_REQUEUE_PI_NONE; -+ } else { -+ /* Deadlock, early wakeup interleave. */ -+ WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT); -+ new = Q_REQUEUE_PI_IGNORE; -+ } -+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); -+ -+#ifdef CONFIG_PREEMPT_RT -+ /* If the waiter interleaved with the requeue let it know */ -+ if (unlikely(old == Q_REQUEUE_PI_WAIT)) -+ rcuwait_wake_up(&q->requeue_wait); -+#endif -+} -+ -+static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) -+{ -+ int old, new; -+ -+ old = atomic_read_acquire(&q->requeue_state); -+ do { -+ /* Is requeue done already? */ -+ if (old >= Q_REQUEUE_PI_DONE) -+ return old; -+ -+ /* -+ * If not done, then tell the requeue code to either ignore -+ * the waiter or to wake it up once the requeue is done. -+ */ -+ new = Q_REQUEUE_PI_WAIT; -+ if (old == Q_REQUEUE_PI_NONE) -+ new = Q_REQUEUE_PI_IGNORE; -+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new)); -+ -+ /* If the requeue was in progress, wait for it to complete */ -+ if (old == Q_REQUEUE_PI_IN_PROGRESS) { -+#ifdef CONFIG_PREEMPT_RT -+ rcuwait_wait_event(&q->requeue_wait, -+ atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT, -+ TASK_UNINTERRUPTIBLE); -+#else -+ (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT); -+#endif -+ } -+ -+ /* -+ * Requeue is now either prohibited or complete. Reread state -+ * because during the wait above it might have changed. Nothing -+ * will modify q->requeue_state after this point. -+ */ -+ return atomic_read(&q->requeue_state); -+} -+ -+/** -+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue -+ * @q: the futex_q -+ * @key: the key of the requeue target futex -+ * @hb: the hash_bucket of the requeue target futex -+ * -+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the -+ * target futex if it is uncontended or via a lock steal. -+ * -+ * 1) Set @q::key to the requeue target futex key so the waiter can detect -+ * the wakeup on the right futex. -+ * -+ * 2) Dequeue @q from the hash bucket. -+ * -+ * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock -+ * acquisition. -+ * -+ * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that -+ * the waiter has to fixup the pi state. -+ * -+ * 5) Complete the requeue state so the waiter can make progress. After -+ * this point the waiter task can return from the syscall immediately in -+ * case that the pi state does not have to be fixed up. -+ * -+ * 6) Wake the waiter task. -+ * -+ * Must be called with both q->lock_ptr and hb->lock held. -+ */ -+static inline -+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, -+ struct futex_hash_bucket *hb) -+{ -+ q->key = *key; -+ -+ __futex_unqueue(q); -+ -+ WARN_ON(!q->rt_waiter); -+ q->rt_waiter = NULL; -+ -+ q->lock_ptr = &hb->lock; -+ -+ /* Signal locked state to the waiter */ -+ futex_requeue_pi_complete(q, 1); -+ wake_up_state(q->task, TASK_NORMAL); -+} -+ -+/** -+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter -+ * @pifutex: the user address of the to futex -+ * @hb1: the from futex hash bucket, must be locked by the caller -+ * @hb2: the to futex hash bucket, must be locked by the caller -+ * @key1: the from futex key -+ * @key2: the to futex key -+ * @ps: address to store the pi_state pointer -+ * @exiting: Pointer to store the task pointer of the owner task -+ * which is in the middle of exiting -+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) -+ * -+ * Try and get the lock on behalf of the top waiter if we can do it atomically. -+ * Wake the top waiter if we succeed. If the caller specified set_waiters, -+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. -+ * hb1 and hb2 must be held by the caller. -+ * -+ * @exiting is only set when the return value is -EBUSY. If so, this holds -+ * a refcount on the exiting task on return and the caller needs to drop it -+ * after waiting for the exit to complete. -+ * -+ * Return: -+ * - 0 - failed to acquire the lock atomically; -+ * - >0 - acquired the lock, return value is vpid of the top_waiter -+ * - <0 - error -+ */ -+static int -+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, -+ struct futex_hash_bucket *hb2, union futex_key *key1, -+ union futex_key *key2, struct futex_pi_state **ps, -+ struct task_struct **exiting, int set_waiters) -+{ -+ struct futex_q *top_waiter = NULL; -+ u32 curval; -+ int ret; -+ -+ if (futex_get_value_locked(&curval, pifutex)) -+ return -EFAULT; -+ -+ if (unlikely(should_fail_futex(true))) -+ return -EFAULT; -+ -+ /* -+ * Find the top_waiter and determine if there are additional waiters. -+ * If the caller intends to requeue more than 1 waiter to pifutex, -+ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, -+ * as we have means to handle the possible fault. If not, don't set -+ * the bit unnecessarily as it will force the subsequent unlock to enter -+ * the kernel. -+ */ -+ top_waiter = futex_top_waiter(hb1, key1); -+ -+ /* There are no waiters, nothing for us to do. */ -+ if (!top_waiter) -+ return 0; -+ -+ /* -+ * Ensure that this is a waiter sitting in futex_wait_requeue_pi() -+ * and waiting on the 'waitqueue' futex which is always !PI. -+ */ -+ if (!top_waiter->rt_waiter || top_waiter->pi_state) -+ return -EINVAL; -+ -+ /* Ensure we requeue to the expected futex. */ -+ if (!futex_match(top_waiter->requeue_pi_key, key2)) -+ return -EINVAL; -+ -+ /* Ensure that this does not race against an early wakeup */ -+ if (!futex_requeue_pi_prepare(top_waiter, NULL)) -+ return -EAGAIN; -+ -+ /* -+ * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit -+ * in the contended case or if @set_waiters is true. -+ * -+ * In the contended case PI state is attached to the lock owner. If -+ * the user space lock can be acquired then PI state is attached to -+ * the new owner (@top_waiter->task) when @set_waiters is true. -+ */ -+ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, -+ exiting, set_waiters); -+ if (ret == 1) { -+ /* -+ * Lock was acquired in user space and PI state was -+ * attached to @top_waiter->task. That means state is fully -+ * consistent and the waiter can return to user space -+ * immediately after the wakeup. -+ */ -+ requeue_pi_wake_futex(top_waiter, key2, hb2); -+ } else if (ret < 0) { -+ /* Rewind top_waiter::requeue_state */ -+ futex_requeue_pi_complete(top_waiter, ret); -+ } else { -+ /* -+ * futex_lock_pi_atomic() did not acquire the user space -+ * futex, but managed to establish the proxy lock and pi -+ * state. top_waiter::requeue_state cannot be fixed up here -+ * because the waiter is not enqueued on the rtmutex -+ * yet. This is handled at the callsite depending on the -+ * result of rt_mutex_start_proxy_lock() which is -+ * guaranteed to be reached with this function returning 0. -+ */ -+ } -+ return ret; -+} -+ -+/** -+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 -+ * @uaddr1: source futex user address -+ * @flags: futex flags (FLAGS_SHARED, etc.) -+ * @uaddr2: target futex user address -+ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) -+ * @nr_requeue: number of waiters to requeue (0-INT_MAX) -+ * @cmpval: @uaddr1 expected value (or %NULL) -+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a -+ * pi futex (pi to pi requeue is not supported) -+ * -+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire -+ * uaddr2 atomically on behalf of the top waiter. -+ * -+ * Return: -+ * - >=0 - on success, the number of tasks requeued or woken; -+ * - <0 - on error -+ */ -+int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, -+ int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi) -+{ -+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; -+ int task_count = 0, ret; -+ struct futex_pi_state *pi_state = NULL; -+ struct futex_hash_bucket *hb1, *hb2; -+ struct futex_q *this, *next; -+ DEFINE_WAKE_Q(wake_q); -+ -+ if (nr_wake < 0 || nr_requeue < 0) -+ return -EINVAL; -+ -+ /* -+ * When PI not supported: return -ENOSYS if requeue_pi is true, -+ * consequently the compiler knows requeue_pi is always false past -+ * this point which will optimize away all the conditional code -+ * further down. -+ */ -+ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi) -+ return -ENOSYS; -+ -+ if (requeue_pi) { -+ /* -+ * Requeue PI only works on two distinct uaddrs. This -+ * check is only valid for private futexes. See below. -+ */ -+ if (uaddr1 == uaddr2) -+ return -EINVAL; -+ -+ /* -+ * futex_requeue() allows the caller to define the number -+ * of waiters to wake up via the @nr_wake argument. With -+ * REQUEUE_PI, waking up more than one waiter is creating -+ * more problems than it solves. Waking up a waiter makes -+ * only sense if the PI futex @uaddr2 is uncontended as -+ * this allows the requeue code to acquire the futex -+ * @uaddr2 before waking the waiter. The waiter can then -+ * return to user space without further action. A secondary -+ * wakeup would just make the futex_wait_requeue_pi() -+ * handling more complex, because that code would have to -+ * look up pi_state and do more or less all the handling -+ * which the requeue code has to do for the to be requeued -+ * waiters. So restrict the number of waiters to wake to -+ * one, and only wake it up when the PI futex is -+ * uncontended. Otherwise requeue it and let the unlock of -+ * the PI futex handle the wakeup. -+ * -+ * All REQUEUE_PI users, e.g. pthread_cond_signal() and -+ * pthread_cond_broadcast() must use nr_wake=1. -+ */ -+ if (nr_wake != 1) -+ return -EINVAL; -+ -+ /* -+ * requeue_pi requires a pi_state, try to allocate it now -+ * without any locks in case it fails. -+ */ -+ if (refill_pi_state_cache()) -+ return -ENOMEM; -+ } -+ -+retry: -+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); -+ if (unlikely(ret != 0)) -+ return ret; -+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, -+ requeue_pi ? FUTEX_WRITE : FUTEX_READ); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+ /* -+ * The check above which compares uaddrs is not sufficient for -+ * shared futexes. We need to compare the keys: -+ */ -+ if (requeue_pi && futex_match(&key1, &key2)) -+ return -EINVAL; -+ -+ hb1 = futex_hash(&key1); -+ hb2 = futex_hash(&key2); -+ -+retry_private: -+ futex_hb_waiters_inc(hb2); -+ double_lock_hb(hb1, hb2); -+ -+ if (likely(cmpval != NULL)) { -+ u32 curval; -+ -+ ret = futex_get_value_locked(&curval, uaddr1); -+ -+ if (unlikely(ret)) { -+ double_unlock_hb(hb1, hb2); -+ futex_hb_waiters_dec(hb2); -+ -+ ret = get_user(curval, uaddr1); -+ if (ret) -+ return ret; -+ -+ if (!(flags & FLAGS_SHARED)) -+ goto retry_private; -+ -+ goto retry; -+ } -+ if (curval != *cmpval) { -+ ret = -EAGAIN; -+ goto out_unlock; -+ } -+ } -+ -+ if (requeue_pi) { -+ struct task_struct *exiting = NULL; -+ -+ /* -+ * Attempt to acquire uaddr2 and wake the top waiter. If we -+ * intend to requeue waiters, force setting the FUTEX_WAITERS -+ * bit. We force this here where we are able to easily handle -+ * faults rather in the requeue loop below. -+ * -+ * Updates topwaiter::requeue_state if a top waiter exists. -+ */ -+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, -+ &key2, &pi_state, -+ &exiting, nr_requeue); -+ -+ /* -+ * At this point the top_waiter has either taken uaddr2 or -+ * is waiting on it. In both cases pi_state has been -+ * established and an initial refcount on it. In case of an -+ * error there's nothing. -+ * -+ * The top waiter's requeue_state is up to date: -+ * -+ * - If the lock was acquired atomically (ret == 1), then -+ * the state is Q_REQUEUE_PI_LOCKED. -+ * -+ * The top waiter has been dequeued and woken up and can -+ * return to user space immediately. The kernel/user -+ * space state is consistent. In case that there must be -+ * more waiters requeued the WAITERS bit in the user -+ * space futex is set so the top waiter task has to go -+ * into the syscall slowpath to unlock the futex. This -+ * will block until this requeue operation has been -+ * completed and the hash bucket locks have been -+ * dropped. -+ * -+ * - If the trylock failed with an error (ret < 0) then -+ * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing -+ * happened", or Q_REQUEUE_PI_IGNORE when there was an -+ * interleaved early wakeup. -+ * -+ * - If the trylock did not succeed (ret == 0) then the -+ * state is either Q_REQUEUE_PI_IN_PROGRESS or -+ * Q_REQUEUE_PI_WAIT if an early wakeup interleaved. -+ * This will be cleaned up in the loop below, which -+ * cannot fail because futex_proxy_trylock_atomic() did -+ * the same sanity checks for requeue_pi as the loop -+ * below does. -+ */ -+ switch (ret) { -+ case 0: -+ /* We hold a reference on the pi state. */ -+ break; -+ -+ case 1: -+ /* -+ * futex_proxy_trylock_atomic() acquired the user space -+ * futex. Adjust task_count. -+ */ -+ task_count++; -+ ret = 0; -+ break; -+ -+ /* -+ * If the above failed, then pi_state is NULL and -+ * waiter::requeue_state is correct. -+ */ -+ case -EFAULT: -+ double_unlock_hb(hb1, hb2); -+ futex_hb_waiters_dec(hb2); -+ ret = fault_in_user_writeable(uaddr2); -+ if (!ret) -+ goto retry; -+ return ret; -+ case -EBUSY: -+ case -EAGAIN: -+ /* -+ * Two reasons for this: -+ * - EBUSY: Owner is exiting and we just wait for the -+ * exit to complete. -+ * - EAGAIN: The user space value changed. -+ */ -+ double_unlock_hb(hb1, hb2); -+ futex_hb_waiters_dec(hb2); -+ /* -+ * Handle the case where the owner is in the middle of -+ * exiting. Wait for the exit to complete otherwise -+ * this task might loop forever, aka. live lock. -+ */ -+ wait_for_owner_exiting(ret, exiting); -+ cond_resched(); -+ goto retry; -+ default: -+ goto out_unlock; -+ } -+ } -+ -+ plist_for_each_entry_safe(this, next, &hb1->chain, list) { -+ if (task_count - nr_wake >= nr_requeue) -+ break; -+ -+ if (!futex_match(&this->key, &key1)) -+ continue; -+ -+ /* -+ * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always -+ * be paired with each other and no other futex ops. -+ * -+ * We should never be requeueing a futex_q with a pi_state, -+ * which is awaiting a futex_unlock_pi(). -+ */ -+ if ((requeue_pi && !this->rt_waiter) || -+ (!requeue_pi && this->rt_waiter) || -+ this->pi_state) { -+ ret = -EINVAL; -+ break; -+ } -+ -+ /* Plain futexes just wake or requeue and are done */ -+ if (!requeue_pi) { -+ if (++task_count <= nr_wake) -+ futex_wake_mark(&wake_q, this); -+ else -+ requeue_futex(this, hb1, hb2, &key2); -+ continue; -+ } -+ -+ /* Ensure we requeue to the expected futex for requeue_pi. */ -+ if (!futex_match(this->requeue_pi_key, &key2)) { -+ ret = -EINVAL; -+ break; -+ } -+ -+ /* -+ * Requeue nr_requeue waiters and possibly one more in the case -+ * of requeue_pi if we couldn't acquire the lock atomically. -+ * -+ * Prepare the waiter to take the rt_mutex. Take a refcount -+ * on the pi_state and store the pointer in the futex_q -+ * object of the waiter. -+ */ -+ get_pi_state(pi_state); -+ -+ /* Don't requeue when the waiter is already on the way out. */ -+ if (!futex_requeue_pi_prepare(this, pi_state)) { -+ /* -+ * Early woken waiter signaled that it is on the -+ * way out. Drop the pi_state reference and try the -+ * next waiter. @this->pi_state is still NULL. -+ */ -+ put_pi_state(pi_state); -+ continue; -+ } -+ -+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, -+ this->rt_waiter, -+ this->task); -+ -+ if (ret == 1) { -+ /* -+ * We got the lock. We do neither drop the refcount -+ * on pi_state nor clear this->pi_state because the -+ * waiter needs the pi_state for cleaning up the -+ * user space value. It will drop the refcount -+ * after doing so. this::requeue_state is updated -+ * in the wakeup as well. -+ */ -+ requeue_pi_wake_futex(this, &key2, hb2); -+ task_count++; -+ } else if (!ret) { -+ /* Waiter is queued, move it to hb2 */ -+ requeue_futex(this, hb1, hb2, &key2); -+ futex_requeue_pi_complete(this, 0); -+ task_count++; -+ } else { -+ /* -+ * rt_mutex_start_proxy_lock() detected a potential -+ * deadlock when we tried to queue that waiter. -+ * Drop the pi_state reference which we took above -+ * and remove the pointer to the state from the -+ * waiters futex_q object. -+ */ -+ this->pi_state = NULL; -+ put_pi_state(pi_state); -+ futex_requeue_pi_complete(this, ret); -+ /* -+ * We stop queueing more waiters and let user space -+ * deal with the mess. -+ */ -+ break; -+ } -+ } -+ -+ /* -+ * We took an extra initial reference to the pi_state in -+ * futex_proxy_trylock_atomic(). We need to drop it here again. -+ */ -+ put_pi_state(pi_state); -+ -+out_unlock: -+ double_unlock_hb(hb1, hb2); -+ wake_up_q(&wake_q); -+ futex_hb_waiters_dec(hb2); -+ return ret ? ret : task_count; -+} -+ -+/** -+ * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex -+ * @hb: the hash_bucket futex_q was original enqueued on -+ * @q: the futex_q woken while waiting to be requeued -+ * @timeout: the timeout associated with the wait (NULL if none) -+ * -+ * Determine the cause for the early wakeup. -+ * -+ * Return: -+ * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR -+ */ -+static inline -+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, -+ struct futex_q *q, -+ struct hrtimer_sleeper *timeout) -+{ -+ int ret; -+ -+ /* -+ * With the hb lock held, we avoid races while we process the wakeup. -+ * We only need to hold hb (and not hb2) to ensure atomicity as the -+ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. -+ * It can't be requeued from uaddr2 to something else since we don't -+ * support a PI aware source futex for requeue. -+ */ -+ WARN_ON_ONCE(&hb->lock != q->lock_ptr); -+ -+ /* -+ * We were woken prior to requeue by a timeout or a signal. -+ * Unqueue the futex_q and determine which it was. -+ */ -+ plist_del(&q->list, &hb->chain); -+ futex_hb_waiters_dec(hb); -+ -+ /* Handle spurious wakeups gracefully */ -+ ret = -EWOULDBLOCK; -+ if (timeout && !timeout->task) -+ ret = -ETIMEDOUT; -+ else if (signal_pending(current)) -+ ret = -ERESTARTNOINTR; -+ return ret; -+} -+ -+/** -+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 -+ * @uaddr: the futex we initially wait on (non-pi) -+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be -+ * the same type, no requeueing from private to shared, etc. -+ * @val: the expected value of uaddr -+ * @abs_time: absolute timeout -+ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all -+ * @uaddr2: the pi futex we will take prior to returning to user-space -+ * -+ * The caller will wait on uaddr and will be requeued by futex_requeue() to -+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake -+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to -+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters; -+ * without one, the pi logic would not know which task to boost/deboost, if -+ * there was a need to. -+ * -+ * We call schedule in futex_wait_queue() when we enqueue and return there -+ * via the following-- -+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() -+ * 2) wakeup on uaddr2 after a requeue -+ * 3) signal -+ * 4) timeout -+ * -+ * If 3, cleanup and return -ERESTARTNOINTR. -+ * -+ * If 2, we may then block on trying to take the rt_mutex and return via: -+ * 5) successful lock -+ * 6) signal -+ * 7) timeout -+ * 8) other lock acquisition failure -+ * -+ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). -+ * -+ * If 4 or 7, we cleanup and return with -ETIMEDOUT. -+ * -+ * Return: -+ * - 0 - On success; -+ * - <0 - On error -+ */ -+int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, -+ u32 val, ktime_t *abs_time, u32 bitset, -+ u32 __user *uaddr2) -+{ -+ struct hrtimer_sleeper timeout, *to; -+ struct rt_mutex_waiter rt_waiter; -+ struct futex_hash_bucket *hb; -+ union futex_key key2 = FUTEX_KEY_INIT; -+ struct futex_q q = futex_q_init; -+ struct rt_mutex_base *pi_mutex; -+ int res, ret; -+ -+ if (!IS_ENABLED(CONFIG_FUTEX_PI)) -+ return -ENOSYS; -+ -+ if (uaddr == uaddr2) -+ return -EINVAL; -+ -+ if (!bitset) -+ return -EINVAL; -+ -+ to = futex_setup_timer(abs_time, &timeout, flags, -+ current->timer_slack_ns); -+ -+ /* -+ * The waiter is allocated on our stack, manipulated by the requeue -+ * code while we sleep on uaddr. -+ */ -+ rt_mutex_init_waiter(&rt_waiter); -+ -+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); -+ if (unlikely(ret != 0)) -+ goto out; -+ -+ q.bitset = bitset; -+ q.rt_waiter = &rt_waiter; -+ q.requeue_pi_key = &key2; -+ -+ /* -+ * Prepare to wait on uaddr. On success, it holds hb->lock and q -+ * is initialized. -+ */ -+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); -+ if (ret) -+ goto out; -+ -+ /* -+ * The check above which compares uaddrs is not sufficient for -+ * shared futexes. We need to compare the keys: -+ */ -+ if (futex_match(&q.key, &key2)) { -+ futex_q_unlock(hb); -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ /* Queue the futex_q, drop the hb lock, wait for wakeup. */ -+ futex_wait_queue(hb, &q, to); -+ -+ switch (futex_requeue_pi_wakeup_sync(&q)) { -+ case Q_REQUEUE_PI_IGNORE: -+ /* The waiter is still on uaddr1 */ -+ spin_lock(&hb->lock); -+ ret = handle_early_requeue_pi_wakeup(hb, &q, to); -+ spin_unlock(&hb->lock); -+ break; -+ -+ case Q_REQUEUE_PI_LOCKED: -+ /* The requeue acquired the lock */ -+ if (q.pi_state && (q.pi_state->owner != current)) { -+ spin_lock(q.lock_ptr); -+ ret = fixup_pi_owner(uaddr2, &q, true); -+ /* -+ * Drop the reference to the pi state which the -+ * requeue_pi() code acquired for us. -+ */ -+ put_pi_state(q.pi_state); -+ spin_unlock(q.lock_ptr); -+ /* -+ * Adjust the return value. It's either -EFAULT or -+ * success (1) but the caller expects 0 for success. -+ */ -+ ret = ret < 0 ? ret : 0; -+ } -+ break; -+ -+ case Q_REQUEUE_PI_DONE: -+ /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */ -+ pi_mutex = &q.pi_state->pi_mutex; -+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); -+ -+ /* Current is not longer pi_blocked_on */ -+ spin_lock(q.lock_ptr); -+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) -+ ret = 0; -+ -+ debug_rt_mutex_free_waiter(&rt_waiter); -+ /* -+ * Fixup the pi_state owner and possibly acquire the lock if we -+ * haven't already. -+ */ -+ res = fixup_pi_owner(uaddr2, &q, !ret); -+ /* -+ * If fixup_pi_owner() returned an error, propagate that. If it -+ * acquired the lock, clear -ETIMEDOUT or -EINTR. -+ */ -+ if (res) -+ ret = (res < 0) ? res : 0; -+ -+ futex_unqueue_pi(&q); -+ spin_unlock(q.lock_ptr); -+ -+ if (ret == -EINTR) { -+ /* -+ * We've already been requeued, but cannot restart -+ * by calling futex_lock_pi() directly. We could -+ * restart this syscall, but it would detect that -+ * the user space "val" changed and return -+ * -EWOULDBLOCK. Save the overhead of the restart -+ * and return -EWOULDBLOCK directly. -+ */ -+ ret = -EWOULDBLOCK; -+ } -+ break; -+ default: -+ BUG(); -+ } -+ -+out: -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ return ret; -+} -+ -diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c -new file mode 100644 -index 000000000..368e9c17f ---- /dev/null -+++ b/kernel/futex/syscalls.c -@@ -0,0 +1,396 @@ -+// SPDX-License-Identifier: GPL-2.0-or-later -+ -+#include <linux/compat.h> -+#include <linux/syscalls.h> -+#include <linux/time_namespace.h> -+ -+#include "futex.h" -+ -+/* -+ * Support for robust futexes: the kernel cleans up held futexes at -+ * thread exit time. -+ * -+ * Implementation: user-space maintains a per-thread list of locks it -+ * is holding. Upon do_exit(), the kernel carefully walks this list, -+ * and marks all locks that are owned by this thread with the -+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is -+ * always manipulated with the lock held, so the list is private and -+ * per-thread. Userspace also maintains a per-thread 'list_op_pending' -+ * field, to allow the kernel to clean up if the thread dies after -+ * acquiring the lock, but just before it could have added itself to -+ * the list. There can only be one such pending lock. -+ */ -+ -+/** -+ * sys_set_robust_list() - Set the robust-futex list head of a task -+ * @head: pointer to the list-head -+ * @len: length of the list-head, as userspace expects -+ */ -+SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, -+ size_t, len) -+{ -+ if (!futex_cmpxchg_enabled) -+ return -ENOSYS; -+ /* -+ * The kernel knows only one size for now: -+ */ -+ if (unlikely(len != sizeof(*head))) -+ return -EINVAL; -+ -+ current->robust_list = head; -+ -+ return 0; -+} -+ -+/** -+ * sys_get_robust_list() - Get the robust-futex list head of a task -+ * @pid: pid of the process [zero for current task] -+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in -+ * @len_ptr: pointer to a length field, the kernel fills in the header size -+ */ -+SYSCALL_DEFINE3(get_robust_list, int, pid, -+ struct robust_list_head __user * __user *, head_ptr, -+ size_t __user *, len_ptr) -+{ -+ struct robust_list_head __user *head; -+ unsigned long ret; -+ struct task_struct *p; -+ -+ if (!futex_cmpxchg_enabled) -+ return -ENOSYS; -+ -+ rcu_read_lock(); -+ -+ ret = -ESRCH; -+ if (!pid) -+ p = current; -+ else { -+ p = find_task_by_vpid(pid); -+ if (!p) -+ goto err_unlock; -+ } -+ -+ ret = -EPERM; -+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) -+ goto err_unlock; -+ -+ head = p->robust_list; -+ rcu_read_unlock(); -+ -+ if (put_user(sizeof(*head), len_ptr)) -+ return -EFAULT; -+ return put_user(head, head_ptr); -+ -+err_unlock: -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, -+ u32 __user *uaddr2, u32 val2, u32 val3) -+{ -+ int cmd = op & FUTEX_CMD_MASK; -+ unsigned int flags = 0; -+ -+ if (!(op & FUTEX_PRIVATE_FLAG)) -+ flags |= FLAGS_SHARED; -+ -+ if (op & FUTEX_CLOCK_REALTIME) { -+ flags |= FLAGS_CLOCKRT; -+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && -+ cmd != FUTEX_LOCK_PI2) -+ return -ENOSYS; -+ } -+ -+ switch (cmd) { -+ case FUTEX_LOCK_PI: -+ case FUTEX_LOCK_PI2: -+ case FUTEX_UNLOCK_PI: -+ case FUTEX_TRYLOCK_PI: -+ case FUTEX_WAIT_REQUEUE_PI: -+ case FUTEX_CMP_REQUEUE_PI: -+ if (!futex_cmpxchg_enabled) -+ return -ENOSYS; -+ } -+ -+ switch (cmd) { -+ case FUTEX_WAIT: -+ val3 = FUTEX_BITSET_MATCH_ANY; -+ fallthrough; -+ case FUTEX_WAIT_BITSET: -+ return futex_wait(uaddr, flags, val, timeout, val3); -+ case FUTEX_WAKE: -+ val3 = FUTEX_BITSET_MATCH_ANY; -+ fallthrough; -+ case FUTEX_WAKE_BITSET: -+ return futex_wake(uaddr, flags, val, val3); -+ case FUTEX_REQUEUE: -+ return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); -+ case FUTEX_CMP_REQUEUE: -+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); -+ case FUTEX_WAKE_OP: -+ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); -+ case FUTEX_LOCK_PI: -+ flags |= FLAGS_CLOCKRT; -+ fallthrough; -+ case FUTEX_LOCK_PI2: -+ return futex_lock_pi(uaddr, flags, timeout, 0); -+ case FUTEX_UNLOCK_PI: -+ return futex_unlock_pi(uaddr, flags); -+ case FUTEX_TRYLOCK_PI: -+ return futex_lock_pi(uaddr, flags, NULL, 1); -+ case FUTEX_WAIT_REQUEUE_PI: -+ val3 = FUTEX_BITSET_MATCH_ANY; -+ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, -+ uaddr2); -+ case FUTEX_CMP_REQUEUE_PI: -+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); -+ } -+ return -ENOSYS; -+} -+ -+static __always_inline bool futex_cmd_has_timeout(u32 cmd) -+{ -+ switch (cmd) { -+ case FUTEX_WAIT: -+ case FUTEX_LOCK_PI: -+ case FUTEX_LOCK_PI2: -+ case FUTEX_WAIT_BITSET: -+ case FUTEX_WAIT_REQUEUE_PI: -+ return true; -+ } -+ return false; -+} -+ -+static __always_inline int -+futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) -+{ -+ if (!timespec64_valid(ts)) -+ return -EINVAL; -+ -+ *t = timespec64_to_ktime(*ts); -+ if (cmd == FUTEX_WAIT) -+ *t = ktime_add_safe(ktime_get(), *t); -+ else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) -+ *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); -+ return 0; -+} -+ -+SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, -+ const struct __kernel_timespec __user *, utime, -+ u32 __user *, uaddr2, u32, val3) -+{ -+ int ret, cmd = op & FUTEX_CMD_MASK; -+ ktime_t t, *tp = NULL; -+ struct timespec64 ts; -+ -+ if (utime && futex_cmd_has_timeout(cmd)) { -+ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) -+ return -EFAULT; -+ if (get_timespec64(&ts, utime)) -+ return -EFAULT; -+ ret = futex_init_timeout(cmd, op, &ts, &t); -+ if (ret) -+ return ret; -+ tp = &t; -+ } -+ -+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); -+} -+ -+/* Mask of available flags for each futex in futex_waitv list */ -+#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG) -+ -+/** -+ * futex_parse_waitv - Parse a waitv array from userspace -+ * @futexv: Kernel side list of waiters to be filled -+ * @uwaitv: Userspace list to be parsed -+ * @nr_futexes: Length of futexv -+ * -+ * Return: Error code on failure, 0 on success -+ */ -+static int futex_parse_waitv(struct futex_vector *futexv, -+ struct futex_waitv __user *uwaitv, -+ unsigned int nr_futexes) -+{ -+ struct futex_waitv aux; -+ unsigned int i; -+ -+ for (i = 0; i < nr_futexes; i++) { -+ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) -+ return -EFAULT; -+ -+ if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved) -+ return -EINVAL; -+ -+ if (!(aux.flags & FUTEX_32)) -+ return -EINVAL; -+ -+ futexv[i].w.flags = aux.flags; -+ futexv[i].w.val = aux.val; -+ futexv[i].w.uaddr = aux.uaddr; -+ futexv[i].q = futex_q_init; -+ } -+ -+ return 0; -+} -+ -+/** -+ * sys_futex_waitv - Wait on a list of futexes -+ * @waiters: List of futexes to wait on -+ * @nr_futexes: Length of futexv -+ * @flags: Flag for timeout (monotonic/realtime) -+ * @timeout: Optional absolute timeout. -+ * @clockid: Clock to be used for the timeout, realtime or monotonic. -+ * -+ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes -+ * if a futex_wake() is performed at any uaddr. The syscall returns immediately -+ * if any waiter has *uaddr != val. *timeout is an optional timeout value for the -+ * operation. Each waiter has individual flags. The `flags` argument for the -+ * syscall should be used solely for specifying the timeout as realtime, if -+ * needed. Flags for private futexes, sizes, etc. should be used on the -+ * individual flags of each waiter. -+ * -+ * Returns the array index of one of the awaken futexes. There's no given -+ * information of how many were awakened, or any particular attribute of it (if -+ * it's the first awakened, if it is of the smaller index...). -+ */ -+ -+SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, -+ unsigned int, nr_futexes, unsigned int, flags, -+ struct __kernel_timespec __user *, timeout, clockid_t, clockid) -+{ -+ struct hrtimer_sleeper to; -+ struct futex_vector *futexv; -+ struct timespec64 ts; -+ ktime_t time; -+ int ret; -+ -+ /* This syscall supports no flags for now */ -+ if (flags) -+ return -EINVAL; -+ -+ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) -+ return -EINVAL; -+ -+ if (timeout) { -+ int flag_clkid = 0, flag_init = 0; -+ -+ if (clockid == CLOCK_REALTIME) { -+ flag_clkid = FLAGS_CLOCKRT; -+ flag_init = FUTEX_CLOCK_REALTIME; -+ } -+ -+ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) -+ return -EINVAL; -+ -+ if (get_timespec64(&ts, timeout)) -+ return -EFAULT; -+ -+ /* -+ * Since there's no opcode for futex_waitv, use -+ * FUTEX_WAIT_BITSET that uses absolute timeout as well -+ */ -+ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); -+ if (ret) -+ return ret; -+ -+ futex_setup_timer(&time, &to, flag_clkid, 0); -+ } -+ -+ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); -+ if (!futexv) -+ return -ENOMEM; -+ -+ ret = futex_parse_waitv(futexv, waiters, nr_futexes); -+ if (!ret) -+ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); -+ -+ if (timeout) { -+ hrtimer_cancel(&to.timer); -+ destroy_hrtimer_on_stack(&to.timer); -+ } -+ -+ kfree(futexv); -+ return ret; -+} -+ -+#ifdef CONFIG_COMPAT -+COMPAT_SYSCALL_DEFINE2(set_robust_list, -+ struct compat_robust_list_head __user *, head, -+ compat_size_t, len) -+{ -+ if (!futex_cmpxchg_enabled) -+ return -ENOSYS; -+ -+ if (unlikely(len != sizeof(*head))) -+ return -EINVAL; -+ -+ current->compat_robust_list = head; -+ -+ return 0; -+} -+ -+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, -+ compat_uptr_t __user *, head_ptr, -+ compat_size_t __user *, len_ptr) -+{ -+ struct compat_robust_list_head __user *head; -+ unsigned long ret; -+ struct task_struct *p; -+ -+ if (!futex_cmpxchg_enabled) -+ return -ENOSYS; -+ -+ rcu_read_lock(); -+ -+ ret = -ESRCH; -+ if (!pid) -+ p = current; -+ else { -+ p = find_task_by_vpid(pid); -+ if (!p) -+ goto err_unlock; -+ } -+ -+ ret = -EPERM; -+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) -+ goto err_unlock; -+ -+ head = p->compat_robust_list; -+ rcu_read_unlock(); -+ -+ if (put_user(sizeof(*head), len_ptr)) -+ return -EFAULT; -+ return put_user(ptr_to_compat(head), head_ptr); -+ -+err_unlock: -+ rcu_read_unlock(); -+ -+ return ret; -+} -+#endif /* CONFIG_COMPAT */ -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, -+ const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, -+ u32, val3) -+{ -+ int ret, cmd = op & FUTEX_CMD_MASK; -+ ktime_t t, *tp = NULL; -+ struct timespec64 ts; -+ -+ if (utime && futex_cmd_has_timeout(cmd)) { -+ if (get_old_timespec32(&ts, utime)) -+ return -EFAULT; -+ ret = futex_init_timeout(cmd, op, &ts, &t); -+ if (ret) -+ return ret; -+ tp = &t; -+ } -+ -+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); -+} -+#endif /* CONFIG_COMPAT_32BIT_TIME */ -+ -diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c -new file mode 100644 -index 000000000..b45597aab ---- /dev/null -+++ b/kernel/futex/waitwake.c -@@ -0,0 +1,708 @@ -+// SPDX-License-Identifier: GPL-2.0-or-later -+ -+#include <linux/sched/task.h> -+#include <linux/sched/signal.h> -+#include <linux/freezer.h> -+ -+#include "futex.h" -+ -+/* -+ * READ this before attempting to hack on futexes! -+ * -+ * Basic futex operation and ordering guarantees -+ * ============================================= -+ * -+ * The waiter reads the futex value in user space and calls -+ * futex_wait(). This function computes the hash bucket and acquires -+ * the hash bucket lock. After that it reads the futex user space value -+ * again and verifies that the data has not changed. If it has not changed -+ * it enqueues itself into the hash bucket, releases the hash bucket lock -+ * and schedules. -+ * -+ * The waker side modifies the user space value of the futex and calls -+ * futex_wake(). This function computes the hash bucket and acquires the -+ * hash bucket lock. Then it looks for waiters on that futex in the hash -+ * bucket and wakes them. -+ * -+ * In futex wake up scenarios where no tasks are blocked on a futex, taking -+ * the hb spinlock can be avoided and simply return. In order for this -+ * optimization to work, ordering guarantees must exist so that the waiter -+ * being added to the list is acknowledged when the list is concurrently being -+ * checked by the waker, avoiding scenarios like the following: -+ * -+ * CPU 0 CPU 1 -+ * val = *futex; -+ * sys_futex(WAIT, futex, val); -+ * futex_wait(futex, val); -+ * uval = *futex; -+ * *futex = newval; -+ * sys_futex(WAKE, futex); -+ * futex_wake(futex); -+ * if (queue_empty()) -+ * return; -+ * if (uval == val) -+ * lock(hash_bucket(futex)); -+ * queue(); -+ * unlock(hash_bucket(futex)); -+ * schedule(); -+ * -+ * This would cause the waiter on CPU 0 to wait forever because it -+ * missed the transition of the user space value from val to newval -+ * and the waker did not find the waiter in the hash bucket queue. -+ * -+ * The correct serialization ensures that a waiter either observes -+ * the changed user space value before blocking or is woken by a -+ * concurrent waker: -+ * -+ * CPU 0 CPU 1 -+ * val = *futex; -+ * sys_futex(WAIT, futex, val); -+ * futex_wait(futex, val); -+ * -+ * waiters++; (a) -+ * smp_mb(); (A) <-- paired with -. -+ * | -+ * lock(hash_bucket(futex)); | -+ * | -+ * uval = *futex; | -+ * | *futex = newval; -+ * | sys_futex(WAKE, futex); -+ * | futex_wake(futex); -+ * | -+ * `--------> smp_mb(); (B) -+ * if (uval == val) -+ * queue(); -+ * unlock(hash_bucket(futex)); -+ * schedule(); if (waiters) -+ * lock(hash_bucket(futex)); -+ * else wake_waiters(futex); -+ * waiters--; (b) unlock(hash_bucket(futex)); -+ * -+ * Where (A) orders the waiters increment and the futex value read through -+ * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write -+ * to futex and the waiters read (see futex_hb_waiters_pending()). -+ * -+ * This yields the following case (where X:=waiters, Y:=futex): -+ * -+ * X = Y = 0 -+ * -+ * w[X]=1 w[Y]=1 -+ * MB MB -+ * r[Y]=y r[X]=x -+ * -+ * Which guarantees that x==0 && y==0 is impossible; which translates back into -+ * the guarantee that we cannot both miss the futex variable change and the -+ * enqueue. -+ * -+ * Note that a new waiter is accounted for in (a) even when it is possible that -+ * the wait call can return error, in which case we backtrack from it in (b). -+ * Refer to the comment in futex_q_lock(). -+ * -+ * Similarly, in order to account for waiters being requeued on another -+ * address we always increment the waiters for the destination bucket before -+ * acquiring the lock. It then decrements them again after releasing it - -+ * the code that actually moves the futex(es) between hash buckets (requeue_futex) -+ * will do the additional required waiter count housekeeping. This is done for -+ * double_lock_hb() and double_unlock_hb(), respectively. -+ */ -+ -+/* -+ * The hash bucket lock must be held when this is called. -+ * Afterwards, the futex_q must not be accessed. Callers -+ * must ensure to later call wake_up_q() for the actual -+ * wakeups to occur. -+ */ -+void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) -+{ -+ struct task_struct *p = q->task; -+ -+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) -+ return; -+ -+ get_task_struct(p); -+ __futex_unqueue(q); -+ /* -+ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL -+ * is written, without taking any locks. This is possible in the event -+ * of a spurious wakeup, for example. A memory barrier is required here -+ * to prevent the following store to lock_ptr from getting ahead of the -+ * plist_del in __futex_unqueue(). -+ */ -+ smp_store_release(&q->lock_ptr, NULL); -+ -+ /* -+ * Queue the task for later wakeup for after we've released -+ * the hb->lock. -+ */ -+ wake_q_add_safe(wake_q, p); -+} -+ -+/* -+ * Wake up waiters matching bitset queued on this futex (uaddr). -+ */ -+int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) -+{ -+ struct futex_hash_bucket *hb; -+ struct futex_q *this, *next; -+ union futex_key key = FUTEX_KEY_INIT; -+ int ret; -+ DEFINE_WAKE_Q(wake_q); -+ -+ if (!bitset) -+ return -EINVAL; -+ -+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+ hb = futex_hash(&key); -+ -+ /* Make sure we really have tasks to wakeup */ -+ if (!futex_hb_waiters_pending(hb)) -+ return ret; -+ -+ spin_lock(&hb->lock); -+ -+ plist_for_each_entry_safe(this, next, &hb->chain, list) { -+ if (futex_match (&this->key, &key)) { -+ if (this->pi_state || this->rt_waiter) { -+ ret = -EINVAL; -+ break; -+ } -+ -+ /* Check if one of the bits is set in both bitsets */ -+ if (!(this->bitset & bitset)) -+ continue; -+ -+ futex_wake_mark(&wake_q, this); -+ if (++ret >= nr_wake) -+ break; -+ } -+ } -+ -+ spin_unlock(&hb->lock); -+ wake_up_q(&wake_q); -+ return ret; -+} -+ -+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) -+{ -+ unsigned int op = (encoded_op & 0x70000000) >> 28; -+ unsigned int cmp = (encoded_op & 0x0f000000) >> 24; -+ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); -+ int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); -+ int oldval, ret; -+ -+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { -+ if (oparg < 0 || oparg > 31) { -+ char comm[sizeof(current->comm)]; -+ /* -+ * kill this print and return -EINVAL when userspace -+ * is sane again -+ */ -+ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", -+ get_task_comm(comm, current), oparg); -+ oparg &= 31; -+ } -+ oparg = 1 << oparg; -+ } -+ -+ pagefault_disable(); -+ ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); -+ pagefault_enable(); -+ if (ret) -+ return ret; -+ -+ switch (cmp) { -+ case FUTEX_OP_CMP_EQ: -+ return oldval == cmparg; -+ case FUTEX_OP_CMP_NE: -+ return oldval != cmparg; -+ case FUTEX_OP_CMP_LT: -+ return oldval < cmparg; -+ case FUTEX_OP_CMP_GE: -+ return oldval >= cmparg; -+ case FUTEX_OP_CMP_LE: -+ return oldval <= cmparg; -+ case FUTEX_OP_CMP_GT: -+ return oldval > cmparg; -+ default: -+ return -ENOSYS; -+ } -+} -+ -+/* -+ * Wake up all waiters hashed on the physical page that is mapped -+ * to this virtual address: -+ */ -+int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, -+ int nr_wake, int nr_wake2, int op) -+{ -+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; -+ struct futex_hash_bucket *hb1, *hb2; -+ struct futex_q *this, *next; -+ int ret, op_ret; -+ DEFINE_WAKE_Q(wake_q); -+ -+retry: -+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); -+ if (unlikely(ret != 0)) -+ return ret; -+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+ hb1 = futex_hash(&key1); -+ hb2 = futex_hash(&key2); -+ -+retry_private: -+ double_lock_hb(hb1, hb2); -+ op_ret = futex_atomic_op_inuser(op, uaddr2); -+ if (unlikely(op_ret < 0)) { -+ double_unlock_hb(hb1, hb2); -+ -+ if (!IS_ENABLED(CONFIG_MMU) || -+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { -+ /* -+ * we don't get EFAULT from MMU faults if we don't have -+ * an MMU, but we might get them from range checking -+ */ -+ ret = op_ret; -+ return ret; -+ } -+ -+ if (op_ret == -EFAULT) { -+ ret = fault_in_user_writeable(uaddr2); -+ if (ret) -+ return ret; -+ } -+ -+ cond_resched(); -+ if (!(flags & FLAGS_SHARED)) -+ goto retry_private; -+ goto retry; -+ } -+ -+ plist_for_each_entry_safe(this, next, &hb1->chain, list) { -+ if (futex_match (&this->key, &key1)) { -+ if (this->pi_state || this->rt_waiter) { -+ ret = -EINVAL; -+ goto out_unlock; -+ } -+ futex_wake_mark(&wake_q, this); -+ if (++ret >= nr_wake) -+ break; -+ } -+ } -+ -+ if (op_ret > 0) { -+ op_ret = 0; -+ plist_for_each_entry_safe(this, next, &hb2->chain, list) { -+ if (futex_match (&this->key, &key2)) { -+ if (this->pi_state || this->rt_waiter) { -+ ret = -EINVAL; -+ goto out_unlock; -+ } -+ futex_wake_mark(&wake_q, this); -+ if (++op_ret >= nr_wake2) -+ break; -+ } -+ } -+ ret += op_ret; -+ } -+ -+out_unlock: -+ double_unlock_hb(hb1, hb2); -+ wake_up_q(&wake_q); -+ return ret; -+} -+ -+static long futex_wait_restart(struct restart_block *restart); -+ -+/** -+ * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal -+ * @hb: the futex hash bucket, must be locked by the caller -+ * @q: the futex_q to queue up on -+ * @timeout: the prepared hrtimer_sleeper, or null for no timeout -+ */ -+void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, -+ struct hrtimer_sleeper *timeout) -+{ -+ /* -+ * The task state is guaranteed to be set before another task can -+ * wake it. set_current_state() is implemented using smp_store_mb() and -+ * futex_queue() calls spin_unlock() upon completion, both serializing -+ * access to the hash list and forcing another memory barrier. -+ */ -+ set_current_state(TASK_INTERRUPTIBLE); -+ futex_queue(q, hb); -+ -+ /* Arm the timer */ -+ if (timeout) -+ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); -+ -+ /* -+ * If we have been removed from the hash list, then another task -+ * has tried to wake us, and we can skip the call to schedule(). -+ */ -+ if (likely(!plist_node_empty(&q->list))) { -+ /* -+ * If the timer has already expired, current will already be -+ * flagged for rescheduling. Only call schedule if there -+ * is no timeout, or if it has yet to expire. -+ */ -+ if (!timeout || timeout->task) -+ freezable_schedule(); -+ } -+ __set_current_state(TASK_RUNNING); -+} -+ -+/** -+ * unqueue_multiple - Remove various futexes from their hash bucket -+ * @v: The list of futexes to unqueue -+ * @count: Number of futexes in the list -+ * -+ * Helper to unqueue a list of futexes. This can't fail. -+ * -+ * Return: -+ * - >=0 - Index of the last futex that was awoken; -+ * - -1 - No futex was awoken -+ */ -+static int unqueue_multiple(struct futex_vector *v, int count) -+{ -+ int ret = -1, i; -+ -+ for (i = 0; i < count; i++) { -+ if (!futex_unqueue(&v[i].q)) -+ ret = i; -+ } -+ -+ return ret; -+} -+ -+/** -+ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes -+ * @vs: The futex list to wait on -+ * @count: The size of the list -+ * @awaken: Index of the last awoken futex, if any. Used to notify the -+ * caller that it can return this index to userspace (return parameter) -+ * -+ * Prepare multiple futexes in a single step and enqueue them. This may fail if -+ * the futex list is invalid or if any futex was already awoken. On success the -+ * task is ready to interruptible sleep. -+ * -+ * Return: -+ * - 1 - One of the futexes was awaken by another thread -+ * - 0 - Success -+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL -+ */ -+static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *awaken) -+{ -+ struct futex_hash_bucket *hb; -+ bool retry = false; -+ int ret, i; -+ u32 uval; -+ -+ /* -+ * Enqueuing multiple futexes is tricky, because we need to enqueue -+ * each futex in the list before dealing with the next one to avoid -+ * deadlocking on the hash bucket. But, before enqueuing, we need to -+ * make sure that current->state is TASK_INTERRUPTIBLE, so we don't -+ * absorb any awake events, which cannot be done before the -+ * get_futex_key of the next key, because it calls get_user_pages, -+ * which can sleep. Thus, we fetch the list of futexes keys in two -+ * steps, by first pinning all the memory keys in the futex key, and -+ * only then we read each key and queue the corresponding futex. -+ * -+ * Private futexes doesn't need to recalculate hash in retry, so skip -+ * get_futex_key() when retrying. -+ */ -+retry: -+ for (i = 0; i < count; i++) { -+ if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry) -+ continue; -+ -+ ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), -+ !(vs[i].w.flags & FUTEX_PRIVATE_FLAG), -+ &vs[i].q.key, FUTEX_READ); -+ -+ if (unlikely(ret)) -+ return ret; -+ } -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ for (i = 0; i < count; i++) { -+ u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; -+ struct futex_q *q = &vs[i].q; -+ u32 val = (u32)vs[i].w.val; -+ -+ hb = futex_q_lock(q); -+ ret = futex_get_value_locked(&uval, uaddr); -+ -+ if (!ret && uval == val) { -+ /* -+ * The bucket lock can't be held while dealing with the -+ * next futex. Queue each futex at this moment so hb can -+ * be unlocked. -+ */ -+ futex_queue(q, hb); -+ continue; -+ } -+ -+ futex_q_unlock(hb); -+ __set_current_state(TASK_RUNNING); -+ -+ /* -+ * Even if something went wrong, if we find out that a futex -+ * was awaken, we don't return error and return this index to -+ * userspace -+ */ -+ *awaken = unqueue_multiple(vs, i); -+ if (*awaken >= 0) -+ return 1; -+ -+ if (ret) { -+ /* -+ * If we need to handle a page fault, we need to do so -+ * without any lock and any enqueued futex (otherwise -+ * we could lose some wakeup). So we do it here, after -+ * undoing all the work done so far. In success, we -+ * retry all the work. -+ */ -+ if (get_user(uval, uaddr)) -+ return -EFAULT; -+ -+ retry = true; -+ goto retry; -+ } -+ -+ if (uval != val) -+ return -EWOULDBLOCK; -+ } -+ -+ return 0; -+} -+ -+/** -+ * futex_sleep_multiple - Check sleeping conditions and sleep -+ * @vs: List of futexes to wait for -+ * @count: Length of vs -+ * @to: Timeout -+ * -+ * Sleep if and only if the timeout hasn't expired and no futex on the list has -+ * been awaken. -+ */ -+static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, -+ struct hrtimer_sleeper *to) -+{ -+ if (to && !to->task) -+ return; -+ -+ for (; count; count--, vs++) { -+ if (!READ_ONCE(vs->q.lock_ptr)) -+ return; -+ } -+ -+ freezable_schedule(); -+} -+ -+/** -+ * futex_wait_multiple - Prepare to wait on and enqueue several futexes -+ * @vs: The list of futexes to wait on -+ * @count: The number of objects -+ * @to: Timeout before giving up and returning to userspace -+ * -+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function -+ * sleeps on a group of futexes and returns on the first futex that is -+ * wake, or after the timeout has elapsed. -+ * -+ * Return: -+ * - >=0 - Hint to the futex that was awoken -+ * - <0 - On error -+ */ -+int futex_wait_multiple(struct futex_vector *vs, unsigned int count, -+ struct hrtimer_sleeper *to) -+{ -+ int ret, hint = 0; -+ -+ if (to) -+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); -+ -+ while (1) { -+ ret = futex_wait_multiple_setup(vs, count, &hint); -+ if (ret) { -+ if (ret > 0) { -+ /* A futex was awaken during setup */ -+ ret = hint; -+ } -+ return ret; -+ } -+ -+ futex_sleep_multiple(vs, count, to); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ ret = unqueue_multiple(vs, count); -+ if (ret >= 0) -+ return ret; -+ -+ if (to && !to->task) -+ return -ETIMEDOUT; -+ else if (signal_pending(current)) -+ return -ERESTARTSYS; -+ /* -+ * The final case is a spurious wakeup, for -+ * which just retry. -+ */ -+ } -+} -+ -+/** -+ * futex_wait_setup() - Prepare to wait on a futex -+ * @uaddr: the futex userspace address -+ * @val: the expected value -+ * @flags: futex flags (FLAGS_SHARED, etc.) -+ * @q: the associated futex_q -+ * @hb: storage for hash_bucket pointer to be returned to caller -+ * -+ * Setup the futex_q and locate the hash_bucket. Get the futex value and -+ * compare it with the expected value. Handle atomic faults internally. -+ * Return with the hb lock held on success, and unlocked on failure. -+ * -+ * Return: -+ * - 0 - uaddr contains val and hb has been locked; -+ * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked -+ */ -+int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, -+ struct futex_q *q, struct futex_hash_bucket **hb) -+{ -+ u32 uval; -+ int ret; -+ -+ /* -+ * Access the page AFTER the hash-bucket is locked. -+ * Order is important: -+ * -+ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); -+ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } -+ * -+ * The basic logical guarantee of a futex is that it blocks ONLY -+ * if cond(var) is known to be true at the time of blocking, for -+ * any cond. If we locked the hash-bucket after testing *uaddr, that -+ * would open a race condition where we could block indefinitely with -+ * cond(var) false, which would violate the guarantee. -+ * -+ * On the other hand, we insert q and release the hash-bucket only -+ * after testing *uaddr. This guarantees that futex_wait() will NOT -+ * absorb a wakeup if *uaddr does not match the desired values -+ * while the syscall executes. -+ */ -+retry: -+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); -+ if (unlikely(ret != 0)) -+ return ret; -+ -+retry_private: -+ *hb = futex_q_lock(q); -+ -+ ret = futex_get_value_locked(&uval, uaddr); -+ -+ if (ret) { -+ futex_q_unlock(*hb); -+ -+ ret = get_user(uval, uaddr); -+ if (ret) -+ return ret; -+ -+ if (!(flags & FLAGS_SHARED)) -+ goto retry_private; -+ -+ goto retry; -+ } -+ -+ if (uval != val) { -+ futex_q_unlock(*hb); -+ ret = -EWOULDBLOCK; -+ } -+ -+ return ret; -+} -+ -+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) -+{ -+ struct hrtimer_sleeper timeout, *to; -+ struct restart_block *restart; -+ struct futex_hash_bucket *hb; -+ struct futex_q q = futex_q_init; -+ int ret; -+ -+ if (!bitset) -+ return -EINVAL; -+ q.bitset = bitset; -+ -+ to = futex_setup_timer(abs_time, &timeout, flags, -+ current->timer_slack_ns); -+retry: -+ /* -+ * Prepare to wait on uaddr. On success, it holds hb->lock and q -+ * is initialized. -+ */ -+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); -+ if (ret) -+ goto out; -+ -+ /* futex_queue and wait for wakeup, timeout, or a signal. */ -+ futex_wait_queue(hb, &q, to); -+ -+ /* If we were woken (and unqueued), we succeeded, whatever. */ -+ ret = 0; -+ if (!futex_unqueue(&q)) -+ goto out; -+ ret = -ETIMEDOUT; -+ if (to && !to->task) -+ goto out; -+ -+ /* -+ * We expect signal_pending(current), but we might be the -+ * victim of a spurious wakeup as well. -+ */ -+ if (!signal_pending(current)) -+ goto retry; -+ -+ ret = -ERESTARTSYS; -+ if (!abs_time) -+ goto out; -+ -+ restart = ¤t->restart_block; -+ restart->futex.uaddr = uaddr; -+ restart->futex.val = val; -+ restart->futex.time = *abs_time; -+ restart->futex.bitset = bitset; -+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; -+ -+ ret = set_restart_fn(restart, futex_wait_restart); -+ -+out: -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ return ret; -+} -+ -+static long futex_wait_restart(struct restart_block *restart) -+{ -+ u32 __user *uaddr = restart->futex.uaddr; -+ ktime_t t, *tp = NULL; -+ -+ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { -+ t = restart->futex.time; -+ tp = &t; -+ } -+ restart->fn = do_no_restart_syscall; -+ -+ return (long)futex_wait(uaddr, restart->futex.flags, -+ restart->futex.val, tp, restart->futex.bitset); -+} -+ -diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c -index f43d89d92..d1944258c 100644 ---- a/kernel/sys_ni.c -+++ b/kernel/sys_ni.c -@@ -143,13 +143,14 @@ COND_SYSCALL(capset); - /* __ARCH_WANT_SYS_CLONE3 */ - COND_SYSCALL(clone3); - --/* kernel/futex.c */ -+/* kernel/futex/syscalls.c */ - COND_SYSCALL(futex); - COND_SYSCALL(futex_time32); - COND_SYSCALL(set_robust_list); - COND_SYSCALL_COMPAT(set_robust_list); - COND_SYSCALL(get_robust_list); - COND_SYSCALL_COMPAT(get_robust_list); -+COND_SYSCALL(futex_waitv); - - /* kernel/hrtimer.c */ - -diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore -index 0e78b49d0..fbcbdb696 100644 ---- a/tools/testing/selftests/futex/functional/.gitignore -+++ b/tools/testing/selftests/futex/functional/.gitignore -@@ -8,3 +8,4 @@ futex_wait_uninitialized_heap - futex_wait_wouldblock - futex_wait - futex_requeue -+futex_waitv -diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile -index bd1fec59e..5cc38de9d 100644 ---- a/tools/testing/selftests/futex/functional/Makefile -+++ b/tools/testing/selftests/futex/functional/Makefile -@@ -17,7 +17,8 @@ TEST_GEN_FILES := \ - futex_wait_uninitialized_heap \ - futex_wait_private_mapped_file \ - futex_wait \ -- futex_requeue -+ futex_requeue \ -+ futex_waitv - - TEST_PROGS := run.sh - -diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c -index 1f8f6daaf..3651ce17b 100644 ---- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c -+++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c -@@ -17,6 +17,7 @@ - - #include <pthread.h> - #include "futextest.h" -+#include "futex2test.h" - #include "logging.h" - - #define TEST_NAME "futex-wait-timeout" -@@ -96,6 +97,12 @@ int main(int argc, char *argv[]) - struct timespec to; - pthread_t thread; - int c; -+ struct futex_waitv waitv = { -+ .uaddr = (uintptr_t)&f1, -+ .val = f1, -+ .flags = FUTEX_32, -+ .__reserved = 0 -+ }; - - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { -@@ -118,7 +125,7 @@ int main(int argc, char *argv[]) - } - - ksft_print_header(); -- ksft_set_plan(7); -+ ksft_set_plan(9); - ksft_print_msg("%s: Block on a futex and wait for timeout\n", - basename(argv[0])); - ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); -@@ -175,6 +182,18 @@ int main(int argc, char *argv[]) - res = futex_lock_pi(&futex_pi, NULL, 0, FUTEX_CLOCK_REALTIME); - test_timeout(res, &ret, "futex_lock_pi invalid timeout flag", ENOSYS); - -+ /* futex_waitv with CLOCK_MONOTONIC */ -+ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns)) -+ return RET_FAIL; -+ res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); -+ test_timeout(res, &ret, "futex_waitv monotonic", ETIMEDOUT); -+ -+ /* futex_waitv with CLOCK_REALTIME */ -+ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns)) -+ return RET_FAIL; -+ res = futex_waitv(&waitv, 1, 0, &to, CLOCK_REALTIME); -+ test_timeout(res, &ret, "futex_waitv realtime", ETIMEDOUT); -+ - ksft_print_cnts(); - return ret; - } -diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -index 0ae390ff8..7d7a6a06c 100644 ---- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -+++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -@@ -22,6 +22,7 @@ - #include <string.h> - #include <time.h> - #include "futextest.h" -+#include "futex2test.h" - #include "logging.h" - - #define TEST_NAME "futex-wait-wouldblock" -@@ -42,6 +43,12 @@ int main(int argc, char *argv[]) - futex_t f1 = FUTEX_INITIALIZER; - int res, ret = RET_PASS; - int c; -+ struct futex_waitv waitv = { -+ .uaddr = (uintptr_t)&f1, -+ .val = f1+1, -+ .flags = FUTEX_32, -+ .__reserved = 0 -+ }; - - while ((c = getopt(argc, argv, "cht:v:")) != -1) { - switch (c) { -@@ -61,18 +68,44 @@ int main(int argc, char *argv[]) - } - - ksft_print_header(); -- ksft_set_plan(1); -+ ksft_set_plan(2); - ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", - basename(argv[0])); - - info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); - res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG); - if (!res || errno != EWOULDBLOCK) { -- fail("futex_wait returned: %d %s\n", -- res ? errno : res, res ? strerror(errno) : ""); -+ ksft_test_result_fail("futex_wait returned: %d %s\n", -+ res ? errno : res, -+ res ? strerror(errno) : ""); - ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_wait\n"); - } - -- print_result(TEST_NAME, ret); -+ if (clock_gettime(CLOCK_MONOTONIC, &to)) { -+ error("clock_gettime failed\n", errno); -+ return errno; -+ } -+ -+ to.tv_nsec += timeout_ns; -+ -+ if (to.tv_nsec >= 1000000000) { -+ to.tv_sec++; -+ to.tv_nsec -= 1000000000; -+ } -+ -+ info("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1); -+ res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC); -+ if (!res || errno != EWOULDBLOCK) { -+ ksft_test_result_pass("futex_waitv returned: %d %s\n", -+ res ? errno : res, -+ res ? strerror(errno) : ""); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_waitv\n"); -+ } -+ -+ ksft_print_cnts(); - return ret; - } -diff --git a/tools/testing/selftests/futex/functional/futex_waitv.c b/tools/testing/selftests/futex/functional/futex_waitv.c -new file mode 100644 -index 000000000..a94337f67 ---- /dev/null -+++ b/tools/testing/selftests/futex/functional/futex_waitv.c -@@ -0,0 +1,237 @@ -+// SPDX-License-Identifier: GPL-2.0-or-later -+/* -+ * futex_waitv() test by AndrĂ© Almeida <andrealmeid@collabora.com> -+ * -+ * Copyright 2021 Collabora Ltd. -+ */ -+ -+#include <errno.h> -+#include <error.h> -+#include <getopt.h> -+#include <stdio.h> -+#include <stdlib.h> -+#include <string.h> -+#include <time.h> -+#include <pthread.h> -+#include <stdint.h> -+#include <sys/shm.h> -+#include "futextest.h" -+#include "futex2test.h" -+#include "logging.h" -+ -+#define TEST_NAME "futex-wait" -+#define WAKE_WAIT_US 10000 -+#define NR_FUTEXES 30 -+static struct futex_waitv waitv[NR_FUTEXES]; -+u_int32_t futexes[NR_FUTEXES] = {0}; -+ -+void usage(char *prog) -+{ -+ printf("Usage: %s\n", prog); -+ printf(" -c Use color\n"); -+ printf(" -h Display this help message\n"); -+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", -+ VQUIET, VCRITICAL, VINFO); -+} -+ -+void *waiterfn(void *arg) -+{ -+ struct timespec to; -+ int res; -+ -+ /* setting absolute timeout for futex2 */ -+ if (clock_gettime(CLOCK_MONOTONIC, &to)) -+ error("gettime64 failed\n", errno); -+ -+ to.tv_sec++; -+ -+ res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC); -+ if (res < 0) { -+ ksft_test_result_fail("futex_waitv returned: %d %s\n", -+ errno, strerror(errno)); -+ } else if (res != NR_FUTEXES - 1) { -+ ksft_test_result_fail("futex_waitv returned: %d, expecting %d\n", -+ res, NR_FUTEXES - 1); -+ } -+ -+ return NULL; -+} -+ -+int main(int argc, char *argv[]) -+{ -+ pthread_t waiter; -+ int res, ret = RET_PASS; -+ struct timespec to; -+ int c, i; -+ -+ while ((c = getopt(argc, argv, "cht:v:")) != -1) { -+ switch (c) { -+ case 'c': -+ log_color(1); -+ break; -+ case 'h': -+ usage(basename(argv[0])); -+ exit(0); -+ case 'v': -+ log_verbosity(atoi(optarg)); -+ break; -+ default: -+ usage(basename(argv[0])); -+ exit(1); -+ } -+ } -+ -+ ksft_print_header(); -+ ksft_set_plan(7); -+ ksft_print_msg("%s: Test FUTEX_WAITV\n", -+ basename(argv[0])); -+ -+ for (i = 0; i < NR_FUTEXES; i++) { -+ waitv[i].uaddr = (uintptr_t)&futexes[i]; -+ waitv[i].flags = FUTEX_32 | FUTEX_PRIVATE_FLAG; -+ waitv[i].val = 0; -+ waitv[i].__reserved = 0; -+ } -+ -+ /* Private waitv */ -+ if (pthread_create(&waiter, NULL, waiterfn, NULL)) -+ error("pthread_create failed\n", errno); -+ -+ usleep(WAKE_WAIT_US); -+ -+ res = futex_wake(u64_to_ptr(waitv[NR_FUTEXES - 1].uaddr), 1, FUTEX_PRIVATE_FLAG); -+ if (res != 1) { -+ ksft_test_result_fail("futex_wake private returned: %d %s\n", -+ res ? errno : res, -+ res ? strerror(errno) : ""); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_waitv private\n"); -+ } -+ -+ /* Shared waitv */ -+ for (i = 0; i < NR_FUTEXES; i++) { -+ int shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); -+ -+ if (shm_id < 0) { -+ perror("shmget"); -+ exit(1); -+ } -+ -+ unsigned int *shared_data = shmat(shm_id, NULL, 0); -+ -+ *shared_data = 0; -+ waitv[i].uaddr = (uintptr_t)shared_data; -+ waitv[i].flags = FUTEX_32; -+ waitv[i].val = 0; -+ waitv[i].__reserved = 0; -+ } -+ -+ if (pthread_create(&waiter, NULL, waiterfn, NULL)) -+ error("pthread_create failed\n", errno); -+ -+ usleep(WAKE_WAIT_US); -+ -+ res = futex_wake(u64_to_ptr(waitv[NR_FUTEXES - 1].uaddr), 1, 0); -+ if (res != 1) { -+ ksft_test_result_fail("futex_wake shared returned: %d %s\n", -+ res ? errno : res, -+ res ? strerror(errno) : ""); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_waitv shared\n"); -+ } -+ -+ for (i = 0; i < NR_FUTEXES; i++) -+ shmdt(u64_to_ptr(waitv[i].uaddr)); -+ -+ /* Testing a waiter without FUTEX_32 flag */ -+ waitv[0].flags = FUTEX_PRIVATE_FLAG; -+ -+ if (clock_gettime(CLOCK_MONOTONIC, &to)) -+ error("gettime64 failed\n", errno); -+ -+ to.tv_sec++; -+ -+ res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC); -+ if (res == EINVAL) { -+ ksft_test_result_fail("futex_waitv private returned: %d %s\n", -+ res ? errno : res, -+ res ? strerror(errno) : ""); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_waitv without FUTEX_32\n"); -+ } -+ -+ /* Testing a waiter with an unaligned address */ -+ waitv[0].flags = FUTEX_PRIVATE_FLAG | FUTEX_32; -+ waitv[0].uaddr = 1; -+ -+ if (clock_gettime(CLOCK_MONOTONIC, &to)) -+ error("gettime64 failed\n", errno); -+ -+ to.tv_sec++; -+ -+ res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC); -+ if (res == EINVAL) { -+ ksft_test_result_fail("futex_wake private returned: %d %s\n", -+ res ? errno : res, -+ res ? strerror(errno) : ""); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_waitv with an unaligned address\n"); -+ } -+ -+ /* Testing a NULL address for waiters.uaddr */ -+ waitv[0].uaddr = 0x00000000; -+ -+ if (clock_gettime(CLOCK_MONOTONIC, &to)) -+ error("gettime64 failed\n", errno); -+ -+ to.tv_sec++; -+ -+ res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC); -+ if (res == EINVAL) { -+ ksft_test_result_fail("futex_waitv private returned: %d %s\n", -+ res ? errno : res, -+ res ? strerror(errno) : ""); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_waitv NULL address in waitv.uaddr\n"); -+ } -+ -+ /* Testing a NULL address for *waiters */ -+ if (clock_gettime(CLOCK_MONOTONIC, &to)) -+ error("gettime64 failed\n", errno); -+ -+ to.tv_sec++; -+ -+ res = futex_waitv(NULL, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC); -+ if (res == EINVAL) { -+ ksft_test_result_fail("futex_waitv private returned: %d %s\n", -+ res ? errno : res, -+ res ? strerror(errno) : ""); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_waitv NULL address in *waiters\n"); -+ } -+ -+ /* Testing an invalid clockid */ -+ if (clock_gettime(CLOCK_MONOTONIC, &to)) -+ error("gettime64 failed\n", errno); -+ -+ to.tv_sec++; -+ -+ res = futex_waitv(NULL, NR_FUTEXES, 0, &to, CLOCK_TAI); -+ if (res == EINVAL) { -+ ksft_test_result_fail("futex_waitv private returned: %d %s\n", -+ res ? errno : res, -+ res ? strerror(errno) : ""); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_waitv invalid clockid\n"); -+ } -+ -+ ksft_print_cnts(); -+ return ret; -+} -diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh -index 11a9d6229..5ccd599da 100755 ---- a/tools/testing/selftests/futex/functional/run.sh -+++ b/tools/testing/selftests/futex/functional/run.sh -@@ -79,3 +79,6 @@ echo - - echo - ./futex_requeue $COLOR -+ -+echo -+./futex_waitv $COLOR -diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h -new file mode 100644 -index 000000000..9d305520e ---- /dev/null -+++ b/tools/testing/selftests/futex/include/futex2test.h -@@ -0,0 +1,22 @@ -+/* SPDX-License-Identifier: GPL-2.0-or-later */ -+/* -+ * Futex2 library addons for futex tests -+ * -+ * Copyright 2021 Collabora Ltd. -+ */ -+#include <stdint.h> -+ -+#define u64_to_ptr(x) ((void *)(uintptr_t)(x)) -+ -+/** -+ * futex_waitv - Wait at multiple futexes, wake on any -+ * @waiters: Array of waiters -+ * @nr_waiters: Length of waiters array -+ * @flags: Operation flags -+ * @timo: Optional timeout for operation -+ */ -+static inline int futex_waitv(volatile struct futex_waitv *waiters, unsigned long nr_waiters, -+ unsigned long flags, struct timespec *timo, clockid_t clockid) -+{ -+ return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo, clockid); -+} --- -2.33.1.711.g9d530dc002 - - diff --git a/0104-revert-xhci-Add-support-for-Renesas-controller-with-memory.patch b/0104-revert-xhci-Add-support-for-Renesas-controller-with-memory.patch deleted file mode 100644 index dafb17784fdb..000000000000 --- a/0104-revert-xhci-Add-support-for-Renesas-controller-with-memory.patch +++ /dev/null @@ -1,89 +0,0 @@ ---- b/drivers/usb/host/xhci-pci.c -+++ a/drivers/usb/host/xhci-pci.c -@@ -636,14 +636,7 @@ - { /* end: all zeroes */ } - }; - MODULE_DEVICE_TABLE(pci, pci_ids); -- --/* -- * Without CONFIG_USB_XHCI_PCI_RENESAS renesas_xhci_check_request_fw() won't -- * load firmware, so don't encumber the xhci-pci driver with it. -- */ --#if IS_ENABLED(CONFIG_USB_XHCI_PCI_RENESAS) - MODULE_FIRMWARE("renesas_usb_fw.mem"); --#endif - - /* pci driver glue; this is a "new style" PCI driver module */ - static struct pci_driver xhci_pci_driver = { ---- b/drivers/usb/host/xhci-pci.c -+++ a/drivers/usb/host/xhci-pci.c -@@ -16,7 +16,6 @@ - - #include "xhci.h" - #include "xhci-trace.h" --#include "xhci-pci.h" - - #define SSIC_PORT_NUM 2 - #define SSIC_PORT_CFG2 0x880c -@@ -92,16 +91,7 @@ static int xhci_pci_reinit(struct xhci_h - - static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) - { -- struct pci_dev *pdev = to_pci_dev(dev); -- struct xhci_driver_data *driver_data; -- const struct pci_device_id *id; -- -- id = pci_match_id(pdev->driver->id_table, pdev); -- -- if (id && id->driver_data) { -- driver_data = (struct xhci_driver_data *)id->driver_data; -- xhci->quirks |= driver_data->quirks; -- } -+ struct pci_dev *pdev = to_pci_dev(dev); - - /* Look for vendor-specific quirks */ - if (pdev->vendor == PCI_VENDOR_ID_FRESCO_LOGIC && -@@ -346,16 +336,8 @@ static int xhci_pci_probe(struct pci_dev - int retval; - struct xhci_hcd *xhci; - struct usb_hcd *hcd; -- struct xhci_driver_data *driver_data; - struct reset_control *reset; - -- driver_data = (struct xhci_driver_data *)id->driver_data; -- if (driver_data && driver_data->quirks & XHCI_RENESAS_FW_QUIRK) { -- retval = renesas_xhci_check_request_fw(dev, id); -- if (retval) -- return retval; -- } -- - reset = devm_reset_control_get_optional_exclusive(&dev->dev, NULL); - if (IS_ERR(reset)) - return PTR_ERR(reset); -@@ -578,26 +557,14 @@ static void xhci_pci_shutdown(struct usb - - /*-------------------------------------------------------------------------*/ - --static const struct xhci_driver_data reneses_data = { -- .quirks = XHCI_RENESAS_FW_QUIRK, -- .firmware = "renesas_usb_fw.mem", --}; -- - /* PCI driver selection metadata; PCI hotplugging uses this */ - static const struct pci_device_id pci_ids[] = { -- { PCI_DEVICE(0x1912, 0x0014), -- .driver_data = (unsigned long)&reneses_data, -- }, -- { PCI_DEVICE(0x1912, 0x0015), -- .driver_data = (unsigned long)&reneses_data, -- }, - /* handle any USB 3.0 xHCI controller */ - { PCI_DEVICE_CLASS(PCI_CLASS_SERIAL_USB_XHCI, ~0), - }, - { /* end: all zeroes */ } - }; - MODULE_DEVICE_TABLE(pci, pci_ids); --MODULE_FIRMWARE("renesas_usb_fw.mem"); - - /* pci driver glue; this is a "new style" PCI driver module */ - static struct pci_driver xhci_pci_driver = { diff --git a/0108-drm_i915_Add_workaround_numbers_to_GEN7_COMMON_SLICE_CHICKEN1_whitelisting.patch b/0108-drm_i915_Add_workaround_numbers_to_GEN7_COMMON_SLICE_CHICKEN1_whitelisting.patch deleted file mode 100644 index 7e59a4802e0a..000000000000 --- a/0108-drm_i915_Add_workaround_numbers_to_GEN7_COMMON_SLICE_CHICKEN1_whitelisting.patch +++ /dev/null @@ -1,17 +0,0 @@ -diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c -index cd2935b9e7c81..c3211325c2d3e 100644 ---- a/drivers/gpu/drm/i915/gt/intel_workarounds.c -+++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c -@@ -1869,7 +1869,11 @@ static void tgl_whitelist_build(struct intel_engine_cs *engine) - RING_FORCE_TO_NONPRIV_ACCESS_RD | - RING_FORCE_TO_NONPRIV_RANGE_4); - -- /* Wa_1808121037:tgl */ -+ /* -+ * Wa_1808121037:tgl -+ * Wa_14012131227:dg1 -+ * Wa_1508744258:tgl,rkl,dg1,adl-s,adl-p -+ */ - whitelist_reg(w, GEN7_COMMON_SLICE_CHICKEN1); - - /* Wa_1806527549:tgl */ diff --git a/0201-lenovo-wmi2.patch b/0201-lenovo-wmi2.patch deleted file mode 100644 index c6b1b0603651..000000000000 --- a/0201-lenovo-wmi2.patch +++ /dev/null @@ -1,15 +0,0 @@ -diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c -index 791449a2370f..45d9010aafcf 100644 ---- a/drivers/platform/x86/wmi.c -+++ b/drivers/platform/x86/wmi.c -@@ -1081,7 +1081,8 @@ static int wmi_create_device(struct device *wmi_bus_dev, - wblock->dev.dev.bus = &wmi_bus_type; - wblock->dev.dev.parent = wmi_bus_dev; - -- dev_set_name(&wblock->dev.dev, "%pUL", gblock->guid); -+ dev_set_name(&wblock->dev.dev, "%s-%pUL", -+ dev_name(&wblock->acpi_device->dev), gblock->guid); - - device_initialize(&wblock->dev.dev); - - diff --git a/0301-revert-garbage-collect-fbdev-scrolling-acceleration.patch b/0301-revert-garbage-collect-fbdev-scrolling-acceleration.patch new file mode 100644 index 000000000000..280ed9645c31 --- /dev/null +++ b/0301-revert-garbage-collect-fbdev-scrolling-acceleration.patch @@ -0,0 +1,1038 @@ +--- b/Documentation/gpu/todo.rst ++++ a/Documentation/gpu/todo.rst +@@ -314,19 +314,16 @@ + Garbage collect fbdev scrolling acceleration + -------------------------------------------- + ++Scroll acceleration is disabled in fbcon by hard-wiring p->scrollmode = ++SCROLL_REDRAW. There's a ton of code this will allow us to remove: +-Scroll acceleration has been disabled in fbcon. Now it works as the old +-SCROLL_REDRAW mode. A ton of code was removed in fbcon.c and the hook bmove was +-removed from fbcon_ops. +-Remaining tasks: + ++- lots of code in fbcon.c ++ ++- a bunch of the hooks in fbcon_ops, maybe the remaining hooks could be called +-- a bunch of the hooks in fbcon_ops could be removed or simplified by calling + directly instead of the function table (with a switch on p->rotate) + + - fb_copyarea is unused after this, and can be deleted from all drivers + +-- after that, fb_copyarea can be deleted from fb_ops in include/linux/fb.h as +- well as cfb_copyarea +- + Note that not all acceleration code can be deleted, since clearing and cursor + support is still accelerated, which might be good candidates for further + deletion projects. +--- b/drivers/video/fbdev/core/bitblit.c ++++ a/drivers/video/fbdev/core/bitblit.c +@@ -43,6 +43,21 @@ + } + } + ++static void bit_bmove(struct vc_data *vc, struct fb_info *info, int sy, ++ int sx, int dy, int dx, int height, int width) ++{ ++ struct fb_copyarea area; ++ ++ area.sx = sx * vc->vc_font.width; ++ area.sy = sy * vc->vc_font.height; ++ area.dx = dx * vc->vc_font.width; ++ area.dy = dy * vc->vc_font.height; ++ area.height = height * vc->vc_font.height; ++ area.width = width * vc->vc_font.width; ++ ++ info->fbops->fb_copyarea(info, &area); ++} ++ + static void bit_clear(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int height, int width) + { +@@ -378,6 +393,7 @@ + + void fbcon_set_bitops(struct fbcon_ops *ops) + { ++ ops->bmove = bit_bmove; + ops->clear = bit_clear; + ops->putcs = bit_putcs; + ops->clear_margins = bit_clear_margins; +--- b/drivers/video/fbdev/core/fbcon.c ++++ a/drivers/video/fbdev/core/fbcon.c +@@ -173,6 +173,8 @@ + int count, int ypos, int xpos); + static void fbcon_clear_margins(struct vc_data *vc, int bottom_only); + static void fbcon_cursor(struct vc_data *vc, int mode); ++static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx, ++ int height, int width); + static int fbcon_switch(struct vc_data *vc); + static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch); + static void fbcon_set_palette(struct vc_data *vc, const unsigned char *table); +@@ -180,8 +182,16 @@ + /* + * Internal routines + */ ++static __inline__ void ywrap_up(struct vc_data *vc, int count); ++static __inline__ void ywrap_down(struct vc_data *vc, int count); ++static __inline__ void ypan_up(struct vc_data *vc, int count); ++static __inline__ void ypan_down(struct vc_data *vc, int count); ++static void fbcon_bmove_rec(struct vc_data *vc, struct fbcon_display *p, int sy, int sx, ++ int dy, int dx, int height, int width, u_int y_break); + static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var, + int unit); ++static void fbcon_redraw_move(struct vc_data *vc, struct fbcon_display *p, ++ int line, int count, int dy); + static void fbcon_modechanged(struct fb_info *info); + static void fbcon_set_all_vcs(struct fb_info *info); + static void fbcon_start(void); +@@ -1125,6 +1135,14 @@ + + ops->graphics = 0; + ++ /* ++ * No more hw acceleration for fbcon. ++ * ++ * FIXME: Garbage collect all the now dead code after sufficient time ++ * has passed. ++ */ ++ p->scrollmode = SCROLL_REDRAW; ++ + /* + * ++guenther: console.c:vc_allocate() relies on initializing + * vc_{cols,rows}, but we must not set those if we are only +@@ -1211,13 +1229,14 @@ + * This system is now divided into two levels because of complications + * caused by hardware scrolling. Top level functions: + * ++ * fbcon_bmove(), fbcon_clear(), fbcon_putc(), fbcon_clear_margins() +- * fbcon_clear(), fbcon_putc(), fbcon_clear_margins() + * + * handles y values in range [0, scr_height-1] that correspond to real + * screen positions. y_wrap shift means that first line of bitmap may be + * anywhere on this display. These functions convert lineoffsets to + * bitmap offsets and deal with the wrap-around case by splitting blits. + * ++ * fbcon_bmove_physical_8() -- These functions fast implementations + * fbcon_clear_physical_8() -- of original fbcon_XXX fns. + * fbcon_putc_physical_8() -- (font width != 8) may be added later + * +@@ -1390,6 +1409,224 @@ + } + } + ++static __inline__ void ywrap_up(struct vc_data *vc, int count) ++{ ++ struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; ++ struct fbcon_ops *ops = info->fbcon_par; ++ struct fbcon_display *p = &fb_display[vc->vc_num]; ++ ++ p->yscroll += count; ++ if (p->yscroll >= p->vrows) /* Deal with wrap */ ++ p->yscroll -= p->vrows; ++ ops->var.xoffset = 0; ++ ops->var.yoffset = p->yscroll * vc->vc_font.height; ++ ops->var.vmode |= FB_VMODE_YWRAP; ++ ops->update_start(info); ++ scrollback_max += count; ++ if (scrollback_max > scrollback_phys_max) ++ scrollback_max = scrollback_phys_max; ++ scrollback_current = 0; ++} ++ ++static __inline__ void ywrap_down(struct vc_data *vc, int count) ++{ ++ struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; ++ struct fbcon_ops *ops = info->fbcon_par; ++ struct fbcon_display *p = &fb_display[vc->vc_num]; ++ ++ p->yscroll -= count; ++ if (p->yscroll < 0) /* Deal with wrap */ ++ p->yscroll += p->vrows; ++ ops->var.xoffset = 0; ++ ops->var.yoffset = p->yscroll * vc->vc_font.height; ++ ops->var.vmode |= FB_VMODE_YWRAP; ++ ops->update_start(info); ++ scrollback_max -= count; ++ if (scrollback_max < 0) ++ scrollback_max = 0; ++ scrollback_current = 0; ++} ++ ++static __inline__ void ypan_up(struct vc_data *vc, int count) ++{ ++ struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; ++ struct fbcon_display *p = &fb_display[vc->vc_num]; ++ struct fbcon_ops *ops = info->fbcon_par; ++ ++ p->yscroll += count; ++ if (p->yscroll > p->vrows - vc->vc_rows) { ++ ops->bmove(vc, info, p->vrows - vc->vc_rows, ++ 0, 0, 0, vc->vc_rows, vc->vc_cols); ++ p->yscroll -= p->vrows - vc->vc_rows; ++ } ++ ++ ops->var.xoffset = 0; ++ ops->var.yoffset = p->yscroll * vc->vc_font.height; ++ ops->var.vmode &= ~FB_VMODE_YWRAP; ++ ops->update_start(info); ++ fbcon_clear_margins(vc, 1); ++ scrollback_max += count; ++ if (scrollback_max > scrollback_phys_max) ++ scrollback_max = scrollback_phys_max; ++ scrollback_current = 0; ++} ++ ++static __inline__ void ypan_up_redraw(struct vc_data *vc, int t, int count) ++{ ++ struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; ++ struct fbcon_ops *ops = info->fbcon_par; ++ struct fbcon_display *p = &fb_display[vc->vc_num]; ++ ++ p->yscroll += count; ++ ++ if (p->yscroll > p->vrows - vc->vc_rows) { ++ p->yscroll -= p->vrows - vc->vc_rows; ++ fbcon_redraw_move(vc, p, t + count, vc->vc_rows - count, t); ++ } ++ ++ ops->var.xoffset = 0; ++ ops->var.yoffset = p->yscroll * vc->vc_font.height; ++ ops->var.vmode &= ~FB_VMODE_YWRAP; ++ ops->update_start(info); ++ fbcon_clear_margins(vc, 1); ++ scrollback_max += count; ++ if (scrollback_max > scrollback_phys_max) ++ scrollback_max = scrollback_phys_max; ++ scrollback_current = 0; ++} ++ ++static __inline__ void ypan_down(struct vc_data *vc, int count) ++{ ++ struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; ++ struct fbcon_display *p = &fb_display[vc->vc_num]; ++ struct fbcon_ops *ops = info->fbcon_par; ++ ++ p->yscroll -= count; ++ if (p->yscroll < 0) { ++ ops->bmove(vc, info, 0, 0, p->vrows - vc->vc_rows, ++ 0, vc->vc_rows, vc->vc_cols); ++ p->yscroll += p->vrows - vc->vc_rows; ++ } ++ ++ ops->var.xoffset = 0; ++ ops->var.yoffset = p->yscroll * vc->vc_font.height; ++ ops->var.vmode &= ~FB_VMODE_YWRAP; ++ ops->update_start(info); ++ fbcon_clear_margins(vc, 1); ++ scrollback_max -= count; ++ if (scrollback_max < 0) ++ scrollback_max = 0; ++ scrollback_current = 0; ++} ++ ++static __inline__ void ypan_down_redraw(struct vc_data *vc, int t, int count) ++{ ++ struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; ++ struct fbcon_ops *ops = info->fbcon_par; ++ struct fbcon_display *p = &fb_display[vc->vc_num]; ++ ++ p->yscroll -= count; ++ ++ if (p->yscroll < 0) { ++ p->yscroll += p->vrows - vc->vc_rows; ++ fbcon_redraw_move(vc, p, t, vc->vc_rows - count, t + count); ++ } ++ ++ ops->var.xoffset = 0; ++ ops->var.yoffset = p->yscroll * vc->vc_font.height; ++ ops->var.vmode &= ~FB_VMODE_YWRAP; ++ ops->update_start(info); ++ fbcon_clear_margins(vc, 1); ++ scrollback_max -= count; ++ if (scrollback_max < 0) ++ scrollback_max = 0; ++ scrollback_current = 0; ++} ++ ++static void fbcon_redraw_move(struct vc_data *vc, struct fbcon_display *p, ++ int line, int count, int dy) ++{ ++ unsigned short *s = (unsigned short *) ++ (vc->vc_origin + vc->vc_size_row * line); ++ ++ while (count--) { ++ unsigned short *start = s; ++ unsigned short *le = advance_row(s, 1); ++ unsigned short c; ++ int x = 0; ++ unsigned short attr = 1; ++ ++ do { ++ c = scr_readw(s); ++ if (attr != (c & 0xff00)) { ++ attr = c & 0xff00; ++ if (s > start) { ++ fbcon_putcs(vc, start, s - start, ++ dy, x); ++ x += s - start; ++ start = s; ++ } ++ } ++ console_conditional_schedule(); ++ s++; ++ } while (s < le); ++ if (s > start) ++ fbcon_putcs(vc, start, s - start, dy, x); ++ console_conditional_schedule(); ++ dy++; ++ } ++} ++ ++static void fbcon_redraw_blit(struct vc_data *vc, struct fb_info *info, ++ struct fbcon_display *p, int line, int count, int ycount) ++{ ++ int offset = ycount * vc->vc_cols; ++ unsigned short *d = (unsigned short *) ++ (vc->vc_origin + vc->vc_size_row * line); ++ unsigned short *s = d + offset; ++ struct fbcon_ops *ops = info->fbcon_par; ++ ++ while (count--) { ++ unsigned short *start = s; ++ unsigned short *le = advance_row(s, 1); ++ unsigned short c; ++ int x = 0; ++ ++ do { ++ c = scr_readw(s); ++ ++ if (c == scr_readw(d)) { ++ if (s > start) { ++ ops->bmove(vc, info, line + ycount, x, ++ line, x, 1, s-start); ++ x += s - start + 1; ++ start = s + 1; ++ } else { ++ x++; ++ start++; ++ } ++ } ++ ++ scr_writew(c, d); ++ console_conditional_schedule(); ++ s++; ++ d++; ++ } while (s < le); ++ if (s > start) ++ ops->bmove(vc, info, line + ycount, x, line, x, 1, ++ s-start); ++ console_conditional_schedule(); ++ if (ycount > 0) ++ line++; ++ else { ++ line--; ++ /* NOTE: We subtract two lines from these pointers */ ++ s -= vc->vc_size_row; ++ d -= vc->vc_size_row; ++ } ++ } ++} ++ + static void fbcon_redraw(struct vc_data *vc, struct fbcon_display *p, + int line, int count, int offset) + { +@@ -1450,6 +1687,7 @@ + { + struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; + struct fbcon_display *p = &fb_display[vc->vc_num]; ++ int scroll_partial = info->flags & FBINFO_PARTIAL_PAN_OK; + + if (fbcon_is_inactive(vc, info)) + return true; +@@ -1466,32 +1704,249 @@ + case SM_UP: + if (count > vc->vc_rows) /* Maximum realistic size */ + count = vc->vc_rows; ++ if (logo_shown >= 0) ++ goto redraw_up; ++ switch (p->scrollmode) { ++ case SCROLL_MOVE: ++ fbcon_redraw_blit(vc, info, p, t, b - t - count, ++ count); ++ fbcon_clear(vc, b - count, 0, count, vc->vc_cols); ++ scr_memsetw((unsigned short *) (vc->vc_origin + ++ vc->vc_size_row * ++ (b - count)), ++ vc->vc_video_erase_char, ++ vc->vc_size_row * count); ++ return true; ++ ++ case SCROLL_WRAP_MOVE: ++ if (b - t - count > 3 * vc->vc_rows >> 2) { ++ if (t > 0) ++ fbcon_bmove(vc, 0, 0, count, 0, t, ++ vc->vc_cols); ++ ywrap_up(vc, count); ++ if (vc->vc_rows - b > 0) ++ fbcon_bmove(vc, b - count, 0, b, 0, ++ vc->vc_rows - b, ++ vc->vc_cols); ++ } else if (info->flags & FBINFO_READS_FAST) ++ fbcon_bmove(vc, t + count, 0, t, 0, ++ b - t - count, vc->vc_cols); ++ else ++ goto redraw_up; ++ fbcon_clear(vc, b - count, 0, count, vc->vc_cols); ++ break; ++ ++ case SCROLL_PAN_REDRAW: ++ if ((p->yscroll + count <= ++ 2 * (p->vrows - vc->vc_rows)) ++ && ((!scroll_partial && (b - t == vc->vc_rows)) ++ || (scroll_partial ++ && (b - t - count > ++ 3 * vc->vc_rows >> 2)))) { ++ if (t > 0) ++ fbcon_redraw_move(vc, p, 0, t, count); ++ ypan_up_redraw(vc, t, count); ++ if (vc->vc_rows - b > 0) ++ fbcon_redraw_move(vc, p, b, ++ vc->vc_rows - b, b); ++ } else ++ fbcon_redraw_move(vc, p, t + count, b - t - count, t); ++ fbcon_clear(vc, b - count, 0, count, vc->vc_cols); ++ break; ++ ++ case SCROLL_PAN_MOVE: ++ if ((p->yscroll + count <= ++ 2 * (p->vrows - vc->vc_rows)) ++ && ((!scroll_partial && (b - t == vc->vc_rows)) ++ || (scroll_partial ++ && (b - t - count > ++ 3 * vc->vc_rows >> 2)))) { ++ if (t > 0) ++ fbcon_bmove(vc, 0, 0, count, 0, t, ++ vc->vc_cols); ++ ypan_up(vc, count); ++ if (vc->vc_rows - b > 0) ++ fbcon_bmove(vc, b - count, 0, b, 0, ++ vc->vc_rows - b, ++ vc->vc_cols); ++ } else if (info->flags & FBINFO_READS_FAST) ++ fbcon_bmove(vc, t + count, 0, t, 0, ++ b - t - count, vc->vc_cols); ++ else ++ goto redraw_up; ++ fbcon_clear(vc, b - count, 0, count, vc->vc_cols); ++ break; ++ ++ case SCROLL_REDRAW: ++ redraw_up: ++ fbcon_redraw(vc, p, t, b - t - count, ++ count * vc->vc_cols); ++ fbcon_clear(vc, b - count, 0, count, vc->vc_cols); ++ scr_memsetw((unsigned short *) (vc->vc_origin + ++ vc->vc_size_row * ++ (b - count)), ++ vc->vc_video_erase_char, ++ vc->vc_size_row * count); ++ return true; ++ } ++ break; +- fbcon_redraw(vc, p, t, b - t - count, +- count * vc->vc_cols); +- fbcon_clear(vc, b - count, 0, count, vc->vc_cols); +- scr_memsetw((unsigned short *) (vc->vc_origin + +- vc->vc_size_row * +- (b - count)), +- vc->vc_video_erase_char, +- vc->vc_size_row * count); +- return true; + + case SM_DOWN: + if (count > vc->vc_rows) /* Maximum realistic size */ + count = vc->vc_rows; ++ if (logo_shown >= 0) ++ goto redraw_down; ++ switch (p->scrollmode) { ++ case SCROLL_MOVE: ++ fbcon_redraw_blit(vc, info, p, b - 1, b - t - count, ++ -count); ++ fbcon_clear(vc, t, 0, count, vc->vc_cols); ++ scr_memsetw((unsigned short *) (vc->vc_origin + ++ vc->vc_size_row * ++ t), ++ vc->vc_video_erase_char, ++ vc->vc_size_row * count); ++ return true; ++ ++ case SCROLL_WRAP_MOVE: ++ if (b - t - count > 3 * vc->vc_rows >> 2) { ++ if (vc->vc_rows - b > 0) ++ fbcon_bmove(vc, b, 0, b - count, 0, ++ vc->vc_rows - b, ++ vc->vc_cols); ++ ywrap_down(vc, count); ++ if (t > 0) ++ fbcon_bmove(vc, count, 0, 0, 0, t, ++ vc->vc_cols); ++ } else if (info->flags & FBINFO_READS_FAST) ++ fbcon_bmove(vc, t, 0, t + count, 0, ++ b - t - count, vc->vc_cols); ++ else ++ goto redraw_down; ++ fbcon_clear(vc, t, 0, count, vc->vc_cols); ++ break; ++ ++ case SCROLL_PAN_MOVE: ++ if ((count - p->yscroll <= p->vrows - vc->vc_rows) ++ && ((!scroll_partial && (b - t == vc->vc_rows)) ++ || (scroll_partial ++ && (b - t - count > ++ 3 * vc->vc_rows >> 2)))) { ++ if (vc->vc_rows - b > 0) ++ fbcon_bmove(vc, b, 0, b - count, 0, ++ vc->vc_rows - b, ++ vc->vc_cols); ++ ypan_down(vc, count); ++ if (t > 0) ++ fbcon_bmove(vc, count, 0, 0, 0, t, ++ vc->vc_cols); ++ } else if (info->flags & FBINFO_READS_FAST) ++ fbcon_bmove(vc, t, 0, t + count, 0, ++ b - t - count, vc->vc_cols); ++ else ++ goto redraw_down; ++ fbcon_clear(vc, t, 0, count, vc->vc_cols); ++ break; ++ ++ case SCROLL_PAN_REDRAW: ++ if ((count - p->yscroll <= p->vrows - vc->vc_rows) ++ && ((!scroll_partial && (b - t == vc->vc_rows)) ++ || (scroll_partial ++ && (b - t - count > ++ 3 * vc->vc_rows >> 2)))) { ++ if (vc->vc_rows - b > 0) ++ fbcon_redraw_move(vc, p, b, vc->vc_rows - b, ++ b - count); ++ ypan_down_redraw(vc, t, count); ++ if (t > 0) ++ fbcon_redraw_move(vc, p, count, t, 0); ++ } else ++ fbcon_redraw_move(vc, p, t, b - t - count, t + count); ++ fbcon_clear(vc, t, 0, count, vc->vc_cols); ++ break; ++ ++ case SCROLL_REDRAW: ++ redraw_down: ++ fbcon_redraw(vc, p, b - 1, b - t - count, ++ -count * vc->vc_cols); ++ fbcon_clear(vc, t, 0, count, vc->vc_cols); ++ scr_memsetw((unsigned short *) (vc->vc_origin + ++ vc->vc_size_row * ++ t), ++ vc->vc_video_erase_char, ++ vc->vc_size_row * count); ++ return true; ++ } +- fbcon_redraw(vc, p, b - 1, b - t - count, +- -count * vc->vc_cols); +- fbcon_clear(vc, t, 0, count, vc->vc_cols); +- scr_memsetw((unsigned short *) (vc->vc_origin + +- vc->vc_size_row * +- t), +- vc->vc_video_erase_char, +- vc->vc_size_row * count); +- return true; + } + return false; + } + ++ ++static void fbcon_bmove(struct vc_data *vc, int sy, int sx, int dy, int dx, ++ int height, int width) ++{ ++ struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; ++ struct fbcon_display *p = &fb_display[vc->vc_num]; ++ ++ if (fbcon_is_inactive(vc, info)) ++ return; ++ ++ if (!width || !height) ++ return; ++ ++ /* Split blits that cross physical y_wrap case. ++ * Pathological case involves 4 blits, better to use recursive ++ * code rather than unrolled case ++ * ++ * Recursive invocations don't need to erase the cursor over and ++ * over again, so we use fbcon_bmove_rec() ++ */ ++ fbcon_bmove_rec(vc, p, sy, sx, dy, dx, height, width, ++ p->vrows - p->yscroll); ++} ++ ++static void fbcon_bmove_rec(struct vc_data *vc, struct fbcon_display *p, int sy, int sx, ++ int dy, int dx, int height, int width, u_int y_break) ++{ ++ struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; ++ struct fbcon_ops *ops = info->fbcon_par; ++ u_int b; ++ ++ if (sy < y_break && sy + height > y_break) { ++ b = y_break - sy; ++ if (dy < sy) { /* Avoid trashing self */ ++ fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, ++ y_break); ++ fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, ++ height - b, width, y_break); ++ } else { ++ fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, ++ height - b, width, y_break); ++ fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, ++ y_break); ++ } ++ return; ++ } ++ ++ if (dy < y_break && dy + height > y_break) { ++ b = y_break - dy; ++ if (dy < sy) { /* Avoid trashing self */ ++ fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, ++ y_break); ++ fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, ++ height - b, width, y_break); ++ } else { ++ fbcon_bmove_rec(vc, p, sy + b, sx, dy + b, dx, ++ height - b, width, y_break); ++ fbcon_bmove_rec(vc, p, sy, sx, dy, dx, b, width, ++ y_break); ++ } ++ return; ++ } ++ ops->bmove(vc, info, real_y(p, sy), sx, real_y(p, dy), dx, ++ height, width); ++} ++ + static void updatescrollmode(struct fbcon_display *p, + struct fb_info *info, + struct vc_data *vc) +@@ -1664,7 +2119,21 @@ + + updatescrollmode(p, info, vc); + ++ switch (p->scrollmode) { ++ case SCROLL_WRAP_MOVE: ++ scrollback_phys_max = p->vrows - vc->vc_rows; ++ break; ++ case SCROLL_PAN_MOVE: ++ case SCROLL_PAN_REDRAW: ++ scrollback_phys_max = p->vrows - 2 * vc->vc_rows; ++ if (scrollback_phys_max < 0) ++ scrollback_phys_max = 0; ++ break; ++ default: ++ scrollback_phys_max = 0; ++ break; ++ } ++ +- scrollback_phys_max = 0; + scrollback_max = 0; + scrollback_current = 0; + +--- b/drivers/video/fbdev/core/fbcon.h ++++ a/drivers/video/fbdev/core/fbcon.h +@@ -29,6 +29,7 @@ + /* Filled in by the low-level console driver */ + const u_char *fontdata; + int userfont; /* != 0 if fontdata kmalloc()ed */ ++ u_short scrollmode; /* Scroll Method */ + u_short inverse; /* != 0 text black on white as default */ + short yscroll; /* Hardware scrolling */ + int vrows; /* number of virtual rows */ +@@ -51,6 +52,8 @@ + }; + + struct fbcon_ops { ++ void (*bmove)(struct vc_data *vc, struct fb_info *info, int sy, ++ int sx, int dy, int dx, int height, int width); + void (*clear)(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int height, int width); + void (*putcs)(struct vc_data *vc, struct fb_info *info, +@@ -149,6 +152,62 @@ + #define attr_bgcol_ec(bgshift, vc, info) attr_col_ec(bgshift, vc, info, 0) + #define attr_fgcol_ec(fgshift, vc, info) attr_col_ec(fgshift, vc, info, 1) + ++ /* ++ * Scroll Method ++ */ ++ ++/* There are several methods fbcon can use to move text around the screen: ++ * ++ * Operation Pan Wrap ++ *--------------------------------------------- ++ * SCROLL_MOVE copyarea No No ++ * SCROLL_PAN_MOVE copyarea Yes No ++ * SCROLL_WRAP_MOVE copyarea No Yes ++ * SCROLL_REDRAW imageblit No No ++ * SCROLL_PAN_REDRAW imageblit Yes No ++ * SCROLL_WRAP_REDRAW imageblit No Yes ++ * ++ * (SCROLL_WRAP_REDRAW is not implemented yet) ++ * ++ * In general, fbcon will choose the best scrolling ++ * method based on the rule below: ++ * ++ * Pan/Wrap > accel imageblit > accel copyarea > ++ * soft imageblit > (soft copyarea) ++ * ++ * Exception to the rule: Pan + accel copyarea is ++ * preferred over Pan + accel imageblit. ++ * ++ * The above is typical for PCI/AGP cards. Unless ++ * overridden, fbcon will never use soft copyarea. ++ * ++ * If you need to override the above rule, set the ++ * appropriate flags in fb_info->flags. For example, ++ * to prefer copyarea over imageblit, set ++ * FBINFO_READS_FAST. ++ * ++ * Other notes: ++ * + use the hardware engine to move the text ++ * (hw-accelerated copyarea() and fillrect()) ++ * + use hardware-supported panning on a large virtual screen ++ * + amifb can not only pan, but also wrap the display by N lines ++ * (i.e. visible line i = physical line (i+N) % yres). ++ * + read what's already rendered on the screen and ++ * write it in a different place (this is cfb_copyarea()) ++ * + re-render the text to the screen ++ * ++ * Whether to use wrapping or panning can only be figured out at ++ * runtime (when we know whether our font height is a multiple ++ * of the pan/wrap step) ++ * ++ */ ++ ++#define SCROLL_MOVE 0x001 ++#define SCROLL_PAN_MOVE 0x002 ++#define SCROLL_WRAP_MOVE 0x003 ++#define SCROLL_REDRAW 0x004 ++#define SCROLL_PAN_REDRAW 0x005 ++ + #ifdef CONFIG_FB_TILEBLITTING + extern void fbcon_set_tileops(struct vc_data *vc, struct fb_info *info); + #endif +--- b/drivers/video/fbdev/core/fbcon_ccw.c ++++ a/drivers/video/fbdev/core/fbcon_ccw.c +@@ -59,12 +59,31 @@ + } + } + ++ ++static void ccw_bmove(struct vc_data *vc, struct fb_info *info, int sy, ++ int sx, int dy, int dx, int height, int width) ++{ ++ struct fbcon_ops *ops = info->fbcon_par; ++ struct fb_copyarea area; ++ u32 vyres = GETVYRES(ops->p->scrollmode, info); ++ ++ area.sx = sy * vc->vc_font.height; ++ area.sy = vyres - ((sx + width) * vc->vc_font.width); ++ area.dx = dy * vc->vc_font.height; ++ area.dy = vyres - ((dx + width) * vc->vc_font.width); ++ area.width = height * vc->vc_font.height; ++ area.height = width * vc->vc_font.width; ++ ++ info->fbops->fb_copyarea(info, &area); ++} ++ + static void ccw_clear(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int height, int width) + { ++ struct fbcon_ops *ops = info->fbcon_par; + struct fb_fillrect region; + int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; ++ u32 vyres = GETVYRES(ops->p->scrollmode, info); +- u32 vyres = info->var.yres; + + region.color = attr_bgcol_ec(bgshift,vc,info); + region.dx = sy * vc->vc_font.height; +@@ -121,7 +140,7 @@ + u32 cnt, pitch, size; + u32 attribute = get_attribute(info, scr_readw(s)); + u8 *dst, *buf = NULL; ++ u32 vyres = GETVYRES(ops->p->scrollmode, info); +- u32 vyres = info->var.yres; + + if (!ops->fontbuffer) + return; +@@ -210,7 +229,7 @@ + int attribute, use_sw = vc->vc_cursor_type & CUR_SW; + int err = 1, dx, dy; + char *src; ++ u32 vyres = GETVYRES(ops->p->scrollmode, info); +- u32 vyres = info->var.yres; + + if (!ops->fontbuffer) + return; +@@ -368,7 +387,7 @@ + { + struct fbcon_ops *ops = info->fbcon_par; + u32 yoffset; ++ u32 vyres = GETVYRES(ops->p->scrollmode, info); +- u32 vyres = info->var.yres; + int err; + + yoffset = (vyres - info->var.yres) - ops->var.xoffset; +@@ -383,6 +402,7 @@ + + void fbcon_rotate_ccw(struct fbcon_ops *ops) + { ++ ops->bmove = ccw_bmove; + ops->clear = ccw_clear; + ops->putcs = ccw_putcs; + ops->clear_margins = ccw_clear_margins; +--- b/drivers/video/fbdev/core/fbcon_cw.c ++++ a/drivers/video/fbdev/core/fbcon_cw.c +@@ -44,12 +44,31 @@ + } + } + ++ ++static void cw_bmove(struct vc_data *vc, struct fb_info *info, int sy, ++ int sx, int dy, int dx, int height, int width) ++{ ++ struct fbcon_ops *ops = info->fbcon_par; ++ struct fb_copyarea area; ++ u32 vxres = GETVXRES(ops->p->scrollmode, info); ++ ++ area.sx = vxres - ((sy + height) * vc->vc_font.height); ++ area.sy = sx * vc->vc_font.width; ++ area.dx = vxres - ((dy + height) * vc->vc_font.height); ++ area.dy = dx * vc->vc_font.width; ++ area.width = height * vc->vc_font.height; ++ area.height = width * vc->vc_font.width; ++ ++ info->fbops->fb_copyarea(info, &area); ++} ++ + static void cw_clear(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int height, int width) + { ++ struct fbcon_ops *ops = info->fbcon_par; + struct fb_fillrect region; + int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; ++ u32 vxres = GETVXRES(ops->p->scrollmode, info); +- u32 vxres = info->var.xres; + + region.color = attr_bgcol_ec(bgshift,vc,info); + region.dx = vxres - ((sy + height) * vc->vc_font.height); +@@ -106,7 +125,7 @@ + u32 cnt, pitch, size; + u32 attribute = get_attribute(info, scr_readw(s)); + u8 *dst, *buf = NULL; ++ u32 vxres = GETVXRES(ops->p->scrollmode, info); +- u32 vxres = info->var.xres; + + if (!ops->fontbuffer) + return; +@@ -193,7 +212,7 @@ + int attribute, use_sw = vc->vc_cursor_type & CUR_SW; + int err = 1, dx, dy; + char *src; ++ u32 vxres = GETVXRES(ops->p->scrollmode, info); +- u32 vxres = info->var.xres; + + if (!ops->fontbuffer) + return; +@@ -350,7 +369,7 @@ + static int cw_update_start(struct fb_info *info) + { + struct fbcon_ops *ops = info->fbcon_par; ++ u32 vxres = GETVXRES(ops->p->scrollmode, info); +- u32 vxres = info->var.xres; + u32 xoffset; + int err; + +@@ -366,6 +385,7 @@ + + void fbcon_rotate_cw(struct fbcon_ops *ops) + { ++ ops->bmove = cw_bmove; + ops->clear = cw_clear; + ops->putcs = cw_putcs; + ops->clear_margins = cw_clear_margins; +--- b/drivers/video/fbdev/core/fbcon_rotate.h ++++ a/drivers/video/fbdev/core/fbcon_rotate.h +@@ -11,6 +11,15 @@ + #ifndef _FBCON_ROTATE_H + #define _FBCON_ROTATE_H + ++#define GETVYRES(s,i) ({ \ ++ (s == SCROLL_REDRAW || s == SCROLL_MOVE) ? \ ++ (i)->var.yres : (i)->var.yres_virtual; }) ++ ++#define GETVXRES(s,i) ({ \ ++ (s == SCROLL_REDRAW || s == SCROLL_MOVE || !(i)->fix.xpanstep) ? \ ++ (i)->var.xres : (i)->var.xres_virtual; }) ++ ++ + static inline int pattern_test_bit(u32 x, u32 y, u32 pitch, const char *pat) + { + u32 tmp = (y * pitch) + x, index = tmp / 8, bit = tmp % 8; +--- b/drivers/video/fbdev/core/fbcon_ud.c ++++ a/drivers/video/fbdev/core/fbcon_ud.c +@@ -44,13 +44,33 @@ + } + } + ++ ++static void ud_bmove(struct vc_data *vc, struct fb_info *info, int sy, ++ int sx, int dy, int dx, int height, int width) ++{ ++ struct fbcon_ops *ops = info->fbcon_par; ++ struct fb_copyarea area; ++ u32 vyres = GETVYRES(ops->p->scrollmode, info); ++ u32 vxres = GETVXRES(ops->p->scrollmode, info); ++ ++ area.sy = vyres - ((sy + height) * vc->vc_font.height); ++ area.sx = vxres - ((sx + width) * vc->vc_font.width); ++ area.dy = vyres - ((dy + height) * vc->vc_font.height); ++ area.dx = vxres - ((dx + width) * vc->vc_font.width); ++ area.height = height * vc->vc_font.height; ++ area.width = width * vc->vc_font.width; ++ ++ info->fbops->fb_copyarea(info, &area); ++} ++ + static void ud_clear(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int height, int width) + { ++ struct fbcon_ops *ops = info->fbcon_par; + struct fb_fillrect region; + int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; ++ u32 vyres = GETVYRES(ops->p->scrollmode, info); ++ u32 vxres = GETVXRES(ops->p->scrollmode, info); +- u32 vyres = info->var.yres; +- u32 vxres = info->var.xres; + + region.color = attr_bgcol_ec(bgshift,vc,info); + region.dy = vyres - ((sy + height) * vc->vc_font.height); +@@ -142,8 +162,8 @@ + u32 mod = vc->vc_font.width % 8, cnt, pitch, size; + u32 attribute = get_attribute(info, scr_readw(s)); + u8 *dst, *buf = NULL; ++ u32 vyres = GETVYRES(ops->p->scrollmode, info); ++ u32 vxres = GETVXRES(ops->p->scrollmode, info); +- u32 vyres = info->var.yres; +- u32 vxres = info->var.xres; + + if (!ops->fontbuffer) + return; +@@ -239,8 +259,8 @@ + int attribute, use_sw = vc->vc_cursor_type & CUR_SW; + int err = 1, dx, dy; + char *src; ++ u32 vyres = GETVYRES(ops->p->scrollmode, info); ++ u32 vxres = GETVXRES(ops->p->scrollmode, info); +- u32 vyres = info->var.yres; +- u32 vxres = info->var.xres; + + if (!ops->fontbuffer) + return; +@@ -390,8 +410,8 @@ + { + struct fbcon_ops *ops = info->fbcon_par; + int xoffset, yoffset; ++ u32 vyres = GETVYRES(ops->p->scrollmode, info); ++ u32 vxres = GETVXRES(ops->p->scrollmode, info); +- u32 vyres = info->var.yres; +- u32 vxres = info->var.xres; + int err; + + xoffset = vxres - info->var.xres - ops->var.xoffset; +@@ -409,6 +429,7 @@ + + void fbcon_rotate_ud(struct fbcon_ops *ops) + { ++ ops->bmove = ud_bmove; + ops->clear = ud_clear; + ops->putcs = ud_putcs; + ops->clear_margins = ud_clear_margins; +--- b/drivers/video/fbdev/core/tileblit.c ++++ a/drivers/video/fbdev/core/tileblit.c +@@ -16,6 +16,21 @@ + #include <asm/types.h> + #include "fbcon.h" + ++static void tile_bmove(struct vc_data *vc, struct fb_info *info, int sy, ++ int sx, int dy, int dx, int height, int width) ++{ ++ struct fb_tilearea area; ++ ++ area.sx = sx; ++ area.sy = sy; ++ area.dx = dx; ++ area.dy = dy; ++ area.height = height; ++ area.width = width; ++ ++ info->tileops->fb_tilecopy(info, &area); ++} ++ + static void tile_clear(struct vc_data *vc, struct fb_info *info, int sy, + int sx, int height, int width) + { +@@ -118,6 +133,7 @@ + struct fb_tilemap map; + struct fbcon_ops *ops = info->fbcon_par; + ++ ops->bmove = tile_bmove; + ops->clear = tile_clear; + ops->putcs = tile_putcs; + ops->clear_margins = tile_clear_margins; +--- b/drivers/video/fbdev/skeletonfb.c ++++ a/drivers/video/fbdev/skeletonfb.c +@@ -505,15 +505,15 @@ + } + + /** ++ * xxxfb_copyarea - REQUIRED function. Can use generic routines if ++ * non acclerated hardware and packed pixel based. +- * xxxfb_copyarea - OBSOLETE function. + * Copies one area of the screen to another area. +- * Will be deleted in a future version + * + * @info: frame buffer structure that represents a single frame buffer + * @area: Structure providing the data to copy the framebuffer contents + * from one region to another. + * ++ * This drawing operation copies a rectangular area from one area of the +- * This drawing operation copied a rectangular area from one area of the + * screen to another area. + */ + void xxxfb_copyarea(struct fb_info *p, const struct fb_copyarea *area) +@@ -645,9 +645,9 @@ + .fb_setcolreg = xxxfb_setcolreg, + .fb_blank = xxxfb_blank, + .fb_pan_display = xxxfb_pan_display, ++ .fb_fillrect = xxxfb_fillrect, /* Needed !!! */ ++ .fb_copyarea = xxxfb_copyarea, /* Needed !!! */ ++ .fb_imageblit = xxxfb_imageblit, /* Needed !!! */ +- .fb_fillrect = xxxfb_fillrect, /* Needed !!! */ +- .fb_copyarea = xxxfb_copyarea, /* Obsolete */ +- .fb_imageblit = xxxfb_imageblit, /* Needed !!! */ + .fb_cursor = xxxfb_cursor, /* Optional !!! */ + .fb_sync = xxxfb_sync, + .fb_ioctl = xxxfb_ioctl, +--- b/include/linux/fb.h ++++ a/include/linux/fb.h +@@ -262,7 +262,7 @@ + + /* Draws a rectangle */ + void (*fb_fillrect) (struct fb_info *info, const struct fb_fillrect *rect); ++ /* Copy data from area to another */ +- /* Copy data from area to another. Obsolete. */ + void (*fb_copyarea) (struct fb_info *info, const struct fb_copyarea *region); + /* Draws a image to the display */ + void (*fb_imageblit) (struct fb_info *info, const struct fb_image *image); diff --git a/0301-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch b/0302-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch index c12688800eab..c12688800eab 100644 --- a/0301-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch +++ b/0302-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch diff --git a/0302-revert-fbcon-remove-no-op-fbcon_set_origin.patch b/0303-revert-fbcon-remove-no-op-fbcon_set_origin.patch index 6491c541e883..6491c541e883 100644 --- a/0302-revert-fbcon-remove-no-op-fbcon_set_origin.patch +++ b/0303-revert-fbcon-remove-no-op-fbcon_set_origin.patch diff --git a/0303-revert-fbcon-remove-soft-scrollback-code.patch b/0304-revert-fbcon-remove-soft-scrollback-code.patch index 4f9735447f37..4f9735447f37 100644 --- a/0303-revert-fbcon-remove-soft-scrollback-code.patch +++ b/0304-revert-fbcon-remove-soft-scrollback-code.patch diff --git a/0999-acs.gitpatch b/0999-acs.gitpatch index 401b27c13f1c..e075ec1d3974 100644 --- a/0999-acs.gitpatch +++ b/0999-acs.gitpatch @@ -1,28 +1,27 @@ diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 1396fd2..3c0ede4 100644 +index 2fba824..a797d74 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -3892,6 +3892,15 @@ - nomsi [MSI] If the PCI_MSI kernel config parameter is - enabled, this kernel boot option can be used to - disable the use of MSI interrupts system-wide. -+ pcie_acs_override= -+ [PCIE] Override missing PCIe ACS support for: -+ downstream -+ All downstream ports - full ACS capabilities -+ multfunction -+ All multifunction devices - multifunction ACS subset -+ id:nnnn:nnnn -+ Specfic device - full ACS capabilities -+ Specified as vid:did (vendor/device ID) in hex - noioapicquirk [APIC] Disable all boot interrupt quirks. - Safety option to keep boot IRQs enabled. This - should never be necessary. +@@ -3922,6 +3922,14 @@ + nomsi [MSI] If the PCI_MSI kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of MSI interrupts system-wide. ++ pci_acs_override [PCIE] Override missing PCIe ACS support for: ++ downstream ++ All downstream ports - full ACS capabilities ++ multifunction ++ Add multifunction devices - multifunction ACS subset ++ id:nnnn:nnnn ++ Specific device - full ACS capabilities ++ Specified as vid:did (vendor/device ID) in hex + noioapicquirk [APIC] Disable all boot interrupt quirks. + Safety option to keep boot IRQs enabled. This + should never be necessary. diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index 4537d1e..c4f01fe 100644 +index 003950c..d3bb542 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c -@@ -193,6 +193,106 @@ static int __init pci_apply_final_quirks(void) +@@ -193,6 +193,107 @@ static int __init pci_apply_final_quirks(void) } fs_initcall_sync(pci_apply_final_quirks); @@ -34,6 +33,7 @@ index 4537d1e..c4f01fe 100644 + unsigned short vendor; + unsigned short device; +}; ++ +static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; +static u8 max_acs_id; + @@ -129,12 +129,12 @@ index 4537d1e..c4f01fe 100644 /* * Decoding should be disabled for a PCI device during BAR sizing to avoid * conflict. But doing so may cause problems on host bridge and perhaps other -@@ -4949,6 +5049,8 @@ static const struct pci_dev_acs_enabled { - { PCI_VENDOR_ID_NXP, 0x8d9b, pci_quirk_nxp_rp_acs }, - /* Zhaoxin Root/Downstream Ports */ - { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, -+ /* allow acs for any */ -+ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, - { 0 } +@@ -4950,6 +5051,8 @@ static const struct pci_dev_acs_enabled { + { PCI_VENDOR_ID_NXP, 0x8d9b, pci_quirk_nxp_rp_acs }, + /* Zhaoxin Root/Downstream Ports */ + { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, ++ /* allow acs for any */ ++ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, + { 0 } }; @@ -12,9 +12,9 @@ pkgbase=linux-acs-manjaro pkgname=('linux-acs-manjaro' 'linux-acs-manjaro-headers') _kernelname=-ACS-MANJARO -_basekernel=5.15 -_basever=515 -pkgver=5.15.16 +_basekernel=5.16 +_basever=516 +pkgver=5.16.2 pkgrel=1 arch=('x86_64') url="https://www.kernel.org/" @@ -37,25 +37,15 @@ source=("https://www.kernel.org/pub/linux/kernel/v5.x/linux-${_basekernel}.tar.x 'config' # ARCH Patches '0001-ZEN-Add-sysctl-and-CONFIG-to-disallow-unprivileged-CLONE_NEWUSER.patch' - '0002-PCI_Add_more_NVIDIA_controllers_to_the_MSI_masking_quirk.patch' - '0003-iommu_intel_do_deep_dma-unmapping_to_avoid_kernel-flooding.patch' - '0004-cpufreq_intel_pstate_ITMT_support_for_overclocked_system.patch' - '0005-Bluetooth_btintel_Fix_bdaddress_comparison_with_garbage_value.patch' - '0006-lg-laptop_Recognize_more_models.patch' + '0002-Btintel_Fix_bdaddress_comparison_with_garbage_value.patch' # MANJARO Patches '0101-i2c-nuvoton-nc677x-hwmon-driver.patch' -# '0102-iomap-iomap_bmap-should-accept-unwritten-maps.patch' - '0103-futex.patch' # https://github.com/sirlucjan/kernel-patches - '0104-revert-xhci-Add-support-for-Renesas-controller-with-memory.patch' '0105-quirk-kernel-org-bug-210681-firmware_rome_error.patch' - '0108-drm_i915_Add_workaround_numbers_to_GEN7_COMMON_SLICE_CHICKEN1_whitelisting.patch::https://patchwork.freedesktop.org/patch/463650/raw/' - # Lenovo + AMD - '0201-lenovo-wmi2.patch' - # other patches # Bootsplash - '0301-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch' - '0302-revert-fbcon-remove-no-op-fbcon_set_origin.patch' - '0303-revert-fbcon-remove-soft-scrollback-code.patch' + '0301-revert-garbage-collect-fbdev-scrolling-acceleration.patch' + '0302-revert-fbcon-remove-now-unusued-softback_lines-cursor-argument.patch' + '0303-revert-fbcon-remove-no-op-fbcon_set_origin.patch' + '0304-revert-fbcon-remove-soft-scrollback-code.patch' '0401-bootsplash.patch' '0402-bootsplash.patch' '0403-bootsplash.patch' @@ -69,22 +59,16 @@ source=("https://www.kernel.org/pub/linux/kernel/v5.x/linux-${_basekernel}.tar.x '0411-bootsplash.patch' '0412-bootsplash.patch' '0413-bootsplash.gitpatch' + # ACS override patch '0999-acs.gitpatch') -sha256sums=('57b2cf6991910e3b67a1b3490022e8a0674b6965c74c12da1e99d138d1991ee8' - '0817171996521675b3c1130568503f08d8b1672c955cc842200a21bf5914cd95' - '93320dbe5928e51fb777a4f13dd9a7364eb150d7983073f7dc159e89a6ffa747' +sha256sums=('027d7e8988bb69ac12ee92406c3be1fe13f990b1ca2249e226225cd1573308bb' + '3a09c2f1ad410c09cf03921abeed1a6ca7c38138fb508171ee673d429d179171' + 'cb2d729cc20743014d9e3bd08facb9f5bdd19d9fa89014f415c61b4a6eb78e97' '986f8d802f37b72a54256f0ab84da83cb229388d58c0b6750f7c770818a18421' - 'e2823eff3355b7c88a3fa327ea2f84f23cbd36569e0a5f0f76599023f63a52ca' - 'ce53090a4572cd6162d22225113082f7e4df5028a1230529d170460e26dcf849' - 'ab0360eac59329eb84f028c2f402ee4a17e4b3dfacb7957355e6178d35af87b9' - '76701599bbafa49b90ccb073ef29ce2dc3731566e8fa852bd1e9e7796e184754' - 'a2a0a0542055a6a921542fbb05cedb6eb6f3d3fb0c038bfb2304bfd3931a0f71' + 'b89188b1bc3516d54965dd36def6a2af3d81379e53ff7e527bbd91f77c6f191b' '7823d7488f42bc4ed7dfae6d1014dbde679d8b862c9a3697a39ba0dae5918978' - '844e66a95d7df754c55ac2f1ce7e215b1e56e20ca095462d926a993d557b20e0' - 'd9330ea593829a6ef3b824db9570253280cbff7da2b4beb47cbc037824d1a29b' '5e804e1f241ce542f3f0e83d274ede6aa4b0539e510fb9376f8106e8732ce69b' - 'e8e6120035977903a7117ba215809b9b162b64a789848107513f219180baaada' - '1d58ef2991c625f6f0eb33b4cb8303932f53f1c4694e42bae24c9cd36d2ad013' + '365d4225a7db60bd064ebbc34ce0ae582a0c378ad6c4cec7960a5ae4641a6757' '2b11905b63b05b25807dd64757c779da74dd4c37e36d3f7a46485b1ee5a9d326' '94a8538251ad148f1025cc3de446ce64f73dc32b01815426fb159c722e8fa5bc' '1f18c5c10a3c63e41ecd05ad34cd9f6653ba96e9f1049ce2b7bb6da2578ae710' @@ -101,7 +85,7 @@ sha256sums=('57b2cf6991910e3b67a1b3490022e8a0674b6965c74c12da1e99d138d1991ee8' '27471eee564ca3149dd271b0817719b5565a9594dc4d884fe3dc51a5f03832bc' '60e295601e4fb33d9bf65f198c54c7eb07c0d1e91e2ad1e0dd6cd6e142cb266d' '035ea4b2a7621054f4560471f45336b981538a40172d8f17285910d4e0e0b3ef' - '6d6b327ec7c7798f628f98ab964f4457d3cf043bad2632eb8f27548478a83cc1') + '2542b5cea79ab5817ce3d30c54acd045966b9c14587bfb0b2f50d473da48a1d5') prepare() { cd "linux-${_basekernel}" @@ -237,6 +221,9 @@ package_linux-acs-manjaro-headers() { # add objtool for external module building and enabled VALIDATION_STACK option install -Dt "${_builddir}/tools/objtool" tools/objtool/objtool + # required when DEBUG_INFO_BTF_MODULES is enabled + install -Dt "${_builddir}/tools/bpf/resolve_btfids" tools/bpf/resolve_btfids/resolve_btfids + # remove unneeded architectures local _arch for _arch in "${_builddir}"/arch/*/; do @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 5.15.15-1 Kernel Configuration +# Linux/x86 5.16.0-1 Kernel Configuration # CONFIG_CC_VERSION_TEXT="gcc (GCC) 11.1.0" CONFIG_CC_IS_GCC=y @@ -121,6 +121,7 @@ CONFIG_BPF_JIT_DEFAULT_ON=y CONFIG_BPF_LSM=y # end of BPF subsystem +CONFIG_PREEMPT_BUILD=y # CONFIG_PREEMPT_NONE is not set # CONFIG_PREEMPT_VOLUNTARY is not set CONFIG_PREEMPT=y @@ -191,6 +192,7 @@ CONFIG_UCLAMP_BUCKETS_COUNT=5 CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y CONFIG_CC_HAS_INT128=y +CONFIG_CC_IMPLICIT_FALLTHROUGH="-Wimplicit-fallthrough=5" CONFIG_ARCH_SUPPORTS_INT128=y CONFIG_NUMA_BALANCING=y CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y @@ -411,6 +413,7 @@ CONFIG_NR_CPUS_RANGE_BEGIN=2 CONFIG_NR_CPUS_RANGE_END=512 CONFIG_NR_CPUS_DEFAULT=64 CONFIG_NR_CPUS=320 +CONFIG_SCHED_CLUSTER=y CONFIG_SCHED_SMT=y CONFIG_SCHED_MC=y CONFIG_SCHED_MC_PRIO=y @@ -511,6 +514,7 @@ CONFIG_LEGACY_VSYSCALL_XONLY=y # CONFIG_LEGACY_VSYSCALL_NONE is not set # CONFIG_CMDLINE_BOOL is not set CONFIG_MODIFY_LDT_SYSCALL=y +# CONFIG_STRICT_SIGALTSTACK_SIZE is not set CONFIG_HAVE_LIVEPATCH=y # CONFIG_LIVEPATCH is not set # end of Processor type and features @@ -712,6 +716,7 @@ CONFIG_KVM_AMD=m CONFIG_KVM_AMD_SEV=y CONFIG_KVM_XEN=y CONFIG_KVM_MMU_AUDIT=y +CONFIG_KVM_EXTERNAL_WRITE_TRACKING=y CONFIG_AS_AVX512=y CONFIG_AS_SHA1_NI=y CONFIG_AS_SHA256_NI=y @@ -740,6 +745,7 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_OPTPROBES=y CONFIG_HAVE_KPROBES_ON_FTRACE=y +CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE=y CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y CONFIG_HAVE_NMI=y CONFIG_TRACE_IRQFLAGS_SUPPORT=y @@ -834,6 +840,7 @@ CONFIG_ARCH_WANT_LD_ORPHAN_WARN=y CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_ARCH_HAS_ELFCORE_COMPAT=y CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH=y +CONFIG_DYNAMIC_SIGFRAME=y # # GCOV-based kernel profiling @@ -979,10 +986,10 @@ CONFIG_SPARSEMEM_VMEMMAP=y CONFIG_HAVE_FAST_GUP=y CONFIG_NUMA_KEEP_MEMINFO=y CONFIG_MEMORY_ISOLATION=y +CONFIG_EXCLUSIVE_SYSTEM_RAM=y CONFIG_HAVE_BOOTMEM_INFO_NODE=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTPLUG_SPARSE=y CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y CONFIG_MEMORY_HOTREMOVE=y @@ -1206,6 +1213,8 @@ CONFIG_BRIDGE_NETFILTER=m # Core Netfilter Configuration # CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_EGRESS=y +CONFIG_NETFILTER_SKIP_EGRESS=y CONFIG_NETFILTER_NETLINK=m CONFIG_NETFILTER_FAMILY_BRIDGE=y CONFIG_NETFILTER_FAMILY_ARP=y @@ -1604,10 +1613,11 @@ CONFIG_NET_DSA_TAG_DSA=m CONFIG_NET_DSA_TAG_EDSA=m CONFIG_NET_DSA_TAG_MTK=m CONFIG_NET_DSA_TAG_KSZ=m -CONFIG_NET_DSA_TAG_RTL4_A=m CONFIG_NET_DSA_TAG_OCELOT=m CONFIG_NET_DSA_TAG_OCELOT_8021Q=m CONFIG_NET_DSA_TAG_QCA=m +CONFIG_NET_DSA_TAG_RTL4_A=m +CONFIG_NET_DSA_TAG_RTL8_4=m CONFIG_NET_DSA_TAG_LAN9303=m CONFIG_NET_DSA_TAG_SJA1105=m CONFIG_NET_DSA_TAG_TRAILER=m @@ -1963,7 +1973,7 @@ CONFIG_AF_RXRPC_DEBUG=y CONFIG_RXKAD=y CONFIG_AF_KCM=m CONFIG_STREAM_PARSER=y -CONFIG_MCTP=m +# CONFIG_MCTP is not set CONFIG_FIB_RULES=y CONFIG_WIRELESS=y CONFIG_WIRELESS_EXT=y @@ -2063,7 +2073,7 @@ CONFIG_LWTUNNEL_BPF=y CONFIG_DST_CACHE=y CONFIG_GRO_CELLS=y CONFIG_SOCK_VALIDATE_XMIT=y -CONFIG_NET_SELFTESTS=m +CONFIG_NET_SELFTESTS=y CONFIG_NET_SOCK_MSG=y CONFIG_NET_DEVLINK=y CONFIG_PAGE_POOL=y @@ -2258,6 +2268,7 @@ CONFIG_FW_CFG_SYSFS=m # CONFIG_FW_CFG_SYSFS_CMDLINE is not set CONFIG_SYSFB=y # CONFIG_SYSFB_SIMPLEFB is not set +CONFIG_CS_DSP=m CONFIG_GOOGLE_FIRMWARE=y # CONFIG_GOOGLE_SMI is not set CONFIG_GOOGLE_COREBOOT_TABLE=m @@ -2334,7 +2345,7 @@ CONFIG_MTD_BLOCK=m # CONFIG_SM_FTL is not set # CONFIG_MTD_OOPS is not set # CONFIG_MTD_SWAP is not set -CONFIG_MTD_PARTITIONED_MASTER=y +# CONFIG_MTD_PARTITIONED_MASTER is not set # # RAM/ROM/Flash chip drivers @@ -2346,7 +2357,7 @@ CONFIG_MTD_MAP_BANK_WIDTH_2=y CONFIG_MTD_MAP_BANK_WIDTH_4=y CONFIG_MTD_CFI_I1=y CONFIG_MTD_CFI_I2=y -# CONFIG_MTD_RAM is not set +CONFIG_MTD_RAM=m CONFIG_MTD_ROM=m # CONFIG_MTD_ABSENT is not set # end of RAM/ROM/Flash chip drivers @@ -2357,7 +2368,7 @@ CONFIG_MTD_ROM=m # CONFIG_MTD_COMPLEX_MAPPINGS is not set # CONFIG_MTD_PHYSMAP is not set # CONFIG_MTD_INTEL_VR_NOR is not set -# CONFIG_MTD_PLATRAM is not set +CONFIG_MTD_PLATRAM=m # end of Mapping drivers for chip access # @@ -2370,9 +2381,7 @@ CONFIG_MTD_ROM=m # CONFIG_MTD_SST25L is not set # CONFIG_MTD_SLRAM is not set CONFIG_MTD_PHRAM=m -CONFIG_MTD_MTDRAM=m -CONFIG_MTDRAM_TOTAL_SIZE=4096 -CONFIG_MTDRAM_ERASE_SIZE=128 +# CONFIG_MTD_MTDRAM is not set CONFIG_MTD_BLOCK2MTD=m # @@ -2473,7 +2482,6 @@ CONFIG_ZRAM_WRITEBACK=y # CONFIG_ZRAM_MEMORY_TRACKING is not set CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 -CONFIG_BLK_DEV_CRYPTOLOOP=m CONFIG_BLK_DEV_DRBD=m # CONFIG_DRBD_FAULT_INJECTION is not set CONFIG_BLK_DEV_NBD=m @@ -2576,6 +2584,7 @@ CONFIG_INTEL_MEI=m CONFIG_INTEL_MEI_ME=m CONFIG_INTEL_MEI_TXE=m CONFIG_INTEL_MEI_HDCP=m +# CONFIG_INTEL_MEI_PXP is not set CONFIG_VMWARE_VMCI=m CONFIG_GENWQE=m CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 @@ -2686,6 +2695,7 @@ CONFIG_SCSI_UFS_CDNS_PLATFORM=m CONFIG_SCSI_UFS_BSG=y CONFIG_SCSI_UFS_CRYPTO=y CONFIG_SCSI_UFS_HPB=y +# CONFIG_SCSI_UFS_HWMON is not set CONFIG_SCSI_HPTIOP=m CONFIG_SCSI_BUSLOGIC=m CONFIG_SCSI_FLASHPOINT=y @@ -2903,6 +2913,7 @@ CONFIG_DM_SWITCH=m CONFIG_DM_LOG_WRITES=m CONFIG_DM_INTEGRITY=m CONFIG_DM_ZONED=m +CONFIG_DM_AUDIT=y CONFIG_TARGET_CORE=m CONFIG_TCM_IBLOCK=m CONFIG_TCM_FILEIO=m @@ -2959,6 +2970,7 @@ CONFIG_VXLAN=m CONFIG_GENEVE=m CONFIG_BAREUDP=m CONFIG_GTP=m +# CONFIG_AMT is not set CONFIG_MACSEC=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y @@ -3079,6 +3091,9 @@ CONFIG_AMD_XGBE_HAVE_ECC=y CONFIG_NET_VENDOR_AQUANTIA=y CONFIG_AQTION=m CONFIG_NET_VENDOR_ARC=y +CONFIG_NET_VENDOR_ASIX=y +CONFIG_SPI_AX88796C=y +# CONFIG_SPI_AX88796C_COMPRESSION is not set CONFIG_NET_VENDOR_ATHEROS=y CONFIG_ATL2=m CONFIG_ATL1=m @@ -3189,6 +3204,7 @@ CONFIG_I40E_DCB=y CONFIG_IAVF=m CONFIG_I40EVF=m CONFIG_ICE=m +CONFIG_ICE_SWITCHDEV=y CONFIG_FM10K=m CONFIG_IGC=m CONFIG_NET_VENDOR_MICROSOFT=y @@ -3383,10 +3399,10 @@ CONFIG_SKFP=m # CONFIG_HIPPI is not set CONFIG_NET_SB1000=m CONFIG_PHYLINK=m -CONFIG_PHYLIB=m +CONFIG_PHYLIB=y CONFIG_SWPHY=y CONFIG_LED_TRIGGER_PHY=y -CONFIG_FIXED_PHY=m +CONFIG_FIXED_PHY=y CONFIG_SFP=m # @@ -3438,15 +3454,11 @@ CONFIG_DP83869_PHY=m CONFIG_VITESSE_PHY=m CONFIG_XILINX_GMII2RGMII=m CONFIG_MICREL_KS8995MA=m - -# -# MCTP Device Drivers -# -CONFIG_MDIO_DEVICE=m -CONFIG_MDIO_BUS=m -CONFIG_FWNODE_MDIO=m -CONFIG_ACPI_MDIO=m -CONFIG_MDIO_DEVRES=m +CONFIG_MDIO_DEVICE=y +CONFIG_MDIO_BUS=y +CONFIG_FWNODE_MDIO=y +CONFIG_ACPI_MDIO=y +CONFIG_MDIO_DEVRES=y CONFIG_MDIO_BITBANG=m CONFIG_MDIO_BCM_UNIMAC=m CONFIG_MDIO_CAVIUM=m @@ -3740,7 +3752,9 @@ CONFIG_MT7663_USB_SDIO_COMMON=m CONFIG_MT7663U=m CONFIG_MT7663S=m CONFIG_MT7915E=m +CONFIG_MT7921_COMMON=m CONFIG_MT7921E=m +CONFIG_MT7921S=m CONFIG_WLAN_VENDOR_MICROCHIP=y CONFIG_WILC1000=m CONFIG_WILC1000_SDIO=m @@ -3812,6 +3826,12 @@ CONFIG_RTW88_8723DE=m CONFIG_RTW88_8821CE=m CONFIG_RTW88_DEBUG=y CONFIG_RTW88_DEBUGFS=y +CONFIG_RTW89=m +CONFIG_RTW89_CORE=m +CONFIG_RTW89_PCI=m +CONFIG_RTW89_8852AE=m +# CONFIG_RTW89_DEBUGMSG is not set +# CONFIG_RTW89_DEBUGFS is not set CONFIG_WLAN_VENDOR_RSI=y CONFIG_RSI_91X=m CONFIG_RSI_DEBUGFS=y @@ -3955,6 +3975,7 @@ CONFIG_KEYBOARD_TWL4030=m CONFIG_KEYBOARD_XTKBD=m CONFIG_KEYBOARD_CROS_EC=m CONFIG_KEYBOARD_MTK_PMIC=m +CONFIG_KEYBOARD_CYPRESS_SF=y CONFIG_INPUT_MOUSE=y CONFIG_MOUSE_PS2=m CONFIG_MOUSE_PS2_ALPS=y @@ -4320,6 +4341,7 @@ CONFIG_HVC_DRIVER=y CONFIG_HVC_IRQ=y CONFIG_HVC_XEN=y CONFIG_HVC_XEN_FRONTEND=y +# CONFIG_RPMSG_TTY is not set CONFIG_SERIAL_DEV_BUS=y CONFIG_SERIAL_DEV_CTRL_TTYPORT=y CONFIG_PRINTER=m @@ -4333,6 +4355,7 @@ CONFIG_IPMI_PLAT_DATA=y CONFIG_IPMI_DEVICE_INTERFACE=m CONFIG_IPMI_SI=m CONFIG_IPMI_SSIF=m +# CONFIG_IPMI_IPMB is not set CONFIG_IPMI_WATCHDOG=m CONFIG_IPMI_POWEROFF=m CONFIG_IPMB_DEVICE_INTERFACE=m @@ -4592,6 +4615,10 @@ CONFIG_PINCTRL_MCP23S08_I2C=m CONFIG_PINCTRL_MCP23S08_SPI=m CONFIG_PINCTRL_MCP23S08=m CONFIG_PINCTRL_SX150X=y + +# +# Intel pinctrl drivers +# CONFIG_PINCTRL_BAYTRAIL=y CONFIG_PINCTRL_CHERRYVIEW=y CONFIG_PINCTRL_LYNXPOINT=y @@ -4610,6 +4637,7 @@ CONFIG_PINCTRL_LAKEFIELD=y CONFIG_PINCTRL_LEWISBURG=y CONFIG_PINCTRL_SUNRISEPOINT=y CONFIG_PINCTRL_TIGERLAKE=y +# end of Intel pinctrl drivers # # Renesas pinctrl drivers @@ -4942,6 +4970,7 @@ CONFIG_SENSORS_MAX1668=m CONFIG_SENSORS_MAX197=m CONFIG_SENSORS_MAX31722=m CONFIG_SENSORS_MAX31730=m +CONFIG_SENSORS_MAX6620=y CONFIG_SENSORS_MAX6621=m CONFIG_SENSORS_MAX6639=m CONFIG_SENSORS_MAX6642=m @@ -5337,7 +5366,6 @@ CONFIG_MFD_TPS65910=y CONFIG_MFD_TPS65912=m CONFIG_MFD_TPS65912_I2C=m CONFIG_MFD_TPS65912_SPI=m -CONFIG_MFD_TPS80031=y CONFIG_TWL4030_CORE=y CONFIG_MFD_TWL4030_AUDIO=y CONFIG_TWL6040_CORE=y @@ -5457,7 +5485,6 @@ CONFIG_REGULATOR_TPS6524X=m CONFIG_REGULATOR_TPS6586X=m CONFIG_REGULATOR_TPS65910=m CONFIG_REGULATOR_TPS65912=m -CONFIG_REGULATOR_TPS80031=m CONFIG_REGULATOR_TWL4030=m CONFIG_REGULATOR_WM831X=m CONFIG_REGULATOR_WM8350=m @@ -5497,12 +5524,15 @@ CONFIG_IR_TTUSBIR=m CONFIG_RC_LOOPBACK=m CONFIG_IR_SERIAL=m CONFIG_IR_SERIAL_TRANSMITTER=y -CONFIG_IR_SIR=m CONFIG_RC_XBOX_DVD=m CONFIG_IR_TOY=m CONFIG_CEC_CORE=y CONFIG_CEC_NOTIFIER=y CONFIG_CEC_PIN=y + +# +# CEC support +# CONFIG_MEDIA_CEC_RC=y # CONFIG_CEC_PIN_ERROR_INJ is not set CONFIG_MEDIA_CEC_SUPPORT=y @@ -5513,6 +5543,8 @@ CONFIG_CEC_SECO=m CONFIG_CEC_SECO_RC=y CONFIG_USB_PULSE8_CEC=m CONFIG_USB_RAINSHADOW_CEC=m +# end of CEC support + CONFIG_MEDIA_SUPPORT=m CONFIG_MEDIA_SUPPORT_FILTER=y CONFIG_MEDIA_SUBDRV_AUTOSELECT=y @@ -5556,10 +5588,6 @@ CONFIG_VIDEOBUF_VMALLOC=m # CONFIG_MEDIA_CONTROLLER_DVB=y CONFIG_MEDIA_CONTROLLER_REQUEST_API=y - -# -# Please notice that the enabled Media controller Request API is EXPERIMENTAL -# # end of Media controller options # @@ -5935,6 +5963,7 @@ CONFIG_VIDEO_M52790=m CONFIG_VIDEO_APTINA_PLL=m CONFIG_VIDEO_CCS_PLL=m CONFIG_VIDEO_HI556=m +# CONFIG_VIDEO_HI846 is not set CONFIG_VIDEO_IMX208=m CONFIG_VIDEO_IMX214=m CONFIG_VIDEO_IMX219=m @@ -5966,6 +5995,7 @@ CONFIG_VIDEO_OV9640=m CONFIG_VIDEO_OV9650=m CONFIG_VIDEO_OV9734=m CONFIG_VIDEO_OV13858=m +# CONFIG_VIDEO_OV13B10 is not set CONFIG_VIDEO_VS6624=m CONFIG_VIDEO_MT9M001=m CONFIG_VIDEO_MT9M032=m @@ -6730,6 +6760,9 @@ CONFIG_SND_SOC_AMD_RV_RT5682_MACH=m CONFIG_SND_SOC_AMD_RENOIR=m CONFIG_SND_SOC_AMD_RENOIR_MACH=m CONFIG_SND_SOC_AMD_ACP5x=m +# CONFIG_SND_SOC_AMD_VANGOGH_MACH is not set +# CONFIG_SND_SOC_AMD_ACP6x is not set +# CONFIG_SND_SOC_AMD_ACP_COMMON is not set CONFIG_SND_ATMEL_SOC=m # CONFIG_SND_BCM63XX_I2S_WHISTLER is not set CONFIG_SND_DESIGNWARE_I2S=m @@ -6820,6 +6853,7 @@ CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH=m CONFIG_SND_SOC_INTEL_SOF_CS42L42_MACH=m CONFIG_SND_SOC_INTEL_SOF_PCM512x_MACH=m +# CONFIG_SND_SOC_INTEL_SOF_ES8336_MACH is not set CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH=m CONFIG_SND_SOC_INTEL_SOF_CML_RT1011_RT5682_MACH=m CONFIG_SND_SOC_INTEL_SOF_DA7219_MAX98373_MACH=m @@ -6911,6 +6945,8 @@ CONFIG_SND_SOC_CS35L33=m CONFIG_SND_SOC_CS35L34=m CONFIG_SND_SOC_CS35L35=m CONFIG_SND_SOC_CS35L36=m +# CONFIG_SND_SOC_CS35L41_SPI is not set +# CONFIG_SND_SOC_CS35L41_I2C is not set CONFIG_SND_SOC_CS42L42=m CONFIG_SND_SOC_CS42L51=m CONFIG_SND_SOC_CS42L51_I2C=m @@ -6951,6 +6987,7 @@ CONFIG_SND_SOC_MAX98357A=m CONFIG_SND_SOC_MAX98504=m CONFIG_SND_SOC_MAX9867=m CONFIG_SND_SOC_MAX98927=m +# CONFIG_SND_SOC_MAX98520 is not set CONFIG_SND_SOC_MAX98373=m CONFIG_SND_SOC_MAX98373_I2C=m CONFIG_SND_SOC_MAX98373_SDW=m @@ -7003,6 +7040,7 @@ CONFIG_SND_SOC_RT5677_SPI=m CONFIG_SND_SOC_RT5682=m CONFIG_SND_SOC_RT5682_I2C=m CONFIG_SND_SOC_RT5682_SDW=m +CONFIG_SND_SOC_RT5682S=m CONFIG_SND_SOC_RT700=m CONFIG_SND_SOC_RT700_SDW=m CONFIG_SND_SOC_RT711=m @@ -7011,6 +7049,7 @@ CONFIG_SND_SOC_RT711_SDCA_SDW=m CONFIG_SND_SOC_RT715=m CONFIG_SND_SOC_RT715_SDW=m CONFIG_SND_SOC_RT715_SDCA_SDW=m +# CONFIG_SND_SOC_RT9120 is not set # CONFIG_SND_SOC_SDW_MOCKUP is not set CONFIG_SND_SOC_SGTL5000=m CONFIG_SND_SOC_SI476X=m @@ -7094,6 +7133,7 @@ CONFIG_SND_SOC_MT6660=m CONFIG_SND_SOC_NAU8315=m CONFIG_SND_SOC_NAU8540=m CONFIG_SND_SOC_NAU8810=m +# CONFIG_SND_SOC_NAU8821 is not set CONFIG_SND_SOC_NAU8822=m CONFIG_SND_SOC_NAU8824=m CONFIG_SND_SOC_NAU8825=m @@ -7167,6 +7207,7 @@ CONFIG_HID_KYE=m CONFIG_HID_UCLOGIC=m CONFIG_HID_WALTOP=m CONFIG_HID_VIEWSONIC=m +# CONFIG_HID_XIAOMI is not set CONFIG_HID_GYRATION=m CONFIG_HID_ICADE=m CONFIG_HID_ITE=m @@ -7190,6 +7231,7 @@ CONFIG_HID_REDRAGON=m CONFIG_HID_MICROSOFT=m CONFIG_HID_MONTEREY=m CONFIG_HID_MULTITOUCH=m +# CONFIG_HID_NINTENDO is not set CONFIG_HID_NTI=m CONFIG_HID_NTRIG=m CONFIG_HID_ORTEK=m @@ -7968,7 +8010,6 @@ CONFIG_RTC_DRV_BQ32K=m CONFIG_RTC_DRV_PALMAS=m CONFIG_RTC_DRV_TPS6586X=m CONFIG_RTC_DRV_TPS65910=m -CONFIG_RTC_DRV_TPS80031=m CONFIG_RTC_DRV_RC5T583=m CONFIG_RTC_DRV_S35390A=m CONFIG_RTC_DRV_FM3130=m @@ -8141,6 +8182,7 @@ CONFIG_ACRN_HSM=m CONFIG_VIRTIO=y CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS=y CONFIG_VIRTIO_PCI_LIB=m +CONFIG_VIRTIO_PCI_LIB_LEGACY=m CONFIG_VIRTIO_MENU=y CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_PCI_LEGACY=y @@ -8161,6 +8203,7 @@ CONFIG_IFCVF=m CONFIG_MLX5_VDPA=y CONFIG_MLX5_VDPA_NET=m CONFIG_VP_VDPA=m +# CONFIG_ALIBABA_ENI_VDPA is not set CONFIG_VHOST_IOTLB=m CONFIG_VHOST_RING=m CONFIG_VHOST=m @@ -8198,6 +8241,7 @@ CONFIG_XEN_GNTDEV_DMABUF=y CONFIG_XEN_GRANT_DEV_ALLOC=m CONFIG_XEN_GRANT_DMA_ALLOC=y CONFIG_SWIOTLB_XEN=y +CONFIG_XEN_PCI_STUB=y CONFIG_XEN_PCIDEV_BACKEND=m CONFIG_XEN_PVCALLS_FRONTEND=m CONFIG_XEN_PVCALLS_BACKEND=y @@ -8228,7 +8272,6 @@ CONFIG_RTL8192E=m CONFIG_RTL8723BS=m CONFIG_R8712U=m CONFIG_R8188EU=m -CONFIG_88EU_AP_MODE=y CONFIG_RTS5208=m CONFIG_VT6655=m CONFIG_VT6656=m @@ -8345,6 +8388,7 @@ CONFIG_WMI_BMOF=m CONFIG_HUAWEI_WMI=m CONFIG_MXM_WMI=m CONFIG_PEAQ_WMI=m +CONFIG_NVIDIA_WMI_EC_BACKLIGHT=m CONFIG_XIAOMI_WMI=m CONFIG_GIGABYTE_WMI=m CONFIG_ACERHDF=m @@ -8394,6 +8438,7 @@ CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y CONFIG_THINKPAD_ACPI_VIDEO=y CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y CONFIG_THINKPAD_LMI=m +CONFIG_X86_PLATFORM_DRIVERS_INTEL=y CONFIG_INTEL_ATOMISP2_PDX86=y CONFIG_INTEL_ATOMISP2_LED=m CONFIG_INTEL_SAR_INT1092=m @@ -8420,6 +8465,7 @@ CONFIG_INTEL_INT0002_VGPIO=m CONFIG_INTEL_OAKTRAIL=m CONFIG_INTEL_BXTWC_PMIC_TMU=m CONFIG_INTEL_CHTDC_TI_PWRBTN=m +# CONFIG_INTEL_ISHTP_ECLITE is not set CONFIG_INTEL_MRFLD_PWRBTN=m CONFIG_INTEL_PUNIT_IPC=m CONFIG_INTEL_RST=m @@ -8429,6 +8475,7 @@ CONFIG_INTEL_UNCORE_FREQ_CONTROL=m CONFIG_MSI_LAPTOP=m CONFIG_MSI_WMI=m CONFIG_PCENGINES_APU2=m +# CONFIG_BARCO_P50_GPIO is not set CONFIG_SAMSUNG_LAPTOP=m CONFIG_SAMSUNG_Q10=m CONFIG_ACPI_TOSHIBA=m @@ -8480,6 +8527,7 @@ CONFIG_WILCO_EC_TELEMETRY=m CONFIG_MELLANOX_PLATFORM=y CONFIG_MLXREG_HOTPLUG=m CONFIG_MLXREG_IO=m +# CONFIG_MLXREG_LC is not set CONFIG_SURFACE_PLATFORMS=y CONFIG_SURFACE3_WMI=m CONFIG_SURFACE_3_BUTTON=m @@ -8499,14 +8547,6 @@ CONFIG_HAVE_CLK=y CONFIG_HAVE_CLK_PREPARE=y CONFIG_COMMON_CLK=y CONFIG_COMMON_CLK_WM831X=m - -# -# Clock driver for ARM Reference designs -# -# CONFIG_ICST is not set -# CONFIG_CLK_SP810 is not set -# end of Clock driver for ARM Reference designs - CONFIG_LMK04832=m CONFIG_COMMON_CLK_MAX9485=m CONFIG_COMMON_CLK_SI5341=m @@ -8692,6 +8732,10 @@ CONFIG_IIO_TRIGGERED_EVENT=m # CONFIG_ADIS16201=m CONFIG_ADIS16209=m +# CONFIG_ADXL313_I2C is not set +# CONFIG_ADXL313_SPI is not set +# CONFIG_ADXL355_I2C is not set +# CONFIG_ADXL355_SPI is not set CONFIG_ADXL372=m CONFIG_ADXL372_SPI=m CONFIG_ADXL372_I2C=m @@ -8840,11 +8884,13 @@ CONFIG_PMS7003=m CONFIG_SCD30_CORE=m CONFIG_SCD30_I2C=m CONFIG_SCD30_SERIAL=m +# CONFIG_SCD4X is not set CONFIG_SENSIRION_SGP30=m CONFIG_SENSIRION_SGP40=m CONFIG_SPS30=m CONFIG_SPS30_I2C=m CONFIG_SPS30_SERIAL=m +# CONFIG_SENSEAIR_SUNRISE_CO2 is not set CONFIG_VZ89X=m # end of Chemical Sensors @@ -8937,6 +8983,7 @@ CONFIG_AD9523=m # CONFIG_ADF4350=m CONFIG_ADF4371=m +# CONFIG_ADRF6780 is not set # end of Phase-Locked Loop (PLL) frequency synthesizers # end of Frequency Synthesizers DDS/PLL @@ -9232,6 +9279,7 @@ CONFIG_TMP117=m CONFIG_TSYS01=m CONFIG_TSYS02D=m CONFIG_MAX31856=m +# CONFIG_MAX31865 is not set # end of Temperature sensors CONFIG_NTB=m @@ -9280,7 +9328,13 @@ CONFIG_RESET_TI_SYSCON=m CONFIG_GENERIC_PHY=y CONFIG_USB_LGM_PHY=m CONFIG_PHY_CAN_TRANSCEIVER=m + +# +# PHY drivers for Broadcom platforms +# CONFIG_BCM_KONA_USB2_PHY=m +# end of PHY drivers for Broadcom platforms + CONFIG_PHY_PXA_28NM_HSIC=m CONFIG_PHY_PXA_28NM_USB2=m CONFIG_PHY_CPCAP_USB=m @@ -9680,6 +9734,7 @@ CONFIG_EROFS_FS_XATTR=y CONFIG_EROFS_FS_POSIX_ACL=y CONFIG_EROFS_FS_SECURITY=y CONFIG_EROFS_FS_ZIP=y +# CONFIG_EROFS_FS_ZIP_LZMA is not set CONFIG_VBOXSF_FS=m CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=m @@ -9838,7 +9893,6 @@ CONFIG_SECURITY_PATH=y CONFIG_LSM_MMAP_MIN_ADDR=65536 CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y CONFIG_HARDENED_USERCOPY=y -CONFIG_HARDENED_USERCOPY_FALLBACK=y CONFIG_FORTIFY_SOURCE=y # CONFIG_STATIC_USERMODEHELPER is not set CONFIG_SECURITY_SELINUX=y @@ -10228,6 +10282,7 @@ CONFIG_XZ_DEC_IA64=y CONFIG_XZ_DEC_ARM=y CONFIG_XZ_DEC_ARMTHUMB=y CONFIG_XZ_DEC_SPARC=y +# CONFIG_XZ_DEC_MICROLZMA is not set CONFIG_XZ_DEC_BCJ=y # CONFIG_XZ_DEC_TEST is not set CONFIG_DECOMPRESS_GZIP=y @@ -10586,6 +10641,8 @@ CONFIG_HIST_TRIGGERS=y # CONFIG_HIST_TRIGGERS_DEBUG is not set # CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set # CONFIG_SAMPLES is not set +CONFIG_HAVE_SAMPLE_FTRACE_DIRECT=y +CONFIG_HAVE_SAMPLE_FTRACE_DIRECT_MULTI=y CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y CONFIG_STRICT_DEVMEM=y CONFIG_IO_STRICT_DEVMEM=y @@ -10631,7 +10688,6 @@ CONFIG_RUNTIME_TESTING_MENU=y # CONFIG_LKDTM is not set # CONFIG_TEST_MIN_HEAP is not set # CONFIG_TEST_DIV64 is not set -# CONFIG_KPROBES_SANITY_TEST is not set # CONFIG_BACKTRACE_SELF_TEST is not set # CONFIG_RBTREE_TEST is not set # CONFIG_REED_SOLOMON_TEST is not set |