summarylogtreecommitdiffstats
path: root/0103-futex.patch
diff options
context:
space:
mode:
Diffstat (limited to '0103-futex.patch')
-rw-r--r--0103-futex.patch9811
1 files changed, 0 insertions, 9811 deletions
diff --git a/0103-futex.patch b/0103-futex.patch
deleted file mode 100644
index d33f488ae054..000000000000
--- a/0103-futex.patch
+++ /dev/null
@@ -1,9811 +0,0 @@
-From 4dc2913212c08c6970f6e8971fd23b6328982f94 Mon Sep 17 00:00:00 2001
-From: Piotr Gorski <lucjan.lucjanov@gmail.com>
-Date: Mon, 1 Nov 2021 12:11:04 +0100
-Subject: [PATCH] futex: resync from gitlab.collabora.com
-
-Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
----
- Documentation/userspace-api/futex2.rst | 86 +
- Documentation/userspace-api/index.rst | 1 +
- MAINTAINERS | 3 +-
- arch/arm/tools/syscall.tbl | 1 +
- arch/arm64/include/asm/unistd.h | 2 +-
- arch/arm64/include/asm/unistd32.h | 2 +
- arch/x86/entry/syscalls/syscall_32.tbl | 1 +
- arch/x86/entry/syscalls/syscall_64.tbl | 1 +
- include/linux/syscalls.h | 7 +-
- include/uapi/asm-generic/unistd.h | 5 +-
- include/uapi/linux/futex.h | 25 +
- kernel/Makefile | 2 +-
- kernel/futex.c | 4272 -----------------
- kernel/futex/Makefile | 3 +
- kernel/futex/core.c | 1176 +++++
- kernel/futex/futex.h | 295 ++
- kernel/futex/pi.c | 1233 +++++
- kernel/futex/requeue.c | 897 ++++
- kernel/futex/syscalls.c | 396 ++
- kernel/futex/waitwake.c | 708 +++
- kernel/sys_ni.c | 3 +-
- .../selftests/futex/functional/.gitignore | 1 +
- .../selftests/futex/functional/Makefile | 3 +-
- .../futex/functional/futex_wait_timeout.c | 21 +-
- .../futex/functional/futex_wait_wouldblock.c | 41 +-
- .../selftests/futex/functional/futex_waitv.c | 237 +
- .../testing/selftests/futex/functional/run.sh | 3 +
- .../selftests/futex/include/futex2test.h | 22 +
- 28 files changed, 5163 insertions(+), 4284 deletions(-)
- create mode 100644 Documentation/userspace-api/futex2.rst
- delete mode 100644 kernel/futex.c
- create mode 100644 kernel/futex/Makefile
- create mode 100644 kernel/futex/core.c
- create mode 100644 kernel/futex/futex.h
- create mode 100644 kernel/futex/pi.c
- create mode 100644 kernel/futex/requeue.c
- create mode 100644 kernel/futex/syscalls.c
- create mode 100644 kernel/futex/waitwake.c
- create mode 100644 tools/testing/selftests/futex/functional/futex_waitv.c
- create mode 100644 tools/testing/selftests/futex/include/futex2test.h
-
-diff --git a/Documentation/userspace-api/futex2.rst b/Documentation/userspace-api/futex2.rst
-new file mode 100644
-index 000000000..7d37409df
---- /dev/null
-+++ b/Documentation/userspace-api/futex2.rst
-@@ -0,0 +1,86 @@
-+.. SPDX-License-Identifier: GPL-2.0
-+
-+======
-+futex2
-+======
-+
-+:Author: André Almeida <andrealmeid@collabora.com>
-+
-+futex, or fast user mutex, is a set of syscalls to allow userspace to create
-+performant synchronization mechanisms, such as mutexes, semaphores and
-+conditional variables in userspace. C standard libraries, like glibc, uses it
-+as a means to implement more high level interfaces like pthreads.
-+
-+futex2 is a followup version of the initial futex syscall, designed to overcome
-+limitations of the original interface.
-+
-+User API
-+========
-+
-+``futex_waitv()``
-+-----------------
-+
-+Wait on an array of futexes, wake on any::
-+
-+ futex_waitv(struct futex_waitv *waiters, unsigned int nr_futexes,
-+ unsigned int flags, struct timespec *timeout, clockid_t clockid)
-+
-+ struct futex_waitv {
-+ __u64 val;
-+ __u64 uaddr;
-+ __u32 flags;
-+ __u32 __reserved;
-+ };
-+
-+Userspace sets an array of struct futex_waitv (up to a max of 128 entries),
-+using ``uaddr`` for the address to wait for, ``val`` for the expected value
-+and ``flags`` to specify the type (e.g. private) and size of futex.
-+``__reserved`` needs to be 0, but it can be used for future extension. The
-+pointer for the first item of the array is passed as ``waiters``. An invalid
-+address for ``waiters`` or for any ``uaddr`` returns ``-EFAULT``.
-+
-+If userspace has 32-bit pointers, it should do a explicit cast to make sure
-+the upper bits are zeroed. ``uintptr_t`` does the tricky and it works for
-+both 32/64-bit pointers.
-+
-+``nr_futexes`` specifies the size of the array. Numbers out of [1, 128]
-+interval will make the syscall return ``-EINVAL``.
-+
-+The ``flags`` argument of the syscall needs to be 0, but it can be used for
-+future extension.
-+
-+For each entry in ``waiters`` array, the current value at ``uaddr`` is compared
-+to ``val``. If it's different, the syscall undo all the work done so far and
-+return ``-EAGAIN``. If all tests and verifications succeeds, syscall waits until
-+one of the following happens:
-+
-+- The timeout expires, returning ``-ETIMEOUT``.
-+- A signal was sent to the sleeping task, returning ``-ERESTARTSYS``.
-+- Some futex at the list was awaken, returning the index of some waked futex.
-+
-+An example of how to use the interface can be found at ``tools/testing/selftests/futex/functional/futex_waitv.c``.
-+
-+Timeout
-+-------
-+
-+``struct timespec *timeout`` argument is an optional argument that points to an
-+absolute timeout. You need to specify the type of clock being used at
-+``clockid`` argument. ``CLOCK_MONOTONIC`` and ``CLOCK_REALTIME`` are supported.
-+This syscall accepts only 64bit timespec structs.
-+
-+Types of futex
-+--------------
-+
-+A futex can be either private or shared. Private is used for processes that
-+shares the same memory space and the virtual address of the futex will be the
-+same for all processes. This allows for optimizations in the kernel. To use
-+private futexes, it's necessary to specify ``FUTEX_PRIVATE_FLAG`` in the futex
-+flag. For processes that doesn't share the same memory space and therefore can
-+have different virtual addresses for the same futex (using, for instance, a
-+file-backed shared memory) requires different internal mechanisms to be get
-+properly enqueued. This is the default behavior, and it works with both private
-+and shared futexes.
-+
-+Futexes can be of different sizes: 8, 16, 32 or 64 bits. Currently, the only
-+supported one is 32 bit sized futex, and it need to be specified using
-+``FUTEX_32`` flag.
-diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
-index c432be070..a61eac0c7 100644
---- a/Documentation/userspace-api/index.rst
-+++ b/Documentation/userspace-api/index.rst
-@@ -28,6 +28,7 @@ place where this information is gathered.
- media/index
- sysfs-platform_profile
- vduse
-+ futex2
-
- .. only:: subproject and html
-
-diff --git a/MAINTAINERS b/MAINTAINERS
-index 3b79fd441..dd165835f 100644
---- a/MAINTAINERS
-+++ b/MAINTAINERS
-@@ -7737,6 +7737,7 @@ M: Ingo Molnar <mingo@redhat.com>
- R: Peter Zijlstra <peterz@infradead.org>
- R: Darren Hart <dvhart@infradead.org>
- R: Davidlohr Bueso <dave@stgolabs.net>
-+R: André Almeida <andrealmeid@collabora.com>
- L: linux-kernel@vger.kernel.org
- S: Maintained
- T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git locking/core
-@@ -7744,7 +7745,7 @@ F: Documentation/locking/*futex*
- F: include/asm-generic/futex.h
- F: include/linux/futex.h
- F: include/uapi/linux/futex.h
--F: kernel/futex.c
-+F: kernel/futex/*
- F: tools/perf/bench/futex*
- F: tools/testing/selftests/futex/
-
-diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
-index e842209e1..543100151 100644
---- a/arch/arm/tools/syscall.tbl
-+++ b/arch/arm/tools/syscall.tbl
-@@ -462,3 +462,4 @@
- 446 common landlock_restrict_self sys_landlock_restrict_self
- # 447 reserved for memfd_secret
- 448 common process_mrelease sys_process_mrelease
-+449 common futex_waitv sys_futex_waitv
-diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
-index 3cb206aea..6bdb5f5db 100644
---- a/arch/arm64/include/asm/unistd.h
-+++ b/arch/arm64/include/asm/unistd.h
-@@ -38,7 +38,7 @@
- #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
- #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
-
--#define __NR_compat_syscalls 449
-+#define __NR_compat_syscalls 450
- #endif
-
- #define __ARCH_WANT_SYS_CLONE
-diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
-index 844f6ae58..41ea1195e 100644
---- a/arch/arm64/include/asm/unistd32.h
-+++ b/arch/arm64/include/asm/unistd32.h
-@@ -903,6 +903,8 @@ __SYSCALL(__NR_landlock_add_rule, sys_landlock_add_rule)
- __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
- #define __NR_process_mrelease 448
- __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
-+#define __NR_futex_waitv 449
-+__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
-
- /*
- * Please add new compat syscalls above this comment and update
-diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
-index 960a021d5..7e2554369 100644
---- a/arch/x86/entry/syscalls/syscall_32.tbl
-+++ b/arch/x86/entry/syscalls/syscall_32.tbl
-@@ -453,3 +453,4 @@
- 446 i386 landlock_restrict_self sys_landlock_restrict_self
- 447 i386 memfd_secret sys_memfd_secret
- 448 i386 process_mrelease sys_process_mrelease
-+449 i386 futex_waitv sys_futex_waitv
-diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
-index 18b5500ea..fe8f8dd15 100644
---- a/arch/x86/entry/syscalls/syscall_64.tbl
-+++ b/arch/x86/entry/syscalls/syscall_64.tbl
-@@ -370,6 +370,7 @@
- 446 common landlock_restrict_self sys_landlock_restrict_self
- 447 common memfd_secret sys_memfd_secret
- 448 common process_mrelease sys_process_mrelease
-+449 common futex_waitv sys_futex_waitv
-
- #
- # Due to a historical design error, certain syscalls are numbered differently
-diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
-index 252243c77..528a478db 100644
---- a/include/linux/syscalls.h
-+++ b/include/linux/syscalls.h
-@@ -58,6 +58,7 @@ struct mq_attr;
- struct compat_stat;
- struct old_timeval32;
- struct robust_list_head;
-+struct futex_waitv;
- struct getcpu_cache;
- struct old_linux_dirent;
- struct perf_event_attr;
-@@ -610,7 +611,7 @@ asmlinkage long sys_waitid(int which, pid_t pid,
- asmlinkage long sys_set_tid_address(int __user *tidptr);
- asmlinkage long sys_unshare(unsigned long unshare_flags);
-
--/* kernel/futex.c */
-+/* kernel/futex/syscalls.c */
- asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
- const struct __kernel_timespec __user *utime,
- u32 __user *uaddr2, u32 val3);
-@@ -623,6 +624,10 @@ asmlinkage long sys_get_robust_list(int pid,
- asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
- size_t len);
-
-+asmlinkage long sys_futex_waitv(struct futex_waitv *waiters,
-+ unsigned int nr_futexes, unsigned int flags,
-+ struct __kernel_timespec __user *timeout, clockid_t clockid);
-+
- /* kernel/hrtimer.c */
- asmlinkage long sys_nanosleep(struct __kernel_timespec __user *rqtp,
- struct __kernel_timespec __user *rmtp);
-diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
-index 1c5fb86d4..4557a8b60 100644
---- a/include/uapi/asm-generic/unistd.h
-+++ b/include/uapi/asm-generic/unistd.h
-@@ -880,8 +880,11 @@ __SYSCALL(__NR_memfd_secret, sys_memfd_secret)
- #define __NR_process_mrelease 448
- __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
-
-+#define __NR_futex_waitv 449
-+__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
-+
- #undef __NR_syscalls
--#define __NR_syscalls 449
-+#define __NR_syscalls 450
-
- /*
- * 32 bit systems traditionally used different
-diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
-index 235e5b2fa..71a5df8d2 100644
---- a/include/uapi/linux/futex.h
-+++ b/include/uapi/linux/futex.h
-@@ -43,6 +43,31 @@
- #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
- FUTEX_PRIVATE_FLAG)
-
-+/*
-+ * Flags to specify the bit length of the futex word for futex2 syscalls.
-+ * Currently, only 32 is supported.
-+ */
-+#define FUTEX_32 2
-+
-+/*
-+ * Max numbers of elements in a futex_waitv array
-+ */
-+#define FUTEX_WAITV_MAX 128
-+
-+/**
-+ * struct futex_waitv - A waiter for vectorized wait
-+ * @val: Expected value at uaddr
-+ * @uaddr: User address to wait on
-+ * @flags: Flags for this waiter
-+ * @__reserved: Reserved member to preserve data alignment. Should be 0.
-+ */
-+struct futex_waitv {
-+ __u64 val;
-+ __u64 uaddr;
-+ __u32 flags;
-+ __u32 __reserved;
-+};
-+
- /*
- * Support for robust futexes: the kernel cleans up held futexes at
- * thread exit time.
-diff --git a/kernel/Makefile b/kernel/Makefile
-index 4df609be4..3f6ab5d50 100644
---- a/kernel/Makefile
-+++ b/kernel/Makefile
-@@ -59,7 +59,7 @@ obj-$(CONFIG_FREEZER) += freezer.o
- obj-$(CONFIG_PROFILING) += profile.o
- obj-$(CONFIG_STACKTRACE) += stacktrace.o
- obj-y += time/
--obj-$(CONFIG_FUTEX) += futex.o
-+obj-$(CONFIG_FUTEX) += futex/
- obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
- obj-$(CONFIG_SMP) += smp.o
- ifneq ($(CONFIG_SMP),y)
-diff --git a/kernel/futex.c b/kernel/futex.c
-deleted file mode 100644
-index c15ad276f..000000000
---- a/kernel/futex.c
-+++ /dev/null
-@@ -1,4272 +0,0 @@
--// SPDX-License-Identifier: GPL-2.0-or-later
--/*
-- * Fast Userspace Mutexes (which I call "Futexes!").
-- * (C) Rusty Russell, IBM 2002
-- *
-- * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
-- * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
-- *
-- * Removed page pinning, fix privately mapped COW pages and other cleanups
-- * (C) Copyright 2003, 2004 Jamie Lokier
-- *
-- * Robust futex support started by Ingo Molnar
-- * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
-- * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
-- *
-- * PI-futex support started by Ingo Molnar and Thomas Gleixner
-- * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
-- * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
-- *
-- * PRIVATE futexes by Eric Dumazet
-- * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
-- *
-- * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
-- * Copyright (C) IBM Corporation, 2009
-- * Thanks to Thomas Gleixner for conceptual design and careful reviews.
-- *
-- * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
-- * enough at me, Linus for the original (flawed) idea, Matthew
-- * Kirkwood for proof-of-concept implementation.
-- *
-- * "The futexes are also cursed."
-- * "But they come in a choice of three flavours!"
-- */
--#include <linux/compat.h>
--#include <linux/jhash.h>
--#include <linux/pagemap.h>
--#include <linux/syscalls.h>
--#include <linux/freezer.h>
--#include <linux/memblock.h>
--#include <linux/fault-inject.h>
--#include <linux/time_namespace.h>
--
--#include <asm/futex.h>
--
--#include "locking/rtmutex_common.h"
--
--/*
-- * READ this before attempting to hack on futexes!
-- *
-- * Basic futex operation and ordering guarantees
-- * =============================================
-- *
-- * The waiter reads the futex value in user space and calls
-- * futex_wait(). This function computes the hash bucket and acquires
-- * the hash bucket lock. After that it reads the futex user space value
-- * again and verifies that the data has not changed. If it has not changed
-- * it enqueues itself into the hash bucket, releases the hash bucket lock
-- * and schedules.
-- *
-- * The waker side modifies the user space value of the futex and calls
-- * futex_wake(). This function computes the hash bucket and acquires the
-- * hash bucket lock. Then it looks for waiters on that futex in the hash
-- * bucket and wakes them.
-- *
-- * In futex wake up scenarios where no tasks are blocked on a futex, taking
-- * the hb spinlock can be avoided and simply return. In order for this
-- * optimization to work, ordering guarantees must exist so that the waiter
-- * being added to the list is acknowledged when the list is concurrently being
-- * checked by the waker, avoiding scenarios like the following:
-- *
-- * CPU 0 CPU 1
-- * val = *futex;
-- * sys_futex(WAIT, futex, val);
-- * futex_wait(futex, val);
-- * uval = *futex;
-- * *futex = newval;
-- * sys_futex(WAKE, futex);
-- * futex_wake(futex);
-- * if (queue_empty())
-- * return;
-- * if (uval == val)
-- * lock(hash_bucket(futex));
-- * queue();
-- * unlock(hash_bucket(futex));
-- * schedule();
-- *
-- * This would cause the waiter on CPU 0 to wait forever because it
-- * missed the transition of the user space value from val to newval
-- * and the waker did not find the waiter in the hash bucket queue.
-- *
-- * The correct serialization ensures that a waiter either observes
-- * the changed user space value before blocking or is woken by a
-- * concurrent waker:
-- *
-- * CPU 0 CPU 1
-- * val = *futex;
-- * sys_futex(WAIT, futex, val);
-- * futex_wait(futex, val);
-- *
-- * waiters++; (a)
-- * smp_mb(); (A) <-- paired with -.
-- * |
-- * lock(hash_bucket(futex)); |
-- * |
-- * uval = *futex; |
-- * | *futex = newval;
-- * | sys_futex(WAKE, futex);
-- * | futex_wake(futex);
-- * |
-- * `--------> smp_mb(); (B)
-- * if (uval == val)
-- * queue();
-- * unlock(hash_bucket(futex));
-- * schedule(); if (waiters)
-- * lock(hash_bucket(futex));
-- * else wake_waiters(futex);
-- * waiters--; (b) unlock(hash_bucket(futex));
-- *
-- * Where (A) orders the waiters increment and the futex value read through
-- * atomic operations (see hb_waiters_inc) and where (B) orders the write
-- * to futex and the waiters read (see hb_waiters_pending()).
-- *
-- * This yields the following case (where X:=waiters, Y:=futex):
-- *
-- * X = Y = 0
-- *
-- * w[X]=1 w[Y]=1
-- * MB MB
-- * r[Y]=y r[X]=x
-- *
-- * Which guarantees that x==0 && y==0 is impossible; which translates back into
-- * the guarantee that we cannot both miss the futex variable change and the
-- * enqueue.
-- *
-- * Note that a new waiter is accounted for in (a) even when it is possible that
-- * the wait call can return error, in which case we backtrack from it in (b).
-- * Refer to the comment in queue_lock().
-- *
-- * Similarly, in order to account for waiters being requeued on another
-- * address we always increment the waiters for the destination bucket before
-- * acquiring the lock. It then decrements them again after releasing it -
-- * the code that actually moves the futex(es) between hash buckets (requeue_futex)
-- * will do the additional required waiter count housekeeping. This is done for
-- * double_lock_hb() and double_unlock_hb(), respectively.
-- */
--
--#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
--#define futex_cmpxchg_enabled 1
--#else
--static int __read_mostly futex_cmpxchg_enabled;
--#endif
--
--/*
-- * Futex flags used to encode options to functions and preserve them across
-- * restarts.
-- */
--#ifdef CONFIG_MMU
--# define FLAGS_SHARED 0x01
--#else
--/*
-- * NOMMU does not have per process address space. Let the compiler optimize
-- * code away.
-- */
--# define FLAGS_SHARED 0x00
--#endif
--#define FLAGS_CLOCKRT 0x02
--#define FLAGS_HAS_TIMEOUT 0x04
--
--/*
-- * Priority Inheritance state:
-- */
--struct futex_pi_state {
-- /*
-- * list of 'owned' pi_state instances - these have to be
-- * cleaned up in do_exit() if the task exits prematurely:
-- */
-- struct list_head list;
--
-- /*
-- * The PI object:
-- */
-- struct rt_mutex_base pi_mutex;
--
-- struct task_struct *owner;
-- refcount_t refcount;
--
-- union futex_key key;
--} __randomize_layout;
--
--/**
-- * struct futex_q - The hashed futex queue entry, one per waiting task
-- * @list: priority-sorted list of tasks waiting on this futex
-- * @task: the task waiting on the futex
-- * @lock_ptr: the hash bucket lock
-- * @key: the key the futex is hashed on
-- * @pi_state: optional priority inheritance state
-- * @rt_waiter: rt_waiter storage for use with requeue_pi
-- * @requeue_pi_key: the requeue_pi target futex key
-- * @bitset: bitset for the optional bitmasked wakeup
-- * @requeue_state: State field for futex_requeue_pi()
-- * @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
-- *
-- * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
-- * we can wake only the relevant ones (hashed queues may be shared).
-- *
-- * A futex_q has a woken state, just like tasks have TASK_RUNNING.
-- * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
-- * The order of wakeup is always to make the first condition true, then
-- * the second.
-- *
-- * PI futexes are typically woken before they are removed from the hash list via
-- * the rt_mutex code. See unqueue_me_pi().
-- */
--struct futex_q {
-- struct plist_node list;
--
-- struct task_struct *task;
-- spinlock_t *lock_ptr;
-- union futex_key key;
-- struct futex_pi_state *pi_state;
-- struct rt_mutex_waiter *rt_waiter;
-- union futex_key *requeue_pi_key;
-- u32 bitset;
-- atomic_t requeue_state;
--#ifdef CONFIG_PREEMPT_RT
-- struct rcuwait requeue_wait;
--#endif
--} __randomize_layout;
--
--/*
-- * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
-- * underlying rtmutex. The task which is about to be requeued could have
-- * just woken up (timeout, signal). After the wake up the task has to
-- * acquire hash bucket lock, which is held by the requeue code. As a task
-- * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
-- * and the hash bucket lock blocking would collide and corrupt state.
-- *
-- * On !PREEMPT_RT this is not a problem and everything could be serialized
-- * on hash bucket lock, but aside of having the benefit of common code,
-- * this allows to avoid doing the requeue when the task is already on the
-- * way out and taking the hash bucket lock of the original uaddr1 when the
-- * requeue has been completed.
-- *
-- * The following state transitions are valid:
-- *
-- * On the waiter side:
-- * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE
-- * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT
-- *
-- * On the requeue side:
-- * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS
-- * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED
-- * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed)
-- * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED
-- * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed)
-- *
-- * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
-- * signals that the waiter is already on the way out. It also means that
-- * the waiter is still on the 'wait' futex, i.e. uaddr1.
-- *
-- * The waiter side signals early wakeup to the requeue side either through
-- * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
-- * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
-- * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
-- * which means the wakeup is interleaving with a requeue in progress it has
-- * to wait for the requeue side to change the state. Either to DONE/LOCKED
-- * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
-- * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
-- * the requeue side when the requeue attempt failed via deadlock detection
-- * and therefore the waiter q is still on the uaddr1 futex.
-- */
--enum {
-- Q_REQUEUE_PI_NONE = 0,
-- Q_REQUEUE_PI_IGNORE,
-- Q_REQUEUE_PI_IN_PROGRESS,
-- Q_REQUEUE_PI_WAIT,
-- Q_REQUEUE_PI_DONE,
-- Q_REQUEUE_PI_LOCKED,
--};
--
--static const struct futex_q futex_q_init = {
-- /* list gets initialized in queue_me()*/
-- .key = FUTEX_KEY_INIT,
-- .bitset = FUTEX_BITSET_MATCH_ANY,
-- .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
--};
--
--/*
-- * Hash buckets are shared by all the futex_keys that hash to the same
-- * location. Each key may have multiple futex_q structures, one for each task
-- * waiting on a futex.
-- */
--struct futex_hash_bucket {
-- atomic_t waiters;
-- spinlock_t lock;
-- struct plist_head chain;
--} ____cacheline_aligned_in_smp;
--
--/*
-- * The base of the bucket array and its size are always used together
-- * (after initialization only in hash_futex()), so ensure that they
-- * reside in the same cacheline.
-- */
--static struct {
-- struct futex_hash_bucket *queues;
-- unsigned long hashsize;
--} __futex_data __read_mostly __aligned(2*sizeof(long));
--#define futex_queues (__futex_data.queues)
--#define futex_hashsize (__futex_data.hashsize)
--
--
--/*
-- * Fault injections for futexes.
-- */
--#ifdef CONFIG_FAIL_FUTEX
--
--static struct {
-- struct fault_attr attr;
--
-- bool ignore_private;
--} fail_futex = {
-- .attr = FAULT_ATTR_INITIALIZER,
-- .ignore_private = false,
--};
--
--static int __init setup_fail_futex(char *str)
--{
-- return setup_fault_attr(&fail_futex.attr, str);
--}
--__setup("fail_futex=", setup_fail_futex);
--
--static bool should_fail_futex(bool fshared)
--{
-- if (fail_futex.ignore_private && !fshared)
-- return false;
--
-- return should_fail(&fail_futex.attr, 1);
--}
--
--#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
--
--static int __init fail_futex_debugfs(void)
--{
-- umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
-- struct dentry *dir;
--
-- dir = fault_create_debugfs_attr("fail_futex", NULL,
-- &fail_futex.attr);
-- if (IS_ERR(dir))
-- return PTR_ERR(dir);
--
-- debugfs_create_bool("ignore-private", mode, dir,
-- &fail_futex.ignore_private);
-- return 0;
--}
--
--late_initcall(fail_futex_debugfs);
--
--#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
--
--#else
--static inline bool should_fail_futex(bool fshared)
--{
-- return false;
--}
--#endif /* CONFIG_FAIL_FUTEX */
--
--#ifdef CONFIG_COMPAT
--static void compat_exit_robust_list(struct task_struct *curr);
--#endif
--
--/*
-- * Reflects a new waiter being added to the waitqueue.
-- */
--static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
--{
--#ifdef CONFIG_SMP
-- atomic_inc(&hb->waiters);
-- /*
-- * Full barrier (A), see the ordering comment above.
-- */
-- smp_mb__after_atomic();
--#endif
--}
--
--/*
-- * Reflects a waiter being removed from the waitqueue by wakeup
-- * paths.
-- */
--static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
--{
--#ifdef CONFIG_SMP
-- atomic_dec(&hb->waiters);
--#endif
--}
--
--static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
--{
--#ifdef CONFIG_SMP
-- /*
-- * Full barrier (B), see the ordering comment above.
-- */
-- smp_mb();
-- return atomic_read(&hb->waiters);
--#else
-- return 1;
--#endif
--}
--
--/**
-- * hash_futex - Return the hash bucket in the global hash
-- * @key: Pointer to the futex key for which the hash is calculated
-- *
-- * We hash on the keys returned from get_futex_key (see below) and return the
-- * corresponding hash bucket in the global hash.
-- */
--static struct futex_hash_bucket *hash_futex(union futex_key *key)
--{
-- u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
-- key->both.offset);
--
-- return &futex_queues[hash & (futex_hashsize - 1)];
--}
--
--
--/**
-- * match_futex - Check whether two futex keys are equal
-- * @key1: Pointer to key1
-- * @key2: Pointer to key2
-- *
-- * Return 1 if two futex_keys are equal, 0 otherwise.
-- */
--static inline int match_futex(union futex_key *key1, union futex_key *key2)
--{
-- return (key1 && key2
-- && key1->both.word == key2->both.word
-- && key1->both.ptr == key2->both.ptr
-- && key1->both.offset == key2->both.offset);
--}
--
--enum futex_access {
-- FUTEX_READ,
-- FUTEX_WRITE
--};
--
--/**
-- * futex_setup_timer - set up the sleeping hrtimer.
-- * @time: ptr to the given timeout value
-- * @timeout: the hrtimer_sleeper structure to be set up
-- * @flags: futex flags
-- * @range_ns: optional range in ns
-- *
-- * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
-- * value given
-- */
--static inline struct hrtimer_sleeper *
--futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
-- int flags, u64 range_ns)
--{
-- if (!time)
-- return NULL;
--
-- hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
-- CLOCK_REALTIME : CLOCK_MONOTONIC,
-- HRTIMER_MODE_ABS);
-- /*
-- * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
-- * effectively the same as calling hrtimer_set_expires().
-- */
-- hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
--
-- return timeout;
--}
--
--/*
-- * Generate a machine wide unique identifier for this inode.
-- *
-- * This relies on u64 not wrapping in the life-time of the machine; which with
-- * 1ns resolution means almost 585 years.
-- *
-- * This further relies on the fact that a well formed program will not unmap
-- * the file while it has a (shared) futex waiting on it. This mapping will have
-- * a file reference which pins the mount and inode.
-- *
-- * If for some reason an inode gets evicted and read back in again, it will get
-- * a new sequence number and will _NOT_ match, even though it is the exact same
-- * file.
-- *
-- * It is important that match_futex() will never have a false-positive, esp.
-- * for PI futexes that can mess up the state. The above argues that false-negatives
-- * are only possible for malformed programs.
-- */
--static u64 get_inode_sequence_number(struct inode *inode)
--{
-- static atomic64_t i_seq;
-- u64 old;
--
-- /* Does the inode already have a sequence number? */
-- old = atomic64_read(&inode->i_sequence);
-- if (likely(old))
-- return old;
--
-- for (;;) {
-- u64 new = atomic64_add_return(1, &i_seq);
-- if (WARN_ON_ONCE(!new))
-- continue;
--
-- old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
-- if (old)
-- return old;
-- return new;
-- }
--}
--
--/**
-- * get_futex_key() - Get parameters which are the keys for a futex
-- * @uaddr: virtual address of the futex
-- * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
-- * @key: address where result is stored.
-- * @rw: mapping needs to be read/write (values: FUTEX_READ,
-- * FUTEX_WRITE)
-- *
-- * Return: a negative error code or 0
-- *
-- * The key words are stored in @key on success.
-- *
-- * For shared mappings (when @fshared), the key is:
-- *
-- * ( inode->i_sequence, page->index, offset_within_page )
-- *
-- * [ also see get_inode_sequence_number() ]
-- *
-- * For private mappings (or when !@fshared), the key is:
-- *
-- * ( current->mm, address, 0 )
-- *
-- * This allows (cross process, where applicable) identification of the futex
-- * without keeping the page pinned for the duration of the FUTEX_WAIT.
-- *
-- * lock_page() might sleep, the caller should not hold a spinlock.
-- */
--static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
-- enum futex_access rw)
--{
-- unsigned long address = (unsigned long)uaddr;
-- struct mm_struct *mm = current->mm;
-- struct page *page, *tail;
-- struct address_space *mapping;
-- int err, ro = 0;
--
-- /*
-- * The futex address must be "naturally" aligned.
-- */
-- key->both.offset = address % PAGE_SIZE;
-- if (unlikely((address % sizeof(u32)) != 0))
-- return -EINVAL;
-- address -= key->both.offset;
--
-- if (unlikely(!access_ok(uaddr, sizeof(u32))))
-- return -EFAULT;
--
-- if (unlikely(should_fail_futex(fshared)))
-- return -EFAULT;
--
-- /*
-- * PROCESS_PRIVATE futexes are fast.
-- * As the mm cannot disappear under us and the 'key' only needs
-- * virtual address, we dont even have to find the underlying vma.
-- * Note : We do have to check 'uaddr' is a valid user address,
-- * but access_ok() should be faster than find_vma()
-- */
-- if (!fshared) {
-- key->private.mm = mm;
-- key->private.address = address;
-- return 0;
-- }
--
--again:
-- /* Ignore any VERIFY_READ mapping (futex common case) */
-- if (unlikely(should_fail_futex(true)))
-- return -EFAULT;
--
-- err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
-- /*
-- * If write access is not required (eg. FUTEX_WAIT), try
-- * and get read-only access.
-- */
-- if (err == -EFAULT && rw == FUTEX_READ) {
-- err = get_user_pages_fast(address, 1, 0, &page);
-- ro = 1;
-- }
-- if (err < 0)
-- return err;
-- else
-- err = 0;
--
-- /*
-- * The treatment of mapping from this point on is critical. The page
-- * lock protects many things but in this context the page lock
-- * stabilizes mapping, prevents inode freeing in the shared
-- * file-backed region case and guards against movement to swap cache.
-- *
-- * Strictly speaking the page lock is not needed in all cases being
-- * considered here and page lock forces unnecessarily serialization
-- * From this point on, mapping will be re-verified if necessary and
-- * page lock will be acquired only if it is unavoidable
-- *
-- * Mapping checks require the head page for any compound page so the
-- * head page and mapping is looked up now. For anonymous pages, it
-- * does not matter if the page splits in the future as the key is
-- * based on the address. For filesystem-backed pages, the tail is
-- * required as the index of the page determines the key. For
-- * base pages, there is no tail page and tail == page.
-- */
-- tail = page;
-- page = compound_head(page);
-- mapping = READ_ONCE(page->mapping);
--
-- /*
-- * If page->mapping is NULL, then it cannot be a PageAnon
-- * page; but it might be the ZERO_PAGE or in the gate area or
-- * in a special mapping (all cases which we are happy to fail);
-- * or it may have been a good file page when get_user_pages_fast
-- * found it, but truncated or holepunched or subjected to
-- * invalidate_complete_page2 before we got the page lock (also
-- * cases which we are happy to fail). And we hold a reference,
-- * so refcount care in invalidate_complete_page's remove_mapping
-- * prevents drop_caches from setting mapping to NULL beneath us.
-- *
-- * The case we do have to guard against is when memory pressure made
-- * shmem_writepage move it from filecache to swapcache beneath us:
-- * an unlikely race, but we do need to retry for page->mapping.
-- */
-- if (unlikely(!mapping)) {
-- int shmem_swizzled;
--
-- /*
-- * Page lock is required to identify which special case above
-- * applies. If this is really a shmem page then the page lock
-- * will prevent unexpected transitions.
-- */
-- lock_page(page);
-- shmem_swizzled = PageSwapCache(page) || page->mapping;
-- unlock_page(page);
-- put_page(page);
--
-- if (shmem_swizzled)
-- goto again;
--
-- return -EFAULT;
-- }
--
-- /*
-- * Private mappings are handled in a simple way.
-- *
-- * If the futex key is stored on an anonymous page, then the associated
-- * object is the mm which is implicitly pinned by the calling process.
-- *
-- * NOTE: When userspace waits on a MAP_SHARED mapping, even if
-- * it's a read-only handle, it's expected that futexes attach to
-- * the object not the particular process.
-- */
-- if (PageAnon(page)) {
-- /*
-- * A RO anonymous page will never change and thus doesn't make
-- * sense for futex operations.
-- */
-- if (unlikely(should_fail_futex(true)) || ro) {
-- err = -EFAULT;
-- goto out;
-- }
--
-- key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
-- key->private.mm = mm;
-- key->private.address = address;
--
-- } else {
-- struct inode *inode;
--
-- /*
-- * The associated futex object in this case is the inode and
-- * the page->mapping must be traversed. Ordinarily this should
-- * be stabilised under page lock but it's not strictly
-- * necessary in this case as we just want to pin the inode, not
-- * update the radix tree or anything like that.
-- *
-- * The RCU read lock is taken as the inode is finally freed
-- * under RCU. If the mapping still matches expectations then the
-- * mapping->host can be safely accessed as being a valid inode.
-- */
-- rcu_read_lock();
--
-- if (READ_ONCE(page->mapping) != mapping) {
-- rcu_read_unlock();
-- put_page(page);
--
-- goto again;
-- }
--
-- inode = READ_ONCE(mapping->host);
-- if (!inode) {
-- rcu_read_unlock();
-- put_page(page);
--
-- goto again;
-- }
--
-- key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-- key->shared.i_seq = get_inode_sequence_number(inode);
-- key->shared.pgoff = page_to_pgoff(tail);
-- rcu_read_unlock();
-- }
--
--out:
-- put_page(page);
-- return err;
--}
--
--/**
-- * fault_in_user_writeable() - Fault in user address and verify RW access
-- * @uaddr: pointer to faulting user space address
-- *
-- * Slow path to fixup the fault we just took in the atomic write
-- * access to @uaddr.
-- *
-- * We have no generic implementation of a non-destructive write to the
-- * user address. We know that we faulted in the atomic pagefault
-- * disabled section so we can as well avoid the #PF overhead by
-- * calling get_user_pages() right away.
-- */
--static int fault_in_user_writeable(u32 __user *uaddr)
--{
-- struct mm_struct *mm = current->mm;
-- int ret;
--
-- mmap_read_lock(mm);
-- ret = fixup_user_fault(mm, (unsigned long)uaddr,
-- FAULT_FLAG_WRITE, NULL);
-- mmap_read_unlock(mm);
--
-- return ret < 0 ? ret : 0;
--}
--
--/**
-- * futex_top_waiter() - Return the highest priority waiter on a futex
-- * @hb: the hash bucket the futex_q's reside in
-- * @key: the futex key (to distinguish it from other futex futex_q's)
-- *
-- * Must be called with the hb lock held.
-- */
--static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
-- union futex_key *key)
--{
-- struct futex_q *this;
--
-- plist_for_each_entry(this, &hb->chain, list) {
-- if (match_futex(&this->key, key))
-- return this;
-- }
-- return NULL;
--}
--
--static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
-- u32 uval, u32 newval)
--{
-- int ret;
--
-- pagefault_disable();
-- ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
-- pagefault_enable();
--
-- return ret;
--}
--
--static int get_futex_value_locked(u32 *dest, u32 __user *from)
--{
-- int ret;
--
-- pagefault_disable();
-- ret = __get_user(*dest, from);
-- pagefault_enable();
--
-- return ret ? -EFAULT : 0;
--}
--
--
--/*
-- * PI code:
-- */
--static int refill_pi_state_cache(void)
--{
-- struct futex_pi_state *pi_state;
--
-- if (likely(current->pi_state_cache))
-- return 0;
--
-- pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
--
-- if (!pi_state)
-- return -ENOMEM;
--
-- INIT_LIST_HEAD(&pi_state->list);
-- /* pi_mutex gets initialized later */
-- pi_state->owner = NULL;
-- refcount_set(&pi_state->refcount, 1);
-- pi_state->key = FUTEX_KEY_INIT;
--
-- current->pi_state_cache = pi_state;
--
-- return 0;
--}
--
--static struct futex_pi_state *alloc_pi_state(void)
--{
-- struct futex_pi_state *pi_state = current->pi_state_cache;
--
-- WARN_ON(!pi_state);
-- current->pi_state_cache = NULL;
--
-- return pi_state;
--}
--
--static void pi_state_update_owner(struct futex_pi_state *pi_state,
-- struct task_struct *new_owner)
--{
-- struct task_struct *old_owner = pi_state->owner;
--
-- lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
--
-- if (old_owner) {
-- raw_spin_lock(&old_owner->pi_lock);
-- WARN_ON(list_empty(&pi_state->list));
-- list_del_init(&pi_state->list);
-- raw_spin_unlock(&old_owner->pi_lock);
-- }
--
-- if (new_owner) {
-- raw_spin_lock(&new_owner->pi_lock);
-- WARN_ON(!list_empty(&pi_state->list));
-- list_add(&pi_state->list, &new_owner->pi_state_list);
-- pi_state->owner = new_owner;
-- raw_spin_unlock(&new_owner->pi_lock);
-- }
--}
--
--static void get_pi_state(struct futex_pi_state *pi_state)
--{
-- WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
--}
--
--/*
-- * Drops a reference to the pi_state object and frees or caches it
-- * when the last reference is gone.
-- */
--static void put_pi_state(struct futex_pi_state *pi_state)
--{
-- if (!pi_state)
-- return;
--
-- if (!refcount_dec_and_test(&pi_state->refcount))
-- return;
--
-- /*
-- * If pi_state->owner is NULL, the owner is most probably dying
-- * and has cleaned up the pi_state already
-- */
-- if (pi_state->owner) {
-- unsigned long flags;
--
-- raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
-- pi_state_update_owner(pi_state, NULL);
-- rt_mutex_proxy_unlock(&pi_state->pi_mutex);
-- raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
-- }
--
-- if (current->pi_state_cache) {
-- kfree(pi_state);
-- } else {
-- /*
-- * pi_state->list is already empty.
-- * clear pi_state->owner.
-- * refcount is at 0 - put it back to 1.
-- */
-- pi_state->owner = NULL;
-- refcount_set(&pi_state->refcount, 1);
-- current->pi_state_cache = pi_state;
-- }
--}
--
--#ifdef CONFIG_FUTEX_PI
--
--/*
-- * This task is holding PI mutexes at exit time => bad.
-- * Kernel cleans up PI-state, but userspace is likely hosed.
-- * (Robust-futex cleanup is separate and might save the day for userspace.)
-- */
--static void exit_pi_state_list(struct task_struct *curr)
--{
-- struct list_head *next, *head = &curr->pi_state_list;
-- struct futex_pi_state *pi_state;
-- struct futex_hash_bucket *hb;
-- union futex_key key = FUTEX_KEY_INIT;
--
-- if (!futex_cmpxchg_enabled)
-- return;
-- /*
-- * We are a ZOMBIE and nobody can enqueue itself on
-- * pi_state_list anymore, but we have to be careful
-- * versus waiters unqueueing themselves:
-- */
-- raw_spin_lock_irq(&curr->pi_lock);
-- while (!list_empty(head)) {
-- next = head->next;
-- pi_state = list_entry(next, struct futex_pi_state, list);
-- key = pi_state->key;
-- hb = hash_futex(&key);
--
-- /*
-- * We can race against put_pi_state() removing itself from the
-- * list (a waiter going away). put_pi_state() will first
-- * decrement the reference count and then modify the list, so
-- * its possible to see the list entry but fail this reference
-- * acquire.
-- *
-- * In that case; drop the locks to let put_pi_state() make
-- * progress and retry the loop.
-- */
-- if (!refcount_inc_not_zero(&pi_state->refcount)) {
-- raw_spin_unlock_irq(&curr->pi_lock);
-- cpu_relax();
-- raw_spin_lock_irq(&curr->pi_lock);
-- continue;
-- }
-- raw_spin_unlock_irq(&curr->pi_lock);
--
-- spin_lock(&hb->lock);
-- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-- raw_spin_lock(&curr->pi_lock);
-- /*
-- * We dropped the pi-lock, so re-check whether this
-- * task still owns the PI-state:
-- */
-- if (head->next != next) {
-- /* retain curr->pi_lock for the loop invariant */
-- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-- spin_unlock(&hb->lock);
-- put_pi_state(pi_state);
-- continue;
-- }
--
-- WARN_ON(pi_state->owner != curr);
-- WARN_ON(list_empty(&pi_state->list));
-- list_del_init(&pi_state->list);
-- pi_state->owner = NULL;
--
-- raw_spin_unlock(&curr->pi_lock);
-- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-- spin_unlock(&hb->lock);
--
-- rt_mutex_futex_unlock(&pi_state->pi_mutex);
-- put_pi_state(pi_state);
--
-- raw_spin_lock_irq(&curr->pi_lock);
-- }
-- raw_spin_unlock_irq(&curr->pi_lock);
--}
--#else
--static inline void exit_pi_state_list(struct task_struct *curr) { }
--#endif
--
--/*
-- * We need to check the following states:
-- *
-- * Waiter | pi_state | pi->owner | uTID | uODIED | ?
-- *
-- * [1] NULL | --- | --- | 0 | 0/1 | Valid
-- * [2] NULL | --- | --- | >0 | 0/1 | Valid
-- *
-- * [3] Found | NULL | -- | Any | 0/1 | Invalid
-- *
-- * [4] Found | Found | NULL | 0 | 1 | Valid
-- * [5] Found | Found | NULL | >0 | 1 | Invalid
-- *
-- * [6] Found | Found | task | 0 | 1 | Valid
-- *
-- * [7] Found | Found | NULL | Any | 0 | Invalid
-- *
-- * [8] Found | Found | task | ==taskTID | 0/1 | Valid
-- * [9] Found | Found | task | 0 | 0 | Invalid
-- * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
-- *
-- * [1] Indicates that the kernel can acquire the futex atomically. We
-- * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
-- *
-- * [2] Valid, if TID does not belong to a kernel thread. If no matching
-- * thread is found then it indicates that the owner TID has died.
-- *
-- * [3] Invalid. The waiter is queued on a non PI futex
-- *
-- * [4] Valid state after exit_robust_list(), which sets the user space
-- * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
-- *
-- * [5] The user space value got manipulated between exit_robust_list()
-- * and exit_pi_state_list()
-- *
-- * [6] Valid state after exit_pi_state_list() which sets the new owner in
-- * the pi_state but cannot access the user space value.
-- *
-- * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
-- *
-- * [8] Owner and user space value match
-- *
-- * [9] There is no transient state which sets the user space TID to 0
-- * except exit_robust_list(), but this is indicated by the
-- * FUTEX_OWNER_DIED bit. See [4]
-- *
-- * [10] There is no transient state which leaves owner and user space
-- * TID out of sync. Except one error case where the kernel is denied
-- * write access to the user address, see fixup_pi_state_owner().
-- *
-- *
-- * Serialization and lifetime rules:
-- *
-- * hb->lock:
-- *
-- * hb -> futex_q, relation
-- * futex_q -> pi_state, relation
-- *
-- * (cannot be raw because hb can contain arbitrary amount
-- * of futex_q's)
-- *
-- * pi_mutex->wait_lock:
-- *
-- * {uval, pi_state}
-- *
-- * (and pi_mutex 'obviously')
-- *
-- * p->pi_lock:
-- *
-- * p->pi_state_list -> pi_state->list, relation
-- * pi_mutex->owner -> pi_state->owner, relation
-- *
-- * pi_state->refcount:
-- *
-- * pi_state lifetime
-- *
-- *
-- * Lock order:
-- *
-- * hb->lock
-- * pi_mutex->wait_lock
-- * p->pi_lock
-- *
-- */
--
--/*
-- * Validate that the existing waiter has a pi_state and sanity check
-- * the pi_state against the user space value. If correct, attach to
-- * it.
-- */
--static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
-- struct futex_pi_state *pi_state,
-- struct futex_pi_state **ps)
--{
-- pid_t pid = uval & FUTEX_TID_MASK;
-- u32 uval2;
-- int ret;
--
-- /*
-- * Userspace might have messed up non-PI and PI futexes [3]
-- */
-- if (unlikely(!pi_state))
-- return -EINVAL;
--
-- /*
-- * We get here with hb->lock held, and having found a
-- * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
-- * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
-- * which in turn means that futex_lock_pi() still has a reference on
-- * our pi_state.
-- *
-- * The waiter holding a reference on @pi_state also protects against
-- * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
-- * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
-- * free pi_state before we can take a reference ourselves.
-- */
-- WARN_ON(!refcount_read(&pi_state->refcount));
--
-- /*
-- * Now that we have a pi_state, we can acquire wait_lock
-- * and do the state validation.
-- */
-- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
--
-- /*
-- * Since {uval, pi_state} is serialized by wait_lock, and our current
-- * uval was read without holding it, it can have changed. Verify it
-- * still is what we expect it to be, otherwise retry the entire
-- * operation.
-- */
-- if (get_futex_value_locked(&uval2, uaddr))
-- goto out_efault;
--
-- if (uval != uval2)
-- goto out_eagain;
--
-- /*
-- * Handle the owner died case:
-- */
-- if (uval & FUTEX_OWNER_DIED) {
-- /*
-- * exit_pi_state_list sets owner to NULL and wakes the
-- * topmost waiter. The task which acquires the
-- * pi_state->rt_mutex will fixup owner.
-- */
-- if (!pi_state->owner) {
-- /*
-- * No pi state owner, but the user space TID
-- * is not 0. Inconsistent state. [5]
-- */
-- if (pid)
-- goto out_einval;
-- /*
-- * Take a ref on the state and return success. [4]
-- */
-- goto out_attach;
-- }
--
-- /*
-- * If TID is 0, then either the dying owner has not
-- * yet executed exit_pi_state_list() or some waiter
-- * acquired the rtmutex in the pi state, but did not
-- * yet fixup the TID in user space.
-- *
-- * Take a ref on the state and return success. [6]
-- */
-- if (!pid)
-- goto out_attach;
-- } else {
-- /*
-- * If the owner died bit is not set, then the pi_state
-- * must have an owner. [7]
-- */
-- if (!pi_state->owner)
-- goto out_einval;
-- }
--
-- /*
-- * Bail out if user space manipulated the futex value. If pi
-- * state exists then the owner TID must be the same as the
-- * user space TID. [9/10]
-- */
-- if (pid != task_pid_vnr(pi_state->owner))
-- goto out_einval;
--
--out_attach:
-- get_pi_state(pi_state);
-- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-- *ps = pi_state;
-- return 0;
--
--out_einval:
-- ret = -EINVAL;
-- goto out_error;
--
--out_eagain:
-- ret = -EAGAIN;
-- goto out_error;
--
--out_efault:
-- ret = -EFAULT;
-- goto out_error;
--
--out_error:
-- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-- return ret;
--}
--
--/**
-- * wait_for_owner_exiting - Block until the owner has exited
-- * @ret: owner's current futex lock status
-- * @exiting: Pointer to the exiting task
-- *
-- * Caller must hold a refcount on @exiting.
-- */
--static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
--{
-- if (ret != -EBUSY) {
-- WARN_ON_ONCE(exiting);
-- return;
-- }
--
-- if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
-- return;
--
-- mutex_lock(&exiting->futex_exit_mutex);
-- /*
-- * No point in doing state checking here. If the waiter got here
-- * while the task was in exec()->exec_futex_release() then it can
-- * have any FUTEX_STATE_* value when the waiter has acquired the
-- * mutex. OK, if running, EXITING or DEAD if it reached exit()
-- * already. Highly unlikely and not a problem. Just one more round
-- * through the futex maze.
-- */
-- mutex_unlock(&exiting->futex_exit_mutex);
--
-- put_task_struct(exiting);
--}
--
--static int handle_exit_race(u32 __user *uaddr, u32 uval,
-- struct task_struct *tsk)
--{
-- u32 uval2;
--
-- /*
-- * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
-- * caller that the alleged owner is busy.
-- */
-- if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
-- return -EBUSY;
--
-- /*
-- * Reread the user space value to handle the following situation:
-- *
-- * CPU0 CPU1
-- *
-- * sys_exit() sys_futex()
-- * do_exit() futex_lock_pi()
-- * futex_lock_pi_atomic()
-- * exit_signals(tsk) No waiters:
-- * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
-- * mm_release(tsk) Set waiter bit
-- * exit_robust_list(tsk) { *uaddr = 0x80000PID;
-- * Set owner died attach_to_pi_owner() {
-- * *uaddr = 0xC0000000; tsk = get_task(PID);
-- * } if (!tsk->flags & PF_EXITING) {
-- * ... attach();
-- * tsk->futex_state = } else {
-- * FUTEX_STATE_DEAD; if (tsk->futex_state !=
-- * FUTEX_STATE_DEAD)
-- * return -EAGAIN;
-- * return -ESRCH; <--- FAIL
-- * }
-- *
-- * Returning ESRCH unconditionally is wrong here because the
-- * user space value has been changed by the exiting task.
-- *
-- * The same logic applies to the case where the exiting task is
-- * already gone.
-- */
-- if (get_futex_value_locked(&uval2, uaddr))
-- return -EFAULT;
--
-- /* If the user space value has changed, try again. */
-- if (uval2 != uval)
-- return -EAGAIN;
--
-- /*
-- * The exiting task did not have a robust list, the robust list was
-- * corrupted or the user space value in *uaddr is simply bogus.
-- * Give up and tell user space.
-- */
-- return -ESRCH;
--}
--
--static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
-- struct futex_pi_state **ps)
--{
-- /*
-- * No existing pi state. First waiter. [2]
-- *
-- * This creates pi_state, we have hb->lock held, this means nothing can
-- * observe this state, wait_lock is irrelevant.
-- */
-- struct futex_pi_state *pi_state = alloc_pi_state();
--
-- /*
-- * Initialize the pi_mutex in locked state and make @p
-- * the owner of it:
-- */
-- rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
--
-- /* Store the key for possible exit cleanups: */
-- pi_state->key = *key;
--
-- WARN_ON(!list_empty(&pi_state->list));
-- list_add(&pi_state->list, &p->pi_state_list);
-- /*
-- * Assignment without holding pi_state->pi_mutex.wait_lock is safe
-- * because there is no concurrency as the object is not published yet.
-- */
-- pi_state->owner = p;
--
-- *ps = pi_state;
--}
--/*
-- * Lookup the task for the TID provided from user space and attach to
-- * it after doing proper sanity checks.
-- */
--static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
-- struct futex_pi_state **ps,
-- struct task_struct **exiting)
--{
-- pid_t pid = uval & FUTEX_TID_MASK;
-- struct task_struct *p;
--
-- /*
-- * We are the first waiter - try to look up the real owner and attach
-- * the new pi_state to it, but bail out when TID = 0 [1]
-- *
-- * The !pid check is paranoid. None of the call sites should end up
-- * with pid == 0, but better safe than sorry. Let the caller retry
-- */
-- if (!pid)
-- return -EAGAIN;
-- p = find_get_task_by_vpid(pid);
-- if (!p)
-- return handle_exit_race(uaddr, uval, NULL);
--
-- if (unlikely(p->flags & PF_KTHREAD)) {
-- put_task_struct(p);
-- return -EPERM;
-- }
--
-- /*
-- * We need to look at the task state to figure out, whether the
-- * task is exiting. To protect against the change of the task state
-- * in futex_exit_release(), we do this protected by p->pi_lock:
-- */
-- raw_spin_lock_irq(&p->pi_lock);
-- if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
-- /*
-- * The task is on the way out. When the futex state is
-- * FUTEX_STATE_DEAD, we know that the task has finished
-- * the cleanup:
-- */
-- int ret = handle_exit_race(uaddr, uval, p);
--
-- raw_spin_unlock_irq(&p->pi_lock);
-- /*
-- * If the owner task is between FUTEX_STATE_EXITING and
-- * FUTEX_STATE_DEAD then store the task pointer and keep
-- * the reference on the task struct. The calling code will
-- * drop all locks, wait for the task to reach
-- * FUTEX_STATE_DEAD and then drop the refcount. This is
-- * required to prevent a live lock when the current task
-- * preempted the exiting task between the two states.
-- */
-- if (ret == -EBUSY)
-- *exiting = p;
-- else
-- put_task_struct(p);
-- return ret;
-- }
--
-- __attach_to_pi_owner(p, key, ps);
-- raw_spin_unlock_irq(&p->pi_lock);
--
-- put_task_struct(p);
--
-- return 0;
--}
--
--static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
--{
-- int err;
-- u32 curval;
--
-- if (unlikely(should_fail_futex(true)))
-- return -EFAULT;
--
-- err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
-- if (unlikely(err))
-- return err;
--
-- /* If user space value changed, let the caller retry */
-- return curval != uval ? -EAGAIN : 0;
--}
--
--/**
-- * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
-- * @uaddr: the pi futex user address
-- * @hb: the pi futex hash bucket
-- * @key: the futex key associated with uaddr and hb
-- * @ps: the pi_state pointer where we store the result of the
-- * lookup
-- * @task: the task to perform the atomic lock work for. This will
-- * be "current" except in the case of requeue pi.
-- * @exiting: Pointer to store the task pointer of the owner task
-- * which is in the middle of exiting
-- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
-- *
-- * Return:
-- * - 0 - ready to wait;
-- * - 1 - acquired the lock;
-- * - <0 - error
-- *
-- * The hb->lock must be held by the caller.
-- *
-- * @exiting is only set when the return value is -EBUSY. If so, this holds
-- * a refcount on the exiting task on return and the caller needs to drop it
-- * after waiting for the exit to complete.
-- */
--static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
-- union futex_key *key,
-- struct futex_pi_state **ps,
-- struct task_struct *task,
-- struct task_struct **exiting,
-- int set_waiters)
--{
-- u32 uval, newval, vpid = task_pid_vnr(task);
-- struct futex_q *top_waiter;
-- int ret;
--
-- /*
-- * Read the user space value first so we can validate a few
-- * things before proceeding further.
-- */
-- if (get_futex_value_locked(&uval, uaddr))
-- return -EFAULT;
--
-- if (unlikely(should_fail_futex(true)))
-- return -EFAULT;
--
-- /*
-- * Detect deadlocks.
-- */
-- if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
-- return -EDEADLK;
--
-- if ((unlikely(should_fail_futex(true))))
-- return -EDEADLK;
--
-- /*
-- * Lookup existing state first. If it exists, try to attach to
-- * its pi_state.
-- */
-- top_waiter = futex_top_waiter(hb, key);
-- if (top_waiter)
-- return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
--
-- /*
-- * No waiter and user TID is 0. We are here because the
-- * waiters or the owner died bit is set or called from
-- * requeue_cmp_pi or for whatever reason something took the
-- * syscall.
-- */
-- if (!(uval & FUTEX_TID_MASK)) {
-- /*
-- * We take over the futex. No other waiters and the user space
-- * TID is 0. We preserve the owner died bit.
-- */
-- newval = uval & FUTEX_OWNER_DIED;
-- newval |= vpid;
--
-- /* The futex requeue_pi code can enforce the waiters bit */
-- if (set_waiters)
-- newval |= FUTEX_WAITERS;
--
-- ret = lock_pi_update_atomic(uaddr, uval, newval);
-- if (ret)
-- return ret;
--
-- /*
-- * If the waiter bit was requested the caller also needs PI
-- * state attached to the new owner of the user space futex.
-- *
-- * @task is guaranteed to be alive and it cannot be exiting
-- * because it is either sleeping or waiting in
-- * futex_requeue_pi_wakeup_sync().
-- *
-- * No need to do the full attach_to_pi_owner() exercise
-- * because @task is known and valid.
-- */
-- if (set_waiters) {
-- raw_spin_lock_irq(&task->pi_lock);
-- __attach_to_pi_owner(task, key, ps);
-- raw_spin_unlock_irq(&task->pi_lock);
-- }
-- return 1;
-- }
--
-- /*
-- * First waiter. Set the waiters bit before attaching ourself to
-- * the owner. If owner tries to unlock, it will be forced into
-- * the kernel and blocked on hb->lock.
-- */
-- newval = uval | FUTEX_WAITERS;
-- ret = lock_pi_update_atomic(uaddr, uval, newval);
-- if (ret)
-- return ret;
-- /*
-- * If the update of the user space value succeeded, we try to
-- * attach to the owner. If that fails, no harm done, we only
-- * set the FUTEX_WAITERS bit in the user space variable.
-- */
-- return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
--}
--
--/**
-- * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
-- * @q: The futex_q to unqueue
-- *
-- * The q->lock_ptr must not be NULL and must be held by the caller.
-- */
--static void __unqueue_futex(struct futex_q *q)
--{
-- struct futex_hash_bucket *hb;
--
-- if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
-- return;
-- lockdep_assert_held(q->lock_ptr);
--
-- hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
-- plist_del(&q->list, &hb->chain);
-- hb_waiters_dec(hb);
--}
--
--/*
-- * The hash bucket lock must be held when this is called.
-- * Afterwards, the futex_q must not be accessed. Callers
-- * must ensure to later call wake_up_q() for the actual
-- * wakeups to occur.
-- */
--static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
--{
-- struct task_struct *p = q->task;
--
-- if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
-- return;
--
-- get_task_struct(p);
-- __unqueue_futex(q);
-- /*
-- * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
-- * is written, without taking any locks. This is possible in the event
-- * of a spurious wakeup, for example. A memory barrier is required here
-- * to prevent the following store to lock_ptr from getting ahead of the
-- * plist_del in __unqueue_futex().
-- */
-- smp_store_release(&q->lock_ptr, NULL);
--
-- /*
-- * Queue the task for later wakeup for after we've released
-- * the hb->lock.
-- */
-- wake_q_add_safe(wake_q, p);
--}
--
--/*
-- * Caller must hold a reference on @pi_state.
-- */
--static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
--{
-- struct rt_mutex_waiter *top_waiter;
-- struct task_struct *new_owner;
-- bool postunlock = false;
-- DEFINE_RT_WAKE_Q(wqh);
-- u32 curval, newval;
-- int ret = 0;
--
-- top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
-- if (WARN_ON_ONCE(!top_waiter)) {
-- /*
-- * As per the comment in futex_unlock_pi() this should not happen.
-- *
-- * When this happens, give up our locks and try again, giving
-- * the futex_lock_pi() instance time to complete, either by
-- * waiting on the rtmutex or removing itself from the futex
-- * queue.
-- */
-- ret = -EAGAIN;
-- goto out_unlock;
-- }
--
-- new_owner = top_waiter->task;
--
-- /*
-- * We pass it to the next owner. The WAITERS bit is always kept
-- * enabled while there is PI state around. We cleanup the owner
-- * died bit, because we are the owner.
-- */
-- newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
--
-- if (unlikely(should_fail_futex(true))) {
-- ret = -EFAULT;
-- goto out_unlock;
-- }
--
-- ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
-- if (!ret && (curval != uval)) {
-- /*
-- * If a unconditional UNLOCK_PI operation (user space did not
-- * try the TID->0 transition) raced with a waiter setting the
-- * FUTEX_WAITERS flag between get_user() and locking the hash
-- * bucket lock, retry the operation.
-- */
-- if ((FUTEX_TID_MASK & curval) == uval)
-- ret = -EAGAIN;
-- else
-- ret = -EINVAL;
-- }
--
-- if (!ret) {
-- /*
-- * This is a point of no return; once we modified the uval
-- * there is no going back and subsequent operations must
-- * not fail.
-- */
-- pi_state_update_owner(pi_state, new_owner);
-- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
-- }
--
--out_unlock:
-- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
--
-- if (postunlock)
-- rt_mutex_postunlock(&wqh);
--
-- return ret;
--}
--
--/*
-- * Express the locking dependencies for lockdep:
-- */
--static inline void
--double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
--{
-- if (hb1 <= hb2) {
-- spin_lock(&hb1->lock);
-- if (hb1 < hb2)
-- spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
-- } else { /* hb1 > hb2 */
-- spin_lock(&hb2->lock);
-- spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
-- }
--}
--
--static inline void
--double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
--{
-- spin_unlock(&hb1->lock);
-- if (hb1 != hb2)
-- spin_unlock(&hb2->lock);
--}
--
--/*
-- * Wake up waiters matching bitset queued on this futex (uaddr).
-- */
--static int
--futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
--{
-- struct futex_hash_bucket *hb;
-- struct futex_q *this, *next;
-- union futex_key key = FUTEX_KEY_INIT;
-- int ret;
-- DEFINE_WAKE_Q(wake_q);
--
-- if (!bitset)
-- return -EINVAL;
--
-- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
-- if (unlikely(ret != 0))
-- return ret;
--
-- hb = hash_futex(&key);
--
-- /* Make sure we really have tasks to wakeup */
-- if (!hb_waiters_pending(hb))
-- return ret;
--
-- spin_lock(&hb->lock);
--
-- plist_for_each_entry_safe(this, next, &hb->chain, list) {
-- if (match_futex (&this->key, &key)) {
-- if (this->pi_state || this->rt_waiter) {
-- ret = -EINVAL;
-- break;
-- }
--
-- /* Check if one of the bits is set in both bitsets */
-- if (!(this->bitset & bitset))
-- continue;
--
-- mark_wake_futex(&wake_q, this);
-- if (++ret >= nr_wake)
-- break;
-- }
-- }
--
-- spin_unlock(&hb->lock);
-- wake_up_q(&wake_q);
-- return ret;
--}
--
--static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
--{
-- unsigned int op = (encoded_op & 0x70000000) >> 28;
-- unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
-- int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
-- int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
-- int oldval, ret;
--
-- if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
-- if (oparg < 0 || oparg > 31) {
-- char comm[sizeof(current->comm)];
-- /*
-- * kill this print and return -EINVAL when userspace
-- * is sane again
-- */
-- pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
-- get_task_comm(comm, current), oparg);
-- oparg &= 31;
-- }
-- oparg = 1 << oparg;
-- }
--
-- pagefault_disable();
-- ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
-- pagefault_enable();
-- if (ret)
-- return ret;
--
-- switch (cmp) {
-- case FUTEX_OP_CMP_EQ:
-- return oldval == cmparg;
-- case FUTEX_OP_CMP_NE:
-- return oldval != cmparg;
-- case FUTEX_OP_CMP_LT:
-- return oldval < cmparg;
-- case FUTEX_OP_CMP_GE:
-- return oldval >= cmparg;
-- case FUTEX_OP_CMP_LE:
-- return oldval <= cmparg;
-- case FUTEX_OP_CMP_GT:
-- return oldval > cmparg;
-- default:
-- return -ENOSYS;
-- }
--}
--
--/*
-- * Wake up all waiters hashed on the physical page that is mapped
-- * to this virtual address:
-- */
--static int
--futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
-- int nr_wake, int nr_wake2, int op)
--{
-- union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
-- struct futex_hash_bucket *hb1, *hb2;
-- struct futex_q *this, *next;
-- int ret, op_ret;
-- DEFINE_WAKE_Q(wake_q);
--
--retry:
-- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
-- if (unlikely(ret != 0))
-- return ret;
-- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
-- if (unlikely(ret != 0))
-- return ret;
--
-- hb1 = hash_futex(&key1);
-- hb2 = hash_futex(&key2);
--
--retry_private:
-- double_lock_hb(hb1, hb2);
-- op_ret = futex_atomic_op_inuser(op, uaddr2);
-- if (unlikely(op_ret < 0)) {
-- double_unlock_hb(hb1, hb2);
--
-- if (!IS_ENABLED(CONFIG_MMU) ||
-- unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
-- /*
-- * we don't get EFAULT from MMU faults if we don't have
-- * an MMU, but we might get them from range checking
-- */
-- ret = op_ret;
-- return ret;
-- }
--
-- if (op_ret == -EFAULT) {
-- ret = fault_in_user_writeable(uaddr2);
-- if (ret)
-- return ret;
-- }
--
-- cond_resched();
-- if (!(flags & FLAGS_SHARED))
-- goto retry_private;
-- goto retry;
-- }
--
-- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
-- if (match_futex (&this->key, &key1)) {
-- if (this->pi_state || this->rt_waiter) {
-- ret = -EINVAL;
-- goto out_unlock;
-- }
-- mark_wake_futex(&wake_q, this);
-- if (++ret >= nr_wake)
-- break;
-- }
-- }
--
-- if (op_ret > 0) {
-- op_ret = 0;
-- plist_for_each_entry_safe(this, next, &hb2->chain, list) {
-- if (match_futex (&this->key, &key2)) {
-- if (this->pi_state || this->rt_waiter) {
-- ret = -EINVAL;
-- goto out_unlock;
-- }
-- mark_wake_futex(&wake_q, this);
-- if (++op_ret >= nr_wake2)
-- break;
-- }
-- }
-- ret += op_ret;
-- }
--
--out_unlock:
-- double_unlock_hb(hb1, hb2);
-- wake_up_q(&wake_q);
-- return ret;
--}
--
--/**
-- * requeue_futex() - Requeue a futex_q from one hb to another
-- * @q: the futex_q to requeue
-- * @hb1: the source hash_bucket
-- * @hb2: the target hash_bucket
-- * @key2: the new key for the requeued futex_q
-- */
--static inline
--void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
-- struct futex_hash_bucket *hb2, union futex_key *key2)
--{
--
-- /*
-- * If key1 and key2 hash to the same bucket, no need to
-- * requeue.
-- */
-- if (likely(&hb1->chain != &hb2->chain)) {
-- plist_del(&q->list, &hb1->chain);
-- hb_waiters_dec(hb1);
-- hb_waiters_inc(hb2);
-- plist_add(&q->list, &hb2->chain);
-- q->lock_ptr = &hb2->lock;
-- }
-- q->key = *key2;
--}
--
--static inline bool futex_requeue_pi_prepare(struct futex_q *q,
-- struct futex_pi_state *pi_state)
--{
-- int old, new;
--
-- /*
-- * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
-- * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
-- * ignore the waiter.
-- */
-- old = atomic_read_acquire(&q->requeue_state);
-- do {
-- if (old == Q_REQUEUE_PI_IGNORE)
-- return false;
--
-- /*
-- * futex_proxy_trylock_atomic() might have set it to
-- * IN_PROGRESS and a interleaved early wake to WAIT.
-- *
-- * It was considered to have an extra state for that
-- * trylock, but that would just add more conditionals
-- * all over the place for a dubious value.
-- */
-- if (old != Q_REQUEUE_PI_NONE)
-- break;
--
-- new = Q_REQUEUE_PI_IN_PROGRESS;
-- } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
--
-- q->pi_state = pi_state;
-- return true;
--}
--
--static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
--{
-- int old, new;
--
-- old = atomic_read_acquire(&q->requeue_state);
-- do {
-- if (old == Q_REQUEUE_PI_IGNORE)
-- return;
--
-- if (locked >= 0) {
-- /* Requeue succeeded. Set DONE or LOCKED */
-- WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
-- old != Q_REQUEUE_PI_WAIT);
-- new = Q_REQUEUE_PI_DONE + locked;
-- } else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
-- /* Deadlock, no early wakeup interleave */
-- new = Q_REQUEUE_PI_NONE;
-- } else {
-- /* Deadlock, early wakeup interleave. */
-- WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
-- new = Q_REQUEUE_PI_IGNORE;
-- }
-- } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
--
--#ifdef CONFIG_PREEMPT_RT
-- /* If the waiter interleaved with the requeue let it know */
-- if (unlikely(old == Q_REQUEUE_PI_WAIT))
-- rcuwait_wake_up(&q->requeue_wait);
--#endif
--}
--
--static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
--{
-- int old, new;
--
-- old = atomic_read_acquire(&q->requeue_state);
-- do {
-- /* Is requeue done already? */
-- if (old >= Q_REQUEUE_PI_DONE)
-- return old;
--
-- /*
-- * If not done, then tell the requeue code to either ignore
-- * the waiter or to wake it up once the requeue is done.
-- */
-- new = Q_REQUEUE_PI_WAIT;
-- if (old == Q_REQUEUE_PI_NONE)
-- new = Q_REQUEUE_PI_IGNORE;
-- } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
--
-- /* If the requeue was in progress, wait for it to complete */
-- if (old == Q_REQUEUE_PI_IN_PROGRESS) {
--#ifdef CONFIG_PREEMPT_RT
-- rcuwait_wait_event(&q->requeue_wait,
-- atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
-- TASK_UNINTERRUPTIBLE);
--#else
-- (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
--#endif
-- }
--
-- /*
-- * Requeue is now either prohibited or complete. Reread state
-- * because during the wait above it might have changed. Nothing
-- * will modify q->requeue_state after this point.
-- */
-- return atomic_read(&q->requeue_state);
--}
--
--/**
-- * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
-- * @q: the futex_q
-- * @key: the key of the requeue target futex
-- * @hb: the hash_bucket of the requeue target futex
-- *
-- * During futex_requeue, with requeue_pi=1, it is possible to acquire the
-- * target futex if it is uncontended or via a lock steal.
-- *
-- * 1) Set @q::key to the requeue target futex key so the waiter can detect
-- * the wakeup on the right futex.
-- *
-- * 2) Dequeue @q from the hash bucket.
-- *
-- * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
-- * acquisition.
-- *
-- * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
-- * the waiter has to fixup the pi state.
-- *
-- * 5) Complete the requeue state so the waiter can make progress. After
-- * this point the waiter task can return from the syscall immediately in
-- * case that the pi state does not have to be fixed up.
-- *
-- * 6) Wake the waiter task.
-- *
-- * Must be called with both q->lock_ptr and hb->lock held.
-- */
--static inline
--void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
-- struct futex_hash_bucket *hb)
--{
-- q->key = *key;
--
-- __unqueue_futex(q);
--
-- WARN_ON(!q->rt_waiter);
-- q->rt_waiter = NULL;
--
-- q->lock_ptr = &hb->lock;
--
-- /* Signal locked state to the waiter */
-- futex_requeue_pi_complete(q, 1);
-- wake_up_state(q->task, TASK_NORMAL);
--}
--
--/**
-- * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
-- * @pifutex: the user address of the to futex
-- * @hb1: the from futex hash bucket, must be locked by the caller
-- * @hb2: the to futex hash bucket, must be locked by the caller
-- * @key1: the from futex key
-- * @key2: the to futex key
-- * @ps: address to store the pi_state pointer
-- * @exiting: Pointer to store the task pointer of the owner task
-- * which is in the middle of exiting
-- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
-- *
-- * Try and get the lock on behalf of the top waiter if we can do it atomically.
-- * Wake the top waiter if we succeed. If the caller specified set_waiters,
-- * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
-- * hb1 and hb2 must be held by the caller.
-- *
-- * @exiting is only set when the return value is -EBUSY. If so, this holds
-- * a refcount on the exiting task on return and the caller needs to drop it
-- * after waiting for the exit to complete.
-- *
-- * Return:
-- * - 0 - failed to acquire the lock atomically;
-- * - >0 - acquired the lock, return value is vpid of the top_waiter
-- * - <0 - error
-- */
--static int
--futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
-- struct futex_hash_bucket *hb2, union futex_key *key1,
-- union futex_key *key2, struct futex_pi_state **ps,
-- struct task_struct **exiting, int set_waiters)
--{
-- struct futex_q *top_waiter = NULL;
-- u32 curval;
-- int ret;
--
-- if (get_futex_value_locked(&curval, pifutex))
-- return -EFAULT;
--
-- if (unlikely(should_fail_futex(true)))
-- return -EFAULT;
--
-- /*
-- * Find the top_waiter and determine if there are additional waiters.
-- * If the caller intends to requeue more than 1 waiter to pifutex,
-- * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
-- * as we have means to handle the possible fault. If not, don't set
-- * the bit unnecessarily as it will force the subsequent unlock to enter
-- * the kernel.
-- */
-- top_waiter = futex_top_waiter(hb1, key1);
--
-- /* There are no waiters, nothing for us to do. */
-- if (!top_waiter)
-- return 0;
--
-- /*
-- * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
-- * and waiting on the 'waitqueue' futex which is always !PI.
-- */
-- if (!top_waiter->rt_waiter || top_waiter->pi_state)
-- return -EINVAL;
--
-- /* Ensure we requeue to the expected futex. */
-- if (!match_futex(top_waiter->requeue_pi_key, key2))
-- return -EINVAL;
--
-- /* Ensure that this does not race against an early wakeup */
-- if (!futex_requeue_pi_prepare(top_waiter, NULL))
-- return -EAGAIN;
--
-- /*
-- * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
-- * in the contended case or if @set_waiters is true.
-- *
-- * In the contended case PI state is attached to the lock owner. If
-- * the user space lock can be acquired then PI state is attached to
-- * the new owner (@top_waiter->task) when @set_waiters is true.
-- */
-- ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
-- exiting, set_waiters);
-- if (ret == 1) {
-- /*
-- * Lock was acquired in user space and PI state was
-- * attached to @top_waiter->task. That means state is fully
-- * consistent and the waiter can return to user space
-- * immediately after the wakeup.
-- */
-- requeue_pi_wake_futex(top_waiter, key2, hb2);
-- } else if (ret < 0) {
-- /* Rewind top_waiter::requeue_state */
-- futex_requeue_pi_complete(top_waiter, ret);
-- } else {
-- /*
-- * futex_lock_pi_atomic() did not acquire the user space
-- * futex, but managed to establish the proxy lock and pi
-- * state. top_waiter::requeue_state cannot be fixed up here
-- * because the waiter is not enqueued on the rtmutex
-- * yet. This is handled at the callsite depending on the
-- * result of rt_mutex_start_proxy_lock() which is
-- * guaranteed to be reached with this function returning 0.
-- */
-- }
-- return ret;
--}
--
--/**
-- * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
-- * @uaddr1: source futex user address
-- * @flags: futex flags (FLAGS_SHARED, etc.)
-- * @uaddr2: target futex user address
-- * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
-- * @nr_requeue: number of waiters to requeue (0-INT_MAX)
-- * @cmpval: @uaddr1 expected value (or %NULL)
-- * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
-- * pi futex (pi to pi requeue is not supported)
-- *
-- * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
-- * uaddr2 atomically on behalf of the top waiter.
-- *
-- * Return:
-- * - >=0 - on success, the number of tasks requeued or woken;
-- * - <0 - on error
-- */
--static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
-- u32 __user *uaddr2, int nr_wake, int nr_requeue,
-- u32 *cmpval, int requeue_pi)
--{
-- union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
-- int task_count = 0, ret;
-- struct futex_pi_state *pi_state = NULL;
-- struct futex_hash_bucket *hb1, *hb2;
-- struct futex_q *this, *next;
-- DEFINE_WAKE_Q(wake_q);
--
-- if (nr_wake < 0 || nr_requeue < 0)
-- return -EINVAL;
--
-- /*
-- * When PI not supported: return -ENOSYS if requeue_pi is true,
-- * consequently the compiler knows requeue_pi is always false past
-- * this point which will optimize away all the conditional code
-- * further down.
-- */
-- if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
-- return -ENOSYS;
--
-- if (requeue_pi) {
-- /*
-- * Requeue PI only works on two distinct uaddrs. This
-- * check is only valid for private futexes. See below.
-- */
-- if (uaddr1 == uaddr2)
-- return -EINVAL;
--
-- /*
-- * futex_requeue() allows the caller to define the number
-- * of waiters to wake up via the @nr_wake argument. With
-- * REQUEUE_PI, waking up more than one waiter is creating
-- * more problems than it solves. Waking up a waiter makes
-- * only sense if the PI futex @uaddr2 is uncontended as
-- * this allows the requeue code to acquire the futex
-- * @uaddr2 before waking the waiter. The waiter can then
-- * return to user space without further action. A secondary
-- * wakeup would just make the futex_wait_requeue_pi()
-- * handling more complex, because that code would have to
-- * look up pi_state and do more or less all the handling
-- * which the requeue code has to do for the to be requeued
-- * waiters. So restrict the number of waiters to wake to
-- * one, and only wake it up when the PI futex is
-- * uncontended. Otherwise requeue it and let the unlock of
-- * the PI futex handle the wakeup.
-- *
-- * All REQUEUE_PI users, e.g. pthread_cond_signal() and
-- * pthread_cond_broadcast() must use nr_wake=1.
-- */
-- if (nr_wake != 1)
-- return -EINVAL;
--
-- /*
-- * requeue_pi requires a pi_state, try to allocate it now
-- * without any locks in case it fails.
-- */
-- if (refill_pi_state_cache())
-- return -ENOMEM;
-- }
--
--retry:
-- ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
-- if (unlikely(ret != 0))
-- return ret;
-- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
-- requeue_pi ? FUTEX_WRITE : FUTEX_READ);
-- if (unlikely(ret != 0))
-- return ret;
--
-- /*
-- * The check above which compares uaddrs is not sufficient for
-- * shared futexes. We need to compare the keys:
-- */
-- if (requeue_pi && match_futex(&key1, &key2))
-- return -EINVAL;
--
-- hb1 = hash_futex(&key1);
-- hb2 = hash_futex(&key2);
--
--retry_private:
-- hb_waiters_inc(hb2);
-- double_lock_hb(hb1, hb2);
--
-- if (likely(cmpval != NULL)) {
-- u32 curval;
--
-- ret = get_futex_value_locked(&curval, uaddr1);
--
-- if (unlikely(ret)) {
-- double_unlock_hb(hb1, hb2);
-- hb_waiters_dec(hb2);
--
-- ret = get_user(curval, uaddr1);
-- if (ret)
-- return ret;
--
-- if (!(flags & FLAGS_SHARED))
-- goto retry_private;
--
-- goto retry;
-- }
-- if (curval != *cmpval) {
-- ret = -EAGAIN;
-- goto out_unlock;
-- }
-- }
--
-- if (requeue_pi) {
-- struct task_struct *exiting = NULL;
--
-- /*
-- * Attempt to acquire uaddr2 and wake the top waiter. If we
-- * intend to requeue waiters, force setting the FUTEX_WAITERS
-- * bit. We force this here where we are able to easily handle
-- * faults rather in the requeue loop below.
-- *
-- * Updates topwaiter::requeue_state if a top waiter exists.
-- */
-- ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
-- &key2, &pi_state,
-- &exiting, nr_requeue);
--
-- /*
-- * At this point the top_waiter has either taken uaddr2 or
-- * is waiting on it. In both cases pi_state has been
-- * established and an initial refcount on it. In case of an
-- * error there's nothing.
-- *
-- * The top waiter's requeue_state is up to date:
-- *
-- * - If the lock was acquired atomically (ret == 1), then
-- * the state is Q_REQUEUE_PI_LOCKED.
-- *
-- * The top waiter has been dequeued and woken up and can
-- * return to user space immediately. The kernel/user
-- * space state is consistent. In case that there must be
-- * more waiters requeued the WAITERS bit in the user
-- * space futex is set so the top waiter task has to go
-- * into the syscall slowpath to unlock the futex. This
-- * will block until this requeue operation has been
-- * completed and the hash bucket locks have been
-- * dropped.
-- *
-- * - If the trylock failed with an error (ret < 0) then
-- * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
-- * happened", or Q_REQUEUE_PI_IGNORE when there was an
-- * interleaved early wakeup.
-- *
-- * - If the trylock did not succeed (ret == 0) then the
-- * state is either Q_REQUEUE_PI_IN_PROGRESS or
-- * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
-- * This will be cleaned up in the loop below, which
-- * cannot fail because futex_proxy_trylock_atomic() did
-- * the same sanity checks for requeue_pi as the loop
-- * below does.
-- */
-- switch (ret) {
-- case 0:
-- /* We hold a reference on the pi state. */
-- break;
--
-- case 1:
-- /*
-- * futex_proxy_trylock_atomic() acquired the user space
-- * futex. Adjust task_count.
-- */
-- task_count++;
-- ret = 0;
-- break;
--
-- /*
-- * If the above failed, then pi_state is NULL and
-- * waiter::requeue_state is correct.
-- */
-- case -EFAULT:
-- double_unlock_hb(hb1, hb2);
-- hb_waiters_dec(hb2);
-- ret = fault_in_user_writeable(uaddr2);
-- if (!ret)
-- goto retry;
-- return ret;
-- case -EBUSY:
-- case -EAGAIN:
-- /*
-- * Two reasons for this:
-- * - EBUSY: Owner is exiting and we just wait for the
-- * exit to complete.
-- * - EAGAIN: The user space value changed.
-- */
-- double_unlock_hb(hb1, hb2);
-- hb_waiters_dec(hb2);
-- /*
-- * Handle the case where the owner is in the middle of
-- * exiting. Wait for the exit to complete otherwise
-- * this task might loop forever, aka. live lock.
-- */
-- wait_for_owner_exiting(ret, exiting);
-- cond_resched();
-- goto retry;
-- default:
-- goto out_unlock;
-- }
-- }
--
-- plist_for_each_entry_safe(this, next, &hb1->chain, list) {
-- if (task_count - nr_wake >= nr_requeue)
-- break;
--
-- if (!match_futex(&this->key, &key1))
-- continue;
--
-- /*
-- * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
-- * be paired with each other and no other futex ops.
-- *
-- * We should never be requeueing a futex_q with a pi_state,
-- * which is awaiting a futex_unlock_pi().
-- */
-- if ((requeue_pi && !this->rt_waiter) ||
-- (!requeue_pi && this->rt_waiter) ||
-- this->pi_state) {
-- ret = -EINVAL;
-- break;
-- }
--
-- /* Plain futexes just wake or requeue and are done */
-- if (!requeue_pi) {
-- if (++task_count <= nr_wake)
-- mark_wake_futex(&wake_q, this);
-- else
-- requeue_futex(this, hb1, hb2, &key2);
-- continue;
-- }
--
-- /* Ensure we requeue to the expected futex for requeue_pi. */
-- if (!match_futex(this->requeue_pi_key, &key2)) {
-- ret = -EINVAL;
-- break;
-- }
--
-- /*
-- * Requeue nr_requeue waiters and possibly one more in the case
-- * of requeue_pi if we couldn't acquire the lock atomically.
-- *
-- * Prepare the waiter to take the rt_mutex. Take a refcount
-- * on the pi_state and store the pointer in the futex_q
-- * object of the waiter.
-- */
-- get_pi_state(pi_state);
--
-- /* Don't requeue when the waiter is already on the way out. */
-- if (!futex_requeue_pi_prepare(this, pi_state)) {
-- /*
-- * Early woken waiter signaled that it is on the
-- * way out. Drop the pi_state reference and try the
-- * next waiter. @this->pi_state is still NULL.
-- */
-- put_pi_state(pi_state);
-- continue;
-- }
--
-- ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
-- this->rt_waiter,
-- this->task);
--
-- if (ret == 1) {
-- /*
-- * We got the lock. We do neither drop the refcount
-- * on pi_state nor clear this->pi_state because the
-- * waiter needs the pi_state for cleaning up the
-- * user space value. It will drop the refcount
-- * after doing so. this::requeue_state is updated
-- * in the wakeup as well.
-- */
-- requeue_pi_wake_futex(this, &key2, hb2);
-- task_count++;
-- } else if (!ret) {
-- /* Waiter is queued, move it to hb2 */
-- requeue_futex(this, hb1, hb2, &key2);
-- futex_requeue_pi_complete(this, 0);
-- task_count++;
-- } else {
-- /*
-- * rt_mutex_start_proxy_lock() detected a potential
-- * deadlock when we tried to queue that waiter.
-- * Drop the pi_state reference which we took above
-- * and remove the pointer to the state from the
-- * waiters futex_q object.
-- */
-- this->pi_state = NULL;
-- put_pi_state(pi_state);
-- futex_requeue_pi_complete(this, ret);
-- /*
-- * We stop queueing more waiters and let user space
-- * deal with the mess.
-- */
-- break;
-- }
-- }
--
-- /*
-- * We took an extra initial reference to the pi_state in
-- * futex_proxy_trylock_atomic(). We need to drop it here again.
-- */
-- put_pi_state(pi_state);
--
--out_unlock:
-- double_unlock_hb(hb1, hb2);
-- wake_up_q(&wake_q);
-- hb_waiters_dec(hb2);
-- return ret ? ret : task_count;
--}
--
--/* The key must be already stored in q->key. */
--static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
-- __acquires(&hb->lock)
--{
-- struct futex_hash_bucket *hb;
--
-- hb = hash_futex(&q->key);
--
-- /*
-- * Increment the counter before taking the lock so that
-- * a potential waker won't miss a to-be-slept task that is
-- * waiting for the spinlock. This is safe as all queue_lock()
-- * users end up calling queue_me(). Similarly, for housekeeping,
-- * decrement the counter at queue_unlock() when some error has
-- * occurred and we don't end up adding the task to the list.
-- */
-- hb_waiters_inc(hb); /* implies smp_mb(); (A) */
--
-- q->lock_ptr = &hb->lock;
--
-- spin_lock(&hb->lock);
-- return hb;
--}
--
--static inline void
--queue_unlock(struct futex_hash_bucket *hb)
-- __releases(&hb->lock)
--{
-- spin_unlock(&hb->lock);
-- hb_waiters_dec(hb);
--}
--
--static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
--{
-- int prio;
--
-- /*
-- * The priority used to register this element is
-- * - either the real thread-priority for the real-time threads
-- * (i.e. threads with a priority lower than MAX_RT_PRIO)
-- * - or MAX_RT_PRIO for non-RT threads.
-- * Thus, all RT-threads are woken first in priority order, and
-- * the others are woken last, in FIFO order.
-- */
-- prio = min(current->normal_prio, MAX_RT_PRIO);
--
-- plist_node_init(&q->list, prio);
-- plist_add(&q->list, &hb->chain);
-- q->task = current;
--}
--
--/**
-- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
-- * @q: The futex_q to enqueue
-- * @hb: The destination hash bucket
-- *
-- * The hb->lock must be held by the caller, and is released here. A call to
-- * queue_me() is typically paired with exactly one call to unqueue_me(). The
-- * exceptions involve the PI related operations, which may use unqueue_me_pi()
-- * or nothing if the unqueue is done as part of the wake process and the unqueue
-- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
-- * an example).
-- */
--static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
-- __releases(&hb->lock)
--{
-- __queue_me(q, hb);
-- spin_unlock(&hb->lock);
--}
--
--/**
-- * unqueue_me() - Remove the futex_q from its futex_hash_bucket
-- * @q: The futex_q to unqueue
-- *
-- * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
-- * be paired with exactly one earlier call to queue_me().
-- *
-- * Return:
-- * - 1 - if the futex_q was still queued (and we removed unqueued it);
-- * - 0 - if the futex_q was already removed by the waking thread
-- */
--static int unqueue_me(struct futex_q *q)
--{
-- spinlock_t *lock_ptr;
-- int ret = 0;
--
-- /* In the common case we don't take the spinlock, which is nice. */
--retry:
-- /*
-- * q->lock_ptr can change between this read and the following spin_lock.
-- * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
-- * optimizing lock_ptr out of the logic below.
-- */
-- lock_ptr = READ_ONCE(q->lock_ptr);
-- if (lock_ptr != NULL) {
-- spin_lock(lock_ptr);
-- /*
-- * q->lock_ptr can change between reading it and
-- * spin_lock(), causing us to take the wrong lock. This
-- * corrects the race condition.
-- *
-- * Reasoning goes like this: if we have the wrong lock,
-- * q->lock_ptr must have changed (maybe several times)
-- * between reading it and the spin_lock(). It can
-- * change again after the spin_lock() but only if it was
-- * already changed before the spin_lock(). It cannot,
-- * however, change back to the original value. Therefore
-- * we can detect whether we acquired the correct lock.
-- */
-- if (unlikely(lock_ptr != q->lock_ptr)) {
-- spin_unlock(lock_ptr);
-- goto retry;
-- }
-- __unqueue_futex(q);
--
-- BUG_ON(q->pi_state);
--
-- spin_unlock(lock_ptr);
-- ret = 1;
-- }
--
-- return ret;
--}
--
--/*
-- * PI futexes can not be requeued and must remove themselves from the
-- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
-- */
--static void unqueue_me_pi(struct futex_q *q)
--{
-- __unqueue_futex(q);
--
-- BUG_ON(!q->pi_state);
-- put_pi_state(q->pi_state);
-- q->pi_state = NULL;
--}
--
--static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-- struct task_struct *argowner)
--{
-- struct futex_pi_state *pi_state = q->pi_state;
-- struct task_struct *oldowner, *newowner;
-- u32 uval, curval, newval, newtid;
-- int err = 0;
--
-- oldowner = pi_state->owner;
--
-- /*
-- * We are here because either:
-- *
-- * - we stole the lock and pi_state->owner needs updating to reflect
-- * that (@argowner == current),
-- *
-- * or:
-- *
-- * - someone stole our lock and we need to fix things to point to the
-- * new owner (@argowner == NULL).
-- *
-- * Either way, we have to replace the TID in the user space variable.
-- * This must be atomic as we have to preserve the owner died bit here.
-- *
-- * Note: We write the user space value _before_ changing the pi_state
-- * because we can fault here. Imagine swapped out pages or a fork
-- * that marked all the anonymous memory readonly for cow.
-- *
-- * Modifying pi_state _before_ the user space value would leave the
-- * pi_state in an inconsistent state when we fault here, because we
-- * need to drop the locks to handle the fault. This might be observed
-- * in the PID checks when attaching to PI state .
-- */
--retry:
-- if (!argowner) {
-- if (oldowner != current) {
-- /*
-- * We raced against a concurrent self; things are
-- * already fixed up. Nothing to do.
-- */
-- return 0;
-- }
--
-- if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
-- /* We got the lock. pi_state is correct. Tell caller. */
-- return 1;
-- }
--
-- /*
-- * The trylock just failed, so either there is an owner or
-- * there is a higher priority waiter than this one.
-- */
-- newowner = rt_mutex_owner(&pi_state->pi_mutex);
-- /*
-- * If the higher priority waiter has not yet taken over the
-- * rtmutex then newowner is NULL. We can't return here with
-- * that state because it's inconsistent vs. the user space
-- * state. So drop the locks and try again. It's a valid
-- * situation and not any different from the other retry
-- * conditions.
-- */
-- if (unlikely(!newowner)) {
-- err = -EAGAIN;
-- goto handle_err;
-- }
-- } else {
-- WARN_ON_ONCE(argowner != current);
-- if (oldowner == current) {
-- /*
-- * We raced against a concurrent self; things are
-- * already fixed up. Nothing to do.
-- */
-- return 1;
-- }
-- newowner = argowner;
-- }
--
-- newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
-- /* Owner died? */
-- if (!pi_state->owner)
-- newtid |= FUTEX_OWNER_DIED;
--
-- err = get_futex_value_locked(&uval, uaddr);
-- if (err)
-- goto handle_err;
--
-- for (;;) {
-- newval = (uval & FUTEX_OWNER_DIED) | newtid;
--
-- err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval);
-- if (err)
-- goto handle_err;
--
-- if (curval == uval)
-- break;
-- uval = curval;
-- }
--
-- /*
-- * We fixed up user space. Now we need to fix the pi_state
-- * itself.
-- */
-- pi_state_update_owner(pi_state, newowner);
--
-- return argowner == current;
--
-- /*
-- * In order to reschedule or handle a page fault, we need to drop the
-- * locks here. In the case of a fault, this gives the other task
-- * (either the highest priority waiter itself or the task which stole
-- * the rtmutex) the chance to try the fixup of the pi_state. So once we
-- * are back from handling the fault we need to check the pi_state after
-- * reacquiring the locks and before trying to do another fixup. When
-- * the fixup has been done already we simply return.
-- *
-- * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
-- * drop hb->lock since the caller owns the hb -> futex_q relation.
-- * Dropping the pi_mutex->wait_lock requires the state revalidate.
-- */
--handle_err:
-- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-- spin_unlock(q->lock_ptr);
--
-- switch (err) {
-- case -EFAULT:
-- err = fault_in_user_writeable(uaddr);
-- break;
--
-- case -EAGAIN:
-- cond_resched();
-- err = 0;
-- break;
--
-- default:
-- WARN_ON_ONCE(1);
-- break;
-- }
--
-- spin_lock(q->lock_ptr);
-- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
--
-- /*
-- * Check if someone else fixed it for us:
-- */
-- if (pi_state->owner != oldowner)
-- return argowner == current;
--
-- /* Retry if err was -EAGAIN or the fault in succeeded */
-- if (!err)
-- goto retry;
--
-- /*
-- * fault_in_user_writeable() failed so user state is immutable. At
-- * best we can make the kernel state consistent but user state will
-- * be most likely hosed and any subsequent unlock operation will be
-- * rejected due to PI futex rule [10].
-- *
-- * Ensure that the rtmutex owner is also the pi_state owner despite
-- * the user space value claiming something different. There is no
-- * point in unlocking the rtmutex if current is the owner as it
-- * would need to wait until the next waiter has taken the rtmutex
-- * to guarantee consistent state. Keep it simple. Userspace asked
-- * for this wreckaged state.
-- *
-- * The rtmutex has an owner - either current or some other
-- * task. See the EAGAIN loop above.
-- */
-- pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
--
-- return err;
--}
--
--static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-- struct task_struct *argowner)
--{
-- struct futex_pi_state *pi_state = q->pi_state;
-- int ret;
--
-- lockdep_assert_held(q->lock_ptr);
--
-- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-- ret = __fixup_pi_state_owner(uaddr, q, argowner);
-- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-- return ret;
--}
--
--static long futex_wait_restart(struct restart_block *restart);
--
--/**
-- * fixup_owner() - Post lock pi_state and corner case management
-- * @uaddr: user address of the futex
-- * @q: futex_q (contains pi_state and access to the rt_mutex)
-- * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
-- *
-- * After attempting to lock an rt_mutex, this function is called to cleanup
-- * the pi_state owner as well as handle race conditions that may allow us to
-- * acquire the lock. Must be called with the hb lock held.
-- *
-- * Return:
-- * - 1 - success, lock taken;
-- * - 0 - success, lock not taken;
-- * - <0 - on error (-EFAULT)
-- */
--static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
--{
-- if (locked) {
-- /*
-- * Got the lock. We might not be the anticipated owner if we
-- * did a lock-steal - fix up the PI-state in that case:
-- *
-- * Speculative pi_state->owner read (we don't hold wait_lock);
-- * since we own the lock pi_state->owner == current is the
-- * stable state, anything else needs more attention.
-- */
-- if (q->pi_state->owner != current)
-- return fixup_pi_state_owner(uaddr, q, current);
-- return 1;
-- }
--
-- /*
-- * If we didn't get the lock; check if anybody stole it from us. In
-- * that case, we need to fix up the uval to point to them instead of
-- * us, otherwise bad things happen. [10]
-- *
-- * Another speculative read; pi_state->owner == current is unstable
-- * but needs our attention.
-- */
-- if (q->pi_state->owner == current)
-- return fixup_pi_state_owner(uaddr, q, NULL);
--
-- /*
-- * Paranoia check. If we did not take the lock, then we should not be
-- * the owner of the rt_mutex. Warn and establish consistent state.
-- */
-- if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
-- return fixup_pi_state_owner(uaddr, q, current);
--
-- return 0;
--}
--
--/**
-- * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
-- * @hb: the futex hash bucket, must be locked by the caller
-- * @q: the futex_q to queue up on
-- * @timeout: the prepared hrtimer_sleeper, or null for no timeout
-- */
--static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
-- struct hrtimer_sleeper *timeout)
--{
-- /*
-- * The task state is guaranteed to be set before another task can
-- * wake it. set_current_state() is implemented using smp_store_mb() and
-- * queue_me() calls spin_unlock() upon completion, both serializing
-- * access to the hash list and forcing another memory barrier.
-- */
-- set_current_state(TASK_INTERRUPTIBLE);
-- queue_me(q, hb);
--
-- /* Arm the timer */
-- if (timeout)
-- hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
--
-- /*
-- * If we have been removed from the hash list, then another task
-- * has tried to wake us, and we can skip the call to schedule().
-- */
-- if (likely(!plist_node_empty(&q->list))) {
-- /*
-- * If the timer has already expired, current will already be
-- * flagged for rescheduling. Only call schedule if there
-- * is no timeout, or if it has yet to expire.
-- */
-- if (!timeout || timeout->task)
-- freezable_schedule();
-- }
-- __set_current_state(TASK_RUNNING);
--}
--
--/**
-- * futex_wait_setup() - Prepare to wait on a futex
-- * @uaddr: the futex userspace address
-- * @val: the expected value
-- * @flags: futex flags (FLAGS_SHARED, etc.)
-- * @q: the associated futex_q
-- * @hb: storage for hash_bucket pointer to be returned to caller
-- *
-- * Setup the futex_q and locate the hash_bucket. Get the futex value and
-- * compare it with the expected value. Handle atomic faults internally.
-- * Return with the hb lock held on success, and unlocked on failure.
-- *
-- * Return:
-- * - 0 - uaddr contains val and hb has been locked;
-- * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
-- */
--static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
-- struct futex_q *q, struct futex_hash_bucket **hb)
--{
-- u32 uval;
-- int ret;
--
-- /*
-- * Access the page AFTER the hash-bucket is locked.
-- * Order is important:
-- *
-- * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
-- * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
-- *
-- * The basic logical guarantee of a futex is that it blocks ONLY
-- * if cond(var) is known to be true at the time of blocking, for
-- * any cond. If we locked the hash-bucket after testing *uaddr, that
-- * would open a race condition where we could block indefinitely with
-- * cond(var) false, which would violate the guarantee.
-- *
-- * On the other hand, we insert q and release the hash-bucket only
-- * after testing *uaddr. This guarantees that futex_wait() will NOT
-- * absorb a wakeup if *uaddr does not match the desired values
-- * while the syscall executes.
-- */
--retry:
-- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
-- if (unlikely(ret != 0))
-- return ret;
--
--retry_private:
-- *hb = queue_lock(q);
--
-- ret = get_futex_value_locked(&uval, uaddr);
--
-- if (ret) {
-- queue_unlock(*hb);
--
-- ret = get_user(uval, uaddr);
-- if (ret)
-- return ret;
--
-- if (!(flags & FLAGS_SHARED))
-- goto retry_private;
--
-- goto retry;
-- }
--
-- if (uval != val) {
-- queue_unlock(*hb);
-- ret = -EWOULDBLOCK;
-- }
--
-- return ret;
--}
--
--static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
-- ktime_t *abs_time, u32 bitset)
--{
-- struct hrtimer_sleeper timeout, *to;
-- struct restart_block *restart;
-- struct futex_hash_bucket *hb;
-- struct futex_q q = futex_q_init;
-- int ret;
--
-- if (!bitset)
-- return -EINVAL;
-- q.bitset = bitset;
--
-- to = futex_setup_timer(abs_time, &timeout, flags,
-- current->timer_slack_ns);
--retry:
-- /*
-- * Prepare to wait on uaddr. On success, it holds hb->lock and q
-- * is initialized.
-- */
-- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
-- if (ret)
-- goto out;
--
-- /* queue_me and wait for wakeup, timeout, or a signal. */
-- futex_wait_queue_me(hb, &q, to);
--
-- /* If we were woken (and unqueued), we succeeded, whatever. */
-- ret = 0;
-- if (!unqueue_me(&q))
-- goto out;
-- ret = -ETIMEDOUT;
-- if (to && !to->task)
-- goto out;
--
-- /*
-- * We expect signal_pending(current), but we might be the
-- * victim of a spurious wakeup as well.
-- */
-- if (!signal_pending(current))
-- goto retry;
--
-- ret = -ERESTARTSYS;
-- if (!abs_time)
-- goto out;
--
-- restart = &current->restart_block;
-- restart->futex.uaddr = uaddr;
-- restart->futex.val = val;
-- restart->futex.time = *abs_time;
-- restart->futex.bitset = bitset;
-- restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
--
-- ret = set_restart_fn(restart, futex_wait_restart);
--
--out:
-- if (to) {
-- hrtimer_cancel(&to->timer);
-- destroy_hrtimer_on_stack(&to->timer);
-- }
-- return ret;
--}
--
--
--static long futex_wait_restart(struct restart_block *restart)
--{
-- u32 __user *uaddr = restart->futex.uaddr;
-- ktime_t t, *tp = NULL;
--
-- if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
-- t = restart->futex.time;
-- tp = &t;
-- }
-- restart->fn = do_no_restart_syscall;
--
-- return (long)futex_wait(uaddr, restart->futex.flags,
-- restart->futex.val, tp, restart->futex.bitset);
--}
--
--
--/*
-- * Userspace tried a 0 -> TID atomic transition of the futex value
-- * and failed. The kernel side here does the whole locking operation:
-- * if there are waiters then it will block as a consequence of relying
-- * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
-- * a 0 value of the futex too.).
-- *
-- * Also serves as futex trylock_pi()'ing, and due semantics.
-- */
--static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
-- ktime_t *time, int trylock)
--{
-- struct hrtimer_sleeper timeout, *to;
-- struct task_struct *exiting = NULL;
-- struct rt_mutex_waiter rt_waiter;
-- struct futex_hash_bucket *hb;
-- struct futex_q q = futex_q_init;
-- int res, ret;
--
-- if (!IS_ENABLED(CONFIG_FUTEX_PI))
-- return -ENOSYS;
--
-- if (refill_pi_state_cache())
-- return -ENOMEM;
--
-- to = futex_setup_timer(time, &timeout, flags, 0);
--
--retry:
-- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
-- if (unlikely(ret != 0))
-- goto out;
--
--retry_private:
-- hb = queue_lock(&q);
--
-- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
-- &exiting, 0);
-- if (unlikely(ret)) {
-- /*
-- * Atomic work succeeded and we got the lock,
-- * or failed. Either way, we do _not_ block.
-- */
-- switch (ret) {
-- case 1:
-- /* We got the lock. */
-- ret = 0;
-- goto out_unlock_put_key;
-- case -EFAULT:
-- goto uaddr_faulted;
-- case -EBUSY:
-- case -EAGAIN:
-- /*
-- * Two reasons for this:
-- * - EBUSY: Task is exiting and we just wait for the
-- * exit to complete.
-- * - EAGAIN: The user space value changed.
-- */
-- queue_unlock(hb);
-- /*
-- * Handle the case where the owner is in the middle of
-- * exiting. Wait for the exit to complete otherwise
-- * this task might loop forever, aka. live lock.
-- */
-- wait_for_owner_exiting(ret, exiting);
-- cond_resched();
-- goto retry;
-- default:
-- goto out_unlock_put_key;
-- }
-- }
--
-- WARN_ON(!q.pi_state);
--
-- /*
-- * Only actually queue now that the atomic ops are done:
-- */
-- __queue_me(&q, hb);
--
-- if (trylock) {
-- ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
-- /* Fixup the trylock return value: */
-- ret = ret ? 0 : -EWOULDBLOCK;
-- goto no_block;
-- }
--
-- rt_mutex_init_waiter(&rt_waiter);
--
-- /*
-- * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
-- * hold it while doing rt_mutex_start_proxy(), because then it will
-- * include hb->lock in the blocking chain, even through we'll not in
-- * fact hold it while blocking. This will lead it to report -EDEADLK
-- * and BUG when futex_unlock_pi() interleaves with this.
-- *
-- * Therefore acquire wait_lock while holding hb->lock, but drop the
-- * latter before calling __rt_mutex_start_proxy_lock(). This
-- * interleaves with futex_unlock_pi() -- which does a similar lock
-- * handoff -- such that the latter can observe the futex_q::pi_state
-- * before __rt_mutex_start_proxy_lock() is done.
-- */
-- raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
-- spin_unlock(q.lock_ptr);
-- /*
-- * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
-- * such that futex_unlock_pi() is guaranteed to observe the waiter when
-- * it sees the futex_q::pi_state.
-- */
-- ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
-- raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
--
-- if (ret) {
-- if (ret == 1)
-- ret = 0;
-- goto cleanup;
-- }
--
-- if (unlikely(to))
-- hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
--
-- ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
--
--cleanup:
-- spin_lock(q.lock_ptr);
-- /*
-- * If we failed to acquire the lock (deadlock/signal/timeout), we must
-- * first acquire the hb->lock before removing the lock from the
-- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
-- * lists consistent.
-- *
-- * In particular; it is important that futex_unlock_pi() can not
-- * observe this inconsistency.
-- */
-- if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
-- ret = 0;
--
--no_block:
-- /*
-- * Fixup the pi_state owner and possibly acquire the lock if we
-- * haven't already.
-- */
-- res = fixup_owner(uaddr, &q, !ret);
-- /*
-- * If fixup_owner() returned an error, propagate that. If it acquired
-- * the lock, clear our -ETIMEDOUT or -EINTR.
-- */
-- if (res)
-- ret = (res < 0) ? res : 0;
--
-- unqueue_me_pi(&q);
-- spin_unlock(q.lock_ptr);
-- goto out;
--
--out_unlock_put_key:
-- queue_unlock(hb);
--
--out:
-- if (to) {
-- hrtimer_cancel(&to->timer);
-- destroy_hrtimer_on_stack(&to->timer);
-- }
-- return ret != -EINTR ? ret : -ERESTARTNOINTR;
--
--uaddr_faulted:
-- queue_unlock(hb);
--
-- ret = fault_in_user_writeable(uaddr);
-- if (ret)
-- goto out;
--
-- if (!(flags & FLAGS_SHARED))
-- goto retry_private;
--
-- goto retry;
--}
--
--/*
-- * Userspace attempted a TID -> 0 atomic transition, and failed.
-- * This is the in-kernel slowpath: we look up the PI state (if any),
-- * and do the rt-mutex unlock.
-- */
--static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
--{
-- u32 curval, uval, vpid = task_pid_vnr(current);
-- union futex_key key = FUTEX_KEY_INIT;
-- struct futex_hash_bucket *hb;
-- struct futex_q *top_waiter;
-- int ret;
--
-- if (!IS_ENABLED(CONFIG_FUTEX_PI))
-- return -ENOSYS;
--
--retry:
-- if (get_user(uval, uaddr))
-- return -EFAULT;
-- /*
-- * We release only a lock we actually own:
-- */
-- if ((uval & FUTEX_TID_MASK) != vpid)
-- return -EPERM;
--
-- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
-- if (ret)
-- return ret;
--
-- hb = hash_futex(&key);
-- spin_lock(&hb->lock);
--
-- /*
-- * Check waiters first. We do not trust user space values at
-- * all and we at least want to know if user space fiddled
-- * with the futex value instead of blindly unlocking.
-- */
-- top_waiter = futex_top_waiter(hb, &key);
-- if (top_waiter) {
-- struct futex_pi_state *pi_state = top_waiter->pi_state;
--
-- ret = -EINVAL;
-- if (!pi_state)
-- goto out_unlock;
--
-- /*
-- * If current does not own the pi_state then the futex is
-- * inconsistent and user space fiddled with the futex value.
-- */
-- if (pi_state->owner != current)
-- goto out_unlock;
--
-- get_pi_state(pi_state);
-- /*
-- * By taking wait_lock while still holding hb->lock, we ensure
-- * there is no point where we hold neither; and therefore
-- * wake_futex_pi() must observe a state consistent with what we
-- * observed.
-- *
-- * In particular; this forces __rt_mutex_start_proxy() to
-- * complete such that we're guaranteed to observe the
-- * rt_waiter. Also see the WARN in wake_futex_pi().
-- */
-- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-- spin_unlock(&hb->lock);
--
-- /* drops pi_state->pi_mutex.wait_lock */
-- ret = wake_futex_pi(uaddr, uval, pi_state);
--
-- put_pi_state(pi_state);
--
-- /*
-- * Success, we're done! No tricky corner cases.
-- */
-- if (!ret)
-- return ret;
-- /*
-- * The atomic access to the futex value generated a
-- * pagefault, so retry the user-access and the wakeup:
-- */
-- if (ret == -EFAULT)
-- goto pi_faulted;
-- /*
-- * A unconditional UNLOCK_PI op raced against a waiter
-- * setting the FUTEX_WAITERS bit. Try again.
-- */
-- if (ret == -EAGAIN)
-- goto pi_retry;
-- /*
-- * wake_futex_pi has detected invalid state. Tell user
-- * space.
-- */
-- return ret;
-- }
--
-- /*
-- * We have no kernel internal state, i.e. no waiters in the
-- * kernel. Waiters which are about to queue themselves are stuck
-- * on hb->lock. So we can safely ignore them. We do neither
-- * preserve the WAITERS bit not the OWNER_DIED one. We are the
-- * owner.
-- */
-- if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {
-- spin_unlock(&hb->lock);
-- switch (ret) {
-- case -EFAULT:
-- goto pi_faulted;
--
-- case -EAGAIN:
-- goto pi_retry;
--
-- default:
-- WARN_ON_ONCE(1);
-- return ret;
-- }
-- }
--
-- /*
-- * If uval has changed, let user space handle it.
-- */
-- ret = (curval == uval) ? 0 : -EAGAIN;
--
--out_unlock:
-- spin_unlock(&hb->lock);
-- return ret;
--
--pi_retry:
-- cond_resched();
-- goto retry;
--
--pi_faulted:
--
-- ret = fault_in_user_writeable(uaddr);
-- if (!ret)
-- goto retry;
--
-- return ret;
--}
--
--/**
-- * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
-- * @hb: the hash_bucket futex_q was original enqueued on
-- * @q: the futex_q woken while waiting to be requeued
-- * @timeout: the timeout associated with the wait (NULL if none)
-- *
-- * Determine the cause for the early wakeup.
-- *
-- * Return:
-- * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
-- */
--static inline
--int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
-- struct futex_q *q,
-- struct hrtimer_sleeper *timeout)
--{
-- int ret;
--
-- /*
-- * With the hb lock held, we avoid races while we process the wakeup.
-- * We only need to hold hb (and not hb2) to ensure atomicity as the
-- * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
-- * It can't be requeued from uaddr2 to something else since we don't
-- * support a PI aware source futex for requeue.
-- */
-- WARN_ON_ONCE(&hb->lock != q->lock_ptr);
--
-- /*
-- * We were woken prior to requeue by a timeout or a signal.
-- * Unqueue the futex_q and determine which it was.
-- */
-- plist_del(&q->list, &hb->chain);
-- hb_waiters_dec(hb);
--
-- /* Handle spurious wakeups gracefully */
-- ret = -EWOULDBLOCK;
-- if (timeout && !timeout->task)
-- ret = -ETIMEDOUT;
-- else if (signal_pending(current))
-- ret = -ERESTARTNOINTR;
-- return ret;
--}
--
--/**
-- * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
-- * @uaddr: the futex we initially wait on (non-pi)
-- * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
-- * the same type, no requeueing from private to shared, etc.
-- * @val: the expected value of uaddr
-- * @abs_time: absolute timeout
-- * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
-- * @uaddr2: the pi futex we will take prior to returning to user-space
-- *
-- * The caller will wait on uaddr and will be requeued by futex_requeue() to
-- * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
-- * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
-- * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
-- * without one, the pi logic would not know which task to boost/deboost, if
-- * there was a need to.
-- *
-- * We call schedule in futex_wait_queue_me() when we enqueue and return there
-- * via the following--
-- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
-- * 2) wakeup on uaddr2 after a requeue
-- * 3) signal
-- * 4) timeout
-- *
-- * If 3, cleanup and return -ERESTARTNOINTR.
-- *
-- * If 2, we may then block on trying to take the rt_mutex and return via:
-- * 5) successful lock
-- * 6) signal
-- * 7) timeout
-- * 8) other lock acquisition failure
-- *
-- * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
-- *
-- * If 4 or 7, we cleanup and return with -ETIMEDOUT.
-- *
-- * Return:
-- * - 0 - On success;
-- * - <0 - On error
-- */
--static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
-- u32 val, ktime_t *abs_time, u32 bitset,
-- u32 __user *uaddr2)
--{
-- struct hrtimer_sleeper timeout, *to;
-- struct rt_mutex_waiter rt_waiter;
-- struct futex_hash_bucket *hb;
-- union futex_key key2 = FUTEX_KEY_INIT;
-- struct futex_q q = futex_q_init;
-- struct rt_mutex_base *pi_mutex;
-- int res, ret;
--
-- if (!IS_ENABLED(CONFIG_FUTEX_PI))
-- return -ENOSYS;
--
-- if (uaddr == uaddr2)
-- return -EINVAL;
--
-- if (!bitset)
-- return -EINVAL;
--
-- to = futex_setup_timer(abs_time, &timeout, flags,
-- current->timer_slack_ns);
--
-- /*
-- * The waiter is allocated on our stack, manipulated by the requeue
-- * code while we sleep on uaddr.
-- */
-- rt_mutex_init_waiter(&rt_waiter);
--
-- ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
-- if (unlikely(ret != 0))
-- goto out;
--
-- q.bitset = bitset;
-- q.rt_waiter = &rt_waiter;
-- q.requeue_pi_key = &key2;
--
-- /*
-- * Prepare to wait on uaddr. On success, it holds hb->lock and q
-- * is initialized.
-- */
-- ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
-- if (ret)
-- goto out;
--
-- /*
-- * The check above which compares uaddrs is not sufficient for
-- * shared futexes. We need to compare the keys:
-- */
-- if (match_futex(&q.key, &key2)) {
-- queue_unlock(hb);
-- ret = -EINVAL;
-- goto out;
-- }
--
-- /* Queue the futex_q, drop the hb lock, wait for wakeup. */
-- futex_wait_queue_me(hb, &q, to);
--
-- switch (futex_requeue_pi_wakeup_sync(&q)) {
-- case Q_REQUEUE_PI_IGNORE:
-- /* The waiter is still on uaddr1 */
-- spin_lock(&hb->lock);
-- ret = handle_early_requeue_pi_wakeup(hb, &q, to);
-- spin_unlock(&hb->lock);
-- break;
--
-- case Q_REQUEUE_PI_LOCKED:
-- /* The requeue acquired the lock */
-- if (q.pi_state && (q.pi_state->owner != current)) {
-- spin_lock(q.lock_ptr);
-- ret = fixup_owner(uaddr2, &q, true);
-- /*
-- * Drop the reference to the pi state which the
-- * requeue_pi() code acquired for us.
-- */
-- put_pi_state(q.pi_state);
-- spin_unlock(q.lock_ptr);
-- /*
-- * Adjust the return value. It's either -EFAULT or
-- * success (1) but the caller expects 0 for success.
-- */
-- ret = ret < 0 ? ret : 0;
-- }
-- break;
--
-- case Q_REQUEUE_PI_DONE:
-- /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
-- pi_mutex = &q.pi_state->pi_mutex;
-- ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
--
-- /* Current is not longer pi_blocked_on */
-- spin_lock(q.lock_ptr);
-- if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
-- ret = 0;
--
-- debug_rt_mutex_free_waiter(&rt_waiter);
-- /*
-- * Fixup the pi_state owner and possibly acquire the lock if we
-- * haven't already.
-- */
-- res = fixup_owner(uaddr2, &q, !ret);
-- /*
-- * If fixup_owner() returned an error, propagate that. If it
-- * acquired the lock, clear -ETIMEDOUT or -EINTR.
-- */
-- if (res)
-- ret = (res < 0) ? res : 0;
--
-- unqueue_me_pi(&q);
-- spin_unlock(q.lock_ptr);
--
-- if (ret == -EINTR) {
-- /*
-- * We've already been requeued, but cannot restart
-- * by calling futex_lock_pi() directly. We could
-- * restart this syscall, but it would detect that
-- * the user space "val" changed and return
-- * -EWOULDBLOCK. Save the overhead of the restart
-- * and return -EWOULDBLOCK directly.
-- */
-- ret = -EWOULDBLOCK;
-- }
-- break;
-- default:
-- BUG();
-- }
--
--out:
-- if (to) {
-- hrtimer_cancel(&to->timer);
-- destroy_hrtimer_on_stack(&to->timer);
-- }
-- return ret;
--}
--
--/*
-- * Support for robust futexes: the kernel cleans up held futexes at
-- * thread exit time.
-- *
-- * Implementation: user-space maintains a per-thread list of locks it
-- * is holding. Upon do_exit(), the kernel carefully walks this list,
-- * and marks all locks that are owned by this thread with the
-- * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
-- * always manipulated with the lock held, so the list is private and
-- * per-thread. Userspace also maintains a per-thread 'list_op_pending'
-- * field, to allow the kernel to clean up if the thread dies after
-- * acquiring the lock, but just before it could have added itself to
-- * the list. There can only be one such pending lock.
-- */
--
--/**
-- * sys_set_robust_list() - Set the robust-futex list head of a task
-- * @head: pointer to the list-head
-- * @len: length of the list-head, as userspace expects
-- */
--SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
-- size_t, len)
--{
-- if (!futex_cmpxchg_enabled)
-- return -ENOSYS;
-- /*
-- * The kernel knows only one size for now:
-- */
-- if (unlikely(len != sizeof(*head)))
-- return -EINVAL;
--
-- current->robust_list = head;
--
-- return 0;
--}
--
--/**
-- * sys_get_robust_list() - Get the robust-futex list head of a task
-- * @pid: pid of the process [zero for current task]
-- * @head_ptr: pointer to a list-head pointer, the kernel fills it in
-- * @len_ptr: pointer to a length field, the kernel fills in the header size
-- */
--SYSCALL_DEFINE3(get_robust_list, int, pid,
-- struct robust_list_head __user * __user *, head_ptr,
-- size_t __user *, len_ptr)
--{
-- struct robust_list_head __user *head;
-- unsigned long ret;
-- struct task_struct *p;
--
-- if (!futex_cmpxchg_enabled)
-- return -ENOSYS;
--
-- rcu_read_lock();
--
-- ret = -ESRCH;
-- if (!pid)
-- p = current;
-- else {
-- p = find_task_by_vpid(pid);
-- if (!p)
-- goto err_unlock;
-- }
--
-- ret = -EPERM;
-- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
-- goto err_unlock;
--
-- head = p->robust_list;
-- rcu_read_unlock();
--
-- if (put_user(sizeof(*head), len_ptr))
-- return -EFAULT;
-- return put_user(head, head_ptr);
--
--err_unlock:
-- rcu_read_unlock();
--
-- return ret;
--}
--
--/* Constants for the pending_op argument of handle_futex_death */
--#define HANDLE_DEATH_PENDING true
--#define HANDLE_DEATH_LIST false
--
--/*
-- * Process a futex-list entry, check whether it's owned by the
-- * dying task, and do notification if so:
-- */
--static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
-- bool pi, bool pending_op)
--{
-- u32 uval, nval, mval;
-- int err;
--
-- /* Futex address must be 32bit aligned */
-- if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
-- return -1;
--
--retry:
-- if (get_user(uval, uaddr))
-- return -1;
--
-- /*
-- * Special case for regular (non PI) futexes. The unlock path in
-- * user space has two race scenarios:
-- *
-- * 1. The unlock path releases the user space futex value and
-- * before it can execute the futex() syscall to wake up
-- * waiters it is killed.
-- *
-- * 2. A woken up waiter is killed before it can acquire the
-- * futex in user space.
-- *
-- * In both cases the TID validation below prevents a wakeup of
-- * potential waiters which can cause these waiters to block
-- * forever.
-- *
-- * In both cases the following conditions are met:
-- *
-- * 1) task->robust_list->list_op_pending != NULL
-- * @pending_op == true
-- * 2) User space futex value == 0
-- * 3) Regular futex: @pi == false
-- *
-- * If these conditions are met, it is safe to attempt waking up a
-- * potential waiter without touching the user space futex value and
-- * trying to set the OWNER_DIED bit. The user space futex value is
-- * uncontended and the rest of the user space mutex state is
-- * consistent, so a woken waiter will just take over the
-- * uncontended futex. Setting the OWNER_DIED bit would create
-- * inconsistent state and malfunction of the user space owner died
-- * handling.
-- */
-- if (pending_op && !pi && !uval) {
-- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
-- return 0;
-- }
--
-- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
-- return 0;
--
-- /*
-- * Ok, this dying thread is truly holding a futex
-- * of interest. Set the OWNER_DIED bit atomically
-- * via cmpxchg, and if the value had FUTEX_WAITERS
-- * set, wake up a waiter (if any). (We have to do a
-- * futex_wake() even if OWNER_DIED is already set -
-- * to handle the rare but possible case of recursive
-- * thread-death.) The rest of the cleanup is done in
-- * userspace.
-- */
-- mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
--
-- /*
-- * We are not holding a lock here, but we want to have
-- * the pagefault_disable/enable() protection because
-- * we want to handle the fault gracefully. If the
-- * access fails we try to fault in the futex with R/W
-- * verification via get_user_pages. get_user() above
-- * does not guarantee R/W access. If that fails we
-- * give up and leave the futex locked.
-- */
-- if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) {
-- switch (err) {
-- case -EFAULT:
-- if (fault_in_user_writeable(uaddr))
-- return -1;
-- goto retry;
--
-- case -EAGAIN:
-- cond_resched();
-- goto retry;
--
-- default:
-- WARN_ON_ONCE(1);
-- return err;
-- }
-- }
--
-- if (nval != uval)
-- goto retry;
--
-- /*
-- * Wake robust non-PI futexes here. The wakeup of
-- * PI futexes happens in exit_pi_state():
-- */
-- if (!pi && (uval & FUTEX_WAITERS))
-- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
--
-- return 0;
--}
--
--/*
-- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
-- */
--static inline int fetch_robust_entry(struct robust_list __user **entry,
-- struct robust_list __user * __user *head,
-- unsigned int *pi)
--{
-- unsigned long uentry;
--
-- if (get_user(uentry, (unsigned long __user *)head))
-- return -EFAULT;
--
-- *entry = (void __user *)(uentry & ~1UL);
-- *pi = uentry & 1;
--
-- return 0;
--}
--
--/*
-- * Walk curr->robust_list (very carefully, it's a userspace list!)
-- * and mark any locks found there dead, and notify any waiters.
-- *
-- * We silently return on any sign of list-walking problem.
-- */
--static void exit_robust_list(struct task_struct *curr)
--{
-- struct robust_list_head __user *head = curr->robust_list;
-- struct robust_list __user *entry, *next_entry, *pending;
-- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-- unsigned int next_pi;
-- unsigned long futex_offset;
-- int rc;
--
-- if (!futex_cmpxchg_enabled)
-- return;
--
-- /*
-- * Fetch the list head (which was registered earlier, via
-- * sys_set_robust_list()):
-- */
-- if (fetch_robust_entry(&entry, &head->list.next, &pi))
-- return;
-- /*
-- * Fetch the relative futex offset:
-- */
-- if (get_user(futex_offset, &head->futex_offset))
-- return;
-- /*
-- * Fetch any possibly pending lock-add first, and handle it
-- * if it exists:
-- */
-- if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
-- return;
--
-- next_entry = NULL; /* avoid warning with gcc */
-- while (entry != &head->list) {
-- /*
-- * Fetch the next entry in the list before calling
-- * handle_futex_death:
-- */
-- rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
-- /*
-- * A pending lock might already be on the list, so
-- * don't process it twice:
-- */
-- if (entry != pending) {
-- if (handle_futex_death((void __user *)entry + futex_offset,
-- curr, pi, HANDLE_DEATH_LIST))
-- return;
-- }
-- if (rc)
-- return;
-- entry = next_entry;
-- pi = next_pi;
-- /*
-- * Avoid excessively long or circular lists:
-- */
-- if (!--limit)
-- break;
--
-- cond_resched();
-- }
--
-- if (pending) {
-- handle_futex_death((void __user *)pending + futex_offset,
-- curr, pip, HANDLE_DEATH_PENDING);
-- }
--}
--
--static void futex_cleanup(struct task_struct *tsk)
--{
-- if (unlikely(tsk->robust_list)) {
-- exit_robust_list(tsk);
-- tsk->robust_list = NULL;
-- }
--
--#ifdef CONFIG_COMPAT
-- if (unlikely(tsk->compat_robust_list)) {
-- compat_exit_robust_list(tsk);
-- tsk->compat_robust_list = NULL;
-- }
--#endif
--
-- if (unlikely(!list_empty(&tsk->pi_state_list)))
-- exit_pi_state_list(tsk);
--}
--
--/**
-- * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
-- * @tsk: task to set the state on
-- *
-- * Set the futex exit state of the task lockless. The futex waiter code
-- * observes that state when a task is exiting and loops until the task has
-- * actually finished the futex cleanup. The worst case for this is that the
-- * waiter runs through the wait loop until the state becomes visible.
-- *
-- * This is called from the recursive fault handling path in do_exit().
-- *
-- * This is best effort. Either the futex exit code has run already or
-- * not. If the OWNER_DIED bit has been set on the futex then the waiter can
-- * take it over. If not, the problem is pushed back to user space. If the
-- * futex exit code did not run yet, then an already queued waiter might
-- * block forever, but there is nothing which can be done about that.
-- */
--void futex_exit_recursive(struct task_struct *tsk)
--{
-- /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
-- if (tsk->futex_state == FUTEX_STATE_EXITING)
-- mutex_unlock(&tsk->futex_exit_mutex);
-- tsk->futex_state = FUTEX_STATE_DEAD;
--}
--
--static void futex_cleanup_begin(struct task_struct *tsk)
--{
-- /*
-- * Prevent various race issues against a concurrent incoming waiter
-- * including live locks by forcing the waiter to block on
-- * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
-- * attach_to_pi_owner().
-- */
-- mutex_lock(&tsk->futex_exit_mutex);
--
-- /*
-- * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
-- *
-- * This ensures that all subsequent checks of tsk->futex_state in
-- * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
-- * tsk->pi_lock held.
-- *
-- * It guarantees also that a pi_state which was queued right before
-- * the state change under tsk->pi_lock by a concurrent waiter must
-- * be observed in exit_pi_state_list().
-- */
-- raw_spin_lock_irq(&tsk->pi_lock);
-- tsk->futex_state = FUTEX_STATE_EXITING;
-- raw_spin_unlock_irq(&tsk->pi_lock);
--}
--
--static void futex_cleanup_end(struct task_struct *tsk, int state)
--{
-- /*
-- * Lockless store. The only side effect is that an observer might
-- * take another loop until it becomes visible.
-- */
-- tsk->futex_state = state;
-- /*
-- * Drop the exit protection. This unblocks waiters which observed
-- * FUTEX_STATE_EXITING to reevaluate the state.
-- */
-- mutex_unlock(&tsk->futex_exit_mutex);
--}
--
--void futex_exec_release(struct task_struct *tsk)
--{
-- /*
-- * The state handling is done for consistency, but in the case of
-- * exec() there is no way to prevent further damage as the PID stays
-- * the same. But for the unlikely and arguably buggy case that a
-- * futex is held on exec(), this provides at least as much state
-- * consistency protection which is possible.
-- */
-- futex_cleanup_begin(tsk);
-- futex_cleanup(tsk);
-- /*
-- * Reset the state to FUTEX_STATE_OK. The task is alive and about
-- * exec a new binary.
-- */
-- futex_cleanup_end(tsk, FUTEX_STATE_OK);
--}
--
--void futex_exit_release(struct task_struct *tsk)
--{
-- futex_cleanup_begin(tsk);
-- futex_cleanup(tsk);
-- futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
--}
--
--long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
-- u32 __user *uaddr2, u32 val2, u32 val3)
--{
-- int cmd = op & FUTEX_CMD_MASK;
-- unsigned int flags = 0;
--
-- if (!(op & FUTEX_PRIVATE_FLAG))
-- flags |= FLAGS_SHARED;
--
-- if (op & FUTEX_CLOCK_REALTIME) {
-- flags |= FLAGS_CLOCKRT;
-- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
-- cmd != FUTEX_LOCK_PI2)
-- return -ENOSYS;
-- }
--
-- switch (cmd) {
-- case FUTEX_LOCK_PI:
-- case FUTEX_LOCK_PI2:
-- case FUTEX_UNLOCK_PI:
-- case FUTEX_TRYLOCK_PI:
-- case FUTEX_WAIT_REQUEUE_PI:
-- case FUTEX_CMP_REQUEUE_PI:
-- if (!futex_cmpxchg_enabled)
-- return -ENOSYS;
-- }
--
-- switch (cmd) {
-- case FUTEX_WAIT:
-- val3 = FUTEX_BITSET_MATCH_ANY;
-- fallthrough;
-- case FUTEX_WAIT_BITSET:
-- return futex_wait(uaddr, flags, val, timeout, val3);
-- case FUTEX_WAKE:
-- val3 = FUTEX_BITSET_MATCH_ANY;
-- fallthrough;
-- case FUTEX_WAKE_BITSET:
-- return futex_wake(uaddr, flags, val, val3);
-- case FUTEX_REQUEUE:
-- return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
-- case FUTEX_CMP_REQUEUE:
-- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
-- case FUTEX_WAKE_OP:
-- return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
-- case FUTEX_LOCK_PI:
-- flags |= FLAGS_CLOCKRT;
-- fallthrough;
-- case FUTEX_LOCK_PI2:
-- return futex_lock_pi(uaddr, flags, timeout, 0);
-- case FUTEX_UNLOCK_PI:
-- return futex_unlock_pi(uaddr, flags);
-- case FUTEX_TRYLOCK_PI:
-- return futex_lock_pi(uaddr, flags, NULL, 1);
-- case FUTEX_WAIT_REQUEUE_PI:
-- val3 = FUTEX_BITSET_MATCH_ANY;
-- return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
-- uaddr2);
-- case FUTEX_CMP_REQUEUE_PI:
-- return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
-- }
-- return -ENOSYS;
--}
--
--static __always_inline bool futex_cmd_has_timeout(u32 cmd)
--{
-- switch (cmd) {
-- case FUTEX_WAIT:
-- case FUTEX_LOCK_PI:
-- case FUTEX_LOCK_PI2:
-- case FUTEX_WAIT_BITSET:
-- case FUTEX_WAIT_REQUEUE_PI:
-- return true;
-- }
-- return false;
--}
--
--static __always_inline int
--futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
--{
-- if (!timespec64_valid(ts))
-- return -EINVAL;
--
-- *t = timespec64_to_ktime(*ts);
-- if (cmd == FUTEX_WAIT)
-- *t = ktime_add_safe(ktime_get(), *t);
-- else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
-- *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
-- return 0;
--}
--
--SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
-- const struct __kernel_timespec __user *, utime,
-- u32 __user *, uaddr2, u32, val3)
--{
-- int ret, cmd = op & FUTEX_CMD_MASK;
-- ktime_t t, *tp = NULL;
-- struct timespec64 ts;
--
-- if (utime && futex_cmd_has_timeout(cmd)) {
-- if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
-- return -EFAULT;
-- if (get_timespec64(&ts, utime))
-- return -EFAULT;
-- ret = futex_init_timeout(cmd, op, &ts, &t);
-- if (ret)
-- return ret;
-- tp = &t;
-- }
--
-- return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
--}
--
--#ifdef CONFIG_COMPAT
--/*
-- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
-- */
--static inline int
--compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
-- compat_uptr_t __user *head, unsigned int *pi)
--{
-- if (get_user(*uentry, head))
-- return -EFAULT;
--
-- *entry = compat_ptr((*uentry) & ~1);
-- *pi = (unsigned int)(*uentry) & 1;
--
-- return 0;
--}
--
--static void __user *futex_uaddr(struct robust_list __user *entry,
-- compat_long_t futex_offset)
--{
-- compat_uptr_t base = ptr_to_compat(entry);
-- void __user *uaddr = compat_ptr(base + futex_offset);
--
-- return uaddr;
--}
--
--/*
-- * Walk curr->robust_list (very carefully, it's a userspace list!)
-- * and mark any locks found there dead, and notify any waiters.
-- *
-- * We silently return on any sign of list-walking problem.
-- */
--static void compat_exit_robust_list(struct task_struct *curr)
--{
-- struct compat_robust_list_head __user *head = curr->compat_robust_list;
-- struct robust_list __user *entry, *next_entry, *pending;
-- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-- unsigned int next_pi;
-- compat_uptr_t uentry, next_uentry, upending;
-- compat_long_t futex_offset;
-- int rc;
--
-- if (!futex_cmpxchg_enabled)
-- return;
--
-- /*
-- * Fetch the list head (which was registered earlier, via
-- * sys_set_robust_list()):
-- */
-- if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
-- return;
-- /*
-- * Fetch the relative futex offset:
-- */
-- if (get_user(futex_offset, &head->futex_offset))
-- return;
-- /*
-- * Fetch any possibly pending lock-add first, and handle it
-- * if it exists:
-- */
-- if (compat_fetch_robust_entry(&upending, &pending,
-- &head->list_op_pending, &pip))
-- return;
--
-- next_entry = NULL; /* avoid warning with gcc */
-- while (entry != (struct robust_list __user *) &head->list) {
-- /*
-- * Fetch the next entry in the list before calling
-- * handle_futex_death:
-- */
-- rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
-- (compat_uptr_t __user *)&entry->next, &next_pi);
-- /*
-- * A pending lock might already be on the list, so
-- * dont process it twice:
-- */
-- if (entry != pending) {
-- void __user *uaddr = futex_uaddr(entry, futex_offset);
--
-- if (handle_futex_death(uaddr, curr, pi,
-- HANDLE_DEATH_LIST))
-- return;
-- }
-- if (rc)
-- return;
-- uentry = next_uentry;
-- entry = next_entry;
-- pi = next_pi;
-- /*
-- * Avoid excessively long or circular lists:
-- */
-- if (!--limit)
-- break;
--
-- cond_resched();
-- }
-- if (pending) {
-- void __user *uaddr = futex_uaddr(pending, futex_offset);
--
-- handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
-- }
--}
--
--COMPAT_SYSCALL_DEFINE2(set_robust_list,
-- struct compat_robust_list_head __user *, head,
-- compat_size_t, len)
--{
-- if (!futex_cmpxchg_enabled)
-- return -ENOSYS;
--
-- if (unlikely(len != sizeof(*head)))
-- return -EINVAL;
--
-- current->compat_robust_list = head;
--
-- return 0;
--}
--
--COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
-- compat_uptr_t __user *, head_ptr,
-- compat_size_t __user *, len_ptr)
--{
-- struct compat_robust_list_head __user *head;
-- unsigned long ret;
-- struct task_struct *p;
--
-- if (!futex_cmpxchg_enabled)
-- return -ENOSYS;
--
-- rcu_read_lock();
--
-- ret = -ESRCH;
-- if (!pid)
-- p = current;
-- else {
-- p = find_task_by_vpid(pid);
-- if (!p)
-- goto err_unlock;
-- }
--
-- ret = -EPERM;
-- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
-- goto err_unlock;
--
-- head = p->compat_robust_list;
-- rcu_read_unlock();
--
-- if (put_user(sizeof(*head), len_ptr))
-- return -EFAULT;
-- return put_user(ptr_to_compat(head), head_ptr);
--
--err_unlock:
-- rcu_read_unlock();
--
-- return ret;
--}
--#endif /* CONFIG_COMPAT */
--
--#ifdef CONFIG_COMPAT_32BIT_TIME
--SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
-- const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
-- u32, val3)
--{
-- int ret, cmd = op & FUTEX_CMD_MASK;
-- ktime_t t, *tp = NULL;
-- struct timespec64 ts;
--
-- if (utime && futex_cmd_has_timeout(cmd)) {
-- if (get_old_timespec32(&ts, utime))
-- return -EFAULT;
-- ret = futex_init_timeout(cmd, op, &ts, &t);
-- if (ret)
-- return ret;
-- tp = &t;
-- }
--
-- return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
--}
--#endif /* CONFIG_COMPAT_32BIT_TIME */
--
--static void __init futex_detect_cmpxchg(void)
--{
--#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-- u32 curval;
--
-- /*
-- * This will fail and we want it. Some arch implementations do
-- * runtime detection of the futex_atomic_cmpxchg_inatomic()
-- * functionality. We want to know that before we call in any
-- * of the complex code paths. Also we want to prevent
-- * registration of robust lists in that case. NULL is
-- * guaranteed to fault and we get -EFAULT on functional
-- * implementation, the non-functional ones will return
-- * -ENOSYS.
-- */
-- if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
-- futex_cmpxchg_enabled = 1;
--#endif
--}
--
--static int __init futex_init(void)
--{
-- unsigned int futex_shift;
-- unsigned long i;
--
--#if CONFIG_BASE_SMALL
-- futex_hashsize = 16;
--#else
-- futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
--#endif
--
-- futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
-- futex_hashsize, 0,
-- futex_hashsize < 256 ? HASH_SMALL : 0,
-- &futex_shift, NULL,
-- futex_hashsize, futex_hashsize);
-- futex_hashsize = 1UL << futex_shift;
--
-- futex_detect_cmpxchg();
--
-- for (i = 0; i < futex_hashsize; i++) {
-- atomic_set(&futex_queues[i].waiters, 0);
-- plist_head_init(&futex_queues[i].chain);
-- spin_lock_init(&futex_queues[i].lock);
-- }
--
-- return 0;
--}
--core_initcall(futex_init);
-diff --git a/kernel/futex/Makefile b/kernel/futex/Makefile
-new file mode 100644
-index 000000000..b77188d1f
---- /dev/null
-+++ b/kernel/futex/Makefile
-@@ -0,0 +1,3 @@
-+# SPDX-License-Identifier: GPL-2.0
-+
-+obj-y += core.o syscalls.o pi.o requeue.o waitwake.o
-diff --git a/kernel/futex/core.c b/kernel/futex/core.c
-new file mode 100644
-index 000000000..25d8a88b3
---- /dev/null
-+++ b/kernel/futex/core.c
-@@ -0,0 +1,1176 @@
-+// SPDX-License-Identifier: GPL-2.0-or-later
-+/*
-+ * Fast Userspace Mutexes (which I call "Futexes!").
-+ * (C) Rusty Russell, IBM 2002
-+ *
-+ * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
-+ * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
-+ *
-+ * Removed page pinning, fix privately mapped COW pages and other cleanups
-+ * (C) Copyright 2003, 2004 Jamie Lokier
-+ *
-+ * Robust futex support started by Ingo Molnar
-+ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
-+ * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
-+ *
-+ * PI-futex support started by Ingo Molnar and Thomas Gleixner
-+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
-+ * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
-+ *
-+ * PRIVATE futexes by Eric Dumazet
-+ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
-+ *
-+ * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
-+ * Copyright (C) IBM Corporation, 2009
-+ * Thanks to Thomas Gleixner for conceptual design and careful reviews.
-+ *
-+ * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
-+ * enough at me, Linus for the original (flawed) idea, Matthew
-+ * Kirkwood for proof-of-concept implementation.
-+ *
-+ * "The futexes are also cursed."
-+ * "But they come in a choice of three flavours!"
-+ */
-+#include <linux/compat.h>
-+#include <linux/jhash.h>
-+#include <linux/pagemap.h>
-+#include <linux/memblock.h>
-+#include <linux/fault-inject.h>
-+#include <linux/slab.h>
-+
-+#include "futex.h"
-+#include "../locking/rtmutex_common.h"
-+
-+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-+int __read_mostly futex_cmpxchg_enabled;
-+#endif
-+
-+
-+/*
-+ * The base of the bucket array and its size are always used together
-+ * (after initialization only in futex_hash()), so ensure that they
-+ * reside in the same cacheline.
-+ */
-+static struct {
-+ struct futex_hash_bucket *queues;
-+ unsigned long hashsize;
-+} __futex_data __read_mostly __aligned(2*sizeof(long));
-+#define futex_queues (__futex_data.queues)
-+#define futex_hashsize (__futex_data.hashsize)
-+
-+
-+/*
-+ * Fault injections for futexes.
-+ */
-+#ifdef CONFIG_FAIL_FUTEX
-+
-+static struct {
-+ struct fault_attr attr;
-+
-+ bool ignore_private;
-+} fail_futex = {
-+ .attr = FAULT_ATTR_INITIALIZER,
-+ .ignore_private = false,
-+};
-+
-+static int __init setup_fail_futex(char *str)
-+{
-+ return setup_fault_attr(&fail_futex.attr, str);
-+}
-+__setup("fail_futex=", setup_fail_futex);
-+
-+bool should_fail_futex(bool fshared)
-+{
-+ if (fail_futex.ignore_private && !fshared)
-+ return false;
-+
-+ return should_fail(&fail_futex.attr, 1);
-+}
-+
-+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-+
-+static int __init fail_futex_debugfs(void)
-+{
-+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
-+ struct dentry *dir;
-+
-+ dir = fault_create_debugfs_attr("fail_futex", NULL,
-+ &fail_futex.attr);
-+ if (IS_ERR(dir))
-+ return PTR_ERR(dir);
-+
-+ debugfs_create_bool("ignore-private", mode, dir,
-+ &fail_futex.ignore_private);
-+ return 0;
-+}
-+
-+late_initcall(fail_futex_debugfs);
-+
-+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-+
-+#endif /* CONFIG_FAIL_FUTEX */
-+
-+/**
-+ * futex_hash - Return the hash bucket in the global hash
-+ * @key: Pointer to the futex key for which the hash is calculated
-+ *
-+ * We hash on the keys returned from get_futex_key (see below) and return the
-+ * corresponding hash bucket in the global hash.
-+ */
-+struct futex_hash_bucket *futex_hash(union futex_key *key)
-+{
-+ u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
-+ key->both.offset);
-+
-+ return &futex_queues[hash & (futex_hashsize - 1)];
-+}
-+
-+
-+/**
-+ * futex_setup_timer - set up the sleeping hrtimer.
-+ * @time: ptr to the given timeout value
-+ * @timeout: the hrtimer_sleeper structure to be set up
-+ * @flags: futex flags
-+ * @range_ns: optional range in ns
-+ *
-+ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
-+ * value given
-+ */
-+struct hrtimer_sleeper *
-+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
-+ int flags, u64 range_ns)
-+{
-+ if (!time)
-+ return NULL;
-+
-+ hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
-+ CLOCK_REALTIME : CLOCK_MONOTONIC,
-+ HRTIMER_MODE_ABS);
-+ /*
-+ * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
-+ * effectively the same as calling hrtimer_set_expires().
-+ */
-+ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
-+
-+ return timeout;
-+}
-+
-+/*
-+ * Generate a machine wide unique identifier for this inode.
-+ *
-+ * This relies on u64 not wrapping in the life-time of the machine; which with
-+ * 1ns resolution means almost 585 years.
-+ *
-+ * This further relies on the fact that a well formed program will not unmap
-+ * the file while it has a (shared) futex waiting on it. This mapping will have
-+ * a file reference which pins the mount and inode.
-+ *
-+ * If for some reason an inode gets evicted and read back in again, it will get
-+ * a new sequence number and will _NOT_ match, even though it is the exact same
-+ * file.
-+ *
-+ * It is important that futex_match() will never have a false-positive, esp.
-+ * for PI futexes that can mess up the state. The above argues that false-negatives
-+ * are only possible for malformed programs.
-+ */
-+static u64 get_inode_sequence_number(struct inode *inode)
-+{
-+ static atomic64_t i_seq;
-+ u64 old;
-+
-+ /* Does the inode already have a sequence number? */
-+ old = atomic64_read(&inode->i_sequence);
-+ if (likely(old))
-+ return old;
-+
-+ for (;;) {
-+ u64 new = atomic64_add_return(1, &i_seq);
-+ if (WARN_ON_ONCE(!new))
-+ continue;
-+
-+ old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
-+ if (old)
-+ return old;
-+ return new;
-+ }
-+}
-+
-+/**
-+ * get_futex_key() - Get parameters which are the keys for a futex
-+ * @uaddr: virtual address of the futex
-+ * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
-+ * @key: address where result is stored.
-+ * @rw: mapping needs to be read/write (values: FUTEX_READ,
-+ * FUTEX_WRITE)
-+ *
-+ * Return: a negative error code or 0
-+ *
-+ * The key words are stored in @key on success.
-+ *
-+ * For shared mappings (when @fshared), the key is:
-+ *
-+ * ( inode->i_sequence, page->index, offset_within_page )
-+ *
-+ * [ also see get_inode_sequence_number() ]
-+ *
-+ * For private mappings (or when !@fshared), the key is:
-+ *
-+ * ( current->mm, address, 0 )
-+ *
-+ * This allows (cross process, where applicable) identification of the futex
-+ * without keeping the page pinned for the duration of the FUTEX_WAIT.
-+ *
-+ * lock_page() might sleep, the caller should not hold a spinlock.
-+ */
-+int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
-+ enum futex_access rw)
-+{
-+ unsigned long address = (unsigned long)uaddr;
-+ struct mm_struct *mm = current->mm;
-+ struct page *page, *tail;
-+ struct address_space *mapping;
-+ int err, ro = 0;
-+
-+ /*
-+ * The futex address must be "naturally" aligned.
-+ */
-+ key->both.offset = address % PAGE_SIZE;
-+ if (unlikely((address % sizeof(u32)) != 0))
-+ return -EINVAL;
-+ address -= key->both.offset;
-+
-+ if (unlikely(!access_ok(uaddr, sizeof(u32))))
-+ return -EFAULT;
-+
-+ if (unlikely(should_fail_futex(fshared)))
-+ return -EFAULT;
-+
-+ /*
-+ * PROCESS_PRIVATE futexes are fast.
-+ * As the mm cannot disappear under us and the 'key' only needs
-+ * virtual address, we dont even have to find the underlying vma.
-+ * Note : We do have to check 'uaddr' is a valid user address,
-+ * but access_ok() should be faster than find_vma()
-+ */
-+ if (!fshared) {
-+ key->private.mm = mm;
-+ key->private.address = address;
-+ return 0;
-+ }
-+
-+again:
-+ /* Ignore any VERIFY_READ mapping (futex common case) */
-+ if (unlikely(should_fail_futex(true)))
-+ return -EFAULT;
-+
-+ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
-+ /*
-+ * If write access is not required (eg. FUTEX_WAIT), try
-+ * and get read-only access.
-+ */
-+ if (err == -EFAULT && rw == FUTEX_READ) {
-+ err = get_user_pages_fast(address, 1, 0, &page);
-+ ro = 1;
-+ }
-+ if (err < 0)
-+ return err;
-+ else
-+ err = 0;
-+
-+ /*
-+ * The treatment of mapping from this point on is critical. The page
-+ * lock protects many things but in this context the page lock
-+ * stabilizes mapping, prevents inode freeing in the shared
-+ * file-backed region case and guards against movement to swap cache.
-+ *
-+ * Strictly speaking the page lock is not needed in all cases being
-+ * considered here and page lock forces unnecessarily serialization
-+ * From this point on, mapping will be re-verified if necessary and
-+ * page lock will be acquired only if it is unavoidable
-+ *
-+ * Mapping checks require the head page for any compound page so the
-+ * head page and mapping is looked up now. For anonymous pages, it
-+ * does not matter if the page splits in the future as the key is
-+ * based on the address. For filesystem-backed pages, the tail is
-+ * required as the index of the page determines the key. For
-+ * base pages, there is no tail page and tail == page.
-+ */
-+ tail = page;
-+ page = compound_head(page);
-+ mapping = READ_ONCE(page->mapping);
-+
-+ /*
-+ * If page->mapping is NULL, then it cannot be a PageAnon
-+ * page; but it might be the ZERO_PAGE or in the gate area or
-+ * in a special mapping (all cases which we are happy to fail);
-+ * or it may have been a good file page when get_user_pages_fast
-+ * found it, but truncated or holepunched or subjected to
-+ * invalidate_complete_page2 before we got the page lock (also
-+ * cases which we are happy to fail). And we hold a reference,
-+ * so refcount care in invalidate_complete_page's remove_mapping
-+ * prevents drop_caches from setting mapping to NULL beneath us.
-+ *
-+ * The case we do have to guard against is when memory pressure made
-+ * shmem_writepage move it from filecache to swapcache beneath us:
-+ * an unlikely race, but we do need to retry for page->mapping.
-+ */
-+ if (unlikely(!mapping)) {
-+ int shmem_swizzled;
-+
-+ /*
-+ * Page lock is required to identify which special case above
-+ * applies. If this is really a shmem page then the page lock
-+ * will prevent unexpected transitions.
-+ */
-+ lock_page(page);
-+ shmem_swizzled = PageSwapCache(page) || page->mapping;
-+ unlock_page(page);
-+ put_page(page);
-+
-+ if (shmem_swizzled)
-+ goto again;
-+
-+ return -EFAULT;
-+ }
-+
-+ /*
-+ * Private mappings are handled in a simple way.
-+ *
-+ * If the futex key is stored on an anonymous page, then the associated
-+ * object is the mm which is implicitly pinned by the calling process.
-+ *
-+ * NOTE: When userspace waits on a MAP_SHARED mapping, even if
-+ * it's a read-only handle, it's expected that futexes attach to
-+ * the object not the particular process.
-+ */
-+ if (PageAnon(page)) {
-+ /*
-+ * A RO anonymous page will never change and thus doesn't make
-+ * sense for futex operations.
-+ */
-+ if (unlikely(should_fail_futex(true)) || ro) {
-+ err = -EFAULT;
-+ goto out;
-+ }
-+
-+ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
-+ key->private.mm = mm;
-+ key->private.address = address;
-+
-+ } else {
-+ struct inode *inode;
-+
-+ /*
-+ * The associated futex object in this case is the inode and
-+ * the page->mapping must be traversed. Ordinarily this should
-+ * be stabilised under page lock but it's not strictly
-+ * necessary in this case as we just want to pin the inode, not
-+ * update the radix tree or anything like that.
-+ *
-+ * The RCU read lock is taken as the inode is finally freed
-+ * under RCU. If the mapping still matches expectations then the
-+ * mapping->host can be safely accessed as being a valid inode.
-+ */
-+ rcu_read_lock();
-+
-+ if (READ_ONCE(page->mapping) != mapping) {
-+ rcu_read_unlock();
-+ put_page(page);
-+
-+ goto again;
-+ }
-+
-+ inode = READ_ONCE(mapping->host);
-+ if (!inode) {
-+ rcu_read_unlock();
-+ put_page(page);
-+
-+ goto again;
-+ }
-+
-+ key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-+ key->shared.i_seq = get_inode_sequence_number(inode);
-+ key->shared.pgoff = page_to_pgoff(tail);
-+ rcu_read_unlock();
-+ }
-+
-+out:
-+ put_page(page);
-+ return err;
-+}
-+
-+/**
-+ * fault_in_user_writeable() - Fault in user address and verify RW access
-+ * @uaddr: pointer to faulting user space address
-+ *
-+ * Slow path to fixup the fault we just took in the atomic write
-+ * access to @uaddr.
-+ *
-+ * We have no generic implementation of a non-destructive write to the
-+ * user address. We know that we faulted in the atomic pagefault
-+ * disabled section so we can as well avoid the #PF overhead by
-+ * calling get_user_pages() right away.
-+ */
-+int fault_in_user_writeable(u32 __user *uaddr)
-+{
-+ struct mm_struct *mm = current->mm;
-+ int ret;
-+
-+ mmap_read_lock(mm);
-+ ret = fixup_user_fault(mm, (unsigned long)uaddr,
-+ FAULT_FLAG_WRITE, NULL);
-+ mmap_read_unlock(mm);
-+
-+ return ret < 0 ? ret : 0;
-+}
-+
-+/**
-+ * futex_top_waiter() - Return the highest priority waiter on a futex
-+ * @hb: the hash bucket the futex_q's reside in
-+ * @key: the futex key (to distinguish it from other futex futex_q's)
-+ *
-+ * Must be called with the hb lock held.
-+ */
-+struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
-+{
-+ struct futex_q *this;
-+
-+ plist_for_each_entry(this, &hb->chain, list) {
-+ if (futex_match(&this->key, key))
-+ return this;
-+ }
-+ return NULL;
-+}
-+
-+int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
-+{
-+ int ret;
-+
-+ pagefault_disable();
-+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
-+ pagefault_enable();
-+
-+ return ret;
-+}
-+
-+int futex_get_value_locked(u32 *dest, u32 __user *from)
-+{
-+ int ret;
-+
-+ pagefault_disable();
-+ ret = __get_user(*dest, from);
-+ pagefault_enable();
-+
-+ return ret ? -EFAULT : 0;
-+}
-+
-+/**
-+ * wait_for_owner_exiting - Block until the owner has exited
-+ * @ret: owner's current futex lock status
-+ * @exiting: Pointer to the exiting task
-+ *
-+ * Caller must hold a refcount on @exiting.
-+ */
-+void wait_for_owner_exiting(int ret, struct task_struct *exiting)
-+{
-+ if (ret != -EBUSY) {
-+ WARN_ON_ONCE(exiting);
-+ return;
-+ }
-+
-+ if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
-+ return;
-+
-+ mutex_lock(&exiting->futex_exit_mutex);
-+ /*
-+ * No point in doing state checking here. If the waiter got here
-+ * while the task was in exec()->exec_futex_release() then it can
-+ * have any FUTEX_STATE_* value when the waiter has acquired the
-+ * mutex. OK, if running, EXITING or DEAD if it reached exit()
-+ * already. Highly unlikely and not a problem. Just one more round
-+ * through the futex maze.
-+ */
-+ mutex_unlock(&exiting->futex_exit_mutex);
-+
-+ put_task_struct(exiting);
-+}
-+
-+/**
-+ * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
-+ * @q: The futex_q to unqueue
-+ *
-+ * The q->lock_ptr must not be NULL and must be held by the caller.
-+ */
-+void __futex_unqueue(struct futex_q *q)
-+{
-+ struct futex_hash_bucket *hb;
-+
-+ if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
-+ return;
-+ lockdep_assert_held(q->lock_ptr);
-+
-+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
-+ plist_del(&q->list, &hb->chain);
-+ futex_hb_waiters_dec(hb);
-+}
-+
-+/* The key must be already stored in q->key. */
-+struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
-+ __acquires(&hb->lock)
-+{
-+ struct futex_hash_bucket *hb;
-+
-+ hb = futex_hash(&q->key);
-+
-+ /*
-+ * Increment the counter before taking the lock so that
-+ * a potential waker won't miss a to-be-slept task that is
-+ * waiting for the spinlock. This is safe as all futex_q_lock()
-+ * users end up calling futex_queue(). Similarly, for housekeeping,
-+ * decrement the counter at futex_q_unlock() when some error has
-+ * occurred and we don't end up adding the task to the list.
-+ */
-+ futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */
-+
-+ q->lock_ptr = &hb->lock;
-+
-+ spin_lock(&hb->lock);
-+ return hb;
-+}
-+
-+void futex_q_unlock(struct futex_hash_bucket *hb)
-+ __releases(&hb->lock)
-+{
-+ spin_unlock(&hb->lock);
-+ futex_hb_waiters_dec(hb);
-+}
-+
-+void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
-+{
-+ int prio;
-+
-+ /*
-+ * The priority used to register this element is
-+ * - either the real thread-priority for the real-time threads
-+ * (i.e. threads with a priority lower than MAX_RT_PRIO)
-+ * - or MAX_RT_PRIO for non-RT threads.
-+ * Thus, all RT-threads are woken first in priority order, and
-+ * the others are woken last, in FIFO order.
-+ */
-+ prio = min(current->normal_prio, MAX_RT_PRIO);
-+
-+ plist_node_init(&q->list, prio);
-+ plist_add(&q->list, &hb->chain);
-+ q->task = current;
-+}
-+
-+/**
-+ * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
-+ * @q: The futex_q to unqueue
-+ *
-+ * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
-+ * be paired with exactly one earlier call to futex_queue().
-+ *
-+ * Return:
-+ * - 1 - if the futex_q was still queued (and we removed unqueued it);
-+ * - 0 - if the futex_q was already removed by the waking thread
-+ */
-+int futex_unqueue(struct futex_q *q)
-+{
-+ spinlock_t *lock_ptr;
-+ int ret = 0;
-+
-+ /* In the common case we don't take the spinlock, which is nice. */
-+retry:
-+ /*
-+ * q->lock_ptr can change between this read and the following spin_lock.
-+ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
-+ * optimizing lock_ptr out of the logic below.
-+ */
-+ lock_ptr = READ_ONCE(q->lock_ptr);
-+ if (lock_ptr != NULL) {
-+ spin_lock(lock_ptr);
-+ /*
-+ * q->lock_ptr can change between reading it and
-+ * spin_lock(), causing us to take the wrong lock. This
-+ * corrects the race condition.
-+ *
-+ * Reasoning goes like this: if we have the wrong lock,
-+ * q->lock_ptr must have changed (maybe several times)
-+ * between reading it and the spin_lock(). It can
-+ * change again after the spin_lock() but only if it was
-+ * already changed before the spin_lock(). It cannot,
-+ * however, change back to the original value. Therefore
-+ * we can detect whether we acquired the correct lock.
-+ */
-+ if (unlikely(lock_ptr != q->lock_ptr)) {
-+ spin_unlock(lock_ptr);
-+ goto retry;
-+ }
-+ __futex_unqueue(q);
-+
-+ BUG_ON(q->pi_state);
-+
-+ spin_unlock(lock_ptr);
-+ ret = 1;
-+ }
-+
-+ return ret;
-+}
-+
-+/*
-+ * PI futexes can not be requeued and must remove themselves from the
-+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
-+ */
-+void futex_unqueue_pi(struct futex_q *q)
-+{
-+ __futex_unqueue(q);
-+
-+ BUG_ON(!q->pi_state);
-+ put_pi_state(q->pi_state);
-+ q->pi_state = NULL;
-+}
-+
-+/* Constants for the pending_op argument of handle_futex_death */
-+#define HANDLE_DEATH_PENDING true
-+#define HANDLE_DEATH_LIST false
-+
-+/*
-+ * Process a futex-list entry, check whether it's owned by the
-+ * dying task, and do notification if so:
-+ */
-+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
-+ bool pi, bool pending_op)
-+{
-+ u32 uval, nval, mval;
-+ int err;
-+
-+ /* Futex address must be 32bit aligned */
-+ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
-+ return -1;
-+
-+retry:
-+ if (get_user(uval, uaddr))
-+ return -1;
-+
-+ /*
-+ * Special case for regular (non PI) futexes. The unlock path in
-+ * user space has two race scenarios:
-+ *
-+ * 1. The unlock path releases the user space futex value and
-+ * before it can execute the futex() syscall to wake up
-+ * waiters it is killed.
-+ *
-+ * 2. A woken up waiter is killed before it can acquire the
-+ * futex in user space.
-+ *
-+ * In both cases the TID validation below prevents a wakeup of
-+ * potential waiters which can cause these waiters to block
-+ * forever.
-+ *
-+ * In both cases the following conditions are met:
-+ *
-+ * 1) task->robust_list->list_op_pending != NULL
-+ * @pending_op == true
-+ * 2) User space futex value == 0
-+ * 3) Regular futex: @pi == false
-+ *
-+ * If these conditions are met, it is safe to attempt waking up a
-+ * potential waiter without touching the user space futex value and
-+ * trying to set the OWNER_DIED bit. The user space futex value is
-+ * uncontended and the rest of the user space mutex state is
-+ * consistent, so a woken waiter will just take over the
-+ * uncontended futex. Setting the OWNER_DIED bit would create
-+ * inconsistent state and malfunction of the user space owner died
-+ * handling.
-+ */
-+ if (pending_op && !pi && !uval) {
-+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
-+ return 0;
-+ }
-+
-+ if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
-+ return 0;
-+
-+ /*
-+ * Ok, this dying thread is truly holding a futex
-+ * of interest. Set the OWNER_DIED bit atomically
-+ * via cmpxchg, and if the value had FUTEX_WAITERS
-+ * set, wake up a waiter (if any). (We have to do a
-+ * futex_wake() even if OWNER_DIED is already set -
-+ * to handle the rare but possible case of recursive
-+ * thread-death.) The rest of the cleanup is done in
-+ * userspace.
-+ */
-+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
-+
-+ /*
-+ * We are not holding a lock here, but we want to have
-+ * the pagefault_disable/enable() protection because
-+ * we want to handle the fault gracefully. If the
-+ * access fails we try to fault in the futex with R/W
-+ * verification via get_user_pages. get_user() above
-+ * does not guarantee R/W access. If that fails we
-+ * give up and leave the futex locked.
-+ */
-+ if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
-+ switch (err) {
-+ case -EFAULT:
-+ if (fault_in_user_writeable(uaddr))
-+ return -1;
-+ goto retry;
-+
-+ case -EAGAIN:
-+ cond_resched();
-+ goto retry;
-+
-+ default:
-+ WARN_ON_ONCE(1);
-+ return err;
-+ }
-+ }
-+
-+ if (nval != uval)
-+ goto retry;
-+
-+ /*
-+ * Wake robust non-PI futexes here. The wakeup of
-+ * PI futexes happens in exit_pi_state():
-+ */
-+ if (!pi && (uval & FUTEX_WAITERS))
-+ futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
-+
-+ return 0;
-+}
-+
-+/*
-+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
-+ */
-+static inline int fetch_robust_entry(struct robust_list __user **entry,
-+ struct robust_list __user * __user *head,
-+ unsigned int *pi)
-+{
-+ unsigned long uentry;
-+
-+ if (get_user(uentry, (unsigned long __user *)head))
-+ return -EFAULT;
-+
-+ *entry = (void __user *)(uentry & ~1UL);
-+ *pi = uentry & 1;
-+
-+ return 0;
-+}
-+
-+/*
-+ * Walk curr->robust_list (very carefully, it's a userspace list!)
-+ * and mark any locks found there dead, and notify any waiters.
-+ *
-+ * We silently return on any sign of list-walking problem.
-+ */
-+static void exit_robust_list(struct task_struct *curr)
-+{
-+ struct robust_list_head __user *head = curr->robust_list;
-+ struct robust_list __user *entry, *next_entry, *pending;
-+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-+ unsigned int next_pi;
-+ unsigned long futex_offset;
-+ int rc;
-+
-+ if (!futex_cmpxchg_enabled)
-+ return;
-+
-+ /*
-+ * Fetch the list head (which was registered earlier, via
-+ * sys_set_robust_list()):
-+ */
-+ if (fetch_robust_entry(&entry, &head->list.next, &pi))
-+ return;
-+ /*
-+ * Fetch the relative futex offset:
-+ */
-+ if (get_user(futex_offset, &head->futex_offset))
-+ return;
-+ /*
-+ * Fetch any possibly pending lock-add first, and handle it
-+ * if it exists:
-+ */
-+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
-+ return;
-+
-+ next_entry = NULL; /* avoid warning with gcc */
-+ while (entry != &head->list) {
-+ /*
-+ * Fetch the next entry in the list before calling
-+ * handle_futex_death:
-+ */
-+ rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
-+ /*
-+ * A pending lock might already be on the list, so
-+ * don't process it twice:
-+ */
-+ if (entry != pending) {
-+ if (handle_futex_death((void __user *)entry + futex_offset,
-+ curr, pi, HANDLE_DEATH_LIST))
-+ return;
-+ }
-+ if (rc)
-+ return;
-+ entry = next_entry;
-+ pi = next_pi;
-+ /*
-+ * Avoid excessively long or circular lists:
-+ */
-+ if (!--limit)
-+ break;
-+
-+ cond_resched();
-+ }
-+
-+ if (pending) {
-+ handle_futex_death((void __user *)pending + futex_offset,
-+ curr, pip, HANDLE_DEATH_PENDING);
-+ }
-+}
-+
-+#ifdef CONFIG_COMPAT
-+static void __user *futex_uaddr(struct robust_list __user *entry,
-+ compat_long_t futex_offset)
-+{
-+ compat_uptr_t base = ptr_to_compat(entry);
-+ void __user *uaddr = compat_ptr(base + futex_offset);
-+
-+ return uaddr;
-+}
-+
-+/*
-+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
-+ */
-+static inline int
-+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
-+ compat_uptr_t __user *head, unsigned int *pi)
-+{
-+ if (get_user(*uentry, head))
-+ return -EFAULT;
-+
-+ *entry = compat_ptr((*uentry) & ~1);
-+ *pi = (unsigned int)(*uentry) & 1;
-+
-+ return 0;
-+}
-+
-+/*
-+ * Walk curr->robust_list (very carefully, it's a userspace list!)
-+ * and mark any locks found there dead, and notify any waiters.
-+ *
-+ * We silently return on any sign of list-walking problem.
-+ */
-+static void compat_exit_robust_list(struct task_struct *curr)
-+{
-+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
-+ struct robust_list __user *entry, *next_entry, *pending;
-+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
-+ unsigned int next_pi;
-+ compat_uptr_t uentry, next_uentry, upending;
-+ compat_long_t futex_offset;
-+ int rc;
-+
-+ if (!futex_cmpxchg_enabled)
-+ return;
-+
-+ /*
-+ * Fetch the list head (which was registered earlier, via
-+ * sys_set_robust_list()):
-+ */
-+ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
-+ return;
-+ /*
-+ * Fetch the relative futex offset:
-+ */
-+ if (get_user(futex_offset, &head->futex_offset))
-+ return;
-+ /*
-+ * Fetch any possibly pending lock-add first, and handle it
-+ * if it exists:
-+ */
-+ if (compat_fetch_robust_entry(&upending, &pending,
-+ &head->list_op_pending, &pip))
-+ return;
-+
-+ next_entry = NULL; /* avoid warning with gcc */
-+ while (entry != (struct robust_list __user *) &head->list) {
-+ /*
-+ * Fetch the next entry in the list before calling
-+ * handle_futex_death:
-+ */
-+ rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
-+ (compat_uptr_t __user *)&entry->next, &next_pi);
-+ /*
-+ * A pending lock might already be on the list, so
-+ * dont process it twice:
-+ */
-+ if (entry != pending) {
-+ void __user *uaddr = futex_uaddr(entry, futex_offset);
-+
-+ if (handle_futex_death(uaddr, curr, pi,
-+ HANDLE_DEATH_LIST))
-+ return;
-+ }
-+ if (rc)
-+ return;
-+ uentry = next_uentry;
-+ entry = next_entry;
-+ pi = next_pi;
-+ /*
-+ * Avoid excessively long or circular lists:
-+ */
-+ if (!--limit)
-+ break;
-+
-+ cond_resched();
-+ }
-+ if (pending) {
-+ void __user *uaddr = futex_uaddr(pending, futex_offset);
-+
-+ handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
-+ }
-+}
-+#endif
-+
-+#ifdef CONFIG_FUTEX_PI
-+
-+/*
-+ * This task is holding PI mutexes at exit time => bad.
-+ * Kernel cleans up PI-state, but userspace is likely hosed.
-+ * (Robust-futex cleanup is separate and might save the day for userspace.)
-+ */
-+static void exit_pi_state_list(struct task_struct *curr)
-+{
-+ struct list_head *next, *head = &curr->pi_state_list;
-+ struct futex_pi_state *pi_state;
-+ struct futex_hash_bucket *hb;
-+ union futex_key key = FUTEX_KEY_INIT;
-+
-+ if (!futex_cmpxchg_enabled)
-+ return;
-+ /*
-+ * We are a ZOMBIE and nobody can enqueue itself on
-+ * pi_state_list anymore, but we have to be careful
-+ * versus waiters unqueueing themselves:
-+ */
-+ raw_spin_lock_irq(&curr->pi_lock);
-+ while (!list_empty(head)) {
-+ next = head->next;
-+ pi_state = list_entry(next, struct futex_pi_state, list);
-+ key = pi_state->key;
-+ hb = futex_hash(&key);
-+
-+ /*
-+ * We can race against put_pi_state() removing itself from the
-+ * list (a waiter going away). put_pi_state() will first
-+ * decrement the reference count and then modify the list, so
-+ * its possible to see the list entry but fail this reference
-+ * acquire.
-+ *
-+ * In that case; drop the locks to let put_pi_state() make
-+ * progress and retry the loop.
-+ */
-+ if (!refcount_inc_not_zero(&pi_state->refcount)) {
-+ raw_spin_unlock_irq(&curr->pi_lock);
-+ cpu_relax();
-+ raw_spin_lock_irq(&curr->pi_lock);
-+ continue;
-+ }
-+ raw_spin_unlock_irq(&curr->pi_lock);
-+
-+ spin_lock(&hb->lock);
-+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-+ raw_spin_lock(&curr->pi_lock);
-+ /*
-+ * We dropped the pi-lock, so re-check whether this
-+ * task still owns the PI-state:
-+ */
-+ if (head->next != next) {
-+ /* retain curr->pi_lock for the loop invariant */
-+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-+ spin_unlock(&hb->lock);
-+ put_pi_state(pi_state);
-+ continue;
-+ }
-+
-+ WARN_ON(pi_state->owner != curr);
-+ WARN_ON(list_empty(&pi_state->list));
-+ list_del_init(&pi_state->list);
-+ pi_state->owner = NULL;
-+
-+ raw_spin_unlock(&curr->pi_lock);
-+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-+ spin_unlock(&hb->lock);
-+
-+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
-+ put_pi_state(pi_state);
-+
-+ raw_spin_lock_irq(&curr->pi_lock);
-+ }
-+ raw_spin_unlock_irq(&curr->pi_lock);
-+}
-+#else
-+static inline void exit_pi_state_list(struct task_struct *curr) { }
-+#endif
-+
-+static void futex_cleanup(struct task_struct *tsk)
-+{
-+ if (unlikely(tsk->robust_list)) {
-+ exit_robust_list(tsk);
-+ tsk->robust_list = NULL;
-+ }
-+
-+#ifdef CONFIG_COMPAT
-+ if (unlikely(tsk->compat_robust_list)) {
-+ compat_exit_robust_list(tsk);
-+ tsk->compat_robust_list = NULL;
-+ }
-+#endif
-+
-+ if (unlikely(!list_empty(&tsk->pi_state_list)))
-+ exit_pi_state_list(tsk);
-+}
-+
-+/**
-+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
-+ * @tsk: task to set the state on
-+ *
-+ * Set the futex exit state of the task lockless. The futex waiter code
-+ * observes that state when a task is exiting and loops until the task has
-+ * actually finished the futex cleanup. The worst case for this is that the
-+ * waiter runs through the wait loop until the state becomes visible.
-+ *
-+ * This is called from the recursive fault handling path in do_exit().
-+ *
-+ * This is best effort. Either the futex exit code has run already or
-+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
-+ * take it over. If not, the problem is pushed back to user space. If the
-+ * futex exit code did not run yet, then an already queued waiter might
-+ * block forever, but there is nothing which can be done about that.
-+ */
-+void futex_exit_recursive(struct task_struct *tsk)
-+{
-+ /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
-+ if (tsk->futex_state == FUTEX_STATE_EXITING)
-+ mutex_unlock(&tsk->futex_exit_mutex);
-+ tsk->futex_state = FUTEX_STATE_DEAD;
-+}
-+
-+static void futex_cleanup_begin(struct task_struct *tsk)
-+{
-+ /*
-+ * Prevent various race issues against a concurrent incoming waiter
-+ * including live locks by forcing the waiter to block on
-+ * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
-+ * attach_to_pi_owner().
-+ */
-+ mutex_lock(&tsk->futex_exit_mutex);
-+
-+ /*
-+ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
-+ *
-+ * This ensures that all subsequent checks of tsk->futex_state in
-+ * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
-+ * tsk->pi_lock held.
-+ *
-+ * It guarantees also that a pi_state which was queued right before
-+ * the state change under tsk->pi_lock by a concurrent waiter must
-+ * be observed in exit_pi_state_list().
-+ */
-+ raw_spin_lock_irq(&tsk->pi_lock);
-+ tsk->futex_state = FUTEX_STATE_EXITING;
-+ raw_spin_unlock_irq(&tsk->pi_lock);
-+}
-+
-+static void futex_cleanup_end(struct task_struct *tsk, int state)
-+{
-+ /*
-+ * Lockless store. The only side effect is that an observer might
-+ * take another loop until it becomes visible.
-+ */
-+ tsk->futex_state = state;
-+ /*
-+ * Drop the exit protection. This unblocks waiters which observed
-+ * FUTEX_STATE_EXITING to reevaluate the state.
-+ */
-+ mutex_unlock(&tsk->futex_exit_mutex);
-+}
-+
-+void futex_exec_release(struct task_struct *tsk)
-+{
-+ /*
-+ * The state handling is done for consistency, but in the case of
-+ * exec() there is no way to prevent further damage as the PID stays
-+ * the same. But for the unlikely and arguably buggy case that a
-+ * futex is held on exec(), this provides at least as much state
-+ * consistency protection which is possible.
-+ */
-+ futex_cleanup_begin(tsk);
-+ futex_cleanup(tsk);
-+ /*
-+ * Reset the state to FUTEX_STATE_OK. The task is alive and about
-+ * exec a new binary.
-+ */
-+ futex_cleanup_end(tsk, FUTEX_STATE_OK);
-+}
-+
-+void futex_exit_release(struct task_struct *tsk)
-+{
-+ futex_cleanup_begin(tsk);
-+ futex_cleanup(tsk);
-+ futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
-+}
-+
-+static void __init futex_detect_cmpxchg(void)
-+{
-+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-+ u32 curval;
-+
-+ /*
-+ * This will fail and we want it. Some arch implementations do
-+ * runtime detection of the futex_atomic_cmpxchg_inatomic()
-+ * functionality. We want to know that before we call in any
-+ * of the complex code paths. Also we want to prevent
-+ * registration of robust lists in that case. NULL is
-+ * guaranteed to fault and we get -EFAULT on functional
-+ * implementation, the non-functional ones will return
-+ * -ENOSYS.
-+ */
-+ if (futex_cmpxchg_value_locked(&curval, NULL, 0, 0) == -EFAULT)
-+ futex_cmpxchg_enabled = 1;
-+#endif
-+}
-+
-+static int __init futex_init(void)
-+{
-+ unsigned int futex_shift;
-+ unsigned long i;
-+
-+#if CONFIG_BASE_SMALL
-+ futex_hashsize = 16;
-+#else
-+ futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
-+#endif
-+
-+ futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
-+ futex_hashsize, 0,
-+ futex_hashsize < 256 ? HASH_SMALL : 0,
-+ &futex_shift, NULL,
-+ futex_hashsize, futex_hashsize);
-+ futex_hashsize = 1UL << futex_shift;
-+
-+ futex_detect_cmpxchg();
-+
-+ for (i = 0; i < futex_hashsize; i++) {
-+ atomic_set(&futex_queues[i].waiters, 0);
-+ plist_head_init(&futex_queues[i].chain);
-+ spin_lock_init(&futex_queues[i].lock);
-+ }
-+
-+ return 0;
-+}
-+core_initcall(futex_init);
-diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
-new file mode 100644
-index 000000000..948fcf317
---- /dev/null
-+++ b/kernel/futex/futex.h
-@@ -0,0 +1,295 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+#ifndef _FUTEX_H
-+#define _FUTEX_H
-+
-+#include <linux/futex.h>
-+#include <linux/sched/wake_q.h>
-+
-+#include <asm/futex.h>
-+
-+/*
-+ * Futex flags used to encode options to functions and preserve them across
-+ * restarts.
-+ */
-+#ifdef CONFIG_MMU
-+# define FLAGS_SHARED 0x01
-+#else
-+/*
-+ * NOMMU does not have per process address space. Let the compiler optimize
-+ * code away.
-+ */
-+# define FLAGS_SHARED 0x00
-+#endif
-+#define FLAGS_CLOCKRT 0x02
-+#define FLAGS_HAS_TIMEOUT 0x04
-+
-+#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
-+#define futex_cmpxchg_enabled 1
-+#else
-+extern int __read_mostly futex_cmpxchg_enabled;
-+#endif
-+
-+#ifdef CONFIG_FAIL_FUTEX
-+extern bool should_fail_futex(bool fshared);
-+#else
-+static inline bool should_fail_futex(bool fshared)
-+{
-+ return false;
-+}
-+#endif
-+
-+/*
-+ * Hash buckets are shared by all the futex_keys that hash to the same
-+ * location. Each key may have multiple futex_q structures, one for each task
-+ * waiting on a futex.
-+ */
-+struct futex_hash_bucket {
-+ atomic_t waiters;
-+ spinlock_t lock;
-+ struct plist_head chain;
-+} ____cacheline_aligned_in_smp;
-+
-+/*
-+ * Priority Inheritance state:
-+ */
-+struct futex_pi_state {
-+ /*
-+ * list of 'owned' pi_state instances - these have to be
-+ * cleaned up in do_exit() if the task exits prematurely:
-+ */
-+ struct list_head list;
-+
-+ /*
-+ * The PI object:
-+ */
-+ struct rt_mutex_base pi_mutex;
-+
-+ struct task_struct *owner;
-+ refcount_t refcount;
-+
-+ union futex_key key;
-+} __randomize_layout;
-+
-+/**
-+ * struct futex_q - The hashed futex queue entry, one per waiting task
-+ * @list: priority-sorted list of tasks waiting on this futex
-+ * @task: the task waiting on the futex
-+ * @lock_ptr: the hash bucket lock
-+ * @key: the key the futex is hashed on
-+ * @pi_state: optional priority inheritance state
-+ * @rt_waiter: rt_waiter storage for use with requeue_pi
-+ * @requeue_pi_key: the requeue_pi target futex key
-+ * @bitset: bitset for the optional bitmasked wakeup
-+ * @requeue_state: State field for futex_requeue_pi()
-+ * @requeue_wait: RCU wait for futex_requeue_pi() (RT only)
-+ *
-+ * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
-+ * we can wake only the relevant ones (hashed queues may be shared).
-+ *
-+ * A futex_q has a woken state, just like tasks have TASK_RUNNING.
-+ * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
-+ * The order of wakeup is always to make the first condition true, then
-+ * the second.
-+ *
-+ * PI futexes are typically woken before they are removed from the hash list via
-+ * the rt_mutex code. See futex_unqueue_pi().
-+ */
-+struct futex_q {
-+ struct plist_node list;
-+
-+ struct task_struct *task;
-+ spinlock_t *lock_ptr;
-+ union futex_key key;
-+ struct futex_pi_state *pi_state;
-+ struct rt_mutex_waiter *rt_waiter;
-+ union futex_key *requeue_pi_key;
-+ u32 bitset;
-+ atomic_t requeue_state;
-+#ifdef CONFIG_PREEMPT_RT
-+ struct rcuwait requeue_wait;
-+#endif
-+} __randomize_layout;
-+
-+extern const struct futex_q futex_q_init;
-+
-+enum futex_access {
-+ FUTEX_READ,
-+ FUTEX_WRITE
-+};
-+
-+extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
-+ enum futex_access rw);
-+
-+extern struct hrtimer_sleeper *
-+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
-+ int flags, u64 range_ns);
-+
-+extern struct futex_hash_bucket *futex_hash(union futex_key *key);
-+
-+/**
-+ * futex_match - Check whether two futex keys are equal
-+ * @key1: Pointer to key1
-+ * @key2: Pointer to key2
-+ *
-+ * Return 1 if two futex_keys are equal, 0 otherwise.
-+ */
-+static inline int futex_match(union futex_key *key1, union futex_key *key2)
-+{
-+ return (key1 && key2
-+ && key1->both.word == key2->both.word
-+ && key1->both.ptr == key2->both.ptr
-+ && key1->both.offset == key2->both.offset);
-+}
-+
-+extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
-+ struct futex_q *q, struct futex_hash_bucket **hb);
-+extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
-+ struct hrtimer_sleeper *timeout);
-+extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);
-+
-+extern int fault_in_user_writeable(u32 __user *uaddr);
-+extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval);
-+extern int futex_get_value_locked(u32 *dest, u32 __user *from);
-+extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);
-+
-+extern void __futex_unqueue(struct futex_q *q);
-+extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb);
-+extern int futex_unqueue(struct futex_q *q);
-+
-+/**
-+ * futex_queue() - Enqueue the futex_q on the futex_hash_bucket
-+ * @q: The futex_q to enqueue
-+ * @hb: The destination hash bucket
-+ *
-+ * The hb->lock must be held by the caller, and is released here. A call to
-+ * futex_queue() is typically paired with exactly one call to futex_unqueue(). The
-+ * exceptions involve the PI related operations, which may use futex_unqueue_pi()
-+ * or nothing if the unqueue is done as part of the wake process and the unqueue
-+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
-+ * an example).
-+ */
-+static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
-+ __releases(&hb->lock)
-+{
-+ __futex_queue(q, hb);
-+ spin_unlock(&hb->lock);
-+}
-+
-+extern void futex_unqueue_pi(struct futex_q *q);
-+
-+extern void wait_for_owner_exiting(int ret, struct task_struct *exiting);
-+
-+/*
-+ * Reflects a new waiter being added to the waitqueue.
-+ */
-+static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb)
-+{
-+#ifdef CONFIG_SMP
-+ atomic_inc(&hb->waiters);
-+ /*
-+ * Full barrier (A), see the ordering comment above.
-+ */
-+ smp_mb__after_atomic();
-+#endif
-+}
-+
-+/*
-+ * Reflects a waiter being removed from the waitqueue by wakeup
-+ * paths.
-+ */
-+static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb)
-+{
-+#ifdef CONFIG_SMP
-+ atomic_dec(&hb->waiters);
-+#endif
-+}
-+
-+static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
-+{
-+#ifdef CONFIG_SMP
-+ /*
-+ * Full barrier (B), see the ordering comment above.
-+ */
-+ smp_mb();
-+ return atomic_read(&hb->waiters);
-+#else
-+ return 1;
-+#endif
-+}
-+
-+extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q);
-+extern void futex_q_unlock(struct futex_hash_bucket *hb);
-+
-+
-+extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
-+ union futex_key *key,
-+ struct futex_pi_state **ps,
-+ struct task_struct *task,
-+ struct task_struct **exiting,
-+ int set_waiters);
-+
-+extern int refill_pi_state_cache(void);
-+extern void get_pi_state(struct futex_pi_state *pi_state);
-+extern void put_pi_state(struct futex_pi_state *pi_state);
-+extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked);
-+
-+/*
-+ * Express the locking dependencies for lockdep:
-+ */
-+static inline void
-+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
-+{
-+ if (hb1 > hb2)
-+ swap(hb1, hb2);
-+
-+ spin_lock(&hb1->lock);
-+ if (hb1 != hb2)
-+ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
-+}
-+
-+static inline void
-+double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
-+{
-+ spin_unlock(&hb1->lock);
-+ if (hb1 != hb2)
-+ spin_unlock(&hb2->lock);
-+}
-+
-+/* syscalls */
-+
-+extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
-+ val, ktime_t *abs_time, u32 bitset, u32 __user
-+ *uaddr2);
-+
-+extern int futex_requeue(u32 __user *uaddr1, unsigned int flags,
-+ u32 __user *uaddr2, int nr_wake, int nr_requeue,
-+ u32 *cmpval, int requeue_pi);
-+
-+extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
-+ ktime_t *abs_time, u32 bitset);
-+
-+/**
-+ * struct futex_vector - Auxiliary struct for futex_waitv()
-+ * @w: Userspace provided data
-+ * @q: Kernel side data
-+ *
-+ * Struct used to build an array with all data need for futex_waitv()
-+ */
-+struct futex_vector {
-+ struct futex_waitv w;
-+ struct futex_q q;
-+};
-+
-+extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
-+ struct hrtimer_sleeper *to);
-+
-+extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset);
-+
-+extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags,
-+ u32 __user *uaddr2, int nr_wake, int nr_wake2, int op);
-+
-+extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags);
-+
-+extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);
-+
-+#endif /* _FUTEX_H */
-diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
-new file mode 100644
-index 000000000..183b28c32
---- /dev/null
-+++ b/kernel/futex/pi.c
-@@ -0,0 +1,1233 @@
-+// SPDX-License-Identifier: GPL-2.0-or-later
-+
-+#include <linux/slab.h>
-+#include <linux/sched/task.h>
-+
-+#include "futex.h"
-+#include "../locking/rtmutex_common.h"
-+
-+/*
-+ * PI code:
-+ */
-+int refill_pi_state_cache(void)
-+{
-+ struct futex_pi_state *pi_state;
-+
-+ if (likely(current->pi_state_cache))
-+ return 0;
-+
-+ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
-+
-+ if (!pi_state)
-+ return -ENOMEM;
-+
-+ INIT_LIST_HEAD(&pi_state->list);
-+ /* pi_mutex gets initialized later */
-+ pi_state->owner = NULL;
-+ refcount_set(&pi_state->refcount, 1);
-+ pi_state->key = FUTEX_KEY_INIT;
-+
-+ current->pi_state_cache = pi_state;
-+
-+ return 0;
-+}
-+
-+static struct futex_pi_state *alloc_pi_state(void)
-+{
-+ struct futex_pi_state *pi_state = current->pi_state_cache;
-+
-+ WARN_ON(!pi_state);
-+ current->pi_state_cache = NULL;
-+
-+ return pi_state;
-+}
-+
-+static void pi_state_update_owner(struct futex_pi_state *pi_state,
-+ struct task_struct *new_owner)
-+{
-+ struct task_struct *old_owner = pi_state->owner;
-+
-+ lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
-+
-+ if (old_owner) {
-+ raw_spin_lock(&old_owner->pi_lock);
-+ WARN_ON(list_empty(&pi_state->list));
-+ list_del_init(&pi_state->list);
-+ raw_spin_unlock(&old_owner->pi_lock);
-+ }
-+
-+ if (new_owner) {
-+ raw_spin_lock(&new_owner->pi_lock);
-+ WARN_ON(!list_empty(&pi_state->list));
-+ list_add(&pi_state->list, &new_owner->pi_state_list);
-+ pi_state->owner = new_owner;
-+ raw_spin_unlock(&new_owner->pi_lock);
-+ }
-+}
-+
-+void get_pi_state(struct futex_pi_state *pi_state)
-+{
-+ WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
-+}
-+
-+/*
-+ * Drops a reference to the pi_state object and frees or caches it
-+ * when the last reference is gone.
-+ */
-+void put_pi_state(struct futex_pi_state *pi_state)
-+{
-+ if (!pi_state)
-+ return;
-+
-+ if (!refcount_dec_and_test(&pi_state->refcount))
-+ return;
-+
-+ /*
-+ * If pi_state->owner is NULL, the owner is most probably dying
-+ * and has cleaned up the pi_state already
-+ */
-+ if (pi_state->owner) {
-+ unsigned long flags;
-+
-+ raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
-+ pi_state_update_owner(pi_state, NULL);
-+ rt_mutex_proxy_unlock(&pi_state->pi_mutex);
-+ raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
-+ }
-+
-+ if (current->pi_state_cache) {
-+ kfree(pi_state);
-+ } else {
-+ /*
-+ * pi_state->list is already empty.
-+ * clear pi_state->owner.
-+ * refcount is at 0 - put it back to 1.
-+ */
-+ pi_state->owner = NULL;
-+ refcount_set(&pi_state->refcount, 1);
-+ current->pi_state_cache = pi_state;
-+ }
-+}
-+
-+/*
-+ * We need to check the following states:
-+ *
-+ * Waiter | pi_state | pi->owner | uTID | uODIED | ?
-+ *
-+ * [1] NULL | --- | --- | 0 | 0/1 | Valid
-+ * [2] NULL | --- | --- | >0 | 0/1 | Valid
-+ *
-+ * [3] Found | NULL | -- | Any | 0/1 | Invalid
-+ *
-+ * [4] Found | Found | NULL | 0 | 1 | Valid
-+ * [5] Found | Found | NULL | >0 | 1 | Invalid
-+ *
-+ * [6] Found | Found | task | 0 | 1 | Valid
-+ *
-+ * [7] Found | Found | NULL | Any | 0 | Invalid
-+ *
-+ * [8] Found | Found | task | ==taskTID | 0/1 | Valid
-+ * [9] Found | Found | task | 0 | 0 | Invalid
-+ * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
-+ *
-+ * [1] Indicates that the kernel can acquire the futex atomically. We
-+ * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
-+ *
-+ * [2] Valid, if TID does not belong to a kernel thread. If no matching
-+ * thread is found then it indicates that the owner TID has died.
-+ *
-+ * [3] Invalid. The waiter is queued on a non PI futex
-+ *
-+ * [4] Valid state after exit_robust_list(), which sets the user space
-+ * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
-+ *
-+ * [5] The user space value got manipulated between exit_robust_list()
-+ * and exit_pi_state_list()
-+ *
-+ * [6] Valid state after exit_pi_state_list() which sets the new owner in
-+ * the pi_state but cannot access the user space value.
-+ *
-+ * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
-+ *
-+ * [8] Owner and user space value match
-+ *
-+ * [9] There is no transient state which sets the user space TID to 0
-+ * except exit_robust_list(), but this is indicated by the
-+ * FUTEX_OWNER_DIED bit. See [4]
-+ *
-+ * [10] There is no transient state which leaves owner and user space
-+ * TID out of sync. Except one error case where the kernel is denied
-+ * write access to the user address, see fixup_pi_state_owner().
-+ *
-+ *
-+ * Serialization and lifetime rules:
-+ *
-+ * hb->lock:
-+ *
-+ * hb -> futex_q, relation
-+ * futex_q -> pi_state, relation
-+ *
-+ * (cannot be raw because hb can contain arbitrary amount
-+ * of futex_q's)
-+ *
-+ * pi_mutex->wait_lock:
-+ *
-+ * {uval, pi_state}
-+ *
-+ * (and pi_mutex 'obviously')
-+ *
-+ * p->pi_lock:
-+ *
-+ * p->pi_state_list -> pi_state->list, relation
-+ * pi_mutex->owner -> pi_state->owner, relation
-+ *
-+ * pi_state->refcount:
-+ *
-+ * pi_state lifetime
-+ *
-+ *
-+ * Lock order:
-+ *
-+ * hb->lock
-+ * pi_mutex->wait_lock
-+ * p->pi_lock
-+ *
-+ */
-+
-+/*
-+ * Validate that the existing waiter has a pi_state and sanity check
-+ * the pi_state against the user space value. If correct, attach to
-+ * it.
-+ */
-+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
-+ struct futex_pi_state *pi_state,
-+ struct futex_pi_state **ps)
-+{
-+ pid_t pid = uval & FUTEX_TID_MASK;
-+ u32 uval2;
-+ int ret;
-+
-+ /*
-+ * Userspace might have messed up non-PI and PI futexes [3]
-+ */
-+ if (unlikely(!pi_state))
-+ return -EINVAL;
-+
-+ /*
-+ * We get here with hb->lock held, and having found a
-+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
-+ * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
-+ * which in turn means that futex_lock_pi() still has a reference on
-+ * our pi_state.
-+ *
-+ * The waiter holding a reference on @pi_state also protects against
-+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
-+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
-+ * free pi_state before we can take a reference ourselves.
-+ */
-+ WARN_ON(!refcount_read(&pi_state->refcount));
-+
-+ /*
-+ * Now that we have a pi_state, we can acquire wait_lock
-+ * and do the state validation.
-+ */
-+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-+
-+ /*
-+ * Since {uval, pi_state} is serialized by wait_lock, and our current
-+ * uval was read without holding it, it can have changed. Verify it
-+ * still is what we expect it to be, otherwise retry the entire
-+ * operation.
-+ */
-+ if (futex_get_value_locked(&uval2, uaddr))
-+ goto out_efault;
-+
-+ if (uval != uval2)
-+ goto out_eagain;
-+
-+ /*
-+ * Handle the owner died case:
-+ */
-+ if (uval & FUTEX_OWNER_DIED) {
-+ /*
-+ * exit_pi_state_list sets owner to NULL and wakes the
-+ * topmost waiter. The task which acquires the
-+ * pi_state->rt_mutex will fixup owner.
-+ */
-+ if (!pi_state->owner) {
-+ /*
-+ * No pi state owner, but the user space TID
-+ * is not 0. Inconsistent state. [5]
-+ */
-+ if (pid)
-+ goto out_einval;
-+ /*
-+ * Take a ref on the state and return success. [4]
-+ */
-+ goto out_attach;
-+ }
-+
-+ /*
-+ * If TID is 0, then either the dying owner has not
-+ * yet executed exit_pi_state_list() or some waiter
-+ * acquired the rtmutex in the pi state, but did not
-+ * yet fixup the TID in user space.
-+ *
-+ * Take a ref on the state and return success. [6]
-+ */
-+ if (!pid)
-+ goto out_attach;
-+ } else {
-+ /*
-+ * If the owner died bit is not set, then the pi_state
-+ * must have an owner. [7]
-+ */
-+ if (!pi_state->owner)
-+ goto out_einval;
-+ }
-+
-+ /*
-+ * Bail out if user space manipulated the futex value. If pi
-+ * state exists then the owner TID must be the same as the
-+ * user space TID. [9/10]
-+ */
-+ if (pid != task_pid_vnr(pi_state->owner))
-+ goto out_einval;
-+
-+out_attach:
-+ get_pi_state(pi_state);
-+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-+ *ps = pi_state;
-+ return 0;
-+
-+out_einval:
-+ ret = -EINVAL;
-+ goto out_error;
-+
-+out_eagain:
-+ ret = -EAGAIN;
-+ goto out_error;
-+
-+out_efault:
-+ ret = -EFAULT;
-+ goto out_error;
-+
-+out_error:
-+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-+ return ret;
-+}
-+
-+static int handle_exit_race(u32 __user *uaddr, u32 uval,
-+ struct task_struct *tsk)
-+{
-+ u32 uval2;
-+
-+ /*
-+ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
-+ * caller that the alleged owner is busy.
-+ */
-+ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
-+ return -EBUSY;
-+
-+ /*
-+ * Reread the user space value to handle the following situation:
-+ *
-+ * CPU0 CPU1
-+ *
-+ * sys_exit() sys_futex()
-+ * do_exit() futex_lock_pi()
-+ * futex_lock_pi_atomic()
-+ * exit_signals(tsk) No waiters:
-+ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
-+ * mm_release(tsk) Set waiter bit
-+ * exit_robust_list(tsk) { *uaddr = 0x80000PID;
-+ * Set owner died attach_to_pi_owner() {
-+ * *uaddr = 0xC0000000; tsk = get_task(PID);
-+ * } if (!tsk->flags & PF_EXITING) {
-+ * ... attach();
-+ * tsk->futex_state = } else {
-+ * FUTEX_STATE_DEAD; if (tsk->futex_state !=
-+ * FUTEX_STATE_DEAD)
-+ * return -EAGAIN;
-+ * return -ESRCH; <--- FAIL
-+ * }
-+ *
-+ * Returning ESRCH unconditionally is wrong here because the
-+ * user space value has been changed by the exiting task.
-+ *
-+ * The same logic applies to the case where the exiting task is
-+ * already gone.
-+ */
-+ if (futex_get_value_locked(&uval2, uaddr))
-+ return -EFAULT;
-+
-+ /* If the user space value has changed, try again. */
-+ if (uval2 != uval)
-+ return -EAGAIN;
-+
-+ /*
-+ * The exiting task did not have a robust list, the robust list was
-+ * corrupted or the user space value in *uaddr is simply bogus.
-+ * Give up and tell user space.
-+ */
-+ return -ESRCH;
-+}
-+
-+static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
-+ struct futex_pi_state **ps)
-+{
-+ /*
-+ * No existing pi state. First waiter. [2]
-+ *
-+ * This creates pi_state, we have hb->lock held, this means nothing can
-+ * observe this state, wait_lock is irrelevant.
-+ */
-+ struct futex_pi_state *pi_state = alloc_pi_state();
-+
-+ /*
-+ * Initialize the pi_mutex in locked state and make @p
-+ * the owner of it:
-+ */
-+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
-+
-+ /* Store the key for possible exit cleanups: */
-+ pi_state->key = *key;
-+
-+ WARN_ON(!list_empty(&pi_state->list));
-+ list_add(&pi_state->list, &p->pi_state_list);
-+ /*
-+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
-+ * because there is no concurrency as the object is not published yet.
-+ */
-+ pi_state->owner = p;
-+
-+ *ps = pi_state;
-+}
-+/*
-+ * Lookup the task for the TID provided from user space and attach to
-+ * it after doing proper sanity checks.
-+ */
-+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
-+ struct futex_pi_state **ps,
-+ struct task_struct **exiting)
-+{
-+ pid_t pid = uval & FUTEX_TID_MASK;
-+ struct task_struct *p;
-+
-+ /*
-+ * We are the first waiter - try to look up the real owner and attach
-+ * the new pi_state to it, but bail out when TID = 0 [1]
-+ *
-+ * The !pid check is paranoid. None of the call sites should end up
-+ * with pid == 0, but better safe than sorry. Let the caller retry
-+ */
-+ if (!pid)
-+ return -EAGAIN;
-+ p = find_get_task_by_vpid(pid);
-+ if (!p)
-+ return handle_exit_race(uaddr, uval, NULL);
-+
-+ if (unlikely(p->flags & PF_KTHREAD)) {
-+ put_task_struct(p);
-+ return -EPERM;
-+ }
-+
-+ /*
-+ * We need to look at the task state to figure out, whether the
-+ * task is exiting. To protect against the change of the task state
-+ * in futex_exit_release(), we do this protected by p->pi_lock:
-+ */
-+ raw_spin_lock_irq(&p->pi_lock);
-+ if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
-+ /*
-+ * The task is on the way out. When the futex state is
-+ * FUTEX_STATE_DEAD, we know that the task has finished
-+ * the cleanup:
-+ */
-+ int ret = handle_exit_race(uaddr, uval, p);
-+
-+ raw_spin_unlock_irq(&p->pi_lock);
-+ /*
-+ * If the owner task is between FUTEX_STATE_EXITING and
-+ * FUTEX_STATE_DEAD then store the task pointer and keep
-+ * the reference on the task struct. The calling code will
-+ * drop all locks, wait for the task to reach
-+ * FUTEX_STATE_DEAD and then drop the refcount. This is
-+ * required to prevent a live lock when the current task
-+ * preempted the exiting task between the two states.
-+ */
-+ if (ret == -EBUSY)
-+ *exiting = p;
-+ else
-+ put_task_struct(p);
-+ return ret;
-+ }
-+
-+ __attach_to_pi_owner(p, key, ps);
-+ raw_spin_unlock_irq(&p->pi_lock);
-+
-+ put_task_struct(p);
-+
-+ return 0;
-+}
-+
-+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
-+{
-+ int err;
-+ u32 curval;
-+
-+ if (unlikely(should_fail_futex(true)))
-+ return -EFAULT;
-+
-+ err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
-+ if (unlikely(err))
-+ return err;
-+
-+ /* If user space value changed, let the caller retry */
-+ return curval != uval ? -EAGAIN : 0;
-+}
-+
-+/**
-+ * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
-+ * @uaddr: the pi futex user address
-+ * @hb: the pi futex hash bucket
-+ * @key: the futex key associated with uaddr and hb
-+ * @ps: the pi_state pointer where we store the result of the
-+ * lookup
-+ * @task: the task to perform the atomic lock work for. This will
-+ * be "current" except in the case of requeue pi.
-+ * @exiting: Pointer to store the task pointer of the owner task
-+ * which is in the middle of exiting
-+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
-+ *
-+ * Return:
-+ * - 0 - ready to wait;
-+ * - 1 - acquired the lock;
-+ * - <0 - error
-+ *
-+ * The hb->lock must be held by the caller.
-+ *
-+ * @exiting is only set when the return value is -EBUSY. If so, this holds
-+ * a refcount on the exiting task on return and the caller needs to drop it
-+ * after waiting for the exit to complete.
-+ */
-+int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
-+ union futex_key *key,
-+ struct futex_pi_state **ps,
-+ struct task_struct *task,
-+ struct task_struct **exiting,
-+ int set_waiters)
-+{
-+ u32 uval, newval, vpid = task_pid_vnr(task);
-+ struct futex_q *top_waiter;
-+ int ret;
-+
-+ /*
-+ * Read the user space value first so we can validate a few
-+ * things before proceeding further.
-+ */
-+ if (futex_get_value_locked(&uval, uaddr))
-+ return -EFAULT;
-+
-+ if (unlikely(should_fail_futex(true)))
-+ return -EFAULT;
-+
-+ /*
-+ * Detect deadlocks.
-+ */
-+ if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
-+ return -EDEADLK;
-+
-+ if ((unlikely(should_fail_futex(true))))
-+ return -EDEADLK;
-+
-+ /*
-+ * Lookup existing state first. If it exists, try to attach to
-+ * its pi_state.
-+ */
-+ top_waiter = futex_top_waiter(hb, key);
-+ if (top_waiter)
-+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
-+
-+ /*
-+ * No waiter and user TID is 0. We are here because the
-+ * waiters or the owner died bit is set or called from
-+ * requeue_cmp_pi or for whatever reason something took the
-+ * syscall.
-+ */
-+ if (!(uval & FUTEX_TID_MASK)) {
-+ /*
-+ * We take over the futex. No other waiters and the user space
-+ * TID is 0. We preserve the owner died bit.
-+ */
-+ newval = uval & FUTEX_OWNER_DIED;
-+ newval |= vpid;
-+
-+ /* The futex requeue_pi code can enforce the waiters bit */
-+ if (set_waiters)
-+ newval |= FUTEX_WAITERS;
-+
-+ ret = lock_pi_update_atomic(uaddr, uval, newval);
-+ if (ret)
-+ return ret;
-+
-+ /*
-+ * If the waiter bit was requested the caller also needs PI
-+ * state attached to the new owner of the user space futex.
-+ *
-+ * @task is guaranteed to be alive and it cannot be exiting
-+ * because it is either sleeping or waiting in
-+ * futex_requeue_pi_wakeup_sync().
-+ *
-+ * No need to do the full attach_to_pi_owner() exercise
-+ * because @task is known and valid.
-+ */
-+ if (set_waiters) {
-+ raw_spin_lock_irq(&task->pi_lock);
-+ __attach_to_pi_owner(task, key, ps);
-+ raw_spin_unlock_irq(&task->pi_lock);
-+ }
-+ return 1;
-+ }
-+
-+ /*
-+ * First waiter. Set the waiters bit before attaching ourself to
-+ * the owner. If owner tries to unlock, it will be forced into
-+ * the kernel and blocked on hb->lock.
-+ */
-+ newval = uval | FUTEX_WAITERS;
-+ ret = lock_pi_update_atomic(uaddr, uval, newval);
-+ if (ret)
-+ return ret;
-+ /*
-+ * If the update of the user space value succeeded, we try to
-+ * attach to the owner. If that fails, no harm done, we only
-+ * set the FUTEX_WAITERS bit in the user space variable.
-+ */
-+ return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
-+}
-+
-+/*
-+ * Caller must hold a reference on @pi_state.
-+ */
-+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
-+{
-+ struct rt_mutex_waiter *top_waiter;
-+ struct task_struct *new_owner;
-+ bool postunlock = false;
-+ DEFINE_RT_WAKE_Q(wqh);
-+ u32 curval, newval;
-+ int ret = 0;
-+
-+ top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
-+ if (WARN_ON_ONCE(!top_waiter)) {
-+ /*
-+ * As per the comment in futex_unlock_pi() this should not happen.
-+ *
-+ * When this happens, give up our locks and try again, giving
-+ * the futex_lock_pi() instance time to complete, either by
-+ * waiting on the rtmutex or removing itself from the futex
-+ * queue.
-+ */
-+ ret = -EAGAIN;
-+ goto out_unlock;
-+ }
-+
-+ new_owner = top_waiter->task;
-+
-+ /*
-+ * We pass it to the next owner. The WAITERS bit is always kept
-+ * enabled while there is PI state around. We cleanup the owner
-+ * died bit, because we are the owner.
-+ */
-+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
-+
-+ if (unlikely(should_fail_futex(true))) {
-+ ret = -EFAULT;
-+ goto out_unlock;
-+ }
-+
-+ ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
-+ if (!ret && (curval != uval)) {
-+ /*
-+ * If a unconditional UNLOCK_PI operation (user space did not
-+ * try the TID->0 transition) raced with a waiter setting the
-+ * FUTEX_WAITERS flag between get_user() and locking the hash
-+ * bucket lock, retry the operation.
-+ */
-+ if ((FUTEX_TID_MASK & curval) == uval)
-+ ret = -EAGAIN;
-+ else
-+ ret = -EINVAL;
-+ }
-+
-+ if (!ret) {
-+ /*
-+ * This is a point of no return; once we modified the uval
-+ * there is no going back and subsequent operations must
-+ * not fail.
-+ */
-+ pi_state_update_owner(pi_state, new_owner);
-+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
-+ }
-+
-+out_unlock:
-+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-+
-+ if (postunlock)
-+ rt_mutex_postunlock(&wqh);
-+
-+ return ret;
-+}
-+
-+static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-+ struct task_struct *argowner)
-+{
-+ struct futex_pi_state *pi_state = q->pi_state;
-+ struct task_struct *oldowner, *newowner;
-+ u32 uval, curval, newval, newtid;
-+ int err = 0;
-+
-+ oldowner = pi_state->owner;
-+
-+ /*
-+ * We are here because either:
-+ *
-+ * - we stole the lock and pi_state->owner needs updating to reflect
-+ * that (@argowner == current),
-+ *
-+ * or:
-+ *
-+ * - someone stole our lock and we need to fix things to point to the
-+ * new owner (@argowner == NULL).
-+ *
-+ * Either way, we have to replace the TID in the user space variable.
-+ * This must be atomic as we have to preserve the owner died bit here.
-+ *
-+ * Note: We write the user space value _before_ changing the pi_state
-+ * because we can fault here. Imagine swapped out pages or a fork
-+ * that marked all the anonymous memory readonly for cow.
-+ *
-+ * Modifying pi_state _before_ the user space value would leave the
-+ * pi_state in an inconsistent state when we fault here, because we
-+ * need to drop the locks to handle the fault. This might be observed
-+ * in the PID checks when attaching to PI state .
-+ */
-+retry:
-+ if (!argowner) {
-+ if (oldowner != current) {
-+ /*
-+ * We raced against a concurrent self; things are
-+ * already fixed up. Nothing to do.
-+ */
-+ return 0;
-+ }
-+
-+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
-+ /* We got the lock. pi_state is correct. Tell caller. */
-+ return 1;
-+ }
-+
-+ /*
-+ * The trylock just failed, so either there is an owner or
-+ * there is a higher priority waiter than this one.
-+ */
-+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
-+ /*
-+ * If the higher priority waiter has not yet taken over the
-+ * rtmutex then newowner is NULL. We can't return here with
-+ * that state because it's inconsistent vs. the user space
-+ * state. So drop the locks and try again. It's a valid
-+ * situation and not any different from the other retry
-+ * conditions.
-+ */
-+ if (unlikely(!newowner)) {
-+ err = -EAGAIN;
-+ goto handle_err;
-+ }
-+ } else {
-+ WARN_ON_ONCE(argowner != current);
-+ if (oldowner == current) {
-+ /*
-+ * We raced against a concurrent self; things are
-+ * already fixed up. Nothing to do.
-+ */
-+ return 1;
-+ }
-+ newowner = argowner;
-+ }
-+
-+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
-+ /* Owner died? */
-+ if (!pi_state->owner)
-+ newtid |= FUTEX_OWNER_DIED;
-+
-+ err = futex_get_value_locked(&uval, uaddr);
-+ if (err)
-+ goto handle_err;
-+
-+ for (;;) {
-+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
-+
-+ err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
-+ if (err)
-+ goto handle_err;
-+
-+ if (curval == uval)
-+ break;
-+ uval = curval;
-+ }
-+
-+ /*
-+ * We fixed up user space. Now we need to fix the pi_state
-+ * itself.
-+ */
-+ pi_state_update_owner(pi_state, newowner);
-+
-+ return argowner == current;
-+
-+ /*
-+ * In order to reschedule or handle a page fault, we need to drop the
-+ * locks here. In the case of a fault, this gives the other task
-+ * (either the highest priority waiter itself or the task which stole
-+ * the rtmutex) the chance to try the fixup of the pi_state. So once we
-+ * are back from handling the fault we need to check the pi_state after
-+ * reacquiring the locks and before trying to do another fixup. When
-+ * the fixup has been done already we simply return.
-+ *
-+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
-+ * drop hb->lock since the caller owns the hb -> futex_q relation.
-+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
-+ */
-+handle_err:
-+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-+ spin_unlock(q->lock_ptr);
-+
-+ switch (err) {
-+ case -EFAULT:
-+ err = fault_in_user_writeable(uaddr);
-+ break;
-+
-+ case -EAGAIN:
-+ cond_resched();
-+ err = 0;
-+ break;
-+
-+ default:
-+ WARN_ON_ONCE(1);
-+ break;
-+ }
-+
-+ spin_lock(q->lock_ptr);
-+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-+
-+ /*
-+ * Check if someone else fixed it for us:
-+ */
-+ if (pi_state->owner != oldowner)
-+ return argowner == current;
-+
-+ /* Retry if err was -EAGAIN or the fault in succeeded */
-+ if (!err)
-+ goto retry;
-+
-+ /*
-+ * fault_in_user_writeable() failed so user state is immutable. At
-+ * best we can make the kernel state consistent but user state will
-+ * be most likely hosed and any subsequent unlock operation will be
-+ * rejected due to PI futex rule [10].
-+ *
-+ * Ensure that the rtmutex owner is also the pi_state owner despite
-+ * the user space value claiming something different. There is no
-+ * point in unlocking the rtmutex if current is the owner as it
-+ * would need to wait until the next waiter has taken the rtmutex
-+ * to guarantee consistent state. Keep it simple. Userspace asked
-+ * for this wreckaged state.
-+ *
-+ * The rtmutex has an owner - either current or some other
-+ * task. See the EAGAIN loop above.
-+ */
-+ pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
-+
-+ return err;
-+}
-+
-+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-+ struct task_struct *argowner)
-+{
-+ struct futex_pi_state *pi_state = q->pi_state;
-+ int ret;
-+
-+ lockdep_assert_held(q->lock_ptr);
-+
-+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-+ ret = __fixup_pi_state_owner(uaddr, q, argowner);
-+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-+ return ret;
-+}
-+
-+/**
-+ * fixup_pi_owner() - Post lock pi_state and corner case management
-+ * @uaddr: user address of the futex
-+ * @q: futex_q (contains pi_state and access to the rt_mutex)
-+ * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
-+ *
-+ * After attempting to lock an rt_mutex, this function is called to cleanup
-+ * the pi_state owner as well as handle race conditions that may allow us to
-+ * acquire the lock. Must be called with the hb lock held.
-+ *
-+ * Return:
-+ * - 1 - success, lock taken;
-+ * - 0 - success, lock not taken;
-+ * - <0 - on error (-EFAULT)
-+ */
-+int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
-+{
-+ if (locked) {
-+ /*
-+ * Got the lock. We might not be the anticipated owner if we
-+ * did a lock-steal - fix up the PI-state in that case:
-+ *
-+ * Speculative pi_state->owner read (we don't hold wait_lock);
-+ * since we own the lock pi_state->owner == current is the
-+ * stable state, anything else needs more attention.
-+ */
-+ if (q->pi_state->owner != current)
-+ return fixup_pi_state_owner(uaddr, q, current);
-+ return 1;
-+ }
-+
-+ /*
-+ * If we didn't get the lock; check if anybody stole it from us. In
-+ * that case, we need to fix up the uval to point to them instead of
-+ * us, otherwise bad things happen. [10]
-+ *
-+ * Another speculative read; pi_state->owner == current is unstable
-+ * but needs our attention.
-+ */
-+ if (q->pi_state->owner == current)
-+ return fixup_pi_state_owner(uaddr, q, NULL);
-+
-+ /*
-+ * Paranoia check. If we did not take the lock, then we should not be
-+ * the owner of the rt_mutex. Warn and establish consistent state.
-+ */
-+ if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
-+ return fixup_pi_state_owner(uaddr, q, current);
-+
-+ return 0;
-+}
-+
-+/*
-+ * Userspace tried a 0 -> TID atomic transition of the futex value
-+ * and failed. The kernel side here does the whole locking operation:
-+ * if there are waiters then it will block as a consequence of relying
-+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
-+ * a 0 value of the futex too.).
-+ *
-+ * Also serves as futex trylock_pi()'ing, and due semantics.
-+ */
-+int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
-+{
-+ struct hrtimer_sleeper timeout, *to;
-+ struct task_struct *exiting = NULL;
-+ struct rt_mutex_waiter rt_waiter;
-+ struct futex_hash_bucket *hb;
-+ struct futex_q q = futex_q_init;
-+ int res, ret;
-+
-+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
-+ return -ENOSYS;
-+
-+ if (refill_pi_state_cache())
-+ return -ENOMEM;
-+
-+ to = futex_setup_timer(time, &timeout, flags, 0);
-+
-+retry:
-+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
-+ if (unlikely(ret != 0))
-+ goto out;
-+
-+retry_private:
-+ hb = futex_q_lock(&q);
-+
-+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
-+ &exiting, 0);
-+ if (unlikely(ret)) {
-+ /*
-+ * Atomic work succeeded and we got the lock,
-+ * or failed. Either way, we do _not_ block.
-+ */
-+ switch (ret) {
-+ case 1:
-+ /* We got the lock. */
-+ ret = 0;
-+ goto out_unlock_put_key;
-+ case -EFAULT:
-+ goto uaddr_faulted;
-+ case -EBUSY:
-+ case -EAGAIN:
-+ /*
-+ * Two reasons for this:
-+ * - EBUSY: Task is exiting and we just wait for the
-+ * exit to complete.
-+ * - EAGAIN: The user space value changed.
-+ */
-+ futex_q_unlock(hb);
-+ /*
-+ * Handle the case where the owner is in the middle of
-+ * exiting. Wait for the exit to complete otherwise
-+ * this task might loop forever, aka. live lock.
-+ */
-+ wait_for_owner_exiting(ret, exiting);
-+ cond_resched();
-+ goto retry;
-+ default:
-+ goto out_unlock_put_key;
-+ }
-+ }
-+
-+ WARN_ON(!q.pi_state);
-+
-+ /*
-+ * Only actually queue now that the atomic ops are done:
-+ */
-+ __futex_queue(&q, hb);
-+
-+ if (trylock) {
-+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
-+ /* Fixup the trylock return value: */
-+ ret = ret ? 0 : -EWOULDBLOCK;
-+ goto no_block;
-+ }
-+
-+ rt_mutex_init_waiter(&rt_waiter);
-+
-+ /*
-+ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
-+ * hold it while doing rt_mutex_start_proxy(), because then it will
-+ * include hb->lock in the blocking chain, even through we'll not in
-+ * fact hold it while blocking. This will lead it to report -EDEADLK
-+ * and BUG when futex_unlock_pi() interleaves with this.
-+ *
-+ * Therefore acquire wait_lock while holding hb->lock, but drop the
-+ * latter before calling __rt_mutex_start_proxy_lock(). This
-+ * interleaves with futex_unlock_pi() -- which does a similar lock
-+ * handoff -- such that the latter can observe the futex_q::pi_state
-+ * before __rt_mutex_start_proxy_lock() is done.
-+ */
-+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
-+ spin_unlock(q.lock_ptr);
-+ /*
-+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
-+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
-+ * it sees the futex_q::pi_state.
-+ */
-+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
-+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
-+
-+ if (ret) {
-+ if (ret == 1)
-+ ret = 0;
-+ goto cleanup;
-+ }
-+
-+ if (unlikely(to))
-+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
-+
-+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
-+
-+cleanup:
-+ spin_lock(q.lock_ptr);
-+ /*
-+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
-+ * first acquire the hb->lock before removing the lock from the
-+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
-+ * lists consistent.
-+ *
-+ * In particular; it is important that futex_unlock_pi() can not
-+ * observe this inconsistency.
-+ */
-+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
-+ ret = 0;
-+
-+no_block:
-+ /*
-+ * Fixup the pi_state owner and possibly acquire the lock if we
-+ * haven't already.
-+ */
-+ res = fixup_pi_owner(uaddr, &q, !ret);
-+ /*
-+ * If fixup_pi_owner() returned an error, propagate that. If it acquired
-+ * the lock, clear our -ETIMEDOUT or -EINTR.
-+ */
-+ if (res)
-+ ret = (res < 0) ? res : 0;
-+
-+ futex_unqueue_pi(&q);
-+ spin_unlock(q.lock_ptr);
-+ goto out;
-+
-+out_unlock_put_key:
-+ futex_q_unlock(hb);
-+
-+out:
-+ if (to) {
-+ hrtimer_cancel(&to->timer);
-+ destroy_hrtimer_on_stack(&to->timer);
-+ }
-+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
-+
-+uaddr_faulted:
-+ futex_q_unlock(hb);
-+
-+ ret = fault_in_user_writeable(uaddr);
-+ if (ret)
-+ goto out;
-+
-+ if (!(flags & FLAGS_SHARED))
-+ goto retry_private;
-+
-+ goto retry;
-+}
-+
-+/*
-+ * Userspace attempted a TID -> 0 atomic transition, and failed.
-+ * This is the in-kernel slowpath: we look up the PI state (if any),
-+ * and do the rt-mutex unlock.
-+ */
-+int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
-+{
-+ u32 curval, uval, vpid = task_pid_vnr(current);
-+ union futex_key key = FUTEX_KEY_INIT;
-+ struct futex_hash_bucket *hb;
-+ struct futex_q *top_waiter;
-+ int ret;
-+
-+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
-+ return -ENOSYS;
-+
-+retry:
-+ if (get_user(uval, uaddr))
-+ return -EFAULT;
-+ /*
-+ * We release only a lock we actually own:
-+ */
-+ if ((uval & FUTEX_TID_MASK) != vpid)
-+ return -EPERM;
-+
-+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
-+ if (ret)
-+ return ret;
-+
-+ hb = futex_hash(&key);
-+ spin_lock(&hb->lock);
-+
-+ /*
-+ * Check waiters first. We do not trust user space values at
-+ * all and we at least want to know if user space fiddled
-+ * with the futex value instead of blindly unlocking.
-+ */
-+ top_waiter = futex_top_waiter(hb, &key);
-+ if (top_waiter) {
-+ struct futex_pi_state *pi_state = top_waiter->pi_state;
-+
-+ ret = -EINVAL;
-+ if (!pi_state)
-+ goto out_unlock;
-+
-+ /*
-+ * If current does not own the pi_state then the futex is
-+ * inconsistent and user space fiddled with the futex value.
-+ */
-+ if (pi_state->owner != current)
-+ goto out_unlock;
-+
-+ get_pi_state(pi_state);
-+ /*
-+ * By taking wait_lock while still holding hb->lock, we ensure
-+ * there is no point where we hold neither; and therefore
-+ * wake_futex_p() must observe a state consistent with what we
-+ * observed.
-+ *
-+ * In particular; this forces __rt_mutex_start_proxy() to
-+ * complete such that we're guaranteed to observe the
-+ * rt_waiter. Also see the WARN in wake_futex_pi().
-+ */
-+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-+ spin_unlock(&hb->lock);
-+
-+ /* drops pi_state->pi_mutex.wait_lock */
-+ ret = wake_futex_pi(uaddr, uval, pi_state);
-+
-+ put_pi_state(pi_state);
-+
-+ /*
-+ * Success, we're done! No tricky corner cases.
-+ */
-+ if (!ret)
-+ return ret;
-+ /*
-+ * The atomic access to the futex value generated a
-+ * pagefault, so retry the user-access and the wakeup:
-+ */
-+ if (ret == -EFAULT)
-+ goto pi_faulted;
-+ /*
-+ * A unconditional UNLOCK_PI op raced against a waiter
-+ * setting the FUTEX_WAITERS bit. Try again.
-+ */
-+ if (ret == -EAGAIN)
-+ goto pi_retry;
-+ /*
-+ * wake_futex_pi has detected invalid state. Tell user
-+ * space.
-+ */
-+ return ret;
-+ }
-+
-+ /*
-+ * We have no kernel internal state, i.e. no waiters in the
-+ * kernel. Waiters which are about to queue themselves are stuck
-+ * on hb->lock. So we can safely ignore them. We do neither
-+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
-+ * owner.
-+ */
-+ if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
-+ spin_unlock(&hb->lock);
-+ switch (ret) {
-+ case -EFAULT:
-+ goto pi_faulted;
-+
-+ case -EAGAIN:
-+ goto pi_retry;
-+
-+ default:
-+ WARN_ON_ONCE(1);
-+ return ret;
-+ }
-+ }
-+
-+ /*
-+ * If uval has changed, let user space handle it.
-+ */
-+ ret = (curval == uval) ? 0 : -EAGAIN;
-+
-+out_unlock:
-+ spin_unlock(&hb->lock);
-+ return ret;
-+
-+pi_retry:
-+ cond_resched();
-+ goto retry;
-+
-+pi_faulted:
-+
-+ ret = fault_in_user_writeable(uaddr);
-+ if (!ret)
-+ goto retry;
-+
-+ return ret;
-+}
-+
-diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
-new file mode 100644
-index 000000000..cba8b1a6a
---- /dev/null
-+++ b/kernel/futex/requeue.c
-@@ -0,0 +1,897 @@
-+// SPDX-License-Identifier: GPL-2.0-or-later
-+
-+#include <linux/sched/signal.h>
-+
-+#include "futex.h"
-+#include "../locking/rtmutex_common.h"
-+
-+/*
-+ * On PREEMPT_RT, the hash bucket lock is a 'sleeping' spinlock with an
-+ * underlying rtmutex. The task which is about to be requeued could have
-+ * just woken up (timeout, signal). After the wake up the task has to
-+ * acquire hash bucket lock, which is held by the requeue code. As a task
-+ * can only be blocked on _ONE_ rtmutex at a time, the proxy lock blocking
-+ * and the hash bucket lock blocking would collide and corrupt state.
-+ *
-+ * On !PREEMPT_RT this is not a problem and everything could be serialized
-+ * on hash bucket lock, but aside of having the benefit of common code,
-+ * this allows to avoid doing the requeue when the task is already on the
-+ * way out and taking the hash bucket lock of the original uaddr1 when the
-+ * requeue has been completed.
-+ *
-+ * The following state transitions are valid:
-+ *
-+ * On the waiter side:
-+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_IGNORE
-+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_WAIT
-+ *
-+ * On the requeue side:
-+ * Q_REQUEUE_PI_NONE -> Q_REQUEUE_PI_INPROGRESS
-+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_DONE/LOCKED
-+ * Q_REQUEUE_PI_IN_PROGRESS -> Q_REQUEUE_PI_NONE (requeue failed)
-+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_DONE/LOCKED
-+ * Q_REQUEUE_PI_WAIT -> Q_REQUEUE_PI_IGNORE (requeue failed)
-+ *
-+ * The requeue side ignores a waiter with state Q_REQUEUE_PI_IGNORE as this
-+ * signals that the waiter is already on the way out. It also means that
-+ * the waiter is still on the 'wait' futex, i.e. uaddr1.
-+ *
-+ * The waiter side signals early wakeup to the requeue side either through
-+ * setting state to Q_REQUEUE_PI_IGNORE or to Q_REQUEUE_PI_WAIT depending
-+ * on the current state. In case of Q_REQUEUE_PI_IGNORE it can immediately
-+ * proceed to take the hash bucket lock of uaddr1. If it set state to WAIT,
-+ * which means the wakeup is interleaving with a requeue in progress it has
-+ * to wait for the requeue side to change the state. Either to DONE/LOCKED
-+ * or to IGNORE. DONE/LOCKED means the waiter q is now on the uaddr2 futex
-+ * and either blocked (DONE) or has acquired it (LOCKED). IGNORE is set by
-+ * the requeue side when the requeue attempt failed via deadlock detection
-+ * and therefore the waiter q is still on the uaddr1 futex.
-+ */
-+enum {
-+ Q_REQUEUE_PI_NONE = 0,
-+ Q_REQUEUE_PI_IGNORE,
-+ Q_REQUEUE_PI_IN_PROGRESS,
-+ Q_REQUEUE_PI_WAIT,
-+ Q_REQUEUE_PI_DONE,
-+ Q_REQUEUE_PI_LOCKED,
-+};
-+
-+const struct futex_q futex_q_init = {
-+ /* list gets initialized in futex_queue()*/
-+ .key = FUTEX_KEY_INIT,
-+ .bitset = FUTEX_BITSET_MATCH_ANY,
-+ .requeue_state = ATOMIC_INIT(Q_REQUEUE_PI_NONE),
-+};
-+
-+/**
-+ * requeue_futex() - Requeue a futex_q from one hb to another
-+ * @q: the futex_q to requeue
-+ * @hb1: the source hash_bucket
-+ * @hb2: the target hash_bucket
-+ * @key2: the new key for the requeued futex_q
-+ */
-+static inline
-+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
-+ struct futex_hash_bucket *hb2, union futex_key *key2)
-+{
-+
-+ /*
-+ * If key1 and key2 hash to the same bucket, no need to
-+ * requeue.
-+ */
-+ if (likely(&hb1->chain != &hb2->chain)) {
-+ plist_del(&q->list, &hb1->chain);
-+ futex_hb_waiters_dec(hb1);
-+ futex_hb_waiters_inc(hb2);
-+ plist_add(&q->list, &hb2->chain);
-+ q->lock_ptr = &hb2->lock;
-+ }
-+ q->key = *key2;
-+}
-+
-+static inline bool futex_requeue_pi_prepare(struct futex_q *q,
-+ struct futex_pi_state *pi_state)
-+{
-+ int old, new;
-+
-+ /*
-+ * Set state to Q_REQUEUE_PI_IN_PROGRESS unless an early wakeup has
-+ * already set Q_REQUEUE_PI_IGNORE to signal that requeue should
-+ * ignore the waiter.
-+ */
-+ old = atomic_read_acquire(&q->requeue_state);
-+ do {
-+ if (old == Q_REQUEUE_PI_IGNORE)
-+ return false;
-+
-+ /*
-+ * futex_proxy_trylock_atomic() might have set it to
-+ * IN_PROGRESS and a interleaved early wake to WAIT.
-+ *
-+ * It was considered to have an extra state for that
-+ * trylock, but that would just add more conditionals
-+ * all over the place for a dubious value.
-+ */
-+ if (old != Q_REQUEUE_PI_NONE)
-+ break;
-+
-+ new = Q_REQUEUE_PI_IN_PROGRESS;
-+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
-+
-+ q->pi_state = pi_state;
-+ return true;
-+}
-+
-+static inline void futex_requeue_pi_complete(struct futex_q *q, int locked)
-+{
-+ int old, new;
-+
-+ old = atomic_read_acquire(&q->requeue_state);
-+ do {
-+ if (old == Q_REQUEUE_PI_IGNORE)
-+ return;
-+
-+ if (locked >= 0) {
-+ /* Requeue succeeded. Set DONE or LOCKED */
-+ WARN_ON_ONCE(old != Q_REQUEUE_PI_IN_PROGRESS &&
-+ old != Q_REQUEUE_PI_WAIT);
-+ new = Q_REQUEUE_PI_DONE + locked;
-+ } else if (old == Q_REQUEUE_PI_IN_PROGRESS) {
-+ /* Deadlock, no early wakeup interleave */
-+ new = Q_REQUEUE_PI_NONE;
-+ } else {
-+ /* Deadlock, early wakeup interleave. */
-+ WARN_ON_ONCE(old != Q_REQUEUE_PI_WAIT);
-+ new = Q_REQUEUE_PI_IGNORE;
-+ }
-+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
-+
-+#ifdef CONFIG_PREEMPT_RT
-+ /* If the waiter interleaved with the requeue let it know */
-+ if (unlikely(old == Q_REQUEUE_PI_WAIT))
-+ rcuwait_wake_up(&q->requeue_wait);
-+#endif
-+}
-+
-+static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
-+{
-+ int old, new;
-+
-+ old = atomic_read_acquire(&q->requeue_state);
-+ do {
-+ /* Is requeue done already? */
-+ if (old >= Q_REQUEUE_PI_DONE)
-+ return old;
-+
-+ /*
-+ * If not done, then tell the requeue code to either ignore
-+ * the waiter or to wake it up once the requeue is done.
-+ */
-+ new = Q_REQUEUE_PI_WAIT;
-+ if (old == Q_REQUEUE_PI_NONE)
-+ new = Q_REQUEUE_PI_IGNORE;
-+ } while (!atomic_try_cmpxchg(&q->requeue_state, &old, new));
-+
-+ /* If the requeue was in progress, wait for it to complete */
-+ if (old == Q_REQUEUE_PI_IN_PROGRESS) {
-+#ifdef CONFIG_PREEMPT_RT
-+ rcuwait_wait_event(&q->requeue_wait,
-+ atomic_read(&q->requeue_state) != Q_REQUEUE_PI_WAIT,
-+ TASK_UNINTERRUPTIBLE);
-+#else
-+ (void)atomic_cond_read_relaxed(&q->requeue_state, VAL != Q_REQUEUE_PI_WAIT);
-+#endif
-+ }
-+
-+ /*
-+ * Requeue is now either prohibited or complete. Reread state
-+ * because during the wait above it might have changed. Nothing
-+ * will modify q->requeue_state after this point.
-+ */
-+ return atomic_read(&q->requeue_state);
-+}
-+
-+/**
-+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
-+ * @q: the futex_q
-+ * @key: the key of the requeue target futex
-+ * @hb: the hash_bucket of the requeue target futex
-+ *
-+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
-+ * target futex if it is uncontended or via a lock steal.
-+ *
-+ * 1) Set @q::key to the requeue target futex key so the waiter can detect
-+ * the wakeup on the right futex.
-+ *
-+ * 2) Dequeue @q from the hash bucket.
-+ *
-+ * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
-+ * acquisition.
-+ *
-+ * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
-+ * the waiter has to fixup the pi state.
-+ *
-+ * 5) Complete the requeue state so the waiter can make progress. After
-+ * this point the waiter task can return from the syscall immediately in
-+ * case that the pi state does not have to be fixed up.
-+ *
-+ * 6) Wake the waiter task.
-+ *
-+ * Must be called with both q->lock_ptr and hb->lock held.
-+ */
-+static inline
-+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
-+ struct futex_hash_bucket *hb)
-+{
-+ q->key = *key;
-+
-+ __futex_unqueue(q);
-+
-+ WARN_ON(!q->rt_waiter);
-+ q->rt_waiter = NULL;
-+
-+ q->lock_ptr = &hb->lock;
-+
-+ /* Signal locked state to the waiter */
-+ futex_requeue_pi_complete(q, 1);
-+ wake_up_state(q->task, TASK_NORMAL);
-+}
-+
-+/**
-+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
-+ * @pifutex: the user address of the to futex
-+ * @hb1: the from futex hash bucket, must be locked by the caller
-+ * @hb2: the to futex hash bucket, must be locked by the caller
-+ * @key1: the from futex key
-+ * @key2: the to futex key
-+ * @ps: address to store the pi_state pointer
-+ * @exiting: Pointer to store the task pointer of the owner task
-+ * which is in the middle of exiting
-+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
-+ *
-+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
-+ * Wake the top waiter if we succeed. If the caller specified set_waiters,
-+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
-+ * hb1 and hb2 must be held by the caller.
-+ *
-+ * @exiting is only set when the return value is -EBUSY. If so, this holds
-+ * a refcount on the exiting task on return and the caller needs to drop it
-+ * after waiting for the exit to complete.
-+ *
-+ * Return:
-+ * - 0 - failed to acquire the lock atomically;
-+ * - >0 - acquired the lock, return value is vpid of the top_waiter
-+ * - <0 - error
-+ */
-+static int
-+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
-+ struct futex_hash_bucket *hb2, union futex_key *key1,
-+ union futex_key *key2, struct futex_pi_state **ps,
-+ struct task_struct **exiting, int set_waiters)
-+{
-+ struct futex_q *top_waiter = NULL;
-+ u32 curval;
-+ int ret;
-+
-+ if (futex_get_value_locked(&curval, pifutex))
-+ return -EFAULT;
-+
-+ if (unlikely(should_fail_futex(true)))
-+ return -EFAULT;
-+
-+ /*
-+ * Find the top_waiter and determine if there are additional waiters.
-+ * If the caller intends to requeue more than 1 waiter to pifutex,
-+ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
-+ * as we have means to handle the possible fault. If not, don't set
-+ * the bit unnecessarily as it will force the subsequent unlock to enter
-+ * the kernel.
-+ */
-+ top_waiter = futex_top_waiter(hb1, key1);
-+
-+ /* There are no waiters, nothing for us to do. */
-+ if (!top_waiter)
-+ return 0;
-+
-+ /*
-+ * Ensure that this is a waiter sitting in futex_wait_requeue_pi()
-+ * and waiting on the 'waitqueue' futex which is always !PI.
-+ */
-+ if (!top_waiter->rt_waiter || top_waiter->pi_state)
-+ return -EINVAL;
-+
-+ /* Ensure we requeue to the expected futex. */
-+ if (!futex_match(top_waiter->requeue_pi_key, key2))
-+ return -EINVAL;
-+
-+ /* Ensure that this does not race against an early wakeup */
-+ if (!futex_requeue_pi_prepare(top_waiter, NULL))
-+ return -EAGAIN;
-+
-+ /*
-+ * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
-+ * in the contended case or if @set_waiters is true.
-+ *
-+ * In the contended case PI state is attached to the lock owner. If
-+ * the user space lock can be acquired then PI state is attached to
-+ * the new owner (@top_waiter->task) when @set_waiters is true.
-+ */
-+ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
-+ exiting, set_waiters);
-+ if (ret == 1) {
-+ /*
-+ * Lock was acquired in user space and PI state was
-+ * attached to @top_waiter->task. That means state is fully
-+ * consistent and the waiter can return to user space
-+ * immediately after the wakeup.
-+ */
-+ requeue_pi_wake_futex(top_waiter, key2, hb2);
-+ } else if (ret < 0) {
-+ /* Rewind top_waiter::requeue_state */
-+ futex_requeue_pi_complete(top_waiter, ret);
-+ } else {
-+ /*
-+ * futex_lock_pi_atomic() did not acquire the user space
-+ * futex, but managed to establish the proxy lock and pi
-+ * state. top_waiter::requeue_state cannot be fixed up here
-+ * because the waiter is not enqueued on the rtmutex
-+ * yet. This is handled at the callsite depending on the
-+ * result of rt_mutex_start_proxy_lock() which is
-+ * guaranteed to be reached with this function returning 0.
-+ */
-+ }
-+ return ret;
-+}
-+
-+/**
-+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
-+ * @uaddr1: source futex user address
-+ * @flags: futex flags (FLAGS_SHARED, etc.)
-+ * @uaddr2: target futex user address
-+ * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
-+ * @nr_requeue: number of waiters to requeue (0-INT_MAX)
-+ * @cmpval: @uaddr1 expected value (or %NULL)
-+ * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
-+ * pi futex (pi to pi requeue is not supported)
-+ *
-+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
-+ * uaddr2 atomically on behalf of the top waiter.
-+ *
-+ * Return:
-+ * - >=0 - on success, the number of tasks requeued or woken;
-+ * - <0 - on error
-+ */
-+int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
-+ int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi)
-+{
-+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
-+ int task_count = 0, ret;
-+ struct futex_pi_state *pi_state = NULL;
-+ struct futex_hash_bucket *hb1, *hb2;
-+ struct futex_q *this, *next;
-+ DEFINE_WAKE_Q(wake_q);
-+
-+ if (nr_wake < 0 || nr_requeue < 0)
-+ return -EINVAL;
-+
-+ /*
-+ * When PI not supported: return -ENOSYS if requeue_pi is true,
-+ * consequently the compiler knows requeue_pi is always false past
-+ * this point which will optimize away all the conditional code
-+ * further down.
-+ */
-+ if (!IS_ENABLED(CONFIG_FUTEX_PI) && requeue_pi)
-+ return -ENOSYS;
-+
-+ if (requeue_pi) {
-+ /*
-+ * Requeue PI only works on two distinct uaddrs. This
-+ * check is only valid for private futexes. See below.
-+ */
-+ if (uaddr1 == uaddr2)
-+ return -EINVAL;
-+
-+ /*
-+ * futex_requeue() allows the caller to define the number
-+ * of waiters to wake up via the @nr_wake argument. With
-+ * REQUEUE_PI, waking up more than one waiter is creating
-+ * more problems than it solves. Waking up a waiter makes
-+ * only sense if the PI futex @uaddr2 is uncontended as
-+ * this allows the requeue code to acquire the futex
-+ * @uaddr2 before waking the waiter. The waiter can then
-+ * return to user space without further action. A secondary
-+ * wakeup would just make the futex_wait_requeue_pi()
-+ * handling more complex, because that code would have to
-+ * look up pi_state and do more or less all the handling
-+ * which the requeue code has to do for the to be requeued
-+ * waiters. So restrict the number of waiters to wake to
-+ * one, and only wake it up when the PI futex is
-+ * uncontended. Otherwise requeue it and let the unlock of
-+ * the PI futex handle the wakeup.
-+ *
-+ * All REQUEUE_PI users, e.g. pthread_cond_signal() and
-+ * pthread_cond_broadcast() must use nr_wake=1.
-+ */
-+ if (nr_wake != 1)
-+ return -EINVAL;
-+
-+ /*
-+ * requeue_pi requires a pi_state, try to allocate it now
-+ * without any locks in case it fails.
-+ */
-+ if (refill_pi_state_cache())
-+ return -ENOMEM;
-+ }
-+
-+retry:
-+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
-+ if (unlikely(ret != 0))
-+ return ret;
-+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
-+ requeue_pi ? FUTEX_WRITE : FUTEX_READ);
-+ if (unlikely(ret != 0))
-+ return ret;
-+
-+ /*
-+ * The check above which compares uaddrs is not sufficient for
-+ * shared futexes. We need to compare the keys:
-+ */
-+ if (requeue_pi && futex_match(&key1, &key2))
-+ return -EINVAL;
-+
-+ hb1 = futex_hash(&key1);
-+ hb2 = futex_hash(&key2);
-+
-+retry_private:
-+ futex_hb_waiters_inc(hb2);
-+ double_lock_hb(hb1, hb2);
-+
-+ if (likely(cmpval != NULL)) {
-+ u32 curval;
-+
-+ ret = futex_get_value_locked(&curval, uaddr1);
-+
-+ if (unlikely(ret)) {
-+ double_unlock_hb(hb1, hb2);
-+ futex_hb_waiters_dec(hb2);
-+
-+ ret = get_user(curval, uaddr1);
-+ if (ret)
-+ return ret;
-+
-+ if (!(flags & FLAGS_SHARED))
-+ goto retry_private;
-+
-+ goto retry;
-+ }
-+ if (curval != *cmpval) {
-+ ret = -EAGAIN;
-+ goto out_unlock;
-+ }
-+ }
-+
-+ if (requeue_pi) {
-+ struct task_struct *exiting = NULL;
-+
-+ /*
-+ * Attempt to acquire uaddr2 and wake the top waiter. If we
-+ * intend to requeue waiters, force setting the FUTEX_WAITERS
-+ * bit. We force this here where we are able to easily handle
-+ * faults rather in the requeue loop below.
-+ *
-+ * Updates topwaiter::requeue_state if a top waiter exists.
-+ */
-+ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
-+ &key2, &pi_state,
-+ &exiting, nr_requeue);
-+
-+ /*
-+ * At this point the top_waiter has either taken uaddr2 or
-+ * is waiting on it. In both cases pi_state has been
-+ * established and an initial refcount on it. In case of an
-+ * error there's nothing.
-+ *
-+ * The top waiter's requeue_state is up to date:
-+ *
-+ * - If the lock was acquired atomically (ret == 1), then
-+ * the state is Q_REQUEUE_PI_LOCKED.
-+ *
-+ * The top waiter has been dequeued and woken up and can
-+ * return to user space immediately. The kernel/user
-+ * space state is consistent. In case that there must be
-+ * more waiters requeued the WAITERS bit in the user
-+ * space futex is set so the top waiter task has to go
-+ * into the syscall slowpath to unlock the futex. This
-+ * will block until this requeue operation has been
-+ * completed and the hash bucket locks have been
-+ * dropped.
-+ *
-+ * - If the trylock failed with an error (ret < 0) then
-+ * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
-+ * happened", or Q_REQUEUE_PI_IGNORE when there was an
-+ * interleaved early wakeup.
-+ *
-+ * - If the trylock did not succeed (ret == 0) then the
-+ * state is either Q_REQUEUE_PI_IN_PROGRESS or
-+ * Q_REQUEUE_PI_WAIT if an early wakeup interleaved.
-+ * This will be cleaned up in the loop below, which
-+ * cannot fail because futex_proxy_trylock_atomic() did
-+ * the same sanity checks for requeue_pi as the loop
-+ * below does.
-+ */
-+ switch (ret) {
-+ case 0:
-+ /* We hold a reference on the pi state. */
-+ break;
-+
-+ case 1:
-+ /*
-+ * futex_proxy_trylock_atomic() acquired the user space
-+ * futex. Adjust task_count.
-+ */
-+ task_count++;
-+ ret = 0;
-+ break;
-+
-+ /*
-+ * If the above failed, then pi_state is NULL and
-+ * waiter::requeue_state is correct.
-+ */
-+ case -EFAULT:
-+ double_unlock_hb(hb1, hb2);
-+ futex_hb_waiters_dec(hb2);
-+ ret = fault_in_user_writeable(uaddr2);
-+ if (!ret)
-+ goto retry;
-+ return ret;
-+ case -EBUSY:
-+ case -EAGAIN:
-+ /*
-+ * Two reasons for this:
-+ * - EBUSY: Owner is exiting and we just wait for the
-+ * exit to complete.
-+ * - EAGAIN: The user space value changed.
-+ */
-+ double_unlock_hb(hb1, hb2);
-+ futex_hb_waiters_dec(hb2);
-+ /*
-+ * Handle the case where the owner is in the middle of
-+ * exiting. Wait for the exit to complete otherwise
-+ * this task might loop forever, aka. live lock.
-+ */
-+ wait_for_owner_exiting(ret, exiting);
-+ cond_resched();
-+ goto retry;
-+ default:
-+ goto out_unlock;
-+ }
-+ }
-+
-+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
-+ if (task_count - nr_wake >= nr_requeue)
-+ break;
-+
-+ if (!futex_match(&this->key, &key1))
-+ continue;
-+
-+ /*
-+ * FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI should always
-+ * be paired with each other and no other futex ops.
-+ *
-+ * We should never be requeueing a futex_q with a pi_state,
-+ * which is awaiting a futex_unlock_pi().
-+ */
-+ if ((requeue_pi && !this->rt_waiter) ||
-+ (!requeue_pi && this->rt_waiter) ||
-+ this->pi_state) {
-+ ret = -EINVAL;
-+ break;
-+ }
-+
-+ /* Plain futexes just wake or requeue and are done */
-+ if (!requeue_pi) {
-+ if (++task_count <= nr_wake)
-+ futex_wake_mark(&wake_q, this);
-+ else
-+ requeue_futex(this, hb1, hb2, &key2);
-+ continue;
-+ }
-+
-+ /* Ensure we requeue to the expected futex for requeue_pi. */
-+ if (!futex_match(this->requeue_pi_key, &key2)) {
-+ ret = -EINVAL;
-+ break;
-+ }
-+
-+ /*
-+ * Requeue nr_requeue waiters and possibly one more in the case
-+ * of requeue_pi if we couldn't acquire the lock atomically.
-+ *
-+ * Prepare the waiter to take the rt_mutex. Take a refcount
-+ * on the pi_state and store the pointer in the futex_q
-+ * object of the waiter.
-+ */
-+ get_pi_state(pi_state);
-+
-+ /* Don't requeue when the waiter is already on the way out. */
-+ if (!futex_requeue_pi_prepare(this, pi_state)) {
-+ /*
-+ * Early woken waiter signaled that it is on the
-+ * way out. Drop the pi_state reference and try the
-+ * next waiter. @this->pi_state is still NULL.
-+ */
-+ put_pi_state(pi_state);
-+ continue;
-+ }
-+
-+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
-+ this->rt_waiter,
-+ this->task);
-+
-+ if (ret == 1) {
-+ /*
-+ * We got the lock. We do neither drop the refcount
-+ * on pi_state nor clear this->pi_state because the
-+ * waiter needs the pi_state for cleaning up the
-+ * user space value. It will drop the refcount
-+ * after doing so. this::requeue_state is updated
-+ * in the wakeup as well.
-+ */
-+ requeue_pi_wake_futex(this, &key2, hb2);
-+ task_count++;
-+ } else if (!ret) {
-+ /* Waiter is queued, move it to hb2 */
-+ requeue_futex(this, hb1, hb2, &key2);
-+ futex_requeue_pi_complete(this, 0);
-+ task_count++;
-+ } else {
-+ /*
-+ * rt_mutex_start_proxy_lock() detected a potential
-+ * deadlock when we tried to queue that waiter.
-+ * Drop the pi_state reference which we took above
-+ * and remove the pointer to the state from the
-+ * waiters futex_q object.
-+ */
-+ this->pi_state = NULL;
-+ put_pi_state(pi_state);
-+ futex_requeue_pi_complete(this, ret);
-+ /*
-+ * We stop queueing more waiters and let user space
-+ * deal with the mess.
-+ */
-+ break;
-+ }
-+ }
-+
-+ /*
-+ * We took an extra initial reference to the pi_state in
-+ * futex_proxy_trylock_atomic(). We need to drop it here again.
-+ */
-+ put_pi_state(pi_state);
-+
-+out_unlock:
-+ double_unlock_hb(hb1, hb2);
-+ wake_up_q(&wake_q);
-+ futex_hb_waiters_dec(hb2);
-+ return ret ? ret : task_count;
-+}
-+
-+/**
-+ * handle_early_requeue_pi_wakeup() - Handle early wakeup on the initial futex
-+ * @hb: the hash_bucket futex_q was original enqueued on
-+ * @q: the futex_q woken while waiting to be requeued
-+ * @timeout: the timeout associated with the wait (NULL if none)
-+ *
-+ * Determine the cause for the early wakeup.
-+ *
-+ * Return:
-+ * -EWOULDBLOCK or -ETIMEDOUT or -ERESTARTNOINTR
-+ */
-+static inline
-+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
-+ struct futex_q *q,
-+ struct hrtimer_sleeper *timeout)
-+{
-+ int ret;
-+
-+ /*
-+ * With the hb lock held, we avoid races while we process the wakeup.
-+ * We only need to hold hb (and not hb2) to ensure atomicity as the
-+ * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
-+ * It can't be requeued from uaddr2 to something else since we don't
-+ * support a PI aware source futex for requeue.
-+ */
-+ WARN_ON_ONCE(&hb->lock != q->lock_ptr);
-+
-+ /*
-+ * We were woken prior to requeue by a timeout or a signal.
-+ * Unqueue the futex_q and determine which it was.
-+ */
-+ plist_del(&q->list, &hb->chain);
-+ futex_hb_waiters_dec(hb);
-+
-+ /* Handle spurious wakeups gracefully */
-+ ret = -EWOULDBLOCK;
-+ if (timeout && !timeout->task)
-+ ret = -ETIMEDOUT;
-+ else if (signal_pending(current))
-+ ret = -ERESTARTNOINTR;
-+ return ret;
-+}
-+
-+/**
-+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
-+ * @uaddr: the futex we initially wait on (non-pi)
-+ * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
-+ * the same type, no requeueing from private to shared, etc.
-+ * @val: the expected value of uaddr
-+ * @abs_time: absolute timeout
-+ * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
-+ * @uaddr2: the pi futex we will take prior to returning to user-space
-+ *
-+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
-+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
-+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
-+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
-+ * without one, the pi logic would not know which task to boost/deboost, if
-+ * there was a need to.
-+ *
-+ * We call schedule in futex_wait_queue() when we enqueue and return there
-+ * via the following--
-+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
-+ * 2) wakeup on uaddr2 after a requeue
-+ * 3) signal
-+ * 4) timeout
-+ *
-+ * If 3, cleanup and return -ERESTARTNOINTR.
-+ *
-+ * If 2, we may then block on trying to take the rt_mutex and return via:
-+ * 5) successful lock
-+ * 6) signal
-+ * 7) timeout
-+ * 8) other lock acquisition failure
-+ *
-+ * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
-+ *
-+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
-+ *
-+ * Return:
-+ * - 0 - On success;
-+ * - <0 - On error
-+ */
-+int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
-+ u32 val, ktime_t *abs_time, u32 bitset,
-+ u32 __user *uaddr2)
-+{
-+ struct hrtimer_sleeper timeout, *to;
-+ struct rt_mutex_waiter rt_waiter;
-+ struct futex_hash_bucket *hb;
-+ union futex_key key2 = FUTEX_KEY_INIT;
-+ struct futex_q q = futex_q_init;
-+ struct rt_mutex_base *pi_mutex;
-+ int res, ret;
-+
-+ if (!IS_ENABLED(CONFIG_FUTEX_PI))
-+ return -ENOSYS;
-+
-+ if (uaddr == uaddr2)
-+ return -EINVAL;
-+
-+ if (!bitset)
-+ return -EINVAL;
-+
-+ to = futex_setup_timer(abs_time, &timeout, flags,
-+ current->timer_slack_ns);
-+
-+ /*
-+ * The waiter is allocated on our stack, manipulated by the requeue
-+ * code while we sleep on uaddr.
-+ */
-+ rt_mutex_init_waiter(&rt_waiter);
-+
-+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
-+ if (unlikely(ret != 0))
-+ goto out;
-+
-+ q.bitset = bitset;
-+ q.rt_waiter = &rt_waiter;
-+ q.requeue_pi_key = &key2;
-+
-+ /*
-+ * Prepare to wait on uaddr. On success, it holds hb->lock and q
-+ * is initialized.
-+ */
-+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
-+ if (ret)
-+ goto out;
-+
-+ /*
-+ * The check above which compares uaddrs is not sufficient for
-+ * shared futexes. We need to compare the keys:
-+ */
-+ if (futex_match(&q.key, &key2)) {
-+ futex_q_unlock(hb);
-+ ret = -EINVAL;
-+ goto out;
-+ }
-+
-+ /* Queue the futex_q, drop the hb lock, wait for wakeup. */
-+ futex_wait_queue(hb, &q, to);
-+
-+ switch (futex_requeue_pi_wakeup_sync(&q)) {
-+ case Q_REQUEUE_PI_IGNORE:
-+ /* The waiter is still on uaddr1 */
-+ spin_lock(&hb->lock);
-+ ret = handle_early_requeue_pi_wakeup(hb, &q, to);
-+ spin_unlock(&hb->lock);
-+ break;
-+
-+ case Q_REQUEUE_PI_LOCKED:
-+ /* The requeue acquired the lock */
-+ if (q.pi_state && (q.pi_state->owner != current)) {
-+ spin_lock(q.lock_ptr);
-+ ret = fixup_pi_owner(uaddr2, &q, true);
-+ /*
-+ * Drop the reference to the pi state which the
-+ * requeue_pi() code acquired for us.
-+ */
-+ put_pi_state(q.pi_state);
-+ spin_unlock(q.lock_ptr);
-+ /*
-+ * Adjust the return value. It's either -EFAULT or
-+ * success (1) but the caller expects 0 for success.
-+ */
-+ ret = ret < 0 ? ret : 0;
-+ }
-+ break;
-+
-+ case Q_REQUEUE_PI_DONE:
-+ /* Requeue completed. Current is 'pi_blocked_on' the rtmutex */
-+ pi_mutex = &q.pi_state->pi_mutex;
-+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
-+
-+ /* Current is not longer pi_blocked_on */
-+ spin_lock(q.lock_ptr);
-+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
-+ ret = 0;
-+
-+ debug_rt_mutex_free_waiter(&rt_waiter);
-+ /*
-+ * Fixup the pi_state owner and possibly acquire the lock if we
-+ * haven't already.
-+ */
-+ res = fixup_pi_owner(uaddr2, &q, !ret);
-+ /*
-+ * If fixup_pi_owner() returned an error, propagate that. If it
-+ * acquired the lock, clear -ETIMEDOUT or -EINTR.
-+ */
-+ if (res)
-+ ret = (res < 0) ? res : 0;
-+
-+ futex_unqueue_pi(&q);
-+ spin_unlock(q.lock_ptr);
-+
-+ if (ret == -EINTR) {
-+ /*
-+ * We've already been requeued, but cannot restart
-+ * by calling futex_lock_pi() directly. We could
-+ * restart this syscall, but it would detect that
-+ * the user space "val" changed and return
-+ * -EWOULDBLOCK. Save the overhead of the restart
-+ * and return -EWOULDBLOCK directly.
-+ */
-+ ret = -EWOULDBLOCK;
-+ }
-+ break;
-+ default:
-+ BUG();
-+ }
-+
-+out:
-+ if (to) {
-+ hrtimer_cancel(&to->timer);
-+ destroy_hrtimer_on_stack(&to->timer);
-+ }
-+ return ret;
-+}
-+
-diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
-new file mode 100644
-index 000000000..368e9c17f
---- /dev/null
-+++ b/kernel/futex/syscalls.c
-@@ -0,0 +1,396 @@
-+// SPDX-License-Identifier: GPL-2.0-or-later
-+
-+#include <linux/compat.h>
-+#include <linux/syscalls.h>
-+#include <linux/time_namespace.h>
-+
-+#include "futex.h"
-+
-+/*
-+ * Support for robust futexes: the kernel cleans up held futexes at
-+ * thread exit time.
-+ *
-+ * Implementation: user-space maintains a per-thread list of locks it
-+ * is holding. Upon do_exit(), the kernel carefully walks this list,
-+ * and marks all locks that are owned by this thread with the
-+ * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
-+ * always manipulated with the lock held, so the list is private and
-+ * per-thread. Userspace also maintains a per-thread 'list_op_pending'
-+ * field, to allow the kernel to clean up if the thread dies after
-+ * acquiring the lock, but just before it could have added itself to
-+ * the list. There can only be one such pending lock.
-+ */
-+
-+/**
-+ * sys_set_robust_list() - Set the robust-futex list head of a task
-+ * @head: pointer to the list-head
-+ * @len: length of the list-head, as userspace expects
-+ */
-+SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
-+ size_t, len)
-+{
-+ if (!futex_cmpxchg_enabled)
-+ return -ENOSYS;
-+ /*
-+ * The kernel knows only one size for now:
-+ */
-+ if (unlikely(len != sizeof(*head)))
-+ return -EINVAL;
-+
-+ current->robust_list = head;
-+
-+ return 0;
-+}
-+
-+/**
-+ * sys_get_robust_list() - Get the robust-futex list head of a task
-+ * @pid: pid of the process [zero for current task]
-+ * @head_ptr: pointer to a list-head pointer, the kernel fills it in
-+ * @len_ptr: pointer to a length field, the kernel fills in the header size
-+ */
-+SYSCALL_DEFINE3(get_robust_list, int, pid,
-+ struct robust_list_head __user * __user *, head_ptr,
-+ size_t __user *, len_ptr)
-+{
-+ struct robust_list_head __user *head;
-+ unsigned long ret;
-+ struct task_struct *p;
-+
-+ if (!futex_cmpxchg_enabled)
-+ return -ENOSYS;
-+
-+ rcu_read_lock();
-+
-+ ret = -ESRCH;
-+ if (!pid)
-+ p = current;
-+ else {
-+ p = find_task_by_vpid(pid);
-+ if (!p)
-+ goto err_unlock;
-+ }
-+
-+ ret = -EPERM;
-+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
-+ goto err_unlock;
-+
-+ head = p->robust_list;
-+ rcu_read_unlock();
-+
-+ if (put_user(sizeof(*head), len_ptr))
-+ return -EFAULT;
-+ return put_user(head, head_ptr);
-+
-+err_unlock:
-+ rcu_read_unlock();
-+
-+ return ret;
-+}
-+
-+long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
-+ u32 __user *uaddr2, u32 val2, u32 val3)
-+{
-+ int cmd = op & FUTEX_CMD_MASK;
-+ unsigned int flags = 0;
-+
-+ if (!(op & FUTEX_PRIVATE_FLAG))
-+ flags |= FLAGS_SHARED;
-+
-+ if (op & FUTEX_CLOCK_REALTIME) {
-+ flags |= FLAGS_CLOCKRT;
-+ if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI &&
-+ cmd != FUTEX_LOCK_PI2)
-+ return -ENOSYS;
-+ }
-+
-+ switch (cmd) {
-+ case FUTEX_LOCK_PI:
-+ case FUTEX_LOCK_PI2:
-+ case FUTEX_UNLOCK_PI:
-+ case FUTEX_TRYLOCK_PI:
-+ case FUTEX_WAIT_REQUEUE_PI:
-+ case FUTEX_CMP_REQUEUE_PI:
-+ if (!futex_cmpxchg_enabled)
-+ return -ENOSYS;
-+ }
-+
-+ switch (cmd) {
-+ case FUTEX_WAIT:
-+ val3 = FUTEX_BITSET_MATCH_ANY;
-+ fallthrough;
-+ case FUTEX_WAIT_BITSET:
-+ return futex_wait(uaddr, flags, val, timeout, val3);
-+ case FUTEX_WAKE:
-+ val3 = FUTEX_BITSET_MATCH_ANY;
-+ fallthrough;
-+ case FUTEX_WAKE_BITSET:
-+ return futex_wake(uaddr, flags, val, val3);
-+ case FUTEX_REQUEUE:
-+ return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
-+ case FUTEX_CMP_REQUEUE:
-+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
-+ case FUTEX_WAKE_OP:
-+ return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
-+ case FUTEX_LOCK_PI:
-+ flags |= FLAGS_CLOCKRT;
-+ fallthrough;
-+ case FUTEX_LOCK_PI2:
-+ return futex_lock_pi(uaddr, flags, timeout, 0);
-+ case FUTEX_UNLOCK_PI:
-+ return futex_unlock_pi(uaddr, flags);
-+ case FUTEX_TRYLOCK_PI:
-+ return futex_lock_pi(uaddr, flags, NULL, 1);
-+ case FUTEX_WAIT_REQUEUE_PI:
-+ val3 = FUTEX_BITSET_MATCH_ANY;
-+ return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
-+ uaddr2);
-+ case FUTEX_CMP_REQUEUE_PI:
-+ return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
-+ }
-+ return -ENOSYS;
-+}
-+
-+static __always_inline bool futex_cmd_has_timeout(u32 cmd)
-+{
-+ switch (cmd) {
-+ case FUTEX_WAIT:
-+ case FUTEX_LOCK_PI:
-+ case FUTEX_LOCK_PI2:
-+ case FUTEX_WAIT_BITSET:
-+ case FUTEX_WAIT_REQUEUE_PI:
-+ return true;
-+ }
-+ return false;
-+}
-+
-+static __always_inline int
-+futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
-+{
-+ if (!timespec64_valid(ts))
-+ return -EINVAL;
-+
-+ *t = timespec64_to_ktime(*ts);
-+ if (cmd == FUTEX_WAIT)
-+ *t = ktime_add_safe(ktime_get(), *t);
-+ else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
-+ *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
-+ return 0;
-+}
-+
-+SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
-+ const struct __kernel_timespec __user *, utime,
-+ u32 __user *, uaddr2, u32, val3)
-+{
-+ int ret, cmd = op & FUTEX_CMD_MASK;
-+ ktime_t t, *tp = NULL;
-+ struct timespec64 ts;
-+
-+ if (utime && futex_cmd_has_timeout(cmd)) {
-+ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
-+ return -EFAULT;
-+ if (get_timespec64(&ts, utime))
-+ return -EFAULT;
-+ ret = futex_init_timeout(cmd, op, &ts, &t);
-+ if (ret)
-+ return ret;
-+ tp = &t;
-+ }
-+
-+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
-+}
-+
-+/* Mask of available flags for each futex in futex_waitv list */
-+#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG)
-+
-+/**
-+ * futex_parse_waitv - Parse a waitv array from userspace
-+ * @futexv: Kernel side list of waiters to be filled
-+ * @uwaitv: Userspace list to be parsed
-+ * @nr_futexes: Length of futexv
-+ *
-+ * Return: Error code on failure, 0 on success
-+ */
-+static int futex_parse_waitv(struct futex_vector *futexv,
-+ struct futex_waitv __user *uwaitv,
-+ unsigned int nr_futexes)
-+{
-+ struct futex_waitv aux;
-+ unsigned int i;
-+
-+ for (i = 0; i < nr_futexes; i++) {
-+ if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
-+ return -EFAULT;
-+
-+ if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved)
-+ return -EINVAL;
-+
-+ if (!(aux.flags & FUTEX_32))
-+ return -EINVAL;
-+
-+ futexv[i].w.flags = aux.flags;
-+ futexv[i].w.val = aux.val;
-+ futexv[i].w.uaddr = aux.uaddr;
-+ futexv[i].q = futex_q_init;
-+ }
-+
-+ return 0;
-+}
-+
-+/**
-+ * sys_futex_waitv - Wait on a list of futexes
-+ * @waiters: List of futexes to wait on
-+ * @nr_futexes: Length of futexv
-+ * @flags: Flag for timeout (monotonic/realtime)
-+ * @timeout: Optional absolute timeout.
-+ * @clockid: Clock to be used for the timeout, realtime or monotonic.
-+ *
-+ * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
-+ * if a futex_wake() is performed at any uaddr. The syscall returns immediately
-+ * if any waiter has *uaddr != val. *timeout is an optional timeout value for the
-+ * operation. Each waiter has individual flags. The `flags` argument for the
-+ * syscall should be used solely for specifying the timeout as realtime, if
-+ * needed. Flags for private futexes, sizes, etc. should be used on the
-+ * individual flags of each waiter.
-+ *
-+ * Returns the array index of one of the awaken futexes. There's no given
-+ * information of how many were awakened, or any particular attribute of it (if
-+ * it's the first awakened, if it is of the smaller index...).
-+ */
-+
-+SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
-+ unsigned int, nr_futexes, unsigned int, flags,
-+ struct __kernel_timespec __user *, timeout, clockid_t, clockid)
-+{
-+ struct hrtimer_sleeper to;
-+ struct futex_vector *futexv;
-+ struct timespec64 ts;
-+ ktime_t time;
-+ int ret;
-+
-+ /* This syscall supports no flags for now */
-+ if (flags)
-+ return -EINVAL;
-+
-+ if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
-+ return -EINVAL;
-+
-+ if (timeout) {
-+ int flag_clkid = 0, flag_init = 0;
-+
-+ if (clockid == CLOCK_REALTIME) {
-+ flag_clkid = FLAGS_CLOCKRT;
-+ flag_init = FUTEX_CLOCK_REALTIME;
-+ }
-+
-+ if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
-+ return -EINVAL;
-+
-+ if (get_timespec64(&ts, timeout))
-+ return -EFAULT;
-+
-+ /*
-+ * Since there's no opcode for futex_waitv, use
-+ * FUTEX_WAIT_BITSET that uses absolute timeout as well
-+ */
-+ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
-+ if (ret)
-+ return ret;
-+
-+ futex_setup_timer(&time, &to, flag_clkid, 0);
-+ }
-+
-+ futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
-+ if (!futexv)
-+ return -ENOMEM;
-+
-+ ret = futex_parse_waitv(futexv, waiters, nr_futexes);
-+ if (!ret)
-+ ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);
-+
-+ if (timeout) {
-+ hrtimer_cancel(&to.timer);
-+ destroy_hrtimer_on_stack(&to.timer);
-+ }
-+
-+ kfree(futexv);
-+ return ret;
-+}
-+
-+#ifdef CONFIG_COMPAT
-+COMPAT_SYSCALL_DEFINE2(set_robust_list,
-+ struct compat_robust_list_head __user *, head,
-+ compat_size_t, len)
-+{
-+ if (!futex_cmpxchg_enabled)
-+ return -ENOSYS;
-+
-+ if (unlikely(len != sizeof(*head)))
-+ return -EINVAL;
-+
-+ current->compat_robust_list = head;
-+
-+ return 0;
-+}
-+
-+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
-+ compat_uptr_t __user *, head_ptr,
-+ compat_size_t __user *, len_ptr)
-+{
-+ struct compat_robust_list_head __user *head;
-+ unsigned long ret;
-+ struct task_struct *p;
-+
-+ if (!futex_cmpxchg_enabled)
-+ return -ENOSYS;
-+
-+ rcu_read_lock();
-+
-+ ret = -ESRCH;
-+ if (!pid)
-+ p = current;
-+ else {
-+ p = find_task_by_vpid(pid);
-+ if (!p)
-+ goto err_unlock;
-+ }
-+
-+ ret = -EPERM;
-+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
-+ goto err_unlock;
-+
-+ head = p->compat_robust_list;
-+ rcu_read_unlock();
-+
-+ if (put_user(sizeof(*head), len_ptr))
-+ return -EFAULT;
-+ return put_user(ptr_to_compat(head), head_ptr);
-+
-+err_unlock:
-+ rcu_read_unlock();
-+
-+ return ret;
-+}
-+#endif /* CONFIG_COMPAT */
-+
-+#ifdef CONFIG_COMPAT_32BIT_TIME
-+SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
-+ const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
-+ u32, val3)
-+{
-+ int ret, cmd = op & FUTEX_CMD_MASK;
-+ ktime_t t, *tp = NULL;
-+ struct timespec64 ts;
-+
-+ if (utime && futex_cmd_has_timeout(cmd)) {
-+ if (get_old_timespec32(&ts, utime))
-+ return -EFAULT;
-+ ret = futex_init_timeout(cmd, op, &ts, &t);
-+ if (ret)
-+ return ret;
-+ tp = &t;
-+ }
-+
-+ return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
-+}
-+#endif /* CONFIG_COMPAT_32BIT_TIME */
-+
-diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
-new file mode 100644
-index 000000000..b45597aab
---- /dev/null
-+++ b/kernel/futex/waitwake.c
-@@ -0,0 +1,708 @@
-+// SPDX-License-Identifier: GPL-2.0-or-later
-+
-+#include <linux/sched/task.h>
-+#include <linux/sched/signal.h>
-+#include <linux/freezer.h>
-+
-+#include "futex.h"
-+
-+/*
-+ * READ this before attempting to hack on futexes!
-+ *
-+ * Basic futex operation and ordering guarantees
-+ * =============================================
-+ *
-+ * The waiter reads the futex value in user space and calls
-+ * futex_wait(). This function computes the hash bucket and acquires
-+ * the hash bucket lock. After that it reads the futex user space value
-+ * again and verifies that the data has not changed. If it has not changed
-+ * it enqueues itself into the hash bucket, releases the hash bucket lock
-+ * and schedules.
-+ *
-+ * The waker side modifies the user space value of the futex and calls
-+ * futex_wake(). This function computes the hash bucket and acquires the
-+ * hash bucket lock. Then it looks for waiters on that futex in the hash
-+ * bucket and wakes them.
-+ *
-+ * In futex wake up scenarios where no tasks are blocked on a futex, taking
-+ * the hb spinlock can be avoided and simply return. In order for this
-+ * optimization to work, ordering guarantees must exist so that the waiter
-+ * being added to the list is acknowledged when the list is concurrently being
-+ * checked by the waker, avoiding scenarios like the following:
-+ *
-+ * CPU 0 CPU 1
-+ * val = *futex;
-+ * sys_futex(WAIT, futex, val);
-+ * futex_wait(futex, val);
-+ * uval = *futex;
-+ * *futex = newval;
-+ * sys_futex(WAKE, futex);
-+ * futex_wake(futex);
-+ * if (queue_empty())
-+ * return;
-+ * if (uval == val)
-+ * lock(hash_bucket(futex));
-+ * queue();
-+ * unlock(hash_bucket(futex));
-+ * schedule();
-+ *
-+ * This would cause the waiter on CPU 0 to wait forever because it
-+ * missed the transition of the user space value from val to newval
-+ * and the waker did not find the waiter in the hash bucket queue.
-+ *
-+ * The correct serialization ensures that a waiter either observes
-+ * the changed user space value before blocking or is woken by a
-+ * concurrent waker:
-+ *
-+ * CPU 0 CPU 1
-+ * val = *futex;
-+ * sys_futex(WAIT, futex, val);
-+ * futex_wait(futex, val);
-+ *
-+ * waiters++; (a)
-+ * smp_mb(); (A) <-- paired with -.
-+ * |
-+ * lock(hash_bucket(futex)); |
-+ * |
-+ * uval = *futex; |
-+ * | *futex = newval;
-+ * | sys_futex(WAKE, futex);
-+ * | futex_wake(futex);
-+ * |
-+ * `--------> smp_mb(); (B)
-+ * if (uval == val)
-+ * queue();
-+ * unlock(hash_bucket(futex));
-+ * schedule(); if (waiters)
-+ * lock(hash_bucket(futex));
-+ * else wake_waiters(futex);
-+ * waiters--; (b) unlock(hash_bucket(futex));
-+ *
-+ * Where (A) orders the waiters increment and the futex value read through
-+ * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write
-+ * to futex and the waiters read (see futex_hb_waiters_pending()).
-+ *
-+ * This yields the following case (where X:=waiters, Y:=futex):
-+ *
-+ * X = Y = 0
-+ *
-+ * w[X]=1 w[Y]=1
-+ * MB MB
-+ * r[Y]=y r[X]=x
-+ *
-+ * Which guarantees that x==0 && y==0 is impossible; which translates back into
-+ * the guarantee that we cannot both miss the futex variable change and the
-+ * enqueue.
-+ *
-+ * Note that a new waiter is accounted for in (a) even when it is possible that
-+ * the wait call can return error, in which case we backtrack from it in (b).
-+ * Refer to the comment in futex_q_lock().
-+ *
-+ * Similarly, in order to account for waiters being requeued on another
-+ * address we always increment the waiters for the destination bucket before
-+ * acquiring the lock. It then decrements them again after releasing it -
-+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
-+ * will do the additional required waiter count housekeeping. This is done for
-+ * double_lock_hb() and double_unlock_hb(), respectively.
-+ */
-+
-+/*
-+ * The hash bucket lock must be held when this is called.
-+ * Afterwards, the futex_q must not be accessed. Callers
-+ * must ensure to later call wake_up_q() for the actual
-+ * wakeups to occur.
-+ */
-+void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
-+{
-+ struct task_struct *p = q->task;
-+
-+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
-+ return;
-+
-+ get_task_struct(p);
-+ __futex_unqueue(q);
-+ /*
-+ * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
-+ * is written, without taking any locks. This is possible in the event
-+ * of a spurious wakeup, for example. A memory barrier is required here
-+ * to prevent the following store to lock_ptr from getting ahead of the
-+ * plist_del in __futex_unqueue().
-+ */
-+ smp_store_release(&q->lock_ptr, NULL);
-+
-+ /*
-+ * Queue the task for later wakeup for after we've released
-+ * the hb->lock.
-+ */
-+ wake_q_add_safe(wake_q, p);
-+}
-+
-+/*
-+ * Wake up waiters matching bitset queued on this futex (uaddr).
-+ */
-+int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
-+{
-+ struct futex_hash_bucket *hb;
-+ struct futex_q *this, *next;
-+ union futex_key key = FUTEX_KEY_INIT;
-+ int ret;
-+ DEFINE_WAKE_Q(wake_q);
-+
-+ if (!bitset)
-+ return -EINVAL;
-+
-+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
-+ if (unlikely(ret != 0))
-+ return ret;
-+
-+ hb = futex_hash(&key);
-+
-+ /* Make sure we really have tasks to wakeup */
-+ if (!futex_hb_waiters_pending(hb))
-+ return ret;
-+
-+ spin_lock(&hb->lock);
-+
-+ plist_for_each_entry_safe(this, next, &hb->chain, list) {
-+ if (futex_match (&this->key, &key)) {
-+ if (this->pi_state || this->rt_waiter) {
-+ ret = -EINVAL;
-+ break;
-+ }
-+
-+ /* Check if one of the bits is set in both bitsets */
-+ if (!(this->bitset & bitset))
-+ continue;
-+
-+ futex_wake_mark(&wake_q, this);
-+ if (++ret >= nr_wake)
-+ break;
-+ }
-+ }
-+
-+ spin_unlock(&hb->lock);
-+ wake_up_q(&wake_q);
-+ return ret;
-+}
-+
-+static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
-+{
-+ unsigned int op = (encoded_op & 0x70000000) >> 28;
-+ unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
-+ int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
-+ int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
-+ int oldval, ret;
-+
-+ if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
-+ if (oparg < 0 || oparg > 31) {
-+ char comm[sizeof(current->comm)];
-+ /*
-+ * kill this print and return -EINVAL when userspace
-+ * is sane again
-+ */
-+ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
-+ get_task_comm(comm, current), oparg);
-+ oparg &= 31;
-+ }
-+ oparg = 1 << oparg;
-+ }
-+
-+ pagefault_disable();
-+ ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
-+ pagefault_enable();
-+ if (ret)
-+ return ret;
-+
-+ switch (cmp) {
-+ case FUTEX_OP_CMP_EQ:
-+ return oldval == cmparg;
-+ case FUTEX_OP_CMP_NE:
-+ return oldval != cmparg;
-+ case FUTEX_OP_CMP_LT:
-+ return oldval < cmparg;
-+ case FUTEX_OP_CMP_GE:
-+ return oldval >= cmparg;
-+ case FUTEX_OP_CMP_LE:
-+ return oldval <= cmparg;
-+ case FUTEX_OP_CMP_GT:
-+ return oldval > cmparg;
-+ default:
-+ return -ENOSYS;
-+ }
-+}
-+
-+/*
-+ * Wake up all waiters hashed on the physical page that is mapped
-+ * to this virtual address:
-+ */
-+int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
-+ int nr_wake, int nr_wake2, int op)
-+{
-+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
-+ struct futex_hash_bucket *hb1, *hb2;
-+ struct futex_q *this, *next;
-+ int ret, op_ret;
-+ DEFINE_WAKE_Q(wake_q);
-+
-+retry:
-+ ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
-+ if (unlikely(ret != 0))
-+ return ret;
-+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
-+ if (unlikely(ret != 0))
-+ return ret;
-+
-+ hb1 = futex_hash(&key1);
-+ hb2 = futex_hash(&key2);
-+
-+retry_private:
-+ double_lock_hb(hb1, hb2);
-+ op_ret = futex_atomic_op_inuser(op, uaddr2);
-+ if (unlikely(op_ret < 0)) {
-+ double_unlock_hb(hb1, hb2);
-+
-+ if (!IS_ENABLED(CONFIG_MMU) ||
-+ unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
-+ /*
-+ * we don't get EFAULT from MMU faults if we don't have
-+ * an MMU, but we might get them from range checking
-+ */
-+ ret = op_ret;
-+ return ret;
-+ }
-+
-+ if (op_ret == -EFAULT) {
-+ ret = fault_in_user_writeable(uaddr2);
-+ if (ret)
-+ return ret;
-+ }
-+
-+ cond_resched();
-+ if (!(flags & FLAGS_SHARED))
-+ goto retry_private;
-+ goto retry;
-+ }
-+
-+ plist_for_each_entry_safe(this, next, &hb1->chain, list) {
-+ if (futex_match (&this->key, &key1)) {
-+ if (this->pi_state || this->rt_waiter) {
-+ ret = -EINVAL;
-+ goto out_unlock;
-+ }
-+ futex_wake_mark(&wake_q, this);
-+ if (++ret >= nr_wake)
-+ break;
-+ }
-+ }
-+
-+ if (op_ret > 0) {
-+ op_ret = 0;
-+ plist_for_each_entry_safe(this, next, &hb2->chain, list) {
-+ if (futex_match (&this->key, &key2)) {
-+ if (this->pi_state || this->rt_waiter) {
-+ ret = -EINVAL;
-+ goto out_unlock;
-+ }
-+ futex_wake_mark(&wake_q, this);
-+ if (++op_ret >= nr_wake2)
-+ break;
-+ }
-+ }
-+ ret += op_ret;
-+ }
-+
-+out_unlock:
-+ double_unlock_hb(hb1, hb2);
-+ wake_up_q(&wake_q);
-+ return ret;
-+}
-+
-+static long futex_wait_restart(struct restart_block *restart);
-+
-+/**
-+ * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal
-+ * @hb: the futex hash bucket, must be locked by the caller
-+ * @q: the futex_q to queue up on
-+ * @timeout: the prepared hrtimer_sleeper, or null for no timeout
-+ */
-+void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
-+ struct hrtimer_sleeper *timeout)
-+{
-+ /*
-+ * The task state is guaranteed to be set before another task can
-+ * wake it. set_current_state() is implemented using smp_store_mb() and
-+ * futex_queue() calls spin_unlock() upon completion, both serializing
-+ * access to the hash list and forcing another memory barrier.
-+ */
-+ set_current_state(TASK_INTERRUPTIBLE);
-+ futex_queue(q, hb);
-+
-+ /* Arm the timer */
-+ if (timeout)
-+ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
-+
-+ /*
-+ * If we have been removed from the hash list, then another task
-+ * has tried to wake us, and we can skip the call to schedule().
-+ */
-+ if (likely(!plist_node_empty(&q->list))) {
-+ /*
-+ * If the timer has already expired, current will already be
-+ * flagged for rescheduling. Only call schedule if there
-+ * is no timeout, or if it has yet to expire.
-+ */
-+ if (!timeout || timeout->task)
-+ freezable_schedule();
-+ }
-+ __set_current_state(TASK_RUNNING);
-+}
-+
-+/**
-+ * unqueue_multiple - Remove various futexes from their hash bucket
-+ * @v: The list of futexes to unqueue
-+ * @count: Number of futexes in the list
-+ *
-+ * Helper to unqueue a list of futexes. This can't fail.
-+ *
-+ * Return:
-+ * - >=0 - Index of the last futex that was awoken;
-+ * - -1 - No futex was awoken
-+ */
-+static int unqueue_multiple(struct futex_vector *v, int count)
-+{
-+ int ret = -1, i;
-+
-+ for (i = 0; i < count; i++) {
-+ if (!futex_unqueue(&v[i].q))
-+ ret = i;
-+ }
-+
-+ return ret;
-+}
-+
-+/**
-+ * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
-+ * @vs: The futex list to wait on
-+ * @count: The size of the list
-+ * @awaken: Index of the last awoken futex, if any. Used to notify the
-+ * caller that it can return this index to userspace (return parameter)
-+ *
-+ * Prepare multiple futexes in a single step and enqueue them. This may fail if
-+ * the futex list is invalid or if any futex was already awoken. On success the
-+ * task is ready to interruptible sleep.
-+ *
-+ * Return:
-+ * - 1 - One of the futexes was awaken by another thread
-+ * - 0 - Success
-+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
-+ */
-+static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *awaken)
-+{
-+ struct futex_hash_bucket *hb;
-+ bool retry = false;
-+ int ret, i;
-+ u32 uval;
-+
-+ /*
-+ * Enqueuing multiple futexes is tricky, because we need to enqueue
-+ * each futex in the list before dealing with the next one to avoid
-+ * deadlocking on the hash bucket. But, before enqueuing, we need to
-+ * make sure that current->state is TASK_INTERRUPTIBLE, so we don't
-+ * absorb any awake events, which cannot be done before the
-+ * get_futex_key of the next key, because it calls get_user_pages,
-+ * which can sleep. Thus, we fetch the list of futexes keys in two
-+ * steps, by first pinning all the memory keys in the futex key, and
-+ * only then we read each key and queue the corresponding futex.
-+ *
-+ * Private futexes doesn't need to recalculate hash in retry, so skip
-+ * get_futex_key() when retrying.
-+ */
-+retry:
-+ for (i = 0; i < count; i++) {
-+ if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry)
-+ continue;
-+
-+ ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
-+ !(vs[i].w.flags & FUTEX_PRIVATE_FLAG),
-+ &vs[i].q.key, FUTEX_READ);
-+
-+ if (unlikely(ret))
-+ return ret;
-+ }
-+
-+ set_current_state(TASK_INTERRUPTIBLE);
-+
-+ for (i = 0; i < count; i++) {
-+ u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
-+ struct futex_q *q = &vs[i].q;
-+ u32 val = (u32)vs[i].w.val;
-+
-+ hb = futex_q_lock(q);
-+ ret = futex_get_value_locked(&uval, uaddr);
-+
-+ if (!ret && uval == val) {
-+ /*
-+ * The bucket lock can't be held while dealing with the
-+ * next futex. Queue each futex at this moment so hb can
-+ * be unlocked.
-+ */
-+ futex_queue(q, hb);
-+ continue;
-+ }
-+
-+ futex_q_unlock(hb);
-+ __set_current_state(TASK_RUNNING);
-+
-+ /*
-+ * Even if something went wrong, if we find out that a futex
-+ * was awaken, we don't return error and return this index to
-+ * userspace
-+ */
-+ *awaken = unqueue_multiple(vs, i);
-+ if (*awaken >= 0)
-+ return 1;
-+
-+ if (ret) {
-+ /*
-+ * If we need to handle a page fault, we need to do so
-+ * without any lock and any enqueued futex (otherwise
-+ * we could lose some wakeup). So we do it here, after
-+ * undoing all the work done so far. In success, we
-+ * retry all the work.
-+ */
-+ if (get_user(uval, uaddr))
-+ return -EFAULT;
-+
-+ retry = true;
-+ goto retry;
-+ }
-+
-+ if (uval != val)
-+ return -EWOULDBLOCK;
-+ }
-+
-+ return 0;
-+}
-+
-+/**
-+ * futex_sleep_multiple - Check sleeping conditions and sleep
-+ * @vs: List of futexes to wait for
-+ * @count: Length of vs
-+ * @to: Timeout
-+ *
-+ * Sleep if and only if the timeout hasn't expired and no futex on the list has
-+ * been awaken.
-+ */
-+static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
-+ struct hrtimer_sleeper *to)
-+{
-+ if (to && !to->task)
-+ return;
-+
-+ for (; count; count--, vs++) {
-+ if (!READ_ONCE(vs->q.lock_ptr))
-+ return;
-+ }
-+
-+ freezable_schedule();
-+}
-+
-+/**
-+ * futex_wait_multiple - Prepare to wait on and enqueue several futexes
-+ * @vs: The list of futexes to wait on
-+ * @count: The number of objects
-+ * @to: Timeout before giving up and returning to userspace
-+ *
-+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function
-+ * sleeps on a group of futexes and returns on the first futex that is
-+ * wake, or after the timeout has elapsed.
-+ *
-+ * Return:
-+ * - >=0 - Hint to the futex that was awoken
-+ * - <0 - On error
-+ */
-+int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
-+ struct hrtimer_sleeper *to)
-+{
-+ int ret, hint = 0;
-+
-+ if (to)
-+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
-+
-+ while (1) {
-+ ret = futex_wait_multiple_setup(vs, count, &hint);
-+ if (ret) {
-+ if (ret > 0) {
-+ /* A futex was awaken during setup */
-+ ret = hint;
-+ }
-+ return ret;
-+ }
-+
-+ futex_sleep_multiple(vs, count, to);
-+
-+ __set_current_state(TASK_RUNNING);
-+
-+ ret = unqueue_multiple(vs, count);
-+ if (ret >= 0)
-+ return ret;
-+
-+ if (to && !to->task)
-+ return -ETIMEDOUT;
-+ else if (signal_pending(current))
-+ return -ERESTARTSYS;
-+ /*
-+ * The final case is a spurious wakeup, for
-+ * which just retry.
-+ */
-+ }
-+}
-+
-+/**
-+ * futex_wait_setup() - Prepare to wait on a futex
-+ * @uaddr: the futex userspace address
-+ * @val: the expected value
-+ * @flags: futex flags (FLAGS_SHARED, etc.)
-+ * @q: the associated futex_q
-+ * @hb: storage for hash_bucket pointer to be returned to caller
-+ *
-+ * Setup the futex_q and locate the hash_bucket. Get the futex value and
-+ * compare it with the expected value. Handle atomic faults internally.
-+ * Return with the hb lock held on success, and unlocked on failure.
-+ *
-+ * Return:
-+ * - 0 - uaddr contains val and hb has been locked;
-+ * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
-+ */
-+int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
-+ struct futex_q *q, struct futex_hash_bucket **hb)
-+{
-+ u32 uval;
-+ int ret;
-+
-+ /*
-+ * Access the page AFTER the hash-bucket is locked.
-+ * Order is important:
-+ *
-+ * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
-+ * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); }
-+ *
-+ * The basic logical guarantee of a futex is that it blocks ONLY
-+ * if cond(var) is known to be true at the time of blocking, for
-+ * any cond. If we locked the hash-bucket after testing *uaddr, that
-+ * would open a race condition where we could block indefinitely with
-+ * cond(var) false, which would violate the guarantee.
-+ *
-+ * On the other hand, we insert q and release the hash-bucket only
-+ * after testing *uaddr. This guarantees that futex_wait() will NOT
-+ * absorb a wakeup if *uaddr does not match the desired values
-+ * while the syscall executes.
-+ */
-+retry:
-+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ);
-+ if (unlikely(ret != 0))
-+ return ret;
-+
-+retry_private:
-+ *hb = futex_q_lock(q);
-+
-+ ret = futex_get_value_locked(&uval, uaddr);
-+
-+ if (ret) {
-+ futex_q_unlock(*hb);
-+
-+ ret = get_user(uval, uaddr);
-+ if (ret)
-+ return ret;
-+
-+ if (!(flags & FLAGS_SHARED))
-+ goto retry_private;
-+
-+ goto retry;
-+ }
-+
-+ if (uval != val) {
-+ futex_q_unlock(*hb);
-+ ret = -EWOULDBLOCK;
-+ }
-+
-+ return ret;
-+}
-+
-+int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
-+{
-+ struct hrtimer_sleeper timeout, *to;
-+ struct restart_block *restart;
-+ struct futex_hash_bucket *hb;
-+ struct futex_q q = futex_q_init;
-+ int ret;
-+
-+ if (!bitset)
-+ return -EINVAL;
-+ q.bitset = bitset;
-+
-+ to = futex_setup_timer(abs_time, &timeout, flags,
-+ current->timer_slack_ns);
-+retry:
-+ /*
-+ * Prepare to wait on uaddr. On success, it holds hb->lock and q
-+ * is initialized.
-+ */
-+ ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
-+ if (ret)
-+ goto out;
-+
-+ /* futex_queue and wait for wakeup, timeout, or a signal. */
-+ futex_wait_queue(hb, &q, to);
-+
-+ /* If we were woken (and unqueued), we succeeded, whatever. */
-+ ret = 0;
-+ if (!futex_unqueue(&q))
-+ goto out;
-+ ret = -ETIMEDOUT;
-+ if (to && !to->task)
-+ goto out;
-+
-+ /*
-+ * We expect signal_pending(current), but we might be the
-+ * victim of a spurious wakeup as well.
-+ */
-+ if (!signal_pending(current))
-+ goto retry;
-+
-+ ret = -ERESTARTSYS;
-+ if (!abs_time)
-+ goto out;
-+
-+ restart = &current->restart_block;
-+ restart->futex.uaddr = uaddr;
-+ restart->futex.val = val;
-+ restart->futex.time = *abs_time;
-+ restart->futex.bitset = bitset;
-+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
-+
-+ ret = set_restart_fn(restart, futex_wait_restart);
-+
-+out:
-+ if (to) {
-+ hrtimer_cancel(&to->timer);
-+ destroy_hrtimer_on_stack(&to->timer);
-+ }
-+ return ret;
-+}
-+
-+static long futex_wait_restart(struct restart_block *restart)
-+{
-+ u32 __user *uaddr = restart->futex.uaddr;
-+ ktime_t t, *tp = NULL;
-+
-+ if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
-+ t = restart->futex.time;
-+ tp = &t;
-+ }
-+ restart->fn = do_no_restart_syscall;
-+
-+ return (long)futex_wait(uaddr, restart->futex.flags,
-+ restart->futex.val, tp, restart->futex.bitset);
-+}
-+
-diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
-index f43d89d92..d1944258c 100644
---- a/kernel/sys_ni.c
-+++ b/kernel/sys_ni.c
-@@ -143,13 +143,14 @@ COND_SYSCALL(capset);
- /* __ARCH_WANT_SYS_CLONE3 */
- COND_SYSCALL(clone3);
-
--/* kernel/futex.c */
-+/* kernel/futex/syscalls.c */
- COND_SYSCALL(futex);
- COND_SYSCALL(futex_time32);
- COND_SYSCALL(set_robust_list);
- COND_SYSCALL_COMPAT(set_robust_list);
- COND_SYSCALL(get_robust_list);
- COND_SYSCALL_COMPAT(get_robust_list);
-+COND_SYSCALL(futex_waitv);
-
- /* kernel/hrtimer.c */
-
-diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore
-index 0e78b49d0..fbcbdb696 100644
---- a/tools/testing/selftests/futex/functional/.gitignore
-+++ b/tools/testing/selftests/futex/functional/.gitignore
-@@ -8,3 +8,4 @@ futex_wait_uninitialized_heap
- futex_wait_wouldblock
- futex_wait
- futex_requeue
-+futex_waitv
-diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile
-index bd1fec59e..5cc38de9d 100644
---- a/tools/testing/selftests/futex/functional/Makefile
-+++ b/tools/testing/selftests/futex/functional/Makefile
-@@ -17,7 +17,8 @@ TEST_GEN_FILES := \
- futex_wait_uninitialized_heap \
- futex_wait_private_mapped_file \
- futex_wait \
-- futex_requeue
-+ futex_requeue \
-+ futex_waitv
-
- TEST_PROGS := run.sh
-
-diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
-index 1f8f6daaf..3651ce17b 100644
---- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c
-+++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
-@@ -17,6 +17,7 @@
-
- #include <pthread.h>
- #include "futextest.h"
-+#include "futex2test.h"
- #include "logging.h"
-
- #define TEST_NAME "futex-wait-timeout"
-@@ -96,6 +97,12 @@ int main(int argc, char *argv[])
- struct timespec to;
- pthread_t thread;
- int c;
-+ struct futex_waitv waitv = {
-+ .uaddr = (uintptr_t)&f1,
-+ .val = f1,
-+ .flags = FUTEX_32,
-+ .__reserved = 0
-+ };
-
- while ((c = getopt(argc, argv, "cht:v:")) != -1) {
- switch (c) {
-@@ -118,7 +125,7 @@ int main(int argc, char *argv[])
- }
-
- ksft_print_header();
-- ksft_set_plan(7);
-+ ksft_set_plan(9);
- ksft_print_msg("%s: Block on a futex and wait for timeout\n",
- basename(argv[0]));
- ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns);
-@@ -175,6 +182,18 @@ int main(int argc, char *argv[])
- res = futex_lock_pi(&futex_pi, NULL, 0, FUTEX_CLOCK_REALTIME);
- test_timeout(res, &ret, "futex_lock_pi invalid timeout flag", ENOSYS);
-
-+ /* futex_waitv with CLOCK_MONOTONIC */
-+ if (futex_get_abs_timeout(CLOCK_MONOTONIC, &to, timeout_ns))
-+ return RET_FAIL;
-+ res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC);
-+ test_timeout(res, &ret, "futex_waitv monotonic", ETIMEDOUT);
-+
-+ /* futex_waitv with CLOCK_REALTIME */
-+ if (futex_get_abs_timeout(CLOCK_REALTIME, &to, timeout_ns))
-+ return RET_FAIL;
-+ res = futex_waitv(&waitv, 1, 0, &to, CLOCK_REALTIME);
-+ test_timeout(res, &ret, "futex_waitv realtime", ETIMEDOUT);
-+
- ksft_print_cnts();
- return ret;
- }
-diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
-index 0ae390ff8..7d7a6a06c 100644
---- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
-+++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
-@@ -22,6 +22,7 @@
- #include <string.h>
- #include <time.h>
- #include "futextest.h"
-+#include "futex2test.h"
- #include "logging.h"
-
- #define TEST_NAME "futex-wait-wouldblock"
-@@ -42,6 +43,12 @@ int main(int argc, char *argv[])
- futex_t f1 = FUTEX_INITIALIZER;
- int res, ret = RET_PASS;
- int c;
-+ struct futex_waitv waitv = {
-+ .uaddr = (uintptr_t)&f1,
-+ .val = f1+1,
-+ .flags = FUTEX_32,
-+ .__reserved = 0
-+ };
-
- while ((c = getopt(argc, argv, "cht:v:")) != -1) {
- switch (c) {
-@@ -61,18 +68,44 @@ int main(int argc, char *argv[])
- }
-
- ksft_print_header();
-- ksft_set_plan(1);
-+ ksft_set_plan(2);
- ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n",
- basename(argv[0]));
-
- info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1);
- res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG);
- if (!res || errno != EWOULDBLOCK) {
-- fail("futex_wait returned: %d %s\n",
-- res ? errno : res, res ? strerror(errno) : "");
-+ ksft_test_result_fail("futex_wait returned: %d %s\n",
-+ res ? errno : res,
-+ res ? strerror(errno) : "");
- ret = RET_FAIL;
-+ } else {
-+ ksft_test_result_pass("futex_wait\n");
- }
-
-- print_result(TEST_NAME, ret);
-+ if (clock_gettime(CLOCK_MONOTONIC, &to)) {
-+ error("clock_gettime failed\n", errno);
-+ return errno;
-+ }
-+
-+ to.tv_nsec += timeout_ns;
-+
-+ if (to.tv_nsec >= 1000000000) {
-+ to.tv_sec++;
-+ to.tv_nsec -= 1000000000;
-+ }
-+
-+ info("Calling futex_waitv on f1: %u @ %p with val=%u\n", f1, &f1, f1+1);
-+ res = futex_waitv(&waitv, 1, 0, &to, CLOCK_MONOTONIC);
-+ if (!res || errno != EWOULDBLOCK) {
-+ ksft_test_result_pass("futex_waitv returned: %d %s\n",
-+ res ? errno : res,
-+ res ? strerror(errno) : "");
-+ ret = RET_FAIL;
-+ } else {
-+ ksft_test_result_pass("futex_waitv\n");
-+ }
-+
-+ ksft_print_cnts();
- return ret;
- }
-diff --git a/tools/testing/selftests/futex/functional/futex_waitv.c b/tools/testing/selftests/futex/functional/futex_waitv.c
-new file mode 100644
-index 000000000..a94337f67
---- /dev/null
-+++ b/tools/testing/selftests/futex/functional/futex_waitv.c
-@@ -0,0 +1,237 @@
-+// SPDX-License-Identifier: GPL-2.0-or-later
-+/*
-+ * futex_waitv() test by André Almeida <andrealmeid@collabora.com>
-+ *
-+ * Copyright 2021 Collabora Ltd.
-+ */
-+
-+#include <errno.h>
-+#include <error.h>
-+#include <getopt.h>
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <time.h>
-+#include <pthread.h>
-+#include <stdint.h>
-+#include <sys/shm.h>
-+#include "futextest.h"
-+#include "futex2test.h"
-+#include "logging.h"
-+
-+#define TEST_NAME "futex-wait"
-+#define WAKE_WAIT_US 10000
-+#define NR_FUTEXES 30
-+static struct futex_waitv waitv[NR_FUTEXES];
-+u_int32_t futexes[NR_FUTEXES] = {0};
-+
-+void usage(char *prog)
-+{
-+ printf("Usage: %s\n", prog);
-+ printf(" -c Use color\n");
-+ printf(" -h Display this help message\n");
-+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
-+ VQUIET, VCRITICAL, VINFO);
-+}
-+
-+void *waiterfn(void *arg)
-+{
-+ struct timespec to;
-+ int res;
-+
-+ /* setting absolute timeout for futex2 */
-+ if (clock_gettime(CLOCK_MONOTONIC, &to))
-+ error("gettime64 failed\n", errno);
-+
-+ to.tv_sec++;
-+
-+ res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC);
-+ if (res < 0) {
-+ ksft_test_result_fail("futex_waitv returned: %d %s\n",
-+ errno, strerror(errno));
-+ } else if (res != NR_FUTEXES - 1) {
-+ ksft_test_result_fail("futex_waitv returned: %d, expecting %d\n",
-+ res, NR_FUTEXES - 1);
-+ }
-+
-+ return NULL;
-+}
-+
-+int main(int argc, char *argv[])
-+{
-+ pthread_t waiter;
-+ int res, ret = RET_PASS;
-+ struct timespec to;
-+ int c, i;
-+
-+ while ((c = getopt(argc, argv, "cht:v:")) != -1) {
-+ switch (c) {
-+ case 'c':
-+ log_color(1);
-+ break;
-+ case 'h':
-+ usage(basename(argv[0]));
-+ exit(0);
-+ case 'v':
-+ log_verbosity(atoi(optarg));
-+ break;
-+ default:
-+ usage(basename(argv[0]));
-+ exit(1);
-+ }
-+ }
-+
-+ ksft_print_header();
-+ ksft_set_plan(7);
-+ ksft_print_msg("%s: Test FUTEX_WAITV\n",
-+ basename(argv[0]));
-+
-+ for (i = 0; i < NR_FUTEXES; i++) {
-+ waitv[i].uaddr = (uintptr_t)&futexes[i];
-+ waitv[i].flags = FUTEX_32 | FUTEX_PRIVATE_FLAG;
-+ waitv[i].val = 0;
-+ waitv[i].__reserved = 0;
-+ }
-+
-+ /* Private waitv */
-+ if (pthread_create(&waiter, NULL, waiterfn, NULL))
-+ error("pthread_create failed\n", errno);
-+
-+ usleep(WAKE_WAIT_US);
-+
-+ res = futex_wake(u64_to_ptr(waitv[NR_FUTEXES - 1].uaddr), 1, FUTEX_PRIVATE_FLAG);
-+ if (res != 1) {
-+ ksft_test_result_fail("futex_wake private returned: %d %s\n",
-+ res ? errno : res,
-+ res ? strerror(errno) : "");
-+ ret = RET_FAIL;
-+ } else {
-+ ksft_test_result_pass("futex_waitv private\n");
-+ }
-+
-+ /* Shared waitv */
-+ for (i = 0; i < NR_FUTEXES; i++) {
-+ int shm_id = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666);
-+
-+ if (shm_id < 0) {
-+ perror("shmget");
-+ exit(1);
-+ }
-+
-+ unsigned int *shared_data = shmat(shm_id, NULL, 0);
-+
-+ *shared_data = 0;
-+ waitv[i].uaddr = (uintptr_t)shared_data;
-+ waitv[i].flags = FUTEX_32;
-+ waitv[i].val = 0;
-+ waitv[i].__reserved = 0;
-+ }
-+
-+ if (pthread_create(&waiter, NULL, waiterfn, NULL))
-+ error("pthread_create failed\n", errno);
-+
-+ usleep(WAKE_WAIT_US);
-+
-+ res = futex_wake(u64_to_ptr(waitv[NR_FUTEXES - 1].uaddr), 1, 0);
-+ if (res != 1) {
-+ ksft_test_result_fail("futex_wake shared returned: %d %s\n",
-+ res ? errno : res,
-+ res ? strerror(errno) : "");
-+ ret = RET_FAIL;
-+ } else {
-+ ksft_test_result_pass("futex_waitv shared\n");
-+ }
-+
-+ for (i = 0; i < NR_FUTEXES; i++)
-+ shmdt(u64_to_ptr(waitv[i].uaddr));
-+
-+ /* Testing a waiter without FUTEX_32 flag */
-+ waitv[0].flags = FUTEX_PRIVATE_FLAG;
-+
-+ if (clock_gettime(CLOCK_MONOTONIC, &to))
-+ error("gettime64 failed\n", errno);
-+
-+ to.tv_sec++;
-+
-+ res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC);
-+ if (res == EINVAL) {
-+ ksft_test_result_fail("futex_waitv private returned: %d %s\n",
-+ res ? errno : res,
-+ res ? strerror(errno) : "");
-+ ret = RET_FAIL;
-+ } else {
-+ ksft_test_result_pass("futex_waitv without FUTEX_32\n");
-+ }
-+
-+ /* Testing a waiter with an unaligned address */
-+ waitv[0].flags = FUTEX_PRIVATE_FLAG | FUTEX_32;
-+ waitv[0].uaddr = 1;
-+
-+ if (clock_gettime(CLOCK_MONOTONIC, &to))
-+ error("gettime64 failed\n", errno);
-+
-+ to.tv_sec++;
-+
-+ res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC);
-+ if (res == EINVAL) {
-+ ksft_test_result_fail("futex_wake private returned: %d %s\n",
-+ res ? errno : res,
-+ res ? strerror(errno) : "");
-+ ret = RET_FAIL;
-+ } else {
-+ ksft_test_result_pass("futex_waitv with an unaligned address\n");
-+ }
-+
-+ /* Testing a NULL address for waiters.uaddr */
-+ waitv[0].uaddr = 0x00000000;
-+
-+ if (clock_gettime(CLOCK_MONOTONIC, &to))
-+ error("gettime64 failed\n", errno);
-+
-+ to.tv_sec++;
-+
-+ res = futex_waitv(waitv, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC);
-+ if (res == EINVAL) {
-+ ksft_test_result_fail("futex_waitv private returned: %d %s\n",
-+ res ? errno : res,
-+ res ? strerror(errno) : "");
-+ ret = RET_FAIL;
-+ } else {
-+ ksft_test_result_pass("futex_waitv NULL address in waitv.uaddr\n");
-+ }
-+
-+ /* Testing a NULL address for *waiters */
-+ if (clock_gettime(CLOCK_MONOTONIC, &to))
-+ error("gettime64 failed\n", errno);
-+
-+ to.tv_sec++;
-+
-+ res = futex_waitv(NULL, NR_FUTEXES, 0, &to, CLOCK_MONOTONIC);
-+ if (res == EINVAL) {
-+ ksft_test_result_fail("futex_waitv private returned: %d %s\n",
-+ res ? errno : res,
-+ res ? strerror(errno) : "");
-+ ret = RET_FAIL;
-+ } else {
-+ ksft_test_result_pass("futex_waitv NULL address in *waiters\n");
-+ }
-+
-+ /* Testing an invalid clockid */
-+ if (clock_gettime(CLOCK_MONOTONIC, &to))
-+ error("gettime64 failed\n", errno);
-+
-+ to.tv_sec++;
-+
-+ res = futex_waitv(NULL, NR_FUTEXES, 0, &to, CLOCK_TAI);
-+ if (res == EINVAL) {
-+ ksft_test_result_fail("futex_waitv private returned: %d %s\n",
-+ res ? errno : res,
-+ res ? strerror(errno) : "");
-+ ret = RET_FAIL;
-+ } else {
-+ ksft_test_result_pass("futex_waitv invalid clockid\n");
-+ }
-+
-+ ksft_print_cnts();
-+ return ret;
-+}
-diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh
-index 11a9d6229..5ccd599da 100755
---- a/tools/testing/selftests/futex/functional/run.sh
-+++ b/tools/testing/selftests/futex/functional/run.sh
-@@ -79,3 +79,6 @@ echo
-
- echo
- ./futex_requeue $COLOR
-+
-+echo
-+./futex_waitv $COLOR
-diff --git a/tools/testing/selftests/futex/include/futex2test.h b/tools/testing/selftests/futex/include/futex2test.h
-new file mode 100644
-index 000000000..9d305520e
---- /dev/null
-+++ b/tools/testing/selftests/futex/include/futex2test.h
-@@ -0,0 +1,22 @@
-+/* SPDX-License-Identifier: GPL-2.0-or-later */
-+/*
-+ * Futex2 library addons for futex tests
-+ *
-+ * Copyright 2021 Collabora Ltd.
-+ */
-+#include <stdint.h>
-+
-+#define u64_to_ptr(x) ((void *)(uintptr_t)(x))
-+
-+/**
-+ * futex_waitv - Wait at multiple futexes, wake on any
-+ * @waiters: Array of waiters
-+ * @nr_waiters: Length of waiters array
-+ * @flags: Operation flags
-+ * @timo: Optional timeout for operation
-+ */
-+static inline int futex_waitv(volatile struct futex_waitv *waiters, unsigned long nr_waiters,
-+ unsigned long flags, struct timespec *timo, clockid_t clockid)
-+{
-+ return syscall(__NR_futex_waitv, waiters, nr_waiters, flags, timo, clockid);
-+}
---
-2.33.1.711.g9d530dc002
-
-