1 // SPDX-License-Identifier: GPL-2.0-or-later
3 #include <linux/slab.h>
4 #include <linux/sched/task.h>
7 #include "../locking/rtmutex_common.h"
12 int refill_pi_state_cache(void)
14 struct futex_pi_state *pi_state;
16 if (likely(current->pi_state_cache))
19 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
24 INIT_LIST_HEAD(&pi_state->list);
25 /* pi_mutex gets initialized later */
26 pi_state->owner = NULL;
27 refcount_set(&pi_state->refcount, 1);
28 pi_state->key = FUTEX_KEY_INIT;
30 current->pi_state_cache = pi_state;
35 static struct futex_pi_state *alloc_pi_state(void)
37 struct futex_pi_state *pi_state = current->pi_state_cache;
40 current->pi_state_cache = NULL;
45 static void pi_state_update_owner(struct futex_pi_state *pi_state,
46 struct task_struct *new_owner)
48 struct task_struct *old_owner = pi_state->owner;
50 lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
53 raw_spin_lock(&old_owner->pi_lock);
54 WARN_ON(list_empty(&pi_state->list));
55 list_del_init(&pi_state->list);
56 raw_spin_unlock(&old_owner->pi_lock);
60 raw_spin_lock(&new_owner->pi_lock);
61 WARN_ON(!list_empty(&pi_state->list));
62 list_add(&pi_state->list, &new_owner->pi_state_list);
63 pi_state->owner = new_owner;
64 raw_spin_unlock(&new_owner->pi_lock);
68 void get_pi_state(struct futex_pi_state *pi_state)
70 WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
74 * Drops a reference to the pi_state object and frees or caches it
75 * when the last reference is gone.
77 void put_pi_state(struct futex_pi_state *pi_state)
82 if (!refcount_dec_and_test(&pi_state->refcount))
86 * If pi_state->owner is NULL, the owner is most probably dying
87 * and has cleaned up the pi_state already
89 if (pi_state->owner) {
92 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
93 pi_state_update_owner(pi_state, NULL);
94 rt_mutex_proxy_unlock(&pi_state->pi_mutex);
95 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
98 if (current->pi_state_cache) {
102 * pi_state->list is already empty.
103 * clear pi_state->owner.
104 * refcount is at 0 - put it back to 1.
106 pi_state->owner = NULL;
107 refcount_set(&pi_state->refcount, 1);
108 current->pi_state_cache = pi_state;
113 * We need to check the following states:
115 * Waiter | pi_state | pi->owner | uTID | uODIED | ?
117 * [1] NULL | --- | --- | 0 | 0/1 | Valid
118 * [2] NULL | --- | --- | >0 | 0/1 | Valid
120 * [3] Found | NULL | -- | Any | 0/1 | Invalid
122 * [4] Found | Found | NULL | 0 | 1 | Valid
123 * [5] Found | Found | NULL | >0 | 1 | Invalid
125 * [6] Found | Found | task | 0 | 1 | Valid
127 * [7] Found | Found | NULL | Any | 0 | Invalid
129 * [8] Found | Found | task | ==taskTID | 0/1 | Valid
130 * [9] Found | Found | task | 0 | 0 | Invalid
131 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
133 * [1] Indicates that the kernel can acquire the futex atomically. We
134 * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
136 * [2] Valid, if TID does not belong to a kernel thread. If no matching
137 * thread is found then it indicates that the owner TID has died.
139 * [3] Invalid. The waiter is queued on a non PI futex
141 * [4] Valid state after exit_robust_list(), which sets the user space
142 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
144 * [5] The user space value got manipulated between exit_robust_list()
145 * and exit_pi_state_list()
147 * [6] Valid state after exit_pi_state_list() which sets the new owner in
148 * the pi_state but cannot access the user space value.
150 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
152 * [8] Owner and user space value match
154 * [9] There is no transient state which sets the user space TID to 0
155 * except exit_robust_list(), but this is indicated by the
156 * FUTEX_OWNER_DIED bit. See [4]
158 * [10] There is no transient state which leaves owner and user space
159 * TID out of sync. Except one error case where the kernel is denied
160 * write access to the user address, see fixup_pi_state_owner().
163 * Serialization and lifetime rules:
167 * hb -> futex_q, relation
168 * futex_q -> pi_state, relation
170 * (cannot be raw because hb can contain arbitrary amount
173 * pi_mutex->wait_lock:
177 * (and pi_mutex 'obviously')
181 * p->pi_state_list -> pi_state->list, relation
182 * pi_mutex->owner -> pi_state->owner, relation
184 * pi_state->refcount:
192 * pi_mutex->wait_lock
198 * Validate that the existing waiter has a pi_state and sanity check
199 * the pi_state against the user space value. If correct, attach to
202 static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
203 struct futex_pi_state *pi_state,
204 struct futex_pi_state **ps)
206 pid_t pid = uval & FUTEX_TID_MASK;
211 * Userspace might have messed up non-PI and PI futexes [3]
213 if (unlikely(!pi_state))
217 * We get here with hb->lock held, and having found a
218 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
219 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
220 * which in turn means that futex_lock_pi() still has a reference on
223 * The waiter holding a reference on @pi_state also protects against
224 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
225 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
226 * free pi_state before we can take a reference ourselves.
228 WARN_ON(!refcount_read(&pi_state->refcount));
231 * Now that we have a pi_state, we can acquire wait_lock
232 * and do the state validation.
234 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
237 * Since {uval, pi_state} is serialized by wait_lock, and our current
238 * uval was read without holding it, it can have changed. Verify it
239 * still is what we expect it to be, otherwise retry the entire
242 if (futex_get_value_locked(&uval2, uaddr))
249 * Handle the owner died case:
251 if (uval & FUTEX_OWNER_DIED) {
253 * exit_pi_state_list sets owner to NULL and wakes the
254 * topmost waiter. The task which acquires the
255 * pi_state->rt_mutex will fixup owner.
257 if (!pi_state->owner) {
259 * No pi state owner, but the user space TID
260 * is not 0. Inconsistent state. [5]
265 * Take a ref on the state and return success. [4]
271 * If TID is 0, then either the dying owner has not
272 * yet executed exit_pi_state_list() or some waiter
273 * acquired the rtmutex in the pi state, but did not
274 * yet fixup the TID in user space.
276 * Take a ref on the state and return success. [6]
282 * If the owner died bit is not set, then the pi_state
283 * must have an owner. [7]
285 if (!pi_state->owner)
290 * Bail out if user space manipulated the futex value. If pi
291 * state exists then the owner TID must be the same as the
292 * user space TID. [9/10]
294 if (pid != task_pid_vnr(pi_state->owner))
298 get_pi_state(pi_state);
299 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
316 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
320 static int handle_exit_race(u32 __user *uaddr, u32 uval,
321 struct task_struct *tsk)
326 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
327 * caller that the alleged owner is busy.
329 if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
333 * Reread the user space value to handle the following situation:
337 * sys_exit() sys_futex()
338 * do_exit() futex_lock_pi()
339 * futex_lock_pi_atomic()
340 * exit_signals(tsk) No waiters:
341 * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
342 * mm_release(tsk) Set waiter bit
343 * exit_robust_list(tsk) { *uaddr = 0x80000PID;
344 * Set owner died attach_to_pi_owner() {
345 * *uaddr = 0xC0000000; tsk = get_task(PID);
346 * } if (!tsk->flags & PF_EXITING) {
348 * tsk->futex_state = } else {
349 * FUTEX_STATE_DEAD; if (tsk->futex_state !=
352 * return -ESRCH; <--- FAIL
355 * Returning ESRCH unconditionally is wrong here because the
356 * user space value has been changed by the exiting task.
358 * The same logic applies to the case where the exiting task is
361 if (futex_get_value_locked(&uval2, uaddr))
364 /* If the user space value has changed, try again. */
369 * The exiting task did not have a robust list, the robust list was
370 * corrupted or the user space value in *uaddr is simply bogus.
371 * Give up and tell user space.
376 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
377 struct futex_pi_state **ps)
380 * No existing pi state. First waiter. [2]
382 * This creates pi_state, we have hb->lock held, this means nothing can
383 * observe this state, wait_lock is irrelevant.
385 struct futex_pi_state *pi_state = alloc_pi_state();
388 * Initialize the pi_mutex in locked state and make @p
391 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
393 /* Store the key for possible exit cleanups: */
394 pi_state->key = *key;
396 WARN_ON(!list_empty(&pi_state->list));
397 list_add(&pi_state->list, &p->pi_state_list);
399 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
400 * because there is no concurrency as the object is not published yet.
407 * Lookup the task for the TID provided from user space and attach to
408 * it after doing proper sanity checks.
410 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
411 struct futex_pi_state **ps,
412 struct task_struct **exiting)
414 pid_t pid = uval & FUTEX_TID_MASK;
415 struct task_struct *p;
418 * We are the first waiter - try to look up the real owner and attach
419 * the new pi_state to it, but bail out when TID = 0 [1]
421 * The !pid check is paranoid. None of the call sites should end up
422 * with pid == 0, but better safe than sorry. Let the caller retry
426 p = find_get_task_by_vpid(pid);
428 return handle_exit_race(uaddr, uval, NULL);
430 if (unlikely(p->flags & PF_KTHREAD)) {
436 * We need to look at the task state to figure out, whether the
437 * task is exiting. To protect against the change of the task state
438 * in futex_exit_release(), we do this protected by p->pi_lock:
440 raw_spin_lock_irq(&p->pi_lock);
441 if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
443 * The task is on the way out. When the futex state is
444 * FUTEX_STATE_DEAD, we know that the task has finished
447 int ret = handle_exit_race(uaddr, uval, p);
449 raw_spin_unlock_irq(&p->pi_lock);
451 * If the owner task is between FUTEX_STATE_EXITING and
452 * FUTEX_STATE_DEAD then store the task pointer and keep
453 * the reference on the task struct. The calling code will
454 * drop all locks, wait for the task to reach
455 * FUTEX_STATE_DEAD and then drop the refcount. This is
456 * required to prevent a live lock when the current task
457 * preempted the exiting task between the two states.
466 __attach_to_pi_owner(p, key, ps);
467 raw_spin_unlock_irq(&p->pi_lock);
474 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
479 if (unlikely(should_fail_futex(true)))
482 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
486 /* If user space value changed, let the caller retry */
487 return curval != uval ? -EAGAIN : 0;
491 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
492 * @uaddr: the pi futex user address
493 * @hb: the pi futex hash bucket
494 * @key: the futex key associated with uaddr and hb
495 * @ps: the pi_state pointer where we store the result of the
497 * @task: the task to perform the atomic lock work for. This will
498 * be "current" except in the case of requeue pi.
499 * @exiting: Pointer to store the task pointer of the owner task
500 * which is in the middle of exiting
501 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
504 * - 0 - ready to wait;
505 * - 1 - acquired the lock;
508 * The hb->lock must be held by the caller.
510 * @exiting is only set when the return value is -EBUSY. If so, this holds
511 * a refcount on the exiting task on return and the caller needs to drop it
512 * after waiting for the exit to complete.
514 int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
515 union futex_key *key,
516 struct futex_pi_state **ps,
517 struct task_struct *task,
518 struct task_struct **exiting,
521 u32 uval, newval, vpid = task_pid_vnr(task);
522 struct futex_q *top_waiter;
526 * Read the user space value first so we can validate a few
527 * things before proceeding further.
529 if (futex_get_value_locked(&uval, uaddr))
532 if (unlikely(should_fail_futex(true)))
538 if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
541 if ((unlikely(should_fail_futex(true))))
545 * Lookup existing state first. If it exists, try to attach to
548 top_waiter = futex_top_waiter(hb, key);
550 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
553 * No waiter and user TID is 0. We are here because the
554 * waiters or the owner died bit is set or called from
555 * requeue_cmp_pi or for whatever reason something took the
558 if (!(uval & FUTEX_TID_MASK)) {
560 * We take over the futex. No other waiters and the user space
561 * TID is 0. We preserve the owner died bit.
563 newval = uval & FUTEX_OWNER_DIED;
566 /* The futex requeue_pi code can enforce the waiters bit */
568 newval |= FUTEX_WAITERS;
570 ret = lock_pi_update_atomic(uaddr, uval, newval);
575 * If the waiter bit was requested the caller also needs PI
576 * state attached to the new owner of the user space futex.
578 * @task is guaranteed to be alive and it cannot be exiting
579 * because it is either sleeping or waiting in
580 * futex_requeue_pi_wakeup_sync().
582 * No need to do the full attach_to_pi_owner() exercise
583 * because @task is known and valid.
586 raw_spin_lock_irq(&task->pi_lock);
587 __attach_to_pi_owner(task, key, ps);
588 raw_spin_unlock_irq(&task->pi_lock);
594 * First waiter. Set the waiters bit before attaching ourself to
595 * the owner. If owner tries to unlock, it will be forced into
596 * the kernel and blocked on hb->lock.
598 newval = uval | FUTEX_WAITERS;
599 ret = lock_pi_update_atomic(uaddr, uval, newval);
603 * If the update of the user space value succeeded, we try to
604 * attach to the owner. If that fails, no harm done, we only
605 * set the FUTEX_WAITERS bit in the user space variable.
607 return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
611 * Caller must hold a reference on @pi_state.
613 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
615 struct rt_mutex_waiter *top_waiter;
616 struct task_struct *new_owner;
617 bool postunlock = false;
618 DEFINE_RT_WAKE_Q(wqh);
622 top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
623 if (WARN_ON_ONCE(!top_waiter)) {
625 * As per the comment in futex_unlock_pi() this should not happen.
627 * When this happens, give up our locks and try again, giving
628 * the futex_lock_pi() instance time to complete, either by
629 * waiting on the rtmutex or removing itself from the futex
636 new_owner = top_waiter->task;
639 * We pass it to the next owner. The WAITERS bit is always kept
640 * enabled while there is PI state around. We cleanup the owner
641 * died bit, because we are the owner.
643 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
645 if (unlikely(should_fail_futex(true))) {
650 ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
651 if (!ret && (curval != uval)) {
653 * If a unconditional UNLOCK_PI operation (user space did not
654 * try the TID->0 transition) raced with a waiter setting the
655 * FUTEX_WAITERS flag between get_user() and locking the hash
656 * bucket lock, retry the operation.
658 if ((FUTEX_TID_MASK & curval) == uval)
666 * This is a point of no return; once we modified the uval
667 * there is no going back and subsequent operations must
670 pi_state_update_owner(pi_state, new_owner);
671 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
675 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
678 rt_mutex_postunlock(&wqh);
683 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
684 struct task_struct *argowner)
686 struct futex_pi_state *pi_state = q->pi_state;
687 struct task_struct *oldowner, *newowner;
688 u32 uval, curval, newval, newtid;
691 oldowner = pi_state->owner;
694 * We are here because either:
696 * - we stole the lock and pi_state->owner needs updating to reflect
697 * that (@argowner == current),
701 * - someone stole our lock and we need to fix things to point to the
702 * new owner (@argowner == NULL).
704 * Either way, we have to replace the TID in the user space variable.
705 * This must be atomic as we have to preserve the owner died bit here.
707 * Note: We write the user space value _before_ changing the pi_state
708 * because we can fault here. Imagine swapped out pages or a fork
709 * that marked all the anonymous memory readonly for cow.
711 * Modifying pi_state _before_ the user space value would leave the
712 * pi_state in an inconsistent state when we fault here, because we
713 * need to drop the locks to handle the fault. This might be observed
714 * in the PID checks when attaching to PI state .
718 if (oldowner != current) {
720 * We raced against a concurrent self; things are
721 * already fixed up. Nothing to do.
726 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
727 /* We got the lock. pi_state is correct. Tell caller. */
732 * The trylock just failed, so either there is an owner or
733 * there is a higher priority waiter than this one.
735 newowner = rt_mutex_owner(&pi_state->pi_mutex);
737 * If the higher priority waiter has not yet taken over the
738 * rtmutex then newowner is NULL. We can't return here with
739 * that state because it's inconsistent vs. the user space
740 * state. So drop the locks and try again. It's a valid
741 * situation and not any different from the other retry
744 if (unlikely(!newowner)) {
749 WARN_ON_ONCE(argowner != current);
750 if (oldowner == current) {
752 * We raced against a concurrent self; things are
753 * already fixed up. Nothing to do.
760 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
762 if (!pi_state->owner)
763 newtid |= FUTEX_OWNER_DIED;
765 err = futex_get_value_locked(&uval, uaddr);
770 newval = (uval & FUTEX_OWNER_DIED) | newtid;
772 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
782 * We fixed up user space. Now we need to fix the pi_state
785 pi_state_update_owner(pi_state, newowner);
787 return argowner == current;
790 * In order to reschedule or handle a page fault, we need to drop the
791 * locks here. In the case of a fault, this gives the other task
792 * (either the highest priority waiter itself or the task which stole
793 * the rtmutex) the chance to try the fixup of the pi_state. So once we
794 * are back from handling the fault we need to check the pi_state after
795 * reacquiring the locks and before trying to do another fixup. When
796 * the fixup has been done already we simply return.
798 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
799 * drop hb->lock since the caller owns the hb -> futex_q relation.
800 * Dropping the pi_mutex->wait_lock requires the state revalidate.
803 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
804 spin_unlock(q->lock_ptr);
808 err = fault_in_user_writeable(uaddr);
821 spin_lock(q->lock_ptr);
822 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
825 * Check if someone else fixed it for us:
827 if (pi_state->owner != oldowner)
828 return argowner == current;
830 /* Retry if err was -EAGAIN or the fault in succeeded */
835 * fault_in_user_writeable() failed so user state is immutable. At
836 * best we can make the kernel state consistent but user state will
837 * be most likely hosed and any subsequent unlock operation will be
838 * rejected due to PI futex rule [10].
840 * Ensure that the rtmutex owner is also the pi_state owner despite
841 * the user space value claiming something different. There is no
842 * point in unlocking the rtmutex if current is the owner as it
843 * would need to wait until the next waiter has taken the rtmutex
844 * to guarantee consistent state. Keep it simple. Userspace asked
845 * for this wreckaged state.
847 * The rtmutex has an owner - either current or some other
848 * task. See the EAGAIN loop above.
850 pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
855 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
856 struct task_struct *argowner)
858 struct futex_pi_state *pi_state = q->pi_state;
861 lockdep_assert_held(q->lock_ptr);
863 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
864 ret = __fixup_pi_state_owner(uaddr, q, argowner);
865 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
870 * fixup_pi_owner() - Post lock pi_state and corner case management
871 * @uaddr: user address of the futex
872 * @q: futex_q (contains pi_state and access to the rt_mutex)
873 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
875 * After attempting to lock an rt_mutex, this function is called to cleanup
876 * the pi_state owner as well as handle race conditions that may allow us to
877 * acquire the lock. Must be called with the hb lock held.
880 * - 1 - success, lock taken;
881 * - 0 - success, lock not taken;
882 * - <0 - on error (-EFAULT)
884 int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
888 * Got the lock. We might not be the anticipated owner if we
889 * did a lock-steal - fix up the PI-state in that case:
891 * Speculative pi_state->owner read (we don't hold wait_lock);
892 * since we own the lock pi_state->owner == current is the
893 * stable state, anything else needs more attention.
895 if (q->pi_state->owner != current)
896 return fixup_pi_state_owner(uaddr, q, current);
901 * If we didn't get the lock; check if anybody stole it from us. In
902 * that case, we need to fix up the uval to point to them instead of
903 * us, otherwise bad things happen. [10]
905 * Another speculative read; pi_state->owner == current is unstable
906 * but needs our attention.
908 if (q->pi_state->owner == current)
909 return fixup_pi_state_owner(uaddr, q, NULL);
912 * Paranoia check. If we did not take the lock, then we should not be
913 * the owner of the rt_mutex. Warn and establish consistent state.
915 if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
916 return fixup_pi_state_owner(uaddr, q, current);
922 * Userspace tried a 0 -> TID atomic transition of the futex value
923 * and failed. The kernel side here does the whole locking operation:
924 * if there are waiters then it will block as a consequence of relying
925 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
926 * a 0 value of the futex too.).
928 * Also serves as futex trylock_pi()'ing, and due semantics.
930 int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
932 struct hrtimer_sleeper timeout, *to;
933 struct task_struct *exiting = NULL;
934 struct rt_mutex_waiter rt_waiter;
935 struct futex_hash_bucket *hb;
936 struct futex_q q = futex_q_init;
939 if (!IS_ENABLED(CONFIG_FUTEX_PI))
942 if (refill_pi_state_cache())
945 to = futex_setup_timer(time, &timeout, flags, 0);
948 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
949 if (unlikely(ret != 0))
953 hb = futex_q_lock(&q);
955 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
959 * Atomic work succeeded and we got the lock,
960 * or failed. Either way, we do _not_ block.
964 /* We got the lock. */
966 goto out_unlock_put_key;
972 * Two reasons for this:
973 * - EBUSY: Task is exiting and we just wait for the
975 * - EAGAIN: The user space value changed.
979 * Handle the case where the owner is in the middle of
980 * exiting. Wait for the exit to complete otherwise
981 * this task might loop forever, aka. live lock.
983 wait_for_owner_exiting(ret, exiting);
987 goto out_unlock_put_key;
991 WARN_ON(!q.pi_state);
994 * Only actually queue now that the atomic ops are done:
996 __futex_queue(&q, hb);
999 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
1000 /* Fixup the trylock return value: */
1001 ret = ret ? 0 : -EWOULDBLOCK;
1005 rt_mutex_init_waiter(&rt_waiter);
1008 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1009 * hold it while doing rt_mutex_start_proxy(), because then it will
1010 * include hb->lock in the blocking chain, even through we'll not in
1011 * fact hold it while blocking. This will lead it to report -EDEADLK
1012 * and BUG when futex_unlock_pi() interleaves with this.
1014 * Therefore acquire wait_lock while holding hb->lock, but drop the
1015 * latter before calling __rt_mutex_start_proxy_lock(). This
1016 * interleaves with futex_unlock_pi() -- which does a similar lock
1017 * handoff -- such that the latter can observe the futex_q::pi_state
1018 * before __rt_mutex_start_proxy_lock() is done.
1020 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1021 spin_unlock(q.lock_ptr);
1023 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1024 * such that futex_unlock_pi() is guaranteed to observe the waiter when
1025 * it sees the futex_q::pi_state.
1027 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1028 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1037 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1039 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1042 spin_lock(q.lock_ptr);
1044 * If we failed to acquire the lock (deadlock/signal/timeout), we must
1045 * first acquire the hb->lock before removing the lock from the
1046 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
1049 * In particular; it is important that futex_unlock_pi() can not
1050 * observe this inconsistency.
1052 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1057 * Fixup the pi_state owner and possibly acquire the lock if we
1060 res = fixup_pi_owner(uaddr, &q, !ret);
1062 * If fixup_pi_owner() returned an error, propagate that. If it acquired
1063 * the lock, clear our -ETIMEDOUT or -EINTR.
1066 ret = (res < 0) ? res : 0;
1068 futex_unqueue_pi(&q);
1069 spin_unlock(q.lock_ptr);
1077 hrtimer_cancel(&to->timer);
1078 destroy_hrtimer_on_stack(&to->timer);
1080 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1085 ret = fault_in_user_writeable(uaddr);
1089 if (!(flags & FLAGS_SHARED))
1096 * Userspace attempted a TID -> 0 atomic transition, and failed.
1097 * This is the in-kernel slowpath: we look up the PI state (if any),
1098 * and do the rt-mutex unlock.
1100 int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1102 u32 curval, uval, vpid = task_pid_vnr(current);
1103 union futex_key key = FUTEX_KEY_INIT;
1104 struct futex_hash_bucket *hb;
1105 struct futex_q *top_waiter;
1108 if (!IS_ENABLED(CONFIG_FUTEX_PI))
1112 if (get_user(uval, uaddr))
1115 * We release only a lock we actually own:
1117 if ((uval & FUTEX_TID_MASK) != vpid)
1120 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
1124 hb = futex_hash(&key);
1125 spin_lock(&hb->lock);
1128 * Check waiters first. We do not trust user space values at
1129 * all and we at least want to know if user space fiddled
1130 * with the futex value instead of blindly unlocking.
1132 top_waiter = futex_top_waiter(hb, &key);
1134 struct futex_pi_state *pi_state = top_waiter->pi_state;
1141 * If current does not own the pi_state then the futex is
1142 * inconsistent and user space fiddled with the futex value.
1144 if (pi_state->owner != current)
1147 get_pi_state(pi_state);
1149 * By taking wait_lock while still holding hb->lock, we ensure
1150 * there is no point where we hold neither; and therefore
1151 * wake_futex_p() must observe a state consistent with what we
1154 * In particular; this forces __rt_mutex_start_proxy() to
1155 * complete such that we're guaranteed to observe the
1156 * rt_waiter. Also see the WARN in wake_futex_pi().
1158 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1159 spin_unlock(&hb->lock);
1161 /* drops pi_state->pi_mutex.wait_lock */
1162 ret = wake_futex_pi(uaddr, uval, pi_state);
1164 put_pi_state(pi_state);
1167 * Success, we're done! No tricky corner cases.
1172 * The atomic access to the futex value generated a
1173 * pagefault, so retry the user-access and the wakeup:
1178 * A unconditional UNLOCK_PI op raced against a waiter
1179 * setting the FUTEX_WAITERS bit. Try again.
1184 * wake_futex_pi has detected invalid state. Tell user
1191 * We have no kernel internal state, i.e. no waiters in the
1192 * kernel. Waiters which are about to queue themselves are stuck
1193 * on hb->lock. So we can safely ignore them. We do neither
1194 * preserve the WAITERS bit not the OWNER_DIED one. We are the
1197 if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1198 spin_unlock(&hb->lock);
1213 * If uval has changed, let user space handle it.
1215 ret = (curval == uval) ? 0 : -EAGAIN;
1218 spin_unlock(&hb->lock);
1227 ret = fault_in_user_writeable(uaddr);