Merge patch series "bpf, riscv: use BPF prog pack allocator in BPF JIT"
[platform/kernel/linux-rpi.git] / kernel / futex / pi.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2
3 #include <linux/slab.h>
4 #include <linux/sched/task.h>
5
6 #include "futex.h"
7 #include "../locking/rtmutex_common.h"
8
9 /*
10  * PI code:
11  */
12 int refill_pi_state_cache(void)
13 {
14         struct futex_pi_state *pi_state;
15
16         if (likely(current->pi_state_cache))
17                 return 0;
18
19         pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
20
21         if (!pi_state)
22                 return -ENOMEM;
23
24         INIT_LIST_HEAD(&pi_state->list);
25         /* pi_mutex gets initialized later */
26         pi_state->owner = NULL;
27         refcount_set(&pi_state->refcount, 1);
28         pi_state->key = FUTEX_KEY_INIT;
29
30         current->pi_state_cache = pi_state;
31
32         return 0;
33 }
34
35 static struct futex_pi_state *alloc_pi_state(void)
36 {
37         struct futex_pi_state *pi_state = current->pi_state_cache;
38
39         WARN_ON(!pi_state);
40         current->pi_state_cache = NULL;
41
42         return pi_state;
43 }
44
45 static void pi_state_update_owner(struct futex_pi_state *pi_state,
46                                   struct task_struct *new_owner)
47 {
48         struct task_struct *old_owner = pi_state->owner;
49
50         lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
51
52         if (old_owner) {
53                 raw_spin_lock(&old_owner->pi_lock);
54                 WARN_ON(list_empty(&pi_state->list));
55                 list_del_init(&pi_state->list);
56                 raw_spin_unlock(&old_owner->pi_lock);
57         }
58
59         if (new_owner) {
60                 raw_spin_lock(&new_owner->pi_lock);
61                 WARN_ON(!list_empty(&pi_state->list));
62                 list_add(&pi_state->list, &new_owner->pi_state_list);
63                 pi_state->owner = new_owner;
64                 raw_spin_unlock(&new_owner->pi_lock);
65         }
66 }
67
68 void get_pi_state(struct futex_pi_state *pi_state)
69 {
70         WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
71 }
72
73 /*
74  * Drops a reference to the pi_state object and frees or caches it
75  * when the last reference is gone.
76  */
77 void put_pi_state(struct futex_pi_state *pi_state)
78 {
79         if (!pi_state)
80                 return;
81
82         if (!refcount_dec_and_test(&pi_state->refcount))
83                 return;
84
85         /*
86          * If pi_state->owner is NULL, the owner is most probably dying
87          * and has cleaned up the pi_state already
88          */
89         if (pi_state->owner) {
90                 unsigned long flags;
91
92                 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
93                 pi_state_update_owner(pi_state, NULL);
94                 rt_mutex_proxy_unlock(&pi_state->pi_mutex);
95                 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
96         }
97
98         if (current->pi_state_cache) {
99                 kfree(pi_state);
100         } else {
101                 /*
102                  * pi_state->list is already empty.
103                  * clear pi_state->owner.
104                  * refcount is at 0 - put it back to 1.
105                  */
106                 pi_state->owner = NULL;
107                 refcount_set(&pi_state->refcount, 1);
108                 current->pi_state_cache = pi_state;
109         }
110 }
111
112 /*
113  * We need to check the following states:
114  *
115  *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
116  *
117  * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
118  * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
119  *
120  * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
121  *
122  * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
123  * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
124  *
125  * [6]  Found  | Found    | task      | 0         | 1      | Valid
126  *
127  * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
128  *
129  * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
130  * [9]  Found  | Found    | task      | 0         | 0      | Invalid
131  * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
132  *
133  * [1]  Indicates that the kernel can acquire the futex atomically. We
134  *      came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
135  *
136  * [2]  Valid, if TID does not belong to a kernel thread. If no matching
137  *      thread is found then it indicates that the owner TID has died.
138  *
139  * [3]  Invalid. The waiter is queued on a non PI futex
140  *
141  * [4]  Valid state after exit_robust_list(), which sets the user space
142  *      value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
143  *
144  * [5]  The user space value got manipulated between exit_robust_list()
145  *      and exit_pi_state_list()
146  *
147  * [6]  Valid state after exit_pi_state_list() which sets the new owner in
148  *      the pi_state but cannot access the user space value.
149  *
150  * [7]  pi_state->owner can only be NULL when the OWNER_DIED bit is set.
151  *
152  * [8]  Owner and user space value match
153  *
154  * [9]  There is no transient state which sets the user space TID to 0
155  *      except exit_robust_list(), but this is indicated by the
156  *      FUTEX_OWNER_DIED bit. See [4]
157  *
158  * [10] There is no transient state which leaves owner and user space
159  *      TID out of sync. Except one error case where the kernel is denied
160  *      write access to the user address, see fixup_pi_state_owner().
161  *
162  *
163  * Serialization and lifetime rules:
164  *
165  * hb->lock:
166  *
167  *      hb -> futex_q, relation
168  *      futex_q -> pi_state, relation
169  *
170  *      (cannot be raw because hb can contain arbitrary amount
171  *       of futex_q's)
172  *
173  * pi_mutex->wait_lock:
174  *
175  *      {uval, pi_state}
176  *
177  *      (and pi_mutex 'obviously')
178  *
179  * p->pi_lock:
180  *
181  *      p->pi_state_list -> pi_state->list, relation
182  *      pi_mutex->owner -> pi_state->owner, relation
183  *
184  * pi_state->refcount:
185  *
186  *      pi_state lifetime
187  *
188  *
189  * Lock order:
190  *
191  *   hb->lock
192  *     pi_mutex->wait_lock
193  *       p->pi_lock
194  *
195  */
196
197 /*
198  * Validate that the existing waiter has a pi_state and sanity check
199  * the pi_state against the user space value. If correct, attach to
200  * it.
201  */
202 static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
203                               struct futex_pi_state *pi_state,
204                               struct futex_pi_state **ps)
205 {
206         pid_t pid = uval & FUTEX_TID_MASK;
207         u32 uval2;
208         int ret;
209
210         /*
211          * Userspace might have messed up non-PI and PI futexes [3]
212          */
213         if (unlikely(!pi_state))
214                 return -EINVAL;
215
216         /*
217          * We get here with hb->lock held, and having found a
218          * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
219          * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
220          * which in turn means that futex_lock_pi() still has a reference on
221          * our pi_state.
222          *
223          * The waiter holding a reference on @pi_state also protects against
224          * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
225          * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
226          * free pi_state before we can take a reference ourselves.
227          */
228         WARN_ON(!refcount_read(&pi_state->refcount));
229
230         /*
231          * Now that we have a pi_state, we can acquire wait_lock
232          * and do the state validation.
233          */
234         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
235
236         /*
237          * Since {uval, pi_state} is serialized by wait_lock, and our current
238          * uval was read without holding it, it can have changed. Verify it
239          * still is what we expect it to be, otherwise retry the entire
240          * operation.
241          */
242         if (futex_get_value_locked(&uval2, uaddr))
243                 goto out_efault;
244
245         if (uval != uval2)
246                 goto out_eagain;
247
248         /*
249          * Handle the owner died case:
250          */
251         if (uval & FUTEX_OWNER_DIED) {
252                 /*
253                  * exit_pi_state_list sets owner to NULL and wakes the
254                  * topmost waiter. The task which acquires the
255                  * pi_state->rt_mutex will fixup owner.
256                  */
257                 if (!pi_state->owner) {
258                         /*
259                          * No pi state owner, but the user space TID
260                          * is not 0. Inconsistent state. [5]
261                          */
262                         if (pid)
263                                 goto out_einval;
264                         /*
265                          * Take a ref on the state and return success. [4]
266                          */
267                         goto out_attach;
268                 }
269
270                 /*
271                  * If TID is 0, then either the dying owner has not
272                  * yet executed exit_pi_state_list() or some waiter
273                  * acquired the rtmutex in the pi state, but did not
274                  * yet fixup the TID in user space.
275                  *
276                  * Take a ref on the state and return success. [6]
277                  */
278                 if (!pid)
279                         goto out_attach;
280         } else {
281                 /*
282                  * If the owner died bit is not set, then the pi_state
283                  * must have an owner. [7]
284                  */
285                 if (!pi_state->owner)
286                         goto out_einval;
287         }
288
289         /*
290          * Bail out if user space manipulated the futex value. If pi
291          * state exists then the owner TID must be the same as the
292          * user space TID. [9/10]
293          */
294         if (pid != task_pid_vnr(pi_state->owner))
295                 goto out_einval;
296
297 out_attach:
298         get_pi_state(pi_state);
299         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
300         *ps = pi_state;
301         return 0;
302
303 out_einval:
304         ret = -EINVAL;
305         goto out_error;
306
307 out_eagain:
308         ret = -EAGAIN;
309         goto out_error;
310
311 out_efault:
312         ret = -EFAULT;
313         goto out_error;
314
315 out_error:
316         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
317         return ret;
318 }
319
320 static int handle_exit_race(u32 __user *uaddr, u32 uval,
321                             struct task_struct *tsk)
322 {
323         u32 uval2;
324
325         /*
326          * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
327          * caller that the alleged owner is busy.
328          */
329         if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
330                 return -EBUSY;
331
332         /*
333          * Reread the user space value to handle the following situation:
334          *
335          * CPU0                         CPU1
336          *
337          * sys_exit()                   sys_futex()
338          *  do_exit()                    futex_lock_pi()
339          *                                futex_lock_pi_atomic()
340          *   exit_signals(tsk)              No waiters:
341          *    tsk->flags |= PF_EXITING;     *uaddr == 0x00000PID
342          *  mm_release(tsk)                 Set waiter bit
343          *   exit_robust_list(tsk) {        *uaddr = 0x80000PID;
344          *      Set owner died              attach_to_pi_owner() {
345          *    *uaddr = 0xC0000000;           tsk = get_task(PID);
346          *   }                               if (!tsk->flags & PF_EXITING) {
347          *  ...                                attach();
348          *  tsk->futex_state =               } else {
349          *      FUTEX_STATE_DEAD;              if (tsk->futex_state !=
350          *                                        FUTEX_STATE_DEAD)
351          *                                       return -EAGAIN;
352          *                                     return -ESRCH; <--- FAIL
353          *                                   }
354          *
355          * Returning ESRCH unconditionally is wrong here because the
356          * user space value has been changed by the exiting task.
357          *
358          * The same logic applies to the case where the exiting task is
359          * already gone.
360          */
361         if (futex_get_value_locked(&uval2, uaddr))
362                 return -EFAULT;
363
364         /* If the user space value has changed, try again. */
365         if (uval2 != uval)
366                 return -EAGAIN;
367
368         /*
369          * The exiting task did not have a robust list, the robust list was
370          * corrupted or the user space value in *uaddr is simply bogus.
371          * Give up and tell user space.
372          */
373         return -ESRCH;
374 }
375
376 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
377                                  struct futex_pi_state **ps)
378 {
379         /*
380          * No existing pi state. First waiter. [2]
381          *
382          * This creates pi_state, we have hb->lock held, this means nothing can
383          * observe this state, wait_lock is irrelevant.
384          */
385         struct futex_pi_state *pi_state = alloc_pi_state();
386
387         /*
388          * Initialize the pi_mutex in locked state and make @p
389          * the owner of it:
390          */
391         rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
392
393         /* Store the key for possible exit cleanups: */
394         pi_state->key = *key;
395
396         WARN_ON(!list_empty(&pi_state->list));
397         list_add(&pi_state->list, &p->pi_state_list);
398         /*
399          * Assignment without holding pi_state->pi_mutex.wait_lock is safe
400          * because there is no concurrency as the object is not published yet.
401          */
402         pi_state->owner = p;
403
404         *ps = pi_state;
405 }
406 /*
407  * Lookup the task for the TID provided from user space and attach to
408  * it after doing proper sanity checks.
409  */
410 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
411                               struct futex_pi_state **ps,
412                               struct task_struct **exiting)
413 {
414         pid_t pid = uval & FUTEX_TID_MASK;
415         struct task_struct *p;
416
417         /*
418          * We are the first waiter - try to look up the real owner and attach
419          * the new pi_state to it, but bail out when TID = 0 [1]
420          *
421          * The !pid check is paranoid. None of the call sites should end up
422          * with pid == 0, but better safe than sorry. Let the caller retry
423          */
424         if (!pid)
425                 return -EAGAIN;
426         p = find_get_task_by_vpid(pid);
427         if (!p)
428                 return handle_exit_race(uaddr, uval, NULL);
429
430         if (unlikely(p->flags & PF_KTHREAD)) {
431                 put_task_struct(p);
432                 return -EPERM;
433         }
434
435         /*
436          * We need to look at the task state to figure out, whether the
437          * task is exiting. To protect against the change of the task state
438          * in futex_exit_release(), we do this protected by p->pi_lock:
439          */
440         raw_spin_lock_irq(&p->pi_lock);
441         if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
442                 /*
443                  * The task is on the way out. When the futex state is
444                  * FUTEX_STATE_DEAD, we know that the task has finished
445                  * the cleanup:
446                  */
447                 int ret = handle_exit_race(uaddr, uval, p);
448
449                 raw_spin_unlock_irq(&p->pi_lock);
450                 /*
451                  * If the owner task is between FUTEX_STATE_EXITING and
452                  * FUTEX_STATE_DEAD then store the task pointer and keep
453                  * the reference on the task struct. The calling code will
454                  * drop all locks, wait for the task to reach
455                  * FUTEX_STATE_DEAD and then drop the refcount. This is
456                  * required to prevent a live lock when the current task
457                  * preempted the exiting task between the two states.
458                  */
459                 if (ret == -EBUSY)
460                         *exiting = p;
461                 else
462                         put_task_struct(p);
463                 return ret;
464         }
465
466         __attach_to_pi_owner(p, key, ps);
467         raw_spin_unlock_irq(&p->pi_lock);
468
469         put_task_struct(p);
470
471         return 0;
472 }
473
474 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
475 {
476         int err;
477         u32 curval;
478
479         if (unlikely(should_fail_futex(true)))
480                 return -EFAULT;
481
482         err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
483         if (unlikely(err))
484                 return err;
485
486         /* If user space value changed, let the caller retry */
487         return curval != uval ? -EAGAIN : 0;
488 }
489
490 /**
491  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
492  * @uaddr:              the pi futex user address
493  * @hb:                 the pi futex hash bucket
494  * @key:                the futex key associated with uaddr and hb
495  * @ps:                 the pi_state pointer where we store the result of the
496  *                      lookup
497  * @task:               the task to perform the atomic lock work for.  This will
498  *                      be "current" except in the case of requeue pi.
499  * @exiting:            Pointer to store the task pointer of the owner task
500  *                      which is in the middle of exiting
501  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
502  *
503  * Return:
504  *  -  0 - ready to wait;
505  *  -  1 - acquired the lock;
506  *  - <0 - error
507  *
508  * The hb->lock must be held by the caller.
509  *
510  * @exiting is only set when the return value is -EBUSY. If so, this holds
511  * a refcount on the exiting task on return and the caller needs to drop it
512  * after waiting for the exit to complete.
513  */
514 int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
515                          union futex_key *key,
516                          struct futex_pi_state **ps,
517                          struct task_struct *task,
518                          struct task_struct **exiting,
519                          int set_waiters)
520 {
521         u32 uval, newval, vpid = task_pid_vnr(task);
522         struct futex_q *top_waiter;
523         int ret;
524
525         /*
526          * Read the user space value first so we can validate a few
527          * things before proceeding further.
528          */
529         if (futex_get_value_locked(&uval, uaddr))
530                 return -EFAULT;
531
532         if (unlikely(should_fail_futex(true)))
533                 return -EFAULT;
534
535         /*
536          * Detect deadlocks.
537          */
538         if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
539                 return -EDEADLK;
540
541         if ((unlikely(should_fail_futex(true))))
542                 return -EDEADLK;
543
544         /*
545          * Lookup existing state first. If it exists, try to attach to
546          * its pi_state.
547          */
548         top_waiter = futex_top_waiter(hb, key);
549         if (top_waiter)
550                 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
551
552         /*
553          * No waiter and user TID is 0. We are here because the
554          * waiters or the owner died bit is set or called from
555          * requeue_cmp_pi or for whatever reason something took the
556          * syscall.
557          */
558         if (!(uval & FUTEX_TID_MASK)) {
559                 /*
560                  * We take over the futex. No other waiters and the user space
561                  * TID is 0. We preserve the owner died bit.
562                  */
563                 newval = uval & FUTEX_OWNER_DIED;
564                 newval |= vpid;
565
566                 /* The futex requeue_pi code can enforce the waiters bit */
567                 if (set_waiters)
568                         newval |= FUTEX_WAITERS;
569
570                 ret = lock_pi_update_atomic(uaddr, uval, newval);
571                 if (ret)
572                         return ret;
573
574                 /*
575                  * If the waiter bit was requested the caller also needs PI
576                  * state attached to the new owner of the user space futex.
577                  *
578                  * @task is guaranteed to be alive and it cannot be exiting
579                  * because it is either sleeping or waiting in
580                  * futex_requeue_pi_wakeup_sync().
581                  *
582                  * No need to do the full attach_to_pi_owner() exercise
583                  * because @task is known and valid.
584                  */
585                 if (set_waiters) {
586                         raw_spin_lock_irq(&task->pi_lock);
587                         __attach_to_pi_owner(task, key, ps);
588                         raw_spin_unlock_irq(&task->pi_lock);
589                 }
590                 return 1;
591         }
592
593         /*
594          * First waiter. Set the waiters bit before attaching ourself to
595          * the owner. If owner tries to unlock, it will be forced into
596          * the kernel and blocked on hb->lock.
597          */
598         newval = uval | FUTEX_WAITERS;
599         ret = lock_pi_update_atomic(uaddr, uval, newval);
600         if (ret)
601                 return ret;
602         /*
603          * If the update of the user space value succeeded, we try to
604          * attach to the owner. If that fails, no harm done, we only
605          * set the FUTEX_WAITERS bit in the user space variable.
606          */
607         return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
608 }
609
610 /*
611  * Caller must hold a reference on @pi_state.
612  */
613 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
614 {
615         struct rt_mutex_waiter *top_waiter;
616         struct task_struct *new_owner;
617         bool postunlock = false;
618         DEFINE_RT_WAKE_Q(wqh);
619         u32 curval, newval;
620         int ret = 0;
621
622         top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
623         if (WARN_ON_ONCE(!top_waiter)) {
624                 /*
625                  * As per the comment in futex_unlock_pi() this should not happen.
626                  *
627                  * When this happens, give up our locks and try again, giving
628                  * the futex_lock_pi() instance time to complete, either by
629                  * waiting on the rtmutex or removing itself from the futex
630                  * queue.
631                  */
632                 ret = -EAGAIN;
633                 goto out_unlock;
634         }
635
636         new_owner = top_waiter->task;
637
638         /*
639          * We pass it to the next owner. The WAITERS bit is always kept
640          * enabled while there is PI state around. We cleanup the owner
641          * died bit, because we are the owner.
642          */
643         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
644
645         if (unlikely(should_fail_futex(true))) {
646                 ret = -EFAULT;
647                 goto out_unlock;
648         }
649
650         ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
651         if (!ret && (curval != uval)) {
652                 /*
653                  * If a unconditional UNLOCK_PI operation (user space did not
654                  * try the TID->0 transition) raced with a waiter setting the
655                  * FUTEX_WAITERS flag between get_user() and locking the hash
656                  * bucket lock, retry the operation.
657                  */
658                 if ((FUTEX_TID_MASK & curval) == uval)
659                         ret = -EAGAIN;
660                 else
661                         ret = -EINVAL;
662         }
663
664         if (!ret) {
665                 /*
666                  * This is a point of no return; once we modified the uval
667                  * there is no going back and subsequent operations must
668                  * not fail.
669                  */
670                 pi_state_update_owner(pi_state, new_owner);
671                 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
672         }
673
674 out_unlock:
675         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
676
677         if (postunlock)
678                 rt_mutex_postunlock(&wqh);
679
680         return ret;
681 }
682
683 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
684                                   struct task_struct *argowner)
685 {
686         struct futex_pi_state *pi_state = q->pi_state;
687         struct task_struct *oldowner, *newowner;
688         u32 uval, curval, newval, newtid;
689         int err = 0;
690
691         oldowner = pi_state->owner;
692
693         /*
694          * We are here because either:
695          *
696          *  - we stole the lock and pi_state->owner needs updating to reflect
697          *    that (@argowner == current),
698          *
699          * or:
700          *
701          *  - someone stole our lock and we need to fix things to point to the
702          *    new owner (@argowner == NULL).
703          *
704          * Either way, we have to replace the TID in the user space variable.
705          * This must be atomic as we have to preserve the owner died bit here.
706          *
707          * Note: We write the user space value _before_ changing the pi_state
708          * because we can fault here. Imagine swapped out pages or a fork
709          * that marked all the anonymous memory readonly for cow.
710          *
711          * Modifying pi_state _before_ the user space value would leave the
712          * pi_state in an inconsistent state when we fault here, because we
713          * need to drop the locks to handle the fault. This might be observed
714          * in the PID checks when attaching to PI state .
715          */
716 retry:
717         if (!argowner) {
718                 if (oldowner != current) {
719                         /*
720                          * We raced against a concurrent self; things are
721                          * already fixed up. Nothing to do.
722                          */
723                         return 0;
724                 }
725
726                 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
727                         /* We got the lock. pi_state is correct. Tell caller. */
728                         return 1;
729                 }
730
731                 /*
732                  * The trylock just failed, so either there is an owner or
733                  * there is a higher priority waiter than this one.
734                  */
735                 newowner = rt_mutex_owner(&pi_state->pi_mutex);
736                 /*
737                  * If the higher priority waiter has not yet taken over the
738                  * rtmutex then newowner is NULL. We can't return here with
739                  * that state because it's inconsistent vs. the user space
740                  * state. So drop the locks and try again. It's a valid
741                  * situation and not any different from the other retry
742                  * conditions.
743                  */
744                 if (unlikely(!newowner)) {
745                         err = -EAGAIN;
746                         goto handle_err;
747                 }
748         } else {
749                 WARN_ON_ONCE(argowner != current);
750                 if (oldowner == current) {
751                         /*
752                          * We raced against a concurrent self; things are
753                          * already fixed up. Nothing to do.
754                          */
755                         return 1;
756                 }
757                 newowner = argowner;
758         }
759
760         newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
761         /* Owner died? */
762         if (!pi_state->owner)
763                 newtid |= FUTEX_OWNER_DIED;
764
765         err = futex_get_value_locked(&uval, uaddr);
766         if (err)
767                 goto handle_err;
768
769         for (;;) {
770                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
771
772                 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
773                 if (err)
774                         goto handle_err;
775
776                 if (curval == uval)
777                         break;
778                 uval = curval;
779         }
780
781         /*
782          * We fixed up user space. Now we need to fix the pi_state
783          * itself.
784          */
785         pi_state_update_owner(pi_state, newowner);
786
787         return argowner == current;
788
789         /*
790          * In order to reschedule or handle a page fault, we need to drop the
791          * locks here. In the case of a fault, this gives the other task
792          * (either the highest priority waiter itself or the task which stole
793          * the rtmutex) the chance to try the fixup of the pi_state. So once we
794          * are back from handling the fault we need to check the pi_state after
795          * reacquiring the locks and before trying to do another fixup. When
796          * the fixup has been done already we simply return.
797          *
798          * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
799          * drop hb->lock since the caller owns the hb -> futex_q relation.
800          * Dropping the pi_mutex->wait_lock requires the state revalidate.
801          */
802 handle_err:
803         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
804         spin_unlock(q->lock_ptr);
805
806         switch (err) {
807         case -EFAULT:
808                 err = fault_in_user_writeable(uaddr);
809                 break;
810
811         case -EAGAIN:
812                 cond_resched();
813                 err = 0;
814                 break;
815
816         default:
817                 WARN_ON_ONCE(1);
818                 break;
819         }
820
821         spin_lock(q->lock_ptr);
822         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
823
824         /*
825          * Check if someone else fixed it for us:
826          */
827         if (pi_state->owner != oldowner)
828                 return argowner == current;
829
830         /* Retry if err was -EAGAIN or the fault in succeeded */
831         if (!err)
832                 goto retry;
833
834         /*
835          * fault_in_user_writeable() failed so user state is immutable. At
836          * best we can make the kernel state consistent but user state will
837          * be most likely hosed and any subsequent unlock operation will be
838          * rejected due to PI futex rule [10].
839          *
840          * Ensure that the rtmutex owner is also the pi_state owner despite
841          * the user space value claiming something different. There is no
842          * point in unlocking the rtmutex if current is the owner as it
843          * would need to wait until the next waiter has taken the rtmutex
844          * to guarantee consistent state. Keep it simple. Userspace asked
845          * for this wreckaged state.
846          *
847          * The rtmutex has an owner - either current or some other
848          * task. See the EAGAIN loop above.
849          */
850         pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
851
852         return err;
853 }
854
855 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
856                                 struct task_struct *argowner)
857 {
858         struct futex_pi_state *pi_state = q->pi_state;
859         int ret;
860
861         lockdep_assert_held(q->lock_ptr);
862
863         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
864         ret = __fixup_pi_state_owner(uaddr, q, argowner);
865         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
866         return ret;
867 }
868
869 /**
870  * fixup_pi_owner() - Post lock pi_state and corner case management
871  * @uaddr:      user address of the futex
872  * @q:          futex_q (contains pi_state and access to the rt_mutex)
873  * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
874  *
875  * After attempting to lock an rt_mutex, this function is called to cleanup
876  * the pi_state owner as well as handle race conditions that may allow us to
877  * acquire the lock. Must be called with the hb lock held.
878  *
879  * Return:
880  *  -  1 - success, lock taken;
881  *  -  0 - success, lock not taken;
882  *  - <0 - on error (-EFAULT)
883  */
884 int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
885 {
886         if (locked) {
887                 /*
888                  * Got the lock. We might not be the anticipated owner if we
889                  * did a lock-steal - fix up the PI-state in that case:
890                  *
891                  * Speculative pi_state->owner read (we don't hold wait_lock);
892                  * since we own the lock pi_state->owner == current is the
893                  * stable state, anything else needs more attention.
894                  */
895                 if (q->pi_state->owner != current)
896                         return fixup_pi_state_owner(uaddr, q, current);
897                 return 1;
898         }
899
900         /*
901          * If we didn't get the lock; check if anybody stole it from us. In
902          * that case, we need to fix up the uval to point to them instead of
903          * us, otherwise bad things happen. [10]
904          *
905          * Another speculative read; pi_state->owner == current is unstable
906          * but needs our attention.
907          */
908         if (q->pi_state->owner == current)
909                 return fixup_pi_state_owner(uaddr, q, NULL);
910
911         /*
912          * Paranoia check. If we did not take the lock, then we should not be
913          * the owner of the rt_mutex. Warn and establish consistent state.
914          */
915         if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
916                 return fixup_pi_state_owner(uaddr, q, current);
917
918         return 0;
919 }
920
921 /*
922  * Userspace tried a 0 -> TID atomic transition of the futex value
923  * and failed. The kernel side here does the whole locking operation:
924  * if there are waiters then it will block as a consequence of relying
925  * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
926  * a 0 value of the futex too.).
927  *
928  * Also serves as futex trylock_pi()'ing, and due semantics.
929  */
930 int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
931 {
932         struct hrtimer_sleeper timeout, *to;
933         struct task_struct *exiting = NULL;
934         struct rt_mutex_waiter rt_waiter;
935         struct futex_hash_bucket *hb;
936         struct futex_q q = futex_q_init;
937         int res, ret;
938
939         if (!IS_ENABLED(CONFIG_FUTEX_PI))
940                 return -ENOSYS;
941
942         if (refill_pi_state_cache())
943                 return -ENOMEM;
944
945         to = futex_setup_timer(time, &timeout, flags, 0);
946
947 retry:
948         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
949         if (unlikely(ret != 0))
950                 goto out;
951
952 retry_private:
953         hb = futex_q_lock(&q);
954
955         ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
956                                    &exiting, 0);
957         if (unlikely(ret)) {
958                 /*
959                  * Atomic work succeeded and we got the lock,
960                  * or failed. Either way, we do _not_ block.
961                  */
962                 switch (ret) {
963                 case 1:
964                         /* We got the lock. */
965                         ret = 0;
966                         goto out_unlock_put_key;
967                 case -EFAULT:
968                         goto uaddr_faulted;
969                 case -EBUSY:
970                 case -EAGAIN:
971                         /*
972                          * Two reasons for this:
973                          * - EBUSY: Task is exiting and we just wait for the
974                          *   exit to complete.
975                          * - EAGAIN: The user space value changed.
976                          */
977                         futex_q_unlock(hb);
978                         /*
979                          * Handle the case where the owner is in the middle of
980                          * exiting. Wait for the exit to complete otherwise
981                          * this task might loop forever, aka. live lock.
982                          */
983                         wait_for_owner_exiting(ret, exiting);
984                         cond_resched();
985                         goto retry;
986                 default:
987                         goto out_unlock_put_key;
988                 }
989         }
990
991         WARN_ON(!q.pi_state);
992
993         /*
994          * Only actually queue now that the atomic ops are done:
995          */
996         __futex_queue(&q, hb);
997
998         if (trylock) {
999                 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
1000                 /* Fixup the trylock return value: */
1001                 ret = ret ? 0 : -EWOULDBLOCK;
1002                 goto no_block;
1003         }
1004
1005         rt_mutex_init_waiter(&rt_waiter);
1006
1007         /*
1008          * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1009          * hold it while doing rt_mutex_start_proxy(), because then it will
1010          * include hb->lock in the blocking chain, even through we'll not in
1011          * fact hold it while blocking. This will lead it to report -EDEADLK
1012          * and BUG when futex_unlock_pi() interleaves with this.
1013          *
1014          * Therefore acquire wait_lock while holding hb->lock, but drop the
1015          * latter before calling __rt_mutex_start_proxy_lock(). This
1016          * interleaves with futex_unlock_pi() -- which does a similar lock
1017          * handoff -- such that the latter can observe the futex_q::pi_state
1018          * before __rt_mutex_start_proxy_lock() is done.
1019          */
1020         raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1021         spin_unlock(q.lock_ptr);
1022         /*
1023          * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1024          * such that futex_unlock_pi() is guaranteed to observe the waiter when
1025          * it sees the futex_q::pi_state.
1026          */
1027         ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1028         raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1029
1030         if (ret) {
1031                 if (ret == 1)
1032                         ret = 0;
1033                 goto cleanup;
1034         }
1035
1036         if (unlikely(to))
1037                 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1038
1039         ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1040
1041 cleanup:
1042         spin_lock(q.lock_ptr);
1043         /*
1044          * If we failed to acquire the lock (deadlock/signal/timeout), we must
1045          * first acquire the hb->lock before removing the lock from the
1046          * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
1047          * lists consistent.
1048          *
1049          * In particular; it is important that futex_unlock_pi() can not
1050          * observe this inconsistency.
1051          */
1052         if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1053                 ret = 0;
1054
1055 no_block:
1056         /*
1057          * Fixup the pi_state owner and possibly acquire the lock if we
1058          * haven't already.
1059          */
1060         res = fixup_pi_owner(uaddr, &q, !ret);
1061         /*
1062          * If fixup_pi_owner() returned an error, propagate that.  If it acquired
1063          * the lock, clear our -ETIMEDOUT or -EINTR.
1064          */
1065         if (res)
1066                 ret = (res < 0) ? res : 0;
1067
1068         futex_unqueue_pi(&q);
1069         spin_unlock(q.lock_ptr);
1070         goto out;
1071
1072 out_unlock_put_key:
1073         futex_q_unlock(hb);
1074
1075 out:
1076         if (to) {
1077                 hrtimer_cancel(&to->timer);
1078                 destroy_hrtimer_on_stack(&to->timer);
1079         }
1080         return ret != -EINTR ? ret : -ERESTARTNOINTR;
1081
1082 uaddr_faulted:
1083         futex_q_unlock(hb);
1084
1085         ret = fault_in_user_writeable(uaddr);
1086         if (ret)
1087                 goto out;
1088
1089         if (!(flags & FLAGS_SHARED))
1090                 goto retry_private;
1091
1092         goto retry;
1093 }
1094
1095 /*
1096  * Userspace attempted a TID -> 0 atomic transition, and failed.
1097  * This is the in-kernel slowpath: we look up the PI state (if any),
1098  * and do the rt-mutex unlock.
1099  */
1100 int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1101 {
1102         u32 curval, uval, vpid = task_pid_vnr(current);
1103         union futex_key key = FUTEX_KEY_INIT;
1104         struct futex_hash_bucket *hb;
1105         struct futex_q *top_waiter;
1106         int ret;
1107
1108         if (!IS_ENABLED(CONFIG_FUTEX_PI))
1109                 return -ENOSYS;
1110
1111 retry:
1112         if (get_user(uval, uaddr))
1113                 return -EFAULT;
1114         /*
1115          * We release only a lock we actually own:
1116          */
1117         if ((uval & FUTEX_TID_MASK) != vpid)
1118                 return -EPERM;
1119
1120         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
1121         if (ret)
1122                 return ret;
1123
1124         hb = futex_hash(&key);
1125         spin_lock(&hb->lock);
1126
1127         /*
1128          * Check waiters first. We do not trust user space values at
1129          * all and we at least want to know if user space fiddled
1130          * with the futex value instead of blindly unlocking.
1131          */
1132         top_waiter = futex_top_waiter(hb, &key);
1133         if (top_waiter) {
1134                 struct futex_pi_state *pi_state = top_waiter->pi_state;
1135
1136                 ret = -EINVAL;
1137                 if (!pi_state)
1138                         goto out_unlock;
1139
1140                 /*
1141                  * If current does not own the pi_state then the futex is
1142                  * inconsistent and user space fiddled with the futex value.
1143                  */
1144                 if (pi_state->owner != current)
1145                         goto out_unlock;
1146
1147                 get_pi_state(pi_state);
1148                 /*
1149                  * By taking wait_lock while still holding hb->lock, we ensure
1150                  * there is no point where we hold neither; and therefore
1151                  * wake_futex_p() must observe a state consistent with what we
1152                  * observed.
1153                  *
1154                  * In particular; this forces __rt_mutex_start_proxy() to
1155                  * complete such that we're guaranteed to observe the
1156                  * rt_waiter. Also see the WARN in wake_futex_pi().
1157                  */
1158                 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1159                 spin_unlock(&hb->lock);
1160
1161                 /* drops pi_state->pi_mutex.wait_lock */
1162                 ret = wake_futex_pi(uaddr, uval, pi_state);
1163
1164                 put_pi_state(pi_state);
1165
1166                 /*
1167                  * Success, we're done! No tricky corner cases.
1168                  */
1169                 if (!ret)
1170                         return ret;
1171                 /*
1172                  * The atomic access to the futex value generated a
1173                  * pagefault, so retry the user-access and the wakeup:
1174                  */
1175                 if (ret == -EFAULT)
1176                         goto pi_faulted;
1177                 /*
1178                  * A unconditional UNLOCK_PI op raced against a waiter
1179                  * setting the FUTEX_WAITERS bit. Try again.
1180                  */
1181                 if (ret == -EAGAIN)
1182                         goto pi_retry;
1183                 /*
1184                  * wake_futex_pi has detected invalid state. Tell user
1185                  * space.
1186                  */
1187                 return ret;
1188         }
1189
1190         /*
1191          * We have no kernel internal state, i.e. no waiters in the
1192          * kernel. Waiters which are about to queue themselves are stuck
1193          * on hb->lock. So we can safely ignore them. We do neither
1194          * preserve the WAITERS bit not the OWNER_DIED one. We are the
1195          * owner.
1196          */
1197         if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1198                 spin_unlock(&hb->lock);
1199                 switch (ret) {
1200                 case -EFAULT:
1201                         goto pi_faulted;
1202
1203                 case -EAGAIN:
1204                         goto pi_retry;
1205
1206                 default:
1207                         WARN_ON_ONCE(1);
1208                         return ret;
1209                 }
1210         }
1211
1212         /*
1213          * If uval has changed, let user space handle it.
1214          */
1215         ret = (curval == uval) ? 0 : -EAGAIN;
1216
1217 out_unlock:
1218         spin_unlock(&hb->lock);
1219         return ret;
1220
1221 pi_retry:
1222         cond_resched();
1223         goto retry;
1224
1225 pi_faulted:
1226
1227         ret = fault_in_user_writeable(uaddr);
1228         if (!ret)
1229                 goto retry;
1230
1231         return ret;
1232 }
1233