kernel/futex/pi.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2
   3 #include <linux/slab.h>
   4 #include <linux/sched/task.h>
   5
   6 #include "futex.h"
   7 #include "../locking/rtmutex_common.h"
   8
   9 /*
  10  * PI code:
  11  */
  12 int refill_pi_state_cache(void)
  13 {
  14         struct futex_pi_state *pi_state;
  15
  16         if (likely(current->pi_state_cache))
  17                 return 0;
  18
  19         pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
  20
  21         if (!pi_state)
  22                 return -ENOMEM;
  23
  24         INIT_LIST_HEAD(&pi_state->list);
  25         /* pi_mutex gets initialized later */
  26         pi_state->owner = NULL;
  27         refcount_set(&pi_state->refcount, 1);
  28         pi_state->key = FUTEX_KEY_INIT;
  29
  30         current->pi_state_cache = pi_state;
  31
  32         return 0;
  33 }
  34
  35 static struct futex_pi_state *alloc_pi_state(void)
  36 {
  37         struct futex_pi_state *pi_state = current->pi_state_cache;
  38
  39         WARN_ON(!pi_state);
  40         current->pi_state_cache = NULL;
  41
  42         return pi_state;
  43 }
  44
  45 static void pi_state_update_owner(struct futex_pi_state *pi_state,
  46                                   struct task_struct *new_owner)
  47 {
  48         struct task_struct *old_owner = pi_state->owner;
  49
  50         lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
  51
  52         if (old_owner) {
  53                 raw_spin_lock(&old_owner->pi_lock);
  54                 WARN_ON(list_empty(&pi_state->list));
  55                 list_del_init(&pi_state->list);
  56                 raw_spin_unlock(&old_owner->pi_lock);
  57         }
  58
  59         if (new_owner) {
  60                 raw_spin_lock(&new_owner->pi_lock);
  61                 WARN_ON(!list_empty(&pi_state->list));
  62                 list_add(&pi_state->list, &new_owner->pi_state_list);
  63                 pi_state->owner = new_owner;
  64                 raw_spin_unlock(&new_owner->pi_lock);
  65         }
  66 }
  67
  68 void get_pi_state(struct futex_pi_state *pi_state)
  69 {
  70         WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
  71 }
  72
  73 /*
  74  * Drops a reference to the pi_state object and frees or caches it
  75  * when the last reference is gone.
  76  */
  77 void put_pi_state(struct futex_pi_state *pi_state)
  78 {
  79         if (!pi_state)
  80                 return;
  81
  82         if (!refcount_dec_and_test(&pi_state->refcount))
  83                 return;
  84
  85         /*
  86          * If pi_state->owner is NULL, the owner is most probably dying
  87          * and has cleaned up the pi_state already
  88          */
  89         if (pi_state->owner) {
  90                 unsigned long flags;
  91
  92                 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
  93                 pi_state_update_owner(pi_state, NULL);
  94                 rt_mutex_proxy_unlock(&pi_state->pi_mutex);
  95                 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
  96         }
  97
  98         if (current->pi_state_cache) {
  99                 kfree(pi_state);
 100         } else {
 101                 /*
 102                  * pi_state->list is already empty.
 103                  * clear pi_state->owner.
 104                  * refcount is at 0 - put it back to 1.
 105                  */
 106                 pi_state->owner = NULL;
 107                 refcount_set(&pi_state->refcount, 1);
 108                 current->pi_state_cache = pi_state;
 109         }
 110 }
 111
 112 /*
 113  * We need to check the following states:
 114  *
 115  *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
 116  *
 117  * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
 118  * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
 119  *
 120  * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
 121  *
 122  * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
 123  * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
 124  *
 125  * [6]  Found  | Found    | task      | 0         | 1      | Valid
 126  *
 127  * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
 128  *
 129  * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
 130  * [9]  Found  | Found    | task      | 0         | 0      | Invalid
 131  * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
 132  *
 133  * [1]  Indicates that the kernel can acquire the futex atomically. We
 134  *      came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
 135  *
 136  * [2]  Valid, if TID does not belong to a kernel thread. If no matching
 137  *      thread is found then it indicates that the owner TID has died.
 138  *
 139  * [3]  Invalid. The waiter is queued on a non PI futex
 140  *
 141  * [4]  Valid state after exit_robust_list(), which sets the user space
 142  *      value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
 143  *
 144  * [5]  The user space value got manipulated between exit_robust_list()
 145  *      and exit_pi_state_list()
 146  *
 147  * [6]  Valid state after exit_pi_state_list() which sets the new owner in
 148  *      the pi_state but cannot access the user space value.
 149  *
 150  * [7]  pi_state->owner can only be NULL when the OWNER_DIED bit is set.
 151  *
 152  * [8]  Owner and user space value match
 153  *
 154  * [9]  There is no transient state which sets the user space TID to 0
 155  *      except exit_robust_list(), but this is indicated by the
 156  *      FUTEX_OWNER_DIED bit. See [4]
 157  *
 158  * [10] There is no transient state which leaves owner and user space
 159  *      TID out of sync. Except one error case where the kernel is denied
 160  *      write access to the user address, see fixup_pi_state_owner().
 161  *
 162  *
 163  * Serialization and lifetime rules:
 164  *
 165  * hb->lock:
 166  *
 167  *      hb -> futex_q, relation
 168  *      futex_q -> pi_state, relation
 169  *
 170  *      (cannot be raw because hb can contain arbitrary amount
 171  *       of futex_q's)
 172  *
 173  * pi_mutex->wait_lock:
 174  *
 175  *      {uval, pi_state}
 176  *
 177  *      (and pi_mutex 'obviously')
 178  *
 179  * p->pi_lock:
 180  *
 181  *      p->pi_state_list -> pi_state->list, relation
 182  *      pi_mutex->owner -> pi_state->owner, relation
 183  *
 184  * pi_state->refcount:
 185  *
 186  *      pi_state lifetime
 187  *
 188  *
 189  * Lock order:
 190  *
 191  *   hb->lock
 192  *     pi_mutex->wait_lock
 193  *       p->pi_lock
 194  *
 195  */
 196
 197 /*
 198  * Validate that the existing waiter has a pi_state and sanity check
 199  * the pi_state against the user space value. If correct, attach to
 200  * it.
 201  */
 202 static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
 203                               struct futex_pi_state *pi_state,
 204                               struct futex_pi_state **ps)
 205 {
 206         pid_t pid = uval & FUTEX_TID_MASK;
 207         u32 uval2;
 208         int ret;
 209
 210         /*
 211          * Userspace might have messed up non-PI and PI futexes [3]
 212          */
 213         if (unlikely(!pi_state))
 214                 return -EINVAL;
 215
 216         /*
 217          * We get here with hb->lock held, and having found a
 218          * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
 219          * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
 220          * which in turn means that futex_lock_pi() still has a reference on
 221          * our pi_state.
 222          *
 223          * The waiter holding a reference on @pi_state also protects against
 224          * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
 225          * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
 226          * free pi_state before we can take a reference ourselves.
 227          */
 228         WARN_ON(!refcount_read(&pi_state->refcount));
 229
 230         /*
 231          * Now that we have a pi_state, we can acquire wait_lock
 232          * and do the state validation.
 233          */
 234         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 235
 236         /*
 237          * Since {uval, pi_state} is serialized by wait_lock, and our current
 238          * uval was read without holding it, it can have changed. Verify it
 239          * still is what we expect it to be, otherwise retry the entire
 240          * operation.
 241          */
 242         if (futex_get_value_locked(&uval2, uaddr))
 243                 goto out_efault;
 244
 245         if (uval != uval2)
 246                 goto out_eagain;
 247
 248         /*
 249          * Handle the owner died case:
 250          */
 251         if (uval & FUTEX_OWNER_DIED) {
 252                 /*
 253                  * exit_pi_state_list sets owner to NULL and wakes the
 254                  * topmost waiter. The task which acquires the
 255                  * pi_state->rt_mutex will fixup owner.
 256                  */
 257                 if (!pi_state->owner) {
 258                         /*
 259                          * No pi state owner, but the user space TID
 260                          * is not 0. Inconsistent state. [5]
 261                          */
 262                         if (pid)
 263                                 goto out_einval;
 264                         /*
 265                          * Take a ref on the state and return success. [4]
 266                          */
 267                         goto out_attach;
 268                 }
 269
 270                 /*
 271                  * If TID is 0, then either the dying owner has not
 272                  * yet executed exit_pi_state_list() or some waiter
 273                  * acquired the rtmutex in the pi state, but did not
 274                  * yet fixup the TID in user space.
 275                  *
 276                  * Take a ref on the state and return success. [6]
 277                  */
 278                 if (!pid)
 279                         goto out_attach;
 280         } else {
 281                 /*
 282                  * If the owner died bit is not set, then the pi_state
 283                  * must have an owner. [7]
 284                  */
 285                 if (!pi_state->owner)
 286                         goto out_einval;
 287         }
 288
 289         /*
 290          * Bail out if user space manipulated the futex value. If pi
 291          * state exists then the owner TID must be the same as the
 292          * user space TID. [9/10]
 293          */
 294         if (pid != task_pid_vnr(pi_state->owner))
 295                 goto out_einval;
 296
 297 out_attach:
 298         get_pi_state(pi_state);
 299         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 300         *ps = pi_state;
 301         return 0;
 302
 303 out_einval:
 304         ret = -EINVAL;
 305         goto out_error;
 306
 307 out_eagain:
 308         ret = -EAGAIN;
 309         goto out_error;
 310
 311 out_efault:
 312         ret = -EFAULT;
 313         goto out_error;
 314
 315 out_error:
 316         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 317         return ret;
 318 }
 319
 320 static int handle_exit_race(u32 __user *uaddr, u32 uval,
 321                             struct task_struct *tsk)
 322 {
 323         u32 uval2;
 324
 325         /*
 326          * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
 327          * caller that the alleged owner is busy.
 328          */
 329         if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
 330                 return -EBUSY;
 331
 332         /*
 333          * Reread the user space value to handle the following situation:
 334          *
 335          * CPU0                         CPU1
 336          *
 337          * sys_exit()                   sys_futex()
 338          *  do_exit()                    futex_lock_pi()
 339          *                                futex_lock_pi_atomic()
 340          *   exit_signals(tsk)              No waiters:
 341          *    tsk->flags |= PF_EXITING;     *uaddr == 0x00000PID
 342          *  mm_release(tsk)                 Set waiter bit
 343          *   exit_robust_list(tsk) {        *uaddr = 0x80000PID;
 344          *      Set owner died              attach_to_pi_owner() {
 345          *    *uaddr = 0xC0000000;           tsk = get_task(PID);
 346          *   }                               if (!tsk->flags & PF_EXITING) {
 347          *  ...                                attach();
 348          *  tsk->futex_state =               } else {
 349          *      FUTEX_STATE_DEAD;              if (tsk->futex_state !=
 350          *                                        FUTEX_STATE_DEAD)
 351          *                                       return -EAGAIN;
 352          *                                     return -ESRCH; <--- FAIL
 353          *                                   }
 354          *
 355          * Returning ESRCH unconditionally is wrong here because the
 356          * user space value has been changed by the exiting task.
 357          *
 358          * The same logic applies to the case where the exiting task is
 359          * already gone.
 360          */
 361         if (futex_get_value_locked(&uval2, uaddr))
 362                 return -EFAULT;
 363
 364         /* If the user space value has changed, try again. */
 365         if (uval2 != uval)
 366                 return -EAGAIN;
 367
 368         /*
 369          * The exiting task did not have a robust list, the robust list was
 370          * corrupted or the user space value in *uaddr is simply bogus.
 371          * Give up and tell user space.
 372          */
 373         return -ESRCH;
 374 }
 375
 376 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
 377                                  struct futex_pi_state **ps)
 378 {
 379         /*
 380          * No existing pi state. First waiter. [2]
 381          *
 382          * This creates pi_state, we have hb->lock held, this means nothing can
 383          * observe this state, wait_lock is irrelevant.
 384          */
 385         struct futex_pi_state *pi_state = alloc_pi_state();
 386
 387         /*
 388          * Initialize the pi_mutex in locked state and make @p
 389          * the owner of it:
 390          */
 391         rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
 392
 393         /* Store the key for possible exit cleanups: */
 394         pi_state->key = *key;
 395
 396         WARN_ON(!list_empty(&pi_state->list));
 397         list_add(&pi_state->list, &p->pi_state_list);
 398         /*
 399          * Assignment without holding pi_state->pi_mutex.wait_lock is safe
 400          * because there is no concurrency as the object is not published yet.
 401          */
 402         pi_state->owner = p;
 403
 404         *ps = pi_state;
 405 }
 406 /*
 407  * Lookup the task for the TID provided from user space and attach to
 408  * it after doing proper sanity checks.
 409  */
 410 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
 411                               struct futex_pi_state **ps,
 412                               struct task_struct **exiting)
 413 {
 414         pid_t pid = uval & FUTEX_TID_MASK;
 415         struct task_struct *p;
 416
 417         /*
 418          * We are the first waiter - try to look up the real owner and attach
 419          * the new pi_state to it, but bail out when TID = 0 [1]
 420          *
 421          * The !pid check is paranoid. None of the call sites should end up
 422          * with pid == 0, but better safe than sorry. Let the caller retry
 423          */
 424         if (!pid)
 425                 return -EAGAIN;
 426         p = find_get_task_by_vpid(pid);
 427         if (!p)
 428                 return handle_exit_race(uaddr, uval, NULL);
 429
 430         if (unlikely(p->flags & PF_KTHREAD)) {
 431                 put_task_struct(p);
 432                 return -EPERM;
 433         }
 434
 435         /*
 436          * We need to look at the task state to figure out, whether the
 437          * task is exiting. To protect against the change of the task state
 438          * in futex_exit_release(), we do this protected by p->pi_lock:
 439          */
 440         raw_spin_lock_irq(&p->pi_lock);
 441         if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
 442                 /*
 443                  * The task is on the way out. When the futex state is
 444                  * FUTEX_STATE_DEAD, we know that the task has finished
 445                  * the cleanup:
 446                  */
 447                 int ret = handle_exit_race(uaddr, uval, p);
 448
 449                 raw_spin_unlock_irq(&p->pi_lock);
 450                 /*
 451                  * If the owner task is between FUTEX_STATE_EXITING and
 452                  * FUTEX_STATE_DEAD then store the task pointer and keep
 453                  * the reference on the task struct. The calling code will
 454                  * drop all locks, wait for the task to reach
 455                  * FUTEX_STATE_DEAD and then drop the refcount. This is
 456                  * required to prevent a live lock when the current task
 457                  * preempted the exiting task between the two states.
 458                  */
 459                 if (ret == -EBUSY)
 460                         *exiting = p;
 461                 else
 462                         put_task_struct(p);
 463                 return ret;
 464         }
 465
 466         __attach_to_pi_owner(p, key, ps);
 467         raw_spin_unlock_irq(&p->pi_lock);
 468
 469         put_task_struct(p);
 470
 471         return 0;
 472 }
 473
 474 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
 475 {
 476         int err;
 477         u32 curval;
 478
 479         if (unlikely(should_fail_futex(true)))
 480                 return -EFAULT;
 481
 482         err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
 483         if (unlikely(err))
 484                 return err;
 485
 486         /* If user space value changed, let the caller retry */
 487         return curval != uval ? -EAGAIN : 0;
 488 }
 489
 490 /**
 491  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
 492  * @uaddr:              the pi futex user address
 493  * @hb:                 the pi futex hash bucket
 494  * @key:                the futex key associated with uaddr and hb
 495  * @ps:                 the pi_state pointer where we store the result of the
 496  *                      lookup
 497  * @task:               the task to perform the atomic lock work for.  This will
 498  *                      be "current" except in the case of requeue pi.
 499  * @exiting:            Pointer to store the task pointer of the owner task
 500  *                      which is in the middle of exiting
 501  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
 502  *
 503  * Return:
 504  *  -  0 - ready to wait;
 505  *  -  1 - acquired the lock;
 506  *  - <0 - error
 507  *
 508  * The hb->lock must be held by the caller.
 509  *
 510  * @exiting is only set when the return value is -EBUSY. If so, this holds
 511  * a refcount on the exiting task on return and the caller needs to drop it
 512  * after waiting for the exit to complete.
 513  */
 514 int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 515                          union futex_key *key,
 516                          struct futex_pi_state **ps,
 517                          struct task_struct *task,
 518                          struct task_struct **exiting,
 519                          int set_waiters)
 520 {
 521         u32 uval, newval, vpid = task_pid_vnr(task);
 522         struct futex_q *top_waiter;
 523         int ret;
 524
 525         /*
 526          * Read the user space value first so we can validate a few
 527          * things before proceeding further.
 528          */
 529         if (futex_get_value_locked(&uval, uaddr))
 530                 return -EFAULT;
 531
 532         if (unlikely(should_fail_futex(true)))
 533                 return -EFAULT;
 534
 535         /*
 536          * Detect deadlocks.
 537          */
 538         if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
 539                 return -EDEADLK;
 540
 541         if ((unlikely(should_fail_futex(true))))
 542                 return -EDEADLK;
 543
 544         /*
 545          * Lookup existing state first. If it exists, try to attach to
 546          * its pi_state.
 547          */
 548         top_waiter = futex_top_waiter(hb, key);
 549         if (top_waiter)
 550                 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
 551
 552         /*
 553          * No waiter and user TID is 0. We are here because the
 554          * waiters or the owner died bit is set or called from
 555          * requeue_cmp_pi or for whatever reason something took the
 556          * syscall.
 557          */
 558         if (!(uval & FUTEX_TID_MASK)) {
 559                 /*
 560                  * We take over the futex. No other waiters and the user space
 561                  * TID is 0. We preserve the owner died bit.
 562                  */
 563                 newval = uval & FUTEX_OWNER_DIED;
 564                 newval |= vpid;
 565
 566                 /* The futex requeue_pi code can enforce the waiters bit */
 567                 if (set_waiters)
 568                         newval |= FUTEX_WAITERS;
 569
 570                 ret = lock_pi_update_atomic(uaddr, uval, newval);
 571                 if (ret)
 572                         return ret;
 573
 574                 /*
 575                  * If the waiter bit was requested the caller also needs PI
 576                  * state attached to the new owner of the user space futex.
 577                  *
 578                  * @task is guaranteed to be alive and it cannot be exiting
 579                  * because it is either sleeping or waiting in
 580                  * futex_requeue_pi_wakeup_sync().
 581                  *
 582                  * No need to do the full attach_to_pi_owner() exercise
 583                  * because @task is known and valid.
 584                  */
 585                 if (set_waiters) {
 586                         raw_spin_lock_irq(&task->pi_lock);
 587                         __attach_to_pi_owner(task, key, ps);
 588                         raw_spin_unlock_irq(&task->pi_lock);
 589                 }
 590                 return 1;
 591         }
 592
 593         /*
 594          * First waiter. Set the waiters bit before attaching ourself to
 595          * the owner. If owner tries to unlock, it will be forced into
 596          * the kernel and blocked on hb->lock.
 597          */
 598         newval = uval | FUTEX_WAITERS;
 599         ret = lock_pi_update_atomic(uaddr, uval, newval);
 600         if (ret)
 601                 return ret;
 602         /*
 603          * If the update of the user space value succeeded, we try to
 604          * attach to the owner. If that fails, no harm done, we only
 605          * set the FUTEX_WAITERS bit in the user space variable.
 606          */
 607         return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
 608 }
 609
 610 /*
 611  * Caller must hold a reference on @pi_state.
 612  */
 613 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
 614 {
 615         struct rt_mutex_waiter *top_waiter;
 616         struct task_struct *new_owner;
 617         bool postunlock = false;
 618         DEFINE_RT_WAKE_Q(wqh);
 619         u32 curval, newval;
 620         int ret = 0;
 621
 622         top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
 623         if (WARN_ON_ONCE(!top_waiter)) {
 624                 /*
 625                  * As per the comment in futex_unlock_pi() this should not happen.
 626                  *
 627                  * When this happens, give up our locks and try again, giving
 628                  * the futex_lock_pi() instance time to complete, either by
 629                  * waiting on the rtmutex or removing itself from the futex
 630                  * queue.
 631                  */
 632                 ret = -EAGAIN;
 633                 goto out_unlock;
 634         }
 635
 636         new_owner = top_waiter->task;
 637
 638         /*
 639          * We pass it to the next owner. The WAITERS bit is always kept
 640          * enabled while there is PI state around. We cleanup the owner
 641          * died bit, because we are the owner.
 642          */
 643         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 644
 645         if (unlikely(should_fail_futex(true))) {
 646                 ret = -EFAULT;
 647                 goto out_unlock;
 648         }
 649
 650         ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
 651         if (!ret && (curval != uval)) {
 652                 /*
 653                  * If a unconditional UNLOCK_PI operation (user space did not
 654                  * try the TID->0 transition) raced with a waiter setting the
 655                  * FUTEX_WAITERS flag between get_user() and locking the hash
 656                  * bucket lock, retry the operation.
 657                  */
 658                 if ((FUTEX_TID_MASK & curval) == uval)
 659                         ret = -EAGAIN;
 660                 else
 661                         ret = -EINVAL;
 662         }
 663
 664         if (!ret) {
 665                 /*
 666                  * This is a point of no return; once we modified the uval
 667                  * there is no going back and subsequent operations must
 668                  * not fail.
 669                  */
 670                 pi_state_update_owner(pi_state, new_owner);
 671                 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
 672         }
 673
 674 out_unlock:
 675         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 676
 677         if (postunlock)
 678                 rt_mutex_postunlock(&wqh);
 679
 680         return ret;
 681 }
 682
 683 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 684                                   struct task_struct *argowner)
 685 {
 686         struct futex_pi_state *pi_state = q->pi_state;
 687         struct task_struct *oldowner, *newowner;
 688         u32 uval, curval, newval, newtid;
 689         int err = 0;
 690
 691         oldowner = pi_state->owner;
 692
 693         /*
 694          * We are here because either:
 695          *
 696          *  - we stole the lock and pi_state->owner needs updating to reflect
 697          *    that (@argowner == current),
 698          *
 699          * or:
 700          *
 701          *  - someone stole our lock and we need to fix things to point to the
 702          *    new owner (@argowner == NULL).
 703          *
 704          * Either way, we have to replace the TID in the user space variable.
 705          * This must be atomic as we have to preserve the owner died bit here.
 706          *
 707          * Note: We write the user space value _before_ changing the pi_state
 708          * because we can fault here. Imagine swapped out pages or a fork
 709          * that marked all the anonymous memory readonly for cow.
 710          *
 711          * Modifying pi_state _before_ the user space value would leave the
 712          * pi_state in an inconsistent state when we fault here, because we
 713          * need to drop the locks to handle the fault. This might be observed
 714          * in the PID checks when attaching to PI state .
 715          */
 716 retry:
 717         if (!argowner) {
 718                 if (oldowner != current) {
 719                         /*
 720                          * We raced against a concurrent self; things are
 721                          * already fixed up. Nothing to do.
 722                          */
 723                         return 0;
 724                 }
 725
 726                 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
 727                         /* We got the lock. pi_state is correct. Tell caller. */
 728                         return 1;
 729                 }
 730
 731                 /*
 732                  * The trylock just failed, so either there is an owner or
 733                  * there is a higher priority waiter than this one.
 734                  */
 735                 newowner = rt_mutex_owner(&pi_state->pi_mutex);
 736                 /*
 737                  * If the higher priority waiter has not yet taken over the
 738                  * rtmutex then newowner is NULL. We can't return here with
 739                  * that state because it's inconsistent vs. the user space
 740                  * state. So drop the locks and try again. It's a valid
 741                  * situation and not any different from the other retry
 742                  * conditions.
 743                  */
 744                 if (unlikely(!newowner)) {
 745                         err = -EAGAIN;
 746                         goto handle_err;
 747                 }
 748         } else {
 749                 WARN_ON_ONCE(argowner != current);
 750                 if (oldowner == current) {
 751                         /*
 752                          * We raced against a concurrent self; things are
 753                          * already fixed up. Nothing to do.
 754                          */
 755                         return 1;
 756                 }
 757                 newowner = argowner;
 758         }
 759
 760         newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
 761         /* Owner died? */
 762         if (!pi_state->owner)
 763                 newtid |= FUTEX_OWNER_DIED;
 764
 765         err = futex_get_value_locked(&uval, uaddr);
 766         if (err)
 767                 goto handle_err;
 768
 769         for (;;) {
 770                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
 771
 772                 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
 773                 if (err)
 774                         goto handle_err;
 775
 776                 if (curval == uval)
 777                         break;
 778                 uval = curval;
 779         }
 780
 781         /*
 782          * We fixed up user space. Now we need to fix the pi_state
 783          * itself.
 784          */
 785         pi_state_update_owner(pi_state, newowner);
 786
 787         return argowner == current;
 788
 789         /*
 790          * In order to reschedule or handle a page fault, we need to drop the
 791          * locks here. In the case of a fault, this gives the other task
 792          * (either the highest priority waiter itself or the task which stole
 793          * the rtmutex) the chance to try the fixup of the pi_state. So once we
 794          * are back from handling the fault we need to check the pi_state after
 795          * reacquiring the locks and before trying to do another fixup. When
 796          * the fixup has been done already we simply return.
 797          *
 798          * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
 799          * drop hb->lock since the caller owns the hb -> futex_q relation.
 800          * Dropping the pi_mutex->wait_lock requires the state revalidate.
 801          */
 802 handle_err:
 803         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 804         spin_unlock(q->lock_ptr);
 805
 806         switch (err) {
 807         case -EFAULT:
 808                 err = fault_in_user_writeable(uaddr);
 809                 break;
 810
 811         case -EAGAIN:
 812                 cond_resched();
 813                 err = 0;
 814                 break;
 815
 816         default:
 817                 WARN_ON_ONCE(1);
 818                 break;
 819         }
 820
 821         spin_lock(q->lock_ptr);
 822         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 823
 824         /*
 825          * Check if someone else fixed it for us:
 826          */
 827         if (pi_state->owner != oldowner)
 828                 return argowner == current;
 829
 830         /* Retry if err was -EAGAIN or the fault in succeeded */
 831         if (!err)
 832                 goto retry;
 833
 834         /*
 835          * fault_in_user_writeable() failed so user state is immutable. At
 836          * best we can make the kernel state consistent but user state will
 837          * be most likely hosed and any subsequent unlock operation will be
 838          * rejected due to PI futex rule [10].
 839          *
 840          * Ensure that the rtmutex owner is also the pi_state owner despite
 841          * the user space value claiming something different. There is no
 842          * point in unlocking the rtmutex if current is the owner as it
 843          * would need to wait until the next waiter has taken the rtmutex
 844          * to guarantee consistent state. Keep it simple. Userspace asked
 845          * for this wreckaged state.
 846          *
 847          * The rtmutex has an owner - either current or some other
 848          * task. See the EAGAIN loop above.
 849          */
 850         pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
 851
 852         return err;
 853 }
 854
 855 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 856                                 struct task_struct *argowner)
 857 {
 858         struct futex_pi_state *pi_state = q->pi_state;
 859         int ret;
 860
 861         lockdep_assert_held(q->lock_ptr);
 862
 863         raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
 864         ret = __fixup_pi_state_owner(uaddr, q, argowner);
 865         raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 866         return ret;
 867 }
 868
 869 /**
 870  * fixup_pi_owner() - Post lock pi_state and corner case management
 871  * @uaddr:      user address of the futex
 872  * @q:          futex_q (contains pi_state and access to the rt_mutex)
 873  * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
 874  *
 875  * After attempting to lock an rt_mutex, this function is called to cleanup
 876  * the pi_state owner as well as handle race conditions that may allow us to
 877  * acquire the lock. Must be called with the hb lock held.
 878  *
 879  * Return:
 880  *  -  1 - success, lock taken;
 881  *  -  0 - success, lock not taken;
 882  *  - <0 - on error (-EFAULT)
 883  */
 884 int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 885 {
 886         if (locked) {
 887                 /*
 888                  * Got the lock. We might not be the anticipated owner if we
 889                  * did a lock-steal - fix up the PI-state in that case:
 890                  *
 891                  * Speculative pi_state->owner read (we don't hold wait_lock);
 892                  * since we own the lock pi_state->owner == current is the
 893                  * stable state, anything else needs more attention.
 894                  */
 895                 if (q->pi_state->owner != current)
 896                         return fixup_pi_state_owner(uaddr, q, current);
 897                 return 1;
 898         }
 899
 900         /*
 901          * If we didn't get the lock; check if anybody stole it from us. In
 902          * that case, we need to fix up the uval to point to them instead of
 903          * us, otherwise bad things happen. [10]
 904          *
 905          * Another speculative read; pi_state->owner == current is unstable
 906          * but needs our attention.
 907          */
 908         if (q->pi_state->owner == current)
 909                 return fixup_pi_state_owner(uaddr, q, NULL);
 910
 911         /*
 912          * Paranoia check. If we did not take the lock, then we should not be
 913          * the owner of the rt_mutex. Warn and establish consistent state.
 914          */
 915         if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
 916                 return fixup_pi_state_owner(uaddr, q, current);
 917
 918         return 0;
 919 }
 920
 921 /*
 922  * Userspace tried a 0 -> TID atomic transition of the futex value
 923  * and failed. The kernel side here does the whole locking operation:
 924  * if there are waiters then it will block as a consequence of relying
 925  * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
 926  * a 0 value of the futex too.).
 927  *
 928  * Also serves as futex trylock_pi()'ing, and due semantics.
 929  */
 930 int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
 931 {
 932         struct hrtimer_sleeper timeout, *to;
 933         struct task_struct *exiting = NULL;
 934         struct rt_mutex_waiter rt_waiter;
 935         struct futex_hash_bucket *hb;
 936         struct futex_q q = futex_q_init;
 937         int res, ret;
 938
 939         if (!IS_ENABLED(CONFIG_FUTEX_PI))
 940                 return -ENOSYS;
 941
 942         if (refill_pi_state_cache())
 943                 return -ENOMEM;
 944
 945         to = futex_setup_timer(time, &timeout, flags, 0);
 946
 947 retry:
 948         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
 949         if (unlikely(ret != 0))
 950                 goto out;
 951
 952 retry_private:
 953         hb = futex_q_lock(&q);
 954
 955         ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
 956                                    &exiting, 0);
 957         if (unlikely(ret)) {
 958                 /*
 959                  * Atomic work succeeded and we got the lock,
 960                  * or failed. Either way, we do _not_ block.
 961                  */
 962                 switch (ret) {
 963                 case 1:
 964                         /* We got the lock. */
 965                         ret = 0;
 966                         goto out_unlock_put_key;
 967                 case -EFAULT:
 968                         goto uaddr_faulted;
 969                 case -EBUSY:
 970                 case -EAGAIN:
 971                         /*
 972                          * Two reasons for this:
 973                          * - EBUSY: Task is exiting and we just wait for the
 974                          *   exit to complete.
 975                          * - EAGAIN: The user space value changed.
 976                          */
 977                         futex_q_unlock(hb);
 978                         /*
 979                          * Handle the case where the owner is in the middle of
 980                          * exiting. Wait for the exit to complete otherwise
 981                          * this task might loop forever, aka. live lock.
 982                          */
 983                         wait_for_owner_exiting(ret, exiting);
 984                         cond_resched();
 985                         goto retry;
 986                 default:
 987                         goto out_unlock_put_key;
 988                 }
 989         }
 990
 991         WARN_ON(!q.pi_state);
 992
 993         /*
 994          * Only actually queue now that the atomic ops are done:
 995          */
 996         __futex_queue(&q, hb);
 997
 998         if (trylock) {
 999                 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
1000                 /* Fixup the trylock return value: */
1001                 ret = ret ? 0 : -EWOULDBLOCK;
1002                 goto no_block;
1003         }
1004
1005         rt_mutex_init_waiter(&rt_waiter);
1006
1007         /*
1008          * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
1009          * hold it while doing rt_mutex_start_proxy(), because then it will
1010          * include hb->lock in the blocking chain, even through we'll not in
1011          * fact hold it while blocking. This will lead it to report -EDEADLK
1012          * and BUG when futex_unlock_pi() interleaves with this.
1013          *
1014          * Therefore acquire wait_lock while holding hb->lock, but drop the
1015          * latter before calling __rt_mutex_start_proxy_lock(). This
1016          * interleaves with futex_unlock_pi() -- which does a similar lock
1017          * handoff -- such that the latter can observe the futex_q::pi_state
1018          * before __rt_mutex_start_proxy_lock() is done.
1019          */
1020         raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1021         spin_unlock(q.lock_ptr);
1022         /*
1023          * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1024          * such that futex_unlock_pi() is guaranteed to observe the waiter when
1025          * it sees the futex_q::pi_state.
1026          */
1027         ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1028         raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1029
1030         if (ret) {
1031                 if (ret == 1)
1032                         ret = 0;
1033                 goto cleanup;
1034         }
1035
1036         if (unlikely(to))
1037                 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1038
1039         ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1040
1041 cleanup:
1042         spin_lock(q.lock_ptr);
1043         /*
1044          * If we failed to acquire the lock (deadlock/signal/timeout), we must
1045          * first acquire the hb->lock before removing the lock from the
1046          * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
1047          * lists consistent.
1048          *
1049          * In particular; it is important that futex_unlock_pi() can not
1050          * observe this inconsistency.
1051          */
1052         if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1053                 ret = 0;
1054
1055 no_block:
1056         /*
1057          * Fixup the pi_state owner and possibly acquire the lock if we
1058          * haven't already.
1059          */
1060         res = fixup_pi_owner(uaddr, &q, !ret);
1061         /*
1062          * If fixup_pi_owner() returned an error, propagate that.  If it acquired
1063          * the lock, clear our -ETIMEDOUT or -EINTR.
1064          */
1065         if (res)
1066                 ret = (res < 0) ? res : 0;
1067
1068         futex_unqueue_pi(&q);
1069         spin_unlock(q.lock_ptr);
1070         goto out;
1071
1072 out_unlock_put_key:
1073         futex_q_unlock(hb);
1074
1075 out:
1076         if (to) {
1077                 hrtimer_cancel(&to->timer);
1078                 destroy_hrtimer_on_stack(&to->timer);
1079         }
1080         return ret != -EINTR ? ret : -ERESTARTNOINTR;
1081
1082 uaddr_faulted:
1083         futex_q_unlock(hb);
1084
1085         ret = fault_in_user_writeable(uaddr);
1086         if (ret)
1087                 goto out;
1088
1089         if (!(flags & FLAGS_SHARED))
1090                 goto retry_private;
1091
1092         goto retry;
1093 }
1094
1095 /*
1096  * Userspace attempted a TID -> 0 atomic transition, and failed.
1097  * This is the in-kernel slowpath: we look up the PI state (if any),
1098  * and do the rt-mutex unlock.
1099  */
1100 int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1101 {
1102         u32 curval, uval, vpid = task_pid_vnr(current);
1103         union futex_key key = FUTEX_KEY_INIT;
1104         struct futex_hash_bucket *hb;
1105         struct futex_q *top_waiter;
1106         int ret;
1107
1108         if (!IS_ENABLED(CONFIG_FUTEX_PI))
1109                 return -ENOSYS;
1110
1111 retry:
1112         if (get_user(uval, uaddr))
1113                 return -EFAULT;
1114         /*
1115          * We release only a lock we actually own:
1116          */
1117         if ((uval & FUTEX_TID_MASK) != vpid)
1118                 return -EPERM;
1119
1120         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
1121         if (ret)
1122                 return ret;
1123
1124         hb = futex_hash(&key);
1125         spin_lock(&hb->lock);
1126
1127         /*
1128          * Check waiters first. We do not trust user space values at
1129          * all and we at least want to know if user space fiddled
1130          * with the futex value instead of blindly unlocking.
1131          */
1132         top_waiter = futex_top_waiter(hb, &key);
1133         if (top_waiter) {
1134                 struct futex_pi_state *pi_state = top_waiter->pi_state;
1135
1136                 ret = -EINVAL;
1137                 if (!pi_state)
1138                         goto out_unlock;
1139
1140                 /*
1141                  * If current does not own the pi_state then the futex is
1142                  * inconsistent and user space fiddled with the futex value.
1143                  */
1144                 if (pi_state->owner != current)
1145                         goto out_unlock;
1146
1147                 get_pi_state(pi_state);
1148                 /*
1149                  * By taking wait_lock while still holding hb->lock, we ensure
1150                  * there is no point where we hold neither; and therefore
1151                  * wake_futex_p() must observe a state consistent with what we
1152                  * observed.
1153                  *
1154                  * In particular; this forces __rt_mutex_start_proxy() to
1155                  * complete such that we're guaranteed to observe the
1156                  * rt_waiter. Also see the WARN in wake_futex_pi().
1157                  */
1158                 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1159                 spin_unlock(&hb->lock);
1160
1161                 /* drops pi_state->pi_mutex.wait_lock */
1162                 ret = wake_futex_pi(uaddr, uval, pi_state);
1163
1164                 put_pi_state(pi_state);
1165
1166                 /*
1167                  * Success, we're done! No tricky corner cases.
1168                  */
1169                 if (!ret)
1170                         return ret;
1171                 /*
1172                  * The atomic access to the futex value generated a
1173                  * pagefault, so retry the user-access and the wakeup:
1174                  */
1175                 if (ret == -EFAULT)
1176                         goto pi_faulted;
1177                 /*
1178                  * A unconditional UNLOCK_PI op raced against a waiter
1179                  * setting the FUTEX_WAITERS bit. Try again.
1180                  */
1181                 if (ret == -EAGAIN)
1182                         goto pi_retry;
1183                 /*
1184                  * wake_futex_pi has detected invalid state. Tell user
1185                  * space.
1186                  */
1187                 return ret;
1188         }
1189
1190         /*
1191          * We have no kernel internal state, i.e. no waiters in the
1192          * kernel. Waiters which are about to queue themselves are stuck
1193          * on hb->lock. So we can safely ignore them. We do neither
1194          * preserve the WAITERS bit not the OWNER_DIED one. We are the
1195          * owner.
1196          */
1197         if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1198                 spin_unlock(&hb->lock);
1199                 switch (ret) {
1200                 case -EFAULT:
1201                         goto pi_faulted;
1202
1203                 case -EAGAIN:
1204                         goto pi_retry;
1205
1206                 default:
1207                         WARN_ON_ONCE(1);
1208                         return ret;
1209                 }
1210         }
1211
1212         /*
1213          * If uval has changed, let user space handle it.
1214          */
1215         ret = (curval == uval) ? 0 : -EAGAIN;
1216
1217 out_unlock:
1218         spin_unlock(&hb->lock);
1219         return ret;
1220
1221 pi_retry:
1222         cond_resched();
1223         goto retry;
1224
1225 pi_faulted:
1226
1227         ret = fault_in_user_writeable(uaddr);
1228         if (!ret)
1229                 goto retry;
1230
1231         return ret;
1232 }
1233