* Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
*
+ * PRIVATE futexes by Eric Dumazet
+ * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
+ *
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation.
&& key1->both.offset == key2->both.offset);
}
-/*
- * Get parameters which are the keys for a futex.
+/**
+ * get_futex_key - Get parameters which are the keys for a futex.
+ * @uaddr: virtual address of the futex
+ * @shared: NULL for a PROCESS_PRIVATE futex,
+ * ¤t->mm->mmap_sem for a PROCESS_SHARED futex
+ * @key: address where result is stored.
+ *
+ * Returns a negative error code or 0
+ * The key words are stored in *key on success.
*
* For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
- * Returns: 0, or negative error code.
- * The key words are stored in *key on success.
- *
- * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks.
+ * fshared is NULL for PROCESS_PRIVATE futexes
+ * For other futexes, it points to ¤t->mm->mmap_sem and
+ * caller must have taken the reader lock. but NOT any spinlocks.
*/
-int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
+ union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
* The futex address must be "naturally" aligned.
*/
key->both.offset = address % PAGE_SIZE;
- if (unlikely((key->both.offset % sizeof(u32)) != 0))
+ if (unlikely((address % sizeof(u32)) != 0))
return -EINVAL;
address -= key->both.offset;
/*
+ * PROCESS_PRIVATE futexes are fast.
+ * As the mm cannot disappear under us and the 'key' only needs
+ * virtual address, we dont even have to find the underlying vma.
+ * Note : We do have to check 'uaddr' is a valid user address,
+ * but access_ok() should be faster than find_vma()
+ */
+ if (!fshared) {
+ if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+ return -EFAULT;
+ key->private.mm = mm;
+ key->private.address = address;
+ return 0;
+ }
+ /*
* The futex is hashed differently depending on whether
* it's in a shared or private mapping. So check vma first.
*/
* mappings of _writable_ handles.
*/
if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
+ key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
key->private.mm = mm;
key->private.address = address;
return 0;
* Linear file mappings are also simple.
*/
key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
- key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
+ key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
+ vma->vm_pgoff);
* Take a reference to the resource addressed by a key.
* Can be called while holding spinlocks.
*
- * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
- * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
*/
inline void get_futex_key_refs(union futex_key *key)
{
- if (key->both.ptr != 0) {
- if (key->both.offset & 1)
+ if (key->both.ptr == 0)
+ return;
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
atomic_inc(&key->shared.inode->i_count);
- else
+ break;
+ case FUT_OFF_MMSHARED:
atomic_inc(&key->private.mm->mm_count);
+ break;
}
}
EXPORT_SYMBOL_GPL(get_futex_key_refs);
*/
void drop_futex_key_refs(union futex_key *key)
{
- if (key->both.ptr != 0) {
- if (key->both.offset & 1)
+ if (key->both.ptr == 0)
+ return;
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
iput(key->shared.inode);
- else
+ break;
+ case FUT_OFF_MMSHARED:
mmdrop(key->private.mm);
+ break;
}
}
EXPORT_SYMBOL_GPL(drop_futex_key_refs);
}
/*
- * Fault handling. Called with current->mm->mmap_sem held.
+ * Fault handling.
+ * if fshared is non NULL, current->mm->mmap_sem is already held
*/
-static int futex_handle_fault(unsigned long address, int attempt)
+static int futex_handle_fault(unsigned long address,
+ struct rw_semaphore *fshared, int attempt)
{
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
+ int ret = -EFAULT;
- if (attempt > 2 || !(vma = find_vma(mm, address)) ||
- vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
- return -EFAULT;
+ if (attempt > 2)
+ return ret;
- switch (handle_mm_fault(mm, vma, address, 1)) {
- case VM_FAULT_MINOR:
- current->min_flt++;
- break;
- case VM_FAULT_MAJOR:
- current->maj_flt++;
- break;
- default:
- return -EFAULT;
+ if (!fshared)
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, address);
+ if (vma && address >= vma->vm_start &&
+ (vma->vm_flags & VM_WRITE)) {
+ switch (handle_mm_fault(mm, vma, address, 1)) {
+ case VM_FAULT_MINOR:
+ ret = 0;
+ current->min_flt++;
+ break;
+ case VM_FAULT_MAJOR:
+ ret = 0;
+ current->maj_flt++;
+ break;
+ }
}
- return 0;
+ if (!fshared)
+ up_read(&mm->mmap_sem);
+ return ret;
}
/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
-static int futex_wake(u32 __user *uaddr, int nr_wake)
+static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
+ int nr_wake)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
union futex_key key;
int ret;
- down_read(¤t->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &key);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
spin_unlock(&hb->lock);
out:
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
* and requeue the next nr_requeue waiters following hashed on
* one physical page to another physical page (PI-futex uaddr2)
*/
-static int futex_requeue_pi(u32 __user *uaddr1, u32 __user *uaddr2,
+static int futex_requeue_pi(u32 __user *uaddr1,
+ struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
int nr_wake, int nr_requeue, u32 *cmpval)
{
union futex_key key1, key2;
/*
* First take all the futex related locks:
*/
- down_read(¤t->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr1, &key1);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;
* If we would have faulted, release mmap_sem, fault
* it in and start all over again.
*/
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(curval, uaddr1);
drop_futex_key_refs(&key1);
out:
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
* to this virtual address:
*/
static int
-futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
union futex_key key1, key2;
int ret, op_ret, attempt = 0;
retryfull:
- down_read(¤t->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr1, &key1);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr2,
- attempt)) {
- ret = -EFAULT;
+ ret = futex_handle_fault((unsigned long)uaddr2,
+ fshared, attempt);
+ if (ret)
goto out;
- }
goto retry;
}
* If we would have faulted, release mmap_sem,
* fault it in and start all over again.
*/
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(dummy, uaddr2);
if (ret)
if (hb1 != hb2)
spin_unlock(&hb2->lock);
out:
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
-static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
+ u32 __user *uaddr2,
int nr_wake, int nr_requeue, u32 *cmpval)
{
union futex_key key1, key2;
int ret, drop_count = 0;
retry:
- down_read(¤t->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr1, &key1);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;
* If we would have faulted, release mmap_sem, fault
* it in and start all over again.
*/
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(curval, uaddr1);
drop_futex_key_refs(&key1);
out:
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
* The cur->mm semaphore must be held, it is released at return of this
* function.
*/
-static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
+ struct futex_q *q,
struct futex_hash_bucket *hb,
struct task_struct *curr)
{
/* Unqueue and drop the lock */
unqueue_me_pi(q);
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
/*
* We own it, so we have to replace the pending owner
* TID. This must be atomic as we have preserve the
return ret;
}
+/*
+ * In case we must use restart_block to restart a futex_wait,
+ * we encode in the 'arg3' shared capability
+ */
+#define ARG3_SHARED 1
+
static long futex_wait_restart(struct restart_block *restart);
-static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
+static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+ u32 val, ktime_t *abs_time)
{
struct task_struct *curr = current;
DECLARE_WAITQUEUE(wait, curr);
q.pi_state = NULL;
retry:
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &q.key);
+ ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
* a wakeup when *uaddr != val on entry to the syscall. This is
* rare, but normal.
*
- * We hold the mmap semaphore, so the mapping cannot have changed
- * since we looked it up in get_futex_key.
+ * for shared futexes, we hold the mmap semaphore, so the mapping
+ * cannot have changed since we looked it up in get_futex_key.
*/
ret = get_futex_value_locked(&uval, uaddr);
* If we would have faulted, release mmap_sem, fault it in and
* start all over again.
*/
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(uval, uaddr);
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
*/
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
/*
* There might have been scheduling since the queue_me(), as we
else
ret = rt_mutex_timed_lock(lock, to, 1);
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
spin_lock(q.lock_ptr);
/*
/* mmap_sem and hash_bucket lock are unlocked at
return of this function */
- ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+ ret = fixup_pi_state_owner(uaddr, fshared,
+ &q, hb, curr);
} else {
/*
* Catch the rare case, where the lock was released
}
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
}
debug_rt_mutex_free_waiter(&q.waiter);
restart->arg0 = (unsigned long)uaddr;
restart->arg1 = (unsigned long)val;
restart->arg2 = (unsigned long)abs_time;
+ restart->arg3 = 0;
+ if (fshared)
+ restart->arg3 |= ARG3_SHARED;
return -ERESTART_RESTARTBLOCK;
}
queue_unlock(&q, hb);
out_release_sem:
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
}
u32 __user *uaddr = (u32 __user *)restart->arg0;
u32 val = (u32)restart->arg1;
ktime_t *abs_time = (ktime_t *)restart->arg2;
+ struct rw_semaphore *fshared = NULL;
restart->fn = do_no_restart_syscall;
- return (long)futex_wait(uaddr, val, abs_time);
+ if (restart->arg3 & ARG3_SHARED)
+ fshared = ¤t->mm->mmap_sem;
+ return (long)futex_wait(uaddr, fshared, val, abs_time);
}
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
- int trylock)
+static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+ int detect, ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct task_struct *curr = current;
q.pi_state = NULL;
retry:
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &q.key);
+ ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
* Now the futex is queued and we have checked the data, we
* don't want to hold mmap_sem while we sleep.
*/
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
WARN_ON(!q.pi_state);
/*
ret = ret ? 0 : -EWOULDBLOCK;
}
- down_read(&curr->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
spin_lock(q.lock_ptr);
/*
*/
if (!ret && q.pi_state->owner != curr)
/* mmap_sem is unlocked at return of this function */
- ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
+ ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
else {
/*
* Catch the rare case, where the lock was released
}
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
}
if (!detect && ret == -EDEADLK && 0)
queue_unlock(&q, hb);
out_release_sem:
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
uaddr_faulted:
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
+ ret = futex_handle_fault((unsigned long)uaddr, fshared,
+ attempt);
+ if (ret)
goto out_unlock_release_sem;
- }
goto retry_locked;
}
queue_unlock(&q, hb);
- up_read(&curr->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
*/
-static int futex_unlock_pi(u32 __user *uaddr)
+static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
/*
* First take all the futex related locks:
*/
- down_read(¤t->mm->mmap_sem);
+ if (fshared)
+ down_read(fshared);
- ret = get_futex_key(uaddr, &key);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
out_unlock:
spin_unlock(&hb->lock);
out:
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
return ret;
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
+ ret = futex_handle_fault((unsigned long)uaddr, fshared,
+ attempt);
+ if (ret)
goto out_unlock;
- }
goto retry_locked;
}
spin_unlock(&hb->lock);
- up_read(¤t->mm->mmap_sem);
+ if (fshared)
+ up_read(fshared);
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
struct futex_q *q;
struct file *filp;
int ret, err;
+ struct rw_semaphore *fshared;
static unsigned long printk_interval;
if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
}
q->pi_state = NULL;
- down_read(¤t->mm->mmap_sem);
- err = get_futex_key(uaddr, &q->key);
+ fshared = ¤t->mm->mmap_sem;
+ down_read(fshared);
+ err = get_futex_key(uaddr, fshared, &q->key);
if (unlikely(err != 0)) {
- up_read(¤t->mm->mmap_sem);
+ up_read(fshared);
kfree(q);
goto error;
}
filp->private_data = q;
queue_me(q, ret, filp);
- up_read(¤t->mm->mmap_sem);
+ up_read(fshared);
/* Now we map fd to filp, so userspace can access it */
fd_install(ret, filp);
*/
if (!pi) {
if (uval & FUTEX_WAITERS)
- futex_wake(uaddr, 1);
+ futex_wake(uaddr, &curr->mm->mmap_sem, 1);
}
}
return 0;
return;
if (pending)
- handle_futex_death((void __user *)pending + futex_offset, curr, pip);
+ handle_futex_death((void __user *)pending + futex_offset,
+ curr, pip);
while (entry != &head->list) {
/*
u32 __user *uaddr2, u32 val2, u32 val3)
{
int ret;
+ int cmd = op & FUTEX_CMD_MASK;
+ struct rw_semaphore *fshared = NULL;
+
+ if (!(op & FUTEX_PRIVATE_FLAG))
+ fshared = ¤t->mm->mmap_sem;
- switch (op) {
+ switch (cmd) {
case FUTEX_WAIT:
- ret = futex_wait(uaddr, val, timeout);
+ ret = futex_wait(uaddr, fshared, val, timeout);
break;
case FUTEX_WAKE:
- ret = futex_wake(uaddr, val);
+ ret = futex_wake(uaddr, fshared, val);
break;
case FUTEX_FD:
/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
ret = futex_fd(uaddr, val);
break;
case FUTEX_REQUEUE:
- ret = futex_requeue(uaddr, uaddr2, val, val2, NULL);
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
break;
case FUTEX_CMP_REQUEUE:
- ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
+ ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
break;
case FUTEX_WAKE_OP:
- ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+ ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
break;
case FUTEX_LOCK_PI:
- ret = futex_lock_pi(uaddr, val, timeout, 0);
+ ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
break;
case FUTEX_UNLOCK_PI:
- ret = futex_unlock_pi(uaddr);
+ ret = futex_unlock_pi(uaddr, fshared);
break;
case FUTEX_TRYLOCK_PI:
- ret = futex_lock_pi(uaddr, 0, timeout, 1);
+ ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
break;
case FUTEX_CMP_REQUEUE_PI:
- ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3);
+ ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
break;
default:
ret = -ENOSYS;
struct timespec ts;
ktime_t t, *tp = NULL;
u32 val2 = 0;
+ int cmd = op & FUTEX_CMD_MASK;
- if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
return -EFAULT;
if (!timespec_valid(&ts))
return -EINVAL;
t = timespec_to_ktime(ts);
- if (op == FUTEX_WAIT)
+ if (cmd == FUTEX_WAIT)
t = ktime_add(ktime_get(), t);
tp = &t;
}
/*
- * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
+ * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
*/
- if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
- || op == FUTEX_CMP_REQUEUE_PI)
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
+ || cmd == FUTEX_CMP_REQUEUE_PI)
val2 = (u32) (unsigned long) utime;
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);