futex: Prevent inconsistent state and exit race
The recent rework of the requeue PI code introduced a possibility for
going back to user space in inconsistent state:
CPU 0 CPU 1
requeue_futex()
if (lock_pifutex_user()) {
dequeue_waiter();
wake_waiter(task);
sched_in(task);
return_from_futex_syscall();
---> Inconsistent state because PI state is not established
It becomes worse if the woken up task immediately exits:
sys_exit();
attach_pistate(vpid); <--- FAIL
Attach the pi state before dequeuing and waking the waiter. If the waiter
gets a spurious wakeup before the dequeue operation it will wait in
futex_requeue_pi_wakeup_sync() and therefore cannot return and exit.
Fixes: 07d91ef510
("futex: Prevent requeue_pi() lock nesting issue on RT")
Reported-by: syzbot+4d1bd0725ef09168e1a0@syzkaller.appspotmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210902094414.558914045@linutronix.de
This commit is contained in:
Родитель
a974b54036
Коммит
4f07ec0d76
|
@ -1454,8 +1454,23 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
|
||||||
newval |= FUTEX_WAITERS;
|
newval |= FUTEX_WAITERS;
|
||||||
|
|
||||||
ret = lock_pi_update_atomic(uaddr, uval, newval);
|
ret = lock_pi_update_atomic(uaddr, uval, newval);
|
||||||
/* If the take over worked, return 1 */
|
if (ret)
|
||||||
return ret < 0 ? ret : 1;
|
return ret;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the waiter bit was requested the caller also needs PI
|
||||||
|
* state attached to the new owner of the user space futex.
|
||||||
|
*
|
||||||
|
* @task is guaranteed to be alive and it cannot be exiting
|
||||||
|
* because it is either sleeping or waiting in
|
||||||
|
* futex_requeue_pi_wakeup_sync().
|
||||||
|
*/
|
||||||
|
if (set_waiters) {
|
||||||
|
ret = attach_to_pi_owner(uaddr, newval, key, ps,
|
||||||
|
exiting);
|
||||||
|
WARN_ON(ret);
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -2036,17 +2051,24 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
|
||||||
return -EAGAIN;
|
return -EAGAIN;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
|
* Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
|
||||||
* the contended case or if set_waiters is 1. The pi_state is returned
|
* in the contended case or if @set_waiters is true.
|
||||||
* in ps in contended cases.
|
*
|
||||||
|
* In the contended case PI state is attached to the lock owner. If
|
||||||
|
* the user space lock can be acquired then PI state is attached to
|
||||||
|
* the new owner (@top_waiter->task) when @set_waiters is true.
|
||||||
*/
|
*/
|
||||||
vpid = task_pid_vnr(top_waiter->task);
|
vpid = task_pid_vnr(top_waiter->task);
|
||||||
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
|
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
|
||||||
exiting, set_waiters);
|
exiting, set_waiters);
|
||||||
if (ret == 1) {
|
if (ret == 1) {
|
||||||
/* Dequeue, wake up and update top_waiter::requeue_state */
|
/*
|
||||||
|
* Lock was acquired in user space and PI state was
|
||||||
|
* attached to @top_waiter->task. That means state is fully
|
||||||
|
* consistent and the waiter can return to user space
|
||||||
|
* immediately after the wakeup.
|
||||||
|
*/
|
||||||
requeue_pi_wake_futex(top_waiter, key2, hb2);
|
requeue_pi_wake_futex(top_waiter, key2, hb2);
|
||||||
return vpid;
|
|
||||||
} else if (ret < 0) {
|
} else if (ret < 0) {
|
||||||
/* Rewind top_waiter::requeue_state */
|
/* Rewind top_waiter::requeue_state */
|
||||||
futex_requeue_pi_complete(top_waiter, ret);
|
futex_requeue_pi_complete(top_waiter, ret);
|
||||||
|
@ -2208,19 +2230,26 @@ retry_private:
|
||||||
&exiting, nr_requeue);
|
&exiting, nr_requeue);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* At this point the top_waiter has either taken uaddr2 or is
|
* At this point the top_waiter has either taken uaddr2 or
|
||||||
* waiting on it. If the former, then the pi_state will not
|
* is waiting on it. In both cases pi_state has been
|
||||||
* exist yet, look it up one more time to ensure we have a
|
* established and an initial refcount on it. In case of an
|
||||||
* reference to it. If the lock was taken, @ret contains the
|
* error there's nothing.
|
||||||
* VPID of the top waiter task.
|
|
||||||
* If the lock was not taken, we have pi_state and an initial
|
|
||||||
* refcount on it. In case of an error we have nothing.
|
|
||||||
*
|
*
|
||||||
* The top waiter's requeue_state is up to date:
|
* The top waiter's requeue_state is up to date:
|
||||||
*
|
*
|
||||||
* - If the lock was acquired atomically (ret > 0), then
|
* - If the lock was acquired atomically (ret == 1), then
|
||||||
* the state is Q_REQUEUE_PI_LOCKED.
|
* the state is Q_REQUEUE_PI_LOCKED.
|
||||||
*
|
*
|
||||||
|
* The top waiter has been dequeued and woken up and can
|
||||||
|
* return to user space immediately. The kernel/user
|
||||||
|
* space state is consistent. In case that there must be
|
||||||
|
* more waiters requeued the WAITERS bit in the user
|
||||||
|
* space futex is set so the top waiter task has to go
|
||||||
|
* into the syscall slowpath to unlock the futex. This
|
||||||
|
* will block until this requeue operation has been
|
||||||
|
* completed and the hash bucket locks have been
|
||||||
|
* dropped.
|
||||||
|
*
|
||||||
* - If the trylock failed with an error (ret < 0) then
|
* - If the trylock failed with an error (ret < 0) then
|
||||||
* the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
|
* the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
|
||||||
* happened", or Q_REQUEUE_PI_IGNORE when there was an
|
* happened", or Q_REQUEUE_PI_IGNORE when there was an
|
||||||
|
@ -2234,36 +2263,20 @@ retry_private:
|
||||||
* the same sanity checks for requeue_pi as the loop
|
* the same sanity checks for requeue_pi as the loop
|
||||||
* below does.
|
* below does.
|
||||||
*/
|
*/
|
||||||
if (ret > 0) {
|
|
||||||
WARN_ON(pi_state);
|
|
||||||
task_count++;
|
|
||||||
/*
|
|
||||||
* If futex_proxy_trylock_atomic() acquired the
|
|
||||||
* user space futex, then the user space value
|
|
||||||
* @uaddr2 has been set to the @hb1's top waiter
|
|
||||||
* task VPID. This task is guaranteed to be alive
|
|
||||||
* and cannot be exiting because it is either
|
|
||||||
* sleeping or blocked on @hb2 lock.
|
|
||||||
*
|
|
||||||
* The @uaddr2 futex cannot have waiters either as
|
|
||||||
* otherwise futex_proxy_trylock_atomic() would not
|
|
||||||
* have succeeded.
|
|
||||||
*
|
|
||||||
* In order to requeue waiters to @hb2, pi state is
|
|
||||||
* required. Hand in the VPID value (@ret) and
|
|
||||||
* allocate PI state with an initial refcount on
|
|
||||||
* it.
|
|
||||||
*/
|
|
||||||
ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state,
|
|
||||||
&exiting);
|
|
||||||
WARN_ON(ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (ret) {
|
switch (ret) {
|
||||||
case 0:
|
case 0:
|
||||||
/* We hold a reference on the pi state. */
|
/* We hold a reference on the pi state. */
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case 1:
|
||||||
|
/*
|
||||||
|
* futex_proxy_trylock_atomic() acquired the user space
|
||||||
|
* futex. Adjust task_count.
|
||||||
|
*/
|
||||||
|
task_count++;
|
||||||
|
ret = 0;
|
||||||
|
break;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the above failed, then pi_state is NULL and
|
* If the above failed, then pi_state is NULL and
|
||||||
* waiter::requeue_state is correct.
|
* waiter::requeue_state is correct.
|
||||||
|
@ -2395,9 +2408,8 @@ retry_private:
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We took an extra initial reference to the pi_state either in
|
* We took an extra initial reference to the pi_state in
|
||||||
* futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need
|
* futex_proxy_trylock_atomic(). We need to drop it here again.
|
||||||
* to drop it here again.
|
|
||||||
*/
|
*/
|
||||||
put_pi_state(pi_state);
|
put_pi_state(pi_state);
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче