perf: Fix missing SIGTRAPs
Marco reported:
Due to the implementation of how SIGTRAP are delivered if
perf_event_attr::sigtrap is set, we've noticed 3 issues:
1. Missing SIGTRAP due to a race with event_sched_out() (more
details below).
2. Hardware PMU events being disabled due to returning 1 from
perf_event_overflow(). The only way to re-enable the event is
for user space to first "properly" disable the event and then
re-enable it.
3. The inability to automatically disable an event after a
specified number of overflows via PERF_EVENT_IOC_REFRESH.
The worst of the 3 issues is problem (1), which occurs when a
pending_disable is "consumed" by a racing event_sched_out(), observed
as follows:
CPU0 | CPU1
--------------------------------+---------------------------
__perf_event_overflow() |
perf_event_disable_inatomic() |
pending_disable = CPU0 | ...
| _perf_event_enable()
| event_function_call()
| task_function_call()
| /* sends IPI to CPU0 */
<IPI> | ...
__perf_event_enable() +---------------------------
ctx_resched()
task_ctx_sched_out()
ctx_sched_out()
group_sched_out()
event_sched_out()
pending_disable = -1
</IPI>
<IRQ-work>
perf_pending_event()
perf_pending_event_disable()
/* Fails to send SIGTRAP because no pending_disable! */
</IRQ-work>
In the above case, not only is that particular SIGTRAP missed, but also
all future SIGTRAPs because 'event_limit' is not reset back to 1.
To fix, rework pending delivery of SIGTRAP via IRQ-work by introduction
of a separate 'pending_sigtrap', no longer using 'event_limit' and
'pending_disable' for its delivery.
Additionally; and different to Marco's proposed patch:
- recognise that pending_disable effectively duplicates oncpu for
the case where it is set. As such, change the irq_work handler to
use ->oncpu to target the event and use pending_* as boolean toggles.
- observe that SIGTRAP targets the ctx->task, so the context switch
optimization that carries contexts between tasks is invalid. If
the irq_work were delayed enough to hit after a context switch the
SIGTRAP would be delivered to the wrong task.
- observe that if the event gets scheduled out
(rotation/migration/context-switch/...) the irq-work would be
insufficient to deliver the SIGTRAP when the event gets scheduled
back in (the irq-work might still be pending on the old CPU).
Therefore have event_sched_out() convert the pending sigtrap into a
task_work which will deliver the signal at return_to_user.
Fixes: 97ba62b278
("perf: Add support for SIGTRAP on perf events")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Debugged-by: Dmitry Vyukov <dvyukov@google.com>
Reported-by: Marco Elver <elver@google.com>
Debugged-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Marco Elver <elver@google.com>
Tested-by: Marco Elver <elver@google.com>
This commit is contained in:
Родитель
9abf2313ad
Коммит
ca6c21327c
|
@ -756,11 +756,14 @@ struct perf_event {
|
||||||
struct fasync_struct *fasync;
|
struct fasync_struct *fasync;
|
||||||
|
|
||||||
/* delayed work for NMIs and such */
|
/* delayed work for NMIs and such */
|
||||||
int pending_wakeup;
|
unsigned int pending_wakeup;
|
||||||
int pending_kill;
|
unsigned int pending_kill;
|
||||||
int pending_disable;
|
unsigned int pending_disable;
|
||||||
|
unsigned int pending_sigtrap;
|
||||||
unsigned long pending_addr; /* SIGTRAP */
|
unsigned long pending_addr; /* SIGTRAP */
|
||||||
struct irq_work pending;
|
struct irq_work pending_irq;
|
||||||
|
struct callback_head pending_task;
|
||||||
|
unsigned int pending_work;
|
||||||
|
|
||||||
atomic_t event_limit;
|
atomic_t event_limit;
|
||||||
|
|
||||||
|
@ -877,6 +880,14 @@ struct perf_event_context {
|
||||||
#endif
|
#endif
|
||||||
void *task_ctx_data; /* pmu specific data */
|
void *task_ctx_data; /* pmu specific data */
|
||||||
struct rcu_head rcu_head;
|
struct rcu_head rcu_head;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Sum (event->pending_sigtrap + event->pending_work)
|
||||||
|
*
|
||||||
|
* The SIGTRAP is targeted at ctx->task, as such it won't do changing
|
||||||
|
* that until the signal is delivered.
|
||||||
|
*/
|
||||||
|
local_t nr_pending;
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -54,6 +54,7 @@
|
||||||
#include <linux/highmem.h>
|
#include <linux/highmem.h>
|
||||||
#include <linux/pgtable.h>
|
#include <linux/pgtable.h>
|
||||||
#include <linux/buildid.h>
|
#include <linux/buildid.h>
|
||||||
|
#include <linux/task_work.h>
|
||||||
|
|
||||||
#include "internal.h"
|
#include "internal.h"
|
||||||
|
|
||||||
|
@ -2276,11 +2277,26 @@ event_sched_out(struct perf_event *event,
|
||||||
event->pmu->del(event, 0);
|
event->pmu->del(event, 0);
|
||||||
event->oncpu = -1;
|
event->oncpu = -1;
|
||||||
|
|
||||||
if (READ_ONCE(event->pending_disable) >= 0) {
|
if (event->pending_disable) {
|
||||||
WRITE_ONCE(event->pending_disable, -1);
|
event->pending_disable = 0;
|
||||||
perf_cgroup_event_disable(event, ctx);
|
perf_cgroup_event_disable(event, ctx);
|
||||||
state = PERF_EVENT_STATE_OFF;
|
state = PERF_EVENT_STATE_OFF;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (event->pending_sigtrap) {
|
||||||
|
bool dec = true;
|
||||||
|
|
||||||
|
event->pending_sigtrap = 0;
|
||||||
|
if (state != PERF_EVENT_STATE_OFF &&
|
||||||
|
!event->pending_work) {
|
||||||
|
event->pending_work = 1;
|
||||||
|
dec = false;
|
||||||
|
task_work_add(current, &event->pending_task, TWA_RESUME);
|
||||||
|
}
|
||||||
|
if (dec)
|
||||||
|
local_dec(&event->ctx->nr_pending);
|
||||||
|
}
|
||||||
|
|
||||||
perf_event_set_state(event, state);
|
perf_event_set_state(event, state);
|
||||||
|
|
||||||
if (!is_software_event(event))
|
if (!is_software_event(event))
|
||||||
|
@ -2432,7 +2448,7 @@ static void __perf_event_disable(struct perf_event *event,
|
||||||
* hold the top-level event's child_mutex, so any descendant that
|
* hold the top-level event's child_mutex, so any descendant that
|
||||||
* goes to exit will block in perf_event_exit_event().
|
* goes to exit will block in perf_event_exit_event().
|
||||||
*
|
*
|
||||||
* When called from perf_pending_event it's OK because event->ctx
|
* When called from perf_pending_irq it's OK because event->ctx
|
||||||
* is the current context on this CPU and preemption is disabled,
|
* is the current context on this CPU and preemption is disabled,
|
||||||
* hence we can't get into perf_event_task_sched_out for this context.
|
* hence we can't get into perf_event_task_sched_out for this context.
|
||||||
*/
|
*/
|
||||||
|
@ -2471,9 +2487,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
|
||||||
|
|
||||||
void perf_event_disable_inatomic(struct perf_event *event)
|
void perf_event_disable_inatomic(struct perf_event *event)
|
||||||
{
|
{
|
||||||
WRITE_ONCE(event->pending_disable, smp_processor_id());
|
event->pending_disable = 1;
|
||||||
/* can fail, see perf_pending_event_disable() */
|
irq_work_queue(&event->pending_irq);
|
||||||
irq_work_queue(&event->pending);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MAX_INTERRUPTS (~0ULL)
|
#define MAX_INTERRUPTS (~0ULL)
|
||||||
|
@ -3428,11 +3443,23 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
|
||||||
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
|
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
|
||||||
if (context_equiv(ctx, next_ctx)) {
|
if (context_equiv(ctx, next_ctx)) {
|
||||||
|
|
||||||
|
perf_pmu_disable(pmu);
|
||||||
|
|
||||||
|
/* PMIs are disabled; ctx->nr_pending is stable. */
|
||||||
|
if (local_read(&ctx->nr_pending) ||
|
||||||
|
local_read(&next_ctx->nr_pending)) {
|
||||||
|
/*
|
||||||
|
* Must not swap out ctx when there's pending
|
||||||
|
* events that rely on the ctx->task relation.
|
||||||
|
*/
|
||||||
|
raw_spin_unlock(&next_ctx->lock);
|
||||||
|
rcu_read_unlock();
|
||||||
|
goto inside_switch;
|
||||||
|
}
|
||||||
|
|
||||||
WRITE_ONCE(ctx->task, next);
|
WRITE_ONCE(ctx->task, next);
|
||||||
WRITE_ONCE(next_ctx->task, task);
|
WRITE_ONCE(next_ctx->task, task);
|
||||||
|
|
||||||
perf_pmu_disable(pmu);
|
|
||||||
|
|
||||||
if (cpuctx->sched_cb_usage && pmu->sched_task)
|
if (cpuctx->sched_cb_usage && pmu->sched_task)
|
||||||
pmu->sched_task(ctx, false);
|
pmu->sched_task(ctx, false);
|
||||||
|
|
||||||
|
@ -3473,6 +3500,7 @@ unlock:
|
||||||
raw_spin_lock(&ctx->lock);
|
raw_spin_lock(&ctx->lock);
|
||||||
perf_pmu_disable(pmu);
|
perf_pmu_disable(pmu);
|
||||||
|
|
||||||
|
inside_switch:
|
||||||
if (cpuctx->sched_cb_usage && pmu->sched_task)
|
if (cpuctx->sched_cb_usage && pmu->sched_task)
|
||||||
pmu->sched_task(ctx, false);
|
pmu->sched_task(ctx, false);
|
||||||
task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
|
task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
|
||||||
|
@ -4939,7 +4967,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
|
||||||
|
|
||||||
static void _free_event(struct perf_event *event)
|
static void _free_event(struct perf_event *event)
|
||||||
{
|
{
|
||||||
irq_work_sync(&event->pending);
|
irq_work_sync(&event->pending_irq);
|
||||||
|
|
||||||
unaccount_event(event);
|
unaccount_event(event);
|
||||||
|
|
||||||
|
@ -6439,7 +6467,8 @@ static void perf_sigtrap(struct perf_event *event)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* perf_pending_event() can race with the task exiting.
|
* Both perf_pending_task() and perf_pending_irq() can race with the
|
||||||
|
* task exiting.
|
||||||
*/
|
*/
|
||||||
if (current->flags & PF_EXITING)
|
if (current->flags & PF_EXITING)
|
||||||
return;
|
return;
|
||||||
|
@ -6448,23 +6477,33 @@ static void perf_sigtrap(struct perf_event *event)
|
||||||
event->attr.type, event->attr.sig_data);
|
event->attr.type, event->attr.sig_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void perf_pending_event_disable(struct perf_event *event)
|
/*
|
||||||
|
* Deliver the pending work in-event-context or follow the context.
|
||||||
|
*/
|
||||||
|
static void __perf_pending_irq(struct perf_event *event)
|
||||||
{
|
{
|
||||||
int cpu = READ_ONCE(event->pending_disable);
|
int cpu = READ_ONCE(event->oncpu);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the event isn't running; we done. event_sched_out() will have
|
||||||
|
* taken care of things.
|
||||||
|
*/
|
||||||
if (cpu < 0)
|
if (cpu < 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Yay, we hit home and are in the context of the event.
|
||||||
|
*/
|
||||||
if (cpu == smp_processor_id()) {
|
if (cpu == smp_processor_id()) {
|
||||||
WRITE_ONCE(event->pending_disable, -1);
|
if (event->pending_sigtrap) {
|
||||||
|
event->pending_sigtrap = 0;
|
||||||
if (event->attr.sigtrap) {
|
|
||||||
perf_sigtrap(event);
|
perf_sigtrap(event);
|
||||||
atomic_set_release(&event->event_limit, 1); /* rearm event */
|
local_dec(&event->ctx->nr_pending);
|
||||||
return;
|
}
|
||||||
|
if (event->pending_disable) {
|
||||||
|
event->pending_disable = 0;
|
||||||
|
perf_event_disable_local(event);
|
||||||
}
|
}
|
||||||
|
|
||||||
perf_event_disable_local(event);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6484,35 +6523,62 @@ static void perf_pending_event_disable(struct perf_event *event)
|
||||||
* irq_work_queue(); // FAILS
|
* irq_work_queue(); // FAILS
|
||||||
*
|
*
|
||||||
* irq_work_run()
|
* irq_work_run()
|
||||||
* perf_pending_event()
|
* perf_pending_irq()
|
||||||
*
|
*
|
||||||
* But the event runs on CPU-B and wants disabling there.
|
* But the event runs on CPU-B and wants disabling there.
|
||||||
*/
|
*/
|
||||||
irq_work_queue_on(&event->pending, cpu);
|
irq_work_queue_on(&event->pending_irq, cpu);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void perf_pending_event(struct irq_work *entry)
|
static void perf_pending_irq(struct irq_work *entry)
|
||||||
{
|
{
|
||||||
struct perf_event *event = container_of(entry, struct perf_event, pending);
|
struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
|
||||||
int rctx;
|
int rctx;
|
||||||
|
|
||||||
rctx = perf_swevent_get_recursion_context();
|
|
||||||
/*
|
/*
|
||||||
* If we 'fail' here, that's OK, it means recursion is already disabled
|
* If we 'fail' here, that's OK, it means recursion is already disabled
|
||||||
* and we won't recurse 'further'.
|
* and we won't recurse 'further'.
|
||||||
*/
|
*/
|
||||||
|
rctx = perf_swevent_get_recursion_context();
|
||||||
|
|
||||||
perf_pending_event_disable(event);
|
/*
|
||||||
|
* The wakeup isn't bound to the context of the event -- it can happen
|
||||||
|
* irrespective of where the event is.
|
||||||
|
*/
|
||||||
if (event->pending_wakeup) {
|
if (event->pending_wakeup) {
|
||||||
event->pending_wakeup = 0;
|
event->pending_wakeup = 0;
|
||||||
perf_event_wakeup(event);
|
perf_event_wakeup(event);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__perf_pending_irq(event);
|
||||||
|
|
||||||
if (rctx >= 0)
|
if (rctx >= 0)
|
||||||
perf_swevent_put_recursion_context(rctx);
|
perf_swevent_put_recursion_context(rctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void perf_pending_task(struct callback_head *head)
|
||||||
|
{
|
||||||
|
struct perf_event *event = container_of(head, struct perf_event, pending_task);
|
||||||
|
int rctx;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we 'fail' here, that's OK, it means recursion is already disabled
|
||||||
|
* and we won't recurse 'further'.
|
||||||
|
*/
|
||||||
|
preempt_disable_notrace();
|
||||||
|
rctx = perf_swevent_get_recursion_context();
|
||||||
|
|
||||||
|
if (event->pending_work) {
|
||||||
|
event->pending_work = 0;
|
||||||
|
perf_sigtrap(event);
|
||||||
|
local_dec(&event->ctx->nr_pending);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (rctx >= 0)
|
||||||
|
perf_swevent_put_recursion_context(rctx);
|
||||||
|
preempt_enable_notrace();
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_GUEST_PERF_EVENTS
|
#ifdef CONFIG_GUEST_PERF_EVENTS
|
||||||
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
|
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
|
||||||
|
|
||||||
|
@ -9212,8 +9278,8 @@ int perf_event_account_interrupt(struct perf_event *event)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static int __perf_event_overflow(struct perf_event *event,
|
static int __perf_event_overflow(struct perf_event *event,
|
||||||
int throttle, struct perf_sample_data *data,
|
int throttle, struct perf_sample_data *data,
|
||||||
struct pt_regs *regs)
|
struct pt_regs *regs)
|
||||||
{
|
{
|
||||||
int events = atomic_read(&event->event_limit);
|
int events = atomic_read(&event->event_limit);
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
@ -9236,24 +9302,36 @@ static int __perf_event_overflow(struct perf_event *event,
|
||||||
if (events && atomic_dec_and_test(&event->event_limit)) {
|
if (events && atomic_dec_and_test(&event->event_limit)) {
|
||||||
ret = 1;
|
ret = 1;
|
||||||
event->pending_kill = POLL_HUP;
|
event->pending_kill = POLL_HUP;
|
||||||
event->pending_addr = data->addr;
|
|
||||||
|
|
||||||
perf_event_disable_inatomic(event);
|
perf_event_disable_inatomic(event);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (event->attr.sigtrap) {
|
||||||
|
/*
|
||||||
|
* Should not be able to return to user space without processing
|
||||||
|
* pending_sigtrap (kernel events can overflow multiple times).
|
||||||
|
*/
|
||||||
|
WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
|
||||||
|
if (!event->pending_sigtrap) {
|
||||||
|
event->pending_sigtrap = 1;
|
||||||
|
local_inc(&event->ctx->nr_pending);
|
||||||
|
}
|
||||||
|
event->pending_addr = data->addr;
|
||||||
|
irq_work_queue(&event->pending_irq);
|
||||||
|
}
|
||||||
|
|
||||||
READ_ONCE(event->overflow_handler)(event, data, regs);
|
READ_ONCE(event->overflow_handler)(event, data, regs);
|
||||||
|
|
||||||
if (*perf_event_fasync(event) && event->pending_kill) {
|
if (*perf_event_fasync(event) && event->pending_kill) {
|
||||||
event->pending_wakeup = 1;
|
event->pending_wakeup = 1;
|
||||||
irq_work_queue(&event->pending);
|
irq_work_queue(&event->pending_irq);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
int perf_event_overflow(struct perf_event *event,
|
int perf_event_overflow(struct perf_event *event,
|
||||||
struct perf_sample_data *data,
|
struct perf_sample_data *data,
|
||||||
struct pt_regs *regs)
|
struct pt_regs *regs)
|
||||||
{
|
{
|
||||||
return __perf_event_overflow(event, 1, data, regs);
|
return __perf_event_overflow(event, 1, data, regs);
|
||||||
}
|
}
|
||||||
|
@ -11570,8 +11648,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
|
||||||
|
|
||||||
|
|
||||||
init_waitqueue_head(&event->waitq);
|
init_waitqueue_head(&event->waitq);
|
||||||
event->pending_disable = -1;
|
init_irq_work(&event->pending_irq, perf_pending_irq);
|
||||||
init_irq_work(&event->pending, perf_pending_event);
|
init_task_work(&event->pending_task, perf_pending_task);
|
||||||
|
|
||||||
mutex_init(&event->mmap_mutex);
|
mutex_init(&event->mmap_mutex);
|
||||||
raw_spin_lock_init(&event->addr_filters.lock);
|
raw_spin_lock_init(&event->addr_filters.lock);
|
||||||
|
@ -11593,9 +11671,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
|
||||||
if (parent_event)
|
if (parent_event)
|
||||||
event->event_caps = parent_event->event_caps;
|
event->event_caps = parent_event->event_caps;
|
||||||
|
|
||||||
if (event->attr.sigtrap)
|
|
||||||
atomic_set(&event->event_limit, 1);
|
|
||||||
|
|
||||||
if (task) {
|
if (task) {
|
||||||
event->attach_state = PERF_ATTACH_TASK;
|
event->attach_state = PERF_ATTACH_TASK;
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -22,7 +22,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
|
||||||
atomic_set(&handle->rb->poll, EPOLLIN);
|
atomic_set(&handle->rb->poll, EPOLLIN);
|
||||||
|
|
||||||
handle->event->pending_wakeup = 1;
|
handle->event->pending_wakeup = 1;
|
||||||
irq_work_queue(&handle->event->pending);
|
irq_work_queue(&handle->event->pending_irq);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Загрузка…
Ссылка в новой задаче