drm/i915: Drop fake breadcrumb irq
Missed breadcrumb detection is defunct due to the tight coupling with dma_fence signaling and the myriad ways we may signal fences from everywhere but from an interrupt, i.e. we frequently signal a fence before we even see its interrupt. This means that even if we miss an interrupt for a fence, it still is signaled before our breadcrumb hangcheck fires, so simplify the breadcrumb hangchecking by moving it into the GPU hangcheck and forgo fake interrupts. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190129205230.19056-3-chris@chris-wilson.co.uk
This commit is contained in:
Родитель
52c0fdb25c
Коммит
789659f430
|
@ -1321,9 +1321,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
|
|||
intel_engine_last_submit(engine),
|
||||
jiffies_to_msecs(jiffies -
|
||||
engine->hangcheck.action_timestamp));
|
||||
seq_printf(m, "\tfake irq active? %s\n",
|
||||
yesno(test_bit(engine->id,
|
||||
&dev_priv->gpu_error.missed_irq_rings)));
|
||||
|
||||
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
|
||||
(long long)engine->hangcheck.acthd,
|
||||
|
@ -3899,94 +3896,6 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
|
|||
i915_wedged_get, i915_wedged_set,
|
||||
"%llu\n");
|
||||
|
||||
static int
|
||||
fault_irq_set(struct drm_i915_private *i915,
|
||||
unsigned long *irq,
|
||||
unsigned long val)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = mutex_lock_interruptible(&i915->drm.struct_mutex);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = i915_gem_wait_for_idle(i915,
|
||||
I915_WAIT_LOCKED |
|
||||
I915_WAIT_INTERRUPTIBLE,
|
||||
MAX_SCHEDULE_TIMEOUT);
|
||||
if (err)
|
||||
goto err_unlock;
|
||||
|
||||
*irq = val;
|
||||
mutex_unlock(&i915->drm.struct_mutex);
|
||||
|
||||
/* Flush idle worker to disarm irq */
|
||||
drain_delayed_work(&i915->gt.idle_work);
|
||||
|
||||
return 0;
|
||||
|
||||
err_unlock:
|
||||
mutex_unlock(&i915->drm.struct_mutex);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int
|
||||
i915_ring_missed_irq_get(void *data, u64 *val)
|
||||
{
|
||||
struct drm_i915_private *dev_priv = data;
|
||||
|
||||
*val = dev_priv->gpu_error.missed_irq_rings;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
i915_ring_missed_irq_set(void *data, u64 val)
|
||||
{
|
||||
struct drm_i915_private *i915 = data;
|
||||
|
||||
return fault_irq_set(i915, &i915->gpu_error.missed_irq_rings, val);
|
||||
}
|
||||
|
||||
DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
|
||||
i915_ring_missed_irq_get, i915_ring_missed_irq_set,
|
||||
"0x%08llx\n");
|
||||
|
||||
static int
|
||||
i915_ring_test_irq_get(void *data, u64 *val)
|
||||
{
|
||||
struct drm_i915_private *dev_priv = data;
|
||||
|
||||
*val = dev_priv->gpu_error.test_irq_rings;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
i915_ring_test_irq_set(void *data, u64 val)
|
||||
{
|
||||
struct drm_i915_private *i915 = data;
|
||||
|
||||
/* GuC keeps the user interrupt permanently enabled for submission */
|
||||
if (USES_GUC_SUBMISSION(i915))
|
||||
return -ENODEV;
|
||||
|
||||
/*
|
||||
* From icl, we can no longer individually mask interrupt generation
|
||||
* from each engine.
|
||||
*/
|
||||
if (INTEL_GEN(i915) >= 11)
|
||||
return -ENODEV;
|
||||
|
||||
val &= INTEL_INFO(i915)->ring_mask;
|
||||
DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
|
||||
|
||||
return fault_irq_set(i915, &i915->gpu_error.test_irq_rings, val);
|
||||
}
|
||||
|
||||
DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
|
||||
i915_ring_test_irq_get, i915_ring_test_irq_set,
|
||||
"0x%08llx\n");
|
||||
|
||||
#define DROP_UNBOUND BIT(0)
|
||||
#define DROP_BOUND BIT(1)
|
||||
#define DROP_RETIRE BIT(2)
|
||||
|
@ -4750,8 +4659,6 @@ static const struct i915_debugfs_files {
|
|||
} i915_debugfs_files[] = {
|
||||
{"i915_wedged", &i915_wedged_fops},
|
||||
{"i915_cache_sharing", &i915_cache_sharing_fops},
|
||||
{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
|
||||
{"i915_ring_test_irq", &i915_ring_test_irq_fops},
|
||||
{"i915_gem_drop_caches", &i915_drop_caches_fops},
|
||||
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
|
||||
{"i915_error_state", &i915_error_state_fops},
|
||||
|
|
|
@ -723,8 +723,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
|
|||
err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
|
||||
err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
|
||||
err_printf(m, "CCID: 0x%08x\n", error->ccid);
|
||||
err_printf(m, "Missed interrupts: 0x%08lx\n",
|
||||
m->i915->gpu_error.missed_irq_rings);
|
||||
|
||||
for (i = 0; i < error->nfence; i++)
|
||||
err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]);
|
||||
|
|
|
@ -204,8 +204,6 @@ struct i915_gpu_error {
|
|||
|
||||
atomic_t pending_fb_pin;
|
||||
|
||||
unsigned long missed_irq_rings;
|
||||
|
||||
/**
|
||||
* State variable controlling the reset flow and count
|
||||
*
|
||||
|
@ -274,9 +272,6 @@ struct i915_gpu_error {
|
|||
*/
|
||||
wait_queue_head_t reset_queue;
|
||||
|
||||
/* For missed irq/seqno simulation. */
|
||||
unsigned long test_irq_rings;
|
||||
|
||||
struct i915_gpu_restart *restart;
|
||||
};
|
||||
|
||||
|
|
|
@ -91,7 +91,6 @@ bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine)
|
|||
|
||||
spin_lock(&b->irq_lock);
|
||||
|
||||
b->irq_fired = true;
|
||||
if (b->irq_armed && list_empty(&b->signalers))
|
||||
__intel_breadcrumbs_disarm_irq(b);
|
||||
|
||||
|
@ -172,86 +171,6 @@ static void signal_irq_work(struct irq_work *work)
|
|||
intel_engine_breadcrumbs_irq(engine);
|
||||
}
|
||||
|
||||
static unsigned long wait_timeout(void)
|
||||
{
|
||||
return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
|
||||
}
|
||||
|
||||
static noinline void missed_breadcrumb(struct intel_engine_cs *engine)
|
||||
{
|
||||
if (GEM_SHOW_DEBUG()) {
|
||||
struct drm_printer p = drm_debug_printer(__func__);
|
||||
|
||||
intel_engine_dump(engine, &p,
|
||||
"%s missed breadcrumb at %pS\n",
|
||||
engine->name, __builtin_return_address(0));
|
||||
}
|
||||
|
||||
set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
|
||||
}
|
||||
|
||||
static void intel_breadcrumbs_hangcheck(struct timer_list *t)
|
||||
{
|
||||
struct intel_engine_cs *engine =
|
||||
from_timer(engine, t, breadcrumbs.hangcheck);
|
||||
struct intel_breadcrumbs *b = &engine->breadcrumbs;
|
||||
|
||||
if (!b->irq_armed)
|
||||
return;
|
||||
|
||||
if (b->irq_fired)
|
||||
goto rearm;
|
||||
|
||||
/*
|
||||
* We keep the hangcheck timer alive until we disarm the irq, even
|
||||
* if there are no waiters at present.
|
||||
*
|
||||
* If the waiter was currently running, assume it hasn't had a chance
|
||||
* to process the pending interrupt (e.g, low priority task on a loaded
|
||||
* system) and wait until it sleeps before declaring a missed interrupt.
|
||||
*
|
||||
* If the waiter was asleep (and not even pending a wakeup), then we
|
||||
* must have missed an interrupt as the GPU has stopped advancing
|
||||
* but we still have a waiter. Assuming all batches complete within
|
||||
* DRM_I915_HANGCHECK_JIFFIES [1.5s]!
|
||||
*/
|
||||
synchronize_hardirq(engine->i915->drm.irq);
|
||||
if (intel_engine_signal_breadcrumbs(engine)) {
|
||||
missed_breadcrumb(engine);
|
||||
mod_timer(&b->fake_irq, jiffies + 1);
|
||||
} else {
|
||||
rearm:
|
||||
b->irq_fired = false;
|
||||
mod_timer(&b->hangcheck, wait_timeout());
|
||||
}
|
||||
}
|
||||
|
||||
static void intel_breadcrumbs_fake_irq(struct timer_list *t)
|
||||
{
|
||||
struct intel_engine_cs *engine =
|
||||
from_timer(engine, t, breadcrumbs.fake_irq);
|
||||
struct intel_breadcrumbs *b = &engine->breadcrumbs;
|
||||
|
||||
/*
|
||||
* The timer persists in case we cannot enable interrupts,
|
||||
* or if we have previously seen seqno/interrupt incoherency
|
||||
* ("missed interrupt" syndrome, better known as a "missed breadcrumb").
|
||||
* Here the worker will wake up every jiffie in order to kick the
|
||||
* oldest waiter to do the coherent seqno check.
|
||||
*/
|
||||
|
||||
if (!intel_engine_signal_breadcrumbs(engine) && !b->irq_armed)
|
||||
return;
|
||||
|
||||
/* If the user has disabled the fake-irq, restore the hangchecking */
|
||||
if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) {
|
||||
mod_timer(&b->hangcheck, wait_timeout());
|
||||
return;
|
||||
}
|
||||
|
||||
mod_timer(&b->fake_irq, jiffies + 1);
|
||||
}
|
||||
|
||||
void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine)
|
||||
{
|
||||
struct intel_breadcrumbs *b = &engine->breadcrumbs;
|
||||
|
@ -274,43 +193,14 @@ void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine)
|
|||
spin_unlock_irq(&b->irq_lock);
|
||||
}
|
||||
|
||||
static bool use_fake_irq(const struct intel_breadcrumbs *b)
|
||||
{
|
||||
const struct intel_engine_cs *engine =
|
||||
container_of(b, struct intel_engine_cs, breadcrumbs);
|
||||
|
||||
if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Only start with the heavy weight fake irq timer if we have not
|
||||
* seen any interrupts since enabling it the first time. If the
|
||||
* interrupts are still arriving, it means we made a mistake in our
|
||||
* engine->seqno_barrier(), a timing error that should be transient
|
||||
* and unlikely to reoccur.
|
||||
*/
|
||||
return !b->irq_fired;
|
||||
}
|
||||
|
||||
static void enable_fake_irq(struct intel_breadcrumbs *b)
|
||||
{
|
||||
/* Ensure we never sleep indefinitely */
|
||||
if (!b->irq_enabled || use_fake_irq(b))
|
||||
mod_timer(&b->fake_irq, jiffies + 1);
|
||||
else
|
||||
mod_timer(&b->hangcheck, wait_timeout());
|
||||
}
|
||||
|
||||
static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
|
||||
static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
|
||||
{
|
||||
struct intel_engine_cs *engine =
|
||||
container_of(b, struct intel_engine_cs, breadcrumbs);
|
||||
struct drm_i915_private *i915 = engine->i915;
|
||||
bool enabled;
|
||||
|
||||
lockdep_assert_held(&b->irq_lock);
|
||||
if (b->irq_armed)
|
||||
return false;
|
||||
return;
|
||||
|
||||
/*
|
||||
* The breadcrumb irq will be disarmed on the interrupt after the
|
||||
|
@ -328,16 +218,8 @@ static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
|
|||
* the driver is idle) we disarm the breadcrumbs.
|
||||
*/
|
||||
|
||||
/* No interrupts? Kick the waiter every jiffie! */
|
||||
enabled = false;
|
||||
if (!b->irq_enabled++ &&
|
||||
!test_bit(engine->id, &i915->gpu_error.test_irq_rings)) {
|
||||
if (!b->irq_enabled++)
|
||||
irq_enable(engine);
|
||||
enabled = true;
|
||||
}
|
||||
|
||||
enable_fake_irq(b);
|
||||
return enabled;
|
||||
}
|
||||
|
||||
void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
|
||||
|
@ -348,18 +230,6 @@ void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
|
|||
INIT_LIST_HEAD(&b->signalers);
|
||||
|
||||
init_irq_work(&b->irq_work, signal_irq_work);
|
||||
|
||||
timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
|
||||
timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
|
||||
}
|
||||
|
||||
static void cancel_fake_irq(struct intel_engine_cs *engine)
|
||||
{
|
||||
struct intel_breadcrumbs *b = &engine->breadcrumbs;
|
||||
|
||||
del_timer_sync(&b->fake_irq); /* may queue b->hangcheck */
|
||||
del_timer_sync(&b->hangcheck);
|
||||
clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
|
||||
}
|
||||
|
||||
void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
|
||||
|
@ -369,13 +239,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
|
|||
|
||||
spin_lock_irqsave(&b->irq_lock, flags);
|
||||
|
||||
/*
|
||||
* Leave the fake_irq timer enabled (if it is running), but clear the
|
||||
* bit so that it turns itself off on its next wake up and goes back
|
||||
* to the long hangcheck interval if still required.
|
||||
*/
|
||||
clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
|
||||
|
||||
if (b->irq_enabled)
|
||||
irq_enable(engine);
|
||||
else
|
||||
|
@ -386,7 +249,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
|
|||
|
||||
void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
|
||||
{
|
||||
cancel_fake_irq(engine);
|
||||
}
|
||||
|
||||
bool i915_request_enable_breadcrumb(struct i915_request *rq)
|
||||
|
@ -482,7 +344,4 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine,
|
|||
}
|
||||
}
|
||||
spin_unlock_irq(&b->irq_lock);
|
||||
|
||||
if (test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
|
||||
drm_printf(p, "Fake irq active\n");
|
||||
}
|
||||
|
|
|
@ -275,6 +275,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
|
|||
for_each_engine(engine, dev_priv, id) {
|
||||
struct hangcheck hc;
|
||||
|
||||
intel_engine_signal_breadcrumbs(engine);
|
||||
|
||||
hangcheck_load_sample(engine, &hc);
|
||||
hangcheck_accumulate_sample(engine, &hc);
|
||||
hangcheck_store_sample(engine, &hc);
|
||||
|
|
|
@ -387,14 +387,9 @@ struct intel_engine_cs {
|
|||
|
||||
struct irq_work irq_work; /* for use from inside irq_lock */
|
||||
|
||||
struct timer_list fake_irq; /* used after a missed interrupt */
|
||||
struct timer_list hangcheck; /* detect missed interrupts */
|
||||
|
||||
unsigned int hangcheck_interrupts;
|
||||
unsigned int irq_enabled;
|
||||
|
||||
bool irq_armed;
|
||||
bool irq_fired;
|
||||
} breadcrumbs;
|
||||
|
||||
struct {
|
||||
|
|
|
@ -35,7 +35,6 @@ int igt_live_test_begin(struct igt_live_test *t,
|
|||
return err;
|
||||
}
|
||||
|
||||
i915->gpu_error.missed_irq_rings = 0;
|
||||
t->reset_global = i915_reset_count(&i915->gpu_error);
|
||||
|
||||
for_each_engine(engine, i915, id)
|
||||
|
@ -75,11 +74,5 @@ int igt_live_test_end(struct igt_live_test *t)
|
|||
return -EIO;
|
||||
}
|
||||
|
||||
if (i915->gpu_error.missed_irq_rings) {
|
||||
pr_err("%s(%s): Missed interrupts on engines %lx\n",
|
||||
t->func, t->name, i915->gpu_error.missed_irq_rings);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче