2019-05-27 09:55:01 +03:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2017-04-05 10:54:50 +03:00
|
|
|
/*
|
|
|
|
* Copyright 2016,2017 IBM Corporation.
|
|
|
|
*/
|
|
|
|
#ifndef _ASM_POWERPC_XIVE_H
|
|
|
|
#define _ASM_POWERPC_XIVE_H
|
|
|
|
|
2020-04-16 08:58:40 +03:00
|
|
|
#include <asm/opal-api.h>
|
|
|
|
|
2017-04-05 10:54:50 +03:00
|
|
|
#define XIVE_INVALID_VP 0xffffffff
|
|
|
|
|
|
|
|
#ifdef CONFIG_PPC_XIVE
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Thread Interrupt Management Area (TIMA)
|
|
|
|
*
|
|
|
|
* This is a global MMIO region divided in 4 pages of varying access
|
|
|
|
* permissions, providing access to per-cpu interrupt management
|
|
|
|
* functions. It always identifies the CPU doing the access based
|
|
|
|
* on the PowerBus initiator ID, thus we always access via the
|
|
|
|
* same offset regardless of where the code is executing
|
|
|
|
*/
|
|
|
|
extern void __iomem *xive_tima;
|
2019-04-18 13:39:37 +03:00
|
|
|
extern unsigned long xive_tima_os;
|
2017-04-05 10:54:50 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Offset in the TM area of our current execution level (provided by
|
|
|
|
* the backend)
|
|
|
|
*/
|
|
|
|
extern u32 xive_tima_offset;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Per-irq data (irq_get_handler_data for normal IRQs), IPIs
|
|
|
|
* have it stored in the xive_cpu structure. We also cache
|
|
|
|
* for normal interrupts the current target CPU.
|
|
|
|
*
|
|
|
|
* This structure is setup by the backend for each interrupt.
|
|
|
|
*/
|
|
|
|
struct xive_irq_data {
|
|
|
|
u64 flags;
|
|
|
|
u64 eoi_page;
|
|
|
|
void __iomem *eoi_mmio;
|
|
|
|
u64 trig_page;
|
|
|
|
void __iomem *trig_mmio;
|
|
|
|
u32 esb_shift;
|
|
|
|
int src_chip;
|
2017-08-30 22:46:14 +03:00
|
|
|
u32 hw_irq;
|
2017-04-05 10:54:50 +03:00
|
|
|
|
|
|
|
/* Setup/used by frontend */
|
|
|
|
int target;
|
powerpc/xive: Implement get_irqchip_state method for XIVE to fix shutdown race
Testing has revealed the existence of a race condition where a XIVE
interrupt being shut down can be in one of the XIVE interrupt queues
(of which there are up to 8 per CPU, one for each priority) at the
point where free_irq() is called. If this happens, can return an
interrupt number which has been shut down. This can lead to various
symptoms:
- irq_to_desc(irq) can be NULL. In this case, no end-of-interrupt
function gets called, resulting in the CPU's elevated interrupt
priority (numerically lowered CPPR) never gets reset. That then
means that the CPU stops processing interrupts, causing device
timeouts and other errors in various device drivers.
- The irq descriptor or related data structures can be in the process
of being freed as the interrupt code is using them. This typically
leads to crashes due to bad pointer dereferences.
This race is basically what commit 62e0468650c3 ("genirq: Add optional
hardware synchronization for shutdown", 2019-06-28) is intended to
fix, given a get_irqchip_state() method for the interrupt controller
being used. It works by polling the interrupt controller when an
interrupt is being freed until the controller says it is not pending.
With XIVE, the PQ bits of the interrupt source indicate the state of
the interrupt source, and in particular the P bit goes from 0 to 1 at
the point where the hardware writes an entry into the interrupt queue
that this interrupt is directed towards. Normally, the code will then
process the interrupt and do an end-of-interrupt (EOI) operation which
will reset PQ to 00 (assuming another interrupt hasn't been generated
in the meantime). However, there are situations where the code resets
P even though a queue entry exists (for example, by setting PQ to 01,
which disables the interrupt source), and also situations where the
code leaves P at 1 after removing the queue entry (for example, this
is done for escalation interrupts so they cannot fire again until
they are explicitly re-enabled).
The code already has a 'saved_p' flag for the interrupt source which
indicates that a queue entry exists, although it isn't maintained
consistently. This patch adds a 'stale_p' flag to indicate that
P has been left at 1 after processing a queue entry, and adds code
to set and clear saved_p and stale_p as necessary to maintain a
consistent indication of whether a queue entry may or may not exist.
With this, we can implement xive_get_irqchip_state() by looking at
stale_p, saved_p and the ESB PQ bits for the interrupt.
There is some additional code to handle escalation interrupts
properly; because they are enabled and disabled in KVM assembly code,
which does not have access to the xive_irq_data struct for the
escalation interrupt. Hence, stale_p may be incorrect when the
escalation interrupt is freed in kvmppc_xive_{,native_}cleanup_vcpu().
Fortunately, we can fix it up by looking at vcpu->arch.xive_esc_on,
with some careful attention to barriers in order to ensure the correct
result if xive_esc_irq() races with kvmppc_xive_cleanup_vcpu().
Finally, this adds code to make noise on the console (pr_crit and
WARN_ON(1)) if we find an interrupt queue entry for an interrupt
which does not have a descriptor. While this won't catch the race
reliably, if it does get triggered it will be an indication that
the race is occurring and needs to be debugged.
Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt controller")
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190813100648.GE9567@blackberry
2019-08-13 13:06:48 +03:00
|
|
|
/*
|
|
|
|
* saved_p means that there is a queue entry for this interrupt
|
|
|
|
* in some CPU's queue (not including guest vcpu queues), even
|
|
|
|
* if P is not set in the source ESB.
|
|
|
|
* stale_p means that there is no queue entry for this interrupt
|
|
|
|
* in some CPU's queue, even if P is set in the source ESB.
|
|
|
|
*/
|
2017-04-05 10:54:50 +03:00
|
|
|
bool saved_p;
|
powerpc/xive: Implement get_irqchip_state method for XIVE to fix shutdown race
Testing has revealed the existence of a race condition where a XIVE
interrupt being shut down can be in one of the XIVE interrupt queues
(of which there are up to 8 per CPU, one for each priority) at the
point where free_irq() is called. If this happens, can return an
interrupt number which has been shut down. This can lead to various
symptoms:
- irq_to_desc(irq) can be NULL. In this case, no end-of-interrupt
function gets called, resulting in the CPU's elevated interrupt
priority (numerically lowered CPPR) never gets reset. That then
means that the CPU stops processing interrupts, causing device
timeouts and other errors in various device drivers.
- The irq descriptor or related data structures can be in the process
of being freed as the interrupt code is using them. This typically
leads to crashes due to bad pointer dereferences.
This race is basically what commit 62e0468650c3 ("genirq: Add optional
hardware synchronization for shutdown", 2019-06-28) is intended to
fix, given a get_irqchip_state() method for the interrupt controller
being used. It works by polling the interrupt controller when an
interrupt is being freed until the controller says it is not pending.
With XIVE, the PQ bits of the interrupt source indicate the state of
the interrupt source, and in particular the P bit goes from 0 to 1 at
the point where the hardware writes an entry into the interrupt queue
that this interrupt is directed towards. Normally, the code will then
process the interrupt and do an end-of-interrupt (EOI) operation which
will reset PQ to 00 (assuming another interrupt hasn't been generated
in the meantime). However, there are situations where the code resets
P even though a queue entry exists (for example, by setting PQ to 01,
which disables the interrupt source), and also situations where the
code leaves P at 1 after removing the queue entry (for example, this
is done for escalation interrupts so they cannot fire again until
they are explicitly re-enabled).
The code already has a 'saved_p' flag for the interrupt source which
indicates that a queue entry exists, although it isn't maintained
consistently. This patch adds a 'stale_p' flag to indicate that
P has been left at 1 after processing a queue entry, and adds code
to set and clear saved_p and stale_p as necessary to maintain a
consistent indication of whether a queue entry may or may not exist.
With this, we can implement xive_get_irqchip_state() by looking at
stale_p, saved_p and the ESB PQ bits for the interrupt.
There is some additional code to handle escalation interrupts
properly; because they are enabled and disabled in KVM assembly code,
which does not have access to the xive_irq_data struct for the
escalation interrupt. Hence, stale_p may be incorrect when the
escalation interrupt is freed in kvmppc_xive_{,native_}cleanup_vcpu().
Fortunately, we can fix it up by looking at vcpu->arch.xive_esc_on,
with some careful attention to barriers in order to ensure the correct
result if xive_esc_irq() races with kvmppc_xive_cleanup_vcpu().
Finally, this adds code to make noise on the console (pr_crit and
WARN_ON(1)) if we find an interrupt queue entry for an interrupt
which does not have a descriptor. While this won't catch the race
reliably, if it does get triggered it will be an indication that
the race is occurring and needs to be debugged.
Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt controller")
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20190813100648.GE9567@blackberry
2019-08-13 13:06:48 +03:00
|
|
|
bool stale_p;
|
2017-04-05 10:54:50 +03:00
|
|
|
};
|
|
|
|
#define XIVE_IRQ_FLAG_STORE_EOI 0x01
|
|
|
|
#define XIVE_IRQ_FLAG_LSI 0x02
|
2020-12-10 20:14:45 +03:00
|
|
|
/* #define XIVE_IRQ_FLAG_SHIFT_BUG 0x04 */ /* P9 DD1.0 workaround */
|
2020-12-10 20:14:46 +03:00
|
|
|
/* #define XIVE_IRQ_FLAG_MASK_FW 0x08 */ /* P9 DD1.0 workaround */
|
2020-12-10 20:14:47 +03:00
|
|
|
/* #define XIVE_IRQ_FLAG_EOI_FW 0x10 */ /* P9 DD1.0 workaround */
|
2017-08-30 22:46:15 +03:00
|
|
|
#define XIVE_IRQ_FLAG_H_INT_ESB 0x20
|
2017-04-05 10:54:50 +03:00
|
|
|
|
2018-01-12 05:39:28 +03:00
|
|
|
/* Special flag set by KVM for excalation interrupts */
|
2020-12-10 20:14:39 +03:00
|
|
|
#define XIVE_IRQ_FLAG_NO_EOI 0x80
|
2018-01-12 05:39:28 +03:00
|
|
|
|
2017-04-05 10:54:50 +03:00
|
|
|
#define XIVE_INVALID_CHIP_ID -1
|
|
|
|
|
|
|
|
/* A queue tracking structure in a CPU */
|
|
|
|
struct xive_q {
|
|
|
|
__be32 *qpage;
|
|
|
|
u32 msk;
|
|
|
|
u32 idx;
|
|
|
|
u32 toggle;
|
|
|
|
u64 eoi_phys;
|
|
|
|
u32 esc_irq;
|
|
|
|
atomic_t count;
|
|
|
|
atomic_t pending_count;
|
2019-04-18 13:39:31 +03:00
|
|
|
u64 guest_qaddr;
|
|
|
|
u32 guest_qshift;
|
2017-04-05 10:54:50 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
/* Global enable flags for the XIVE support */
|
|
|
|
extern bool __xive_enabled;
|
|
|
|
|
|
|
|
static inline bool xive_enabled(void) { return __xive_enabled; }
|
|
|
|
|
2019-11-15 21:10:58 +03:00
|
|
|
bool xive_spapr_init(void);
|
|
|
|
bool xive_native_init(void);
|
|
|
|
void xive_smp_probe(void);
|
|
|
|
int xive_smp_prepare_cpu(unsigned int cpu);
|
|
|
|
void xive_smp_setup_cpu(void);
|
|
|
|
void xive_smp_disable_cpu(void);
|
|
|
|
void xive_teardown_cpu(void);
|
|
|
|
void xive_shutdown(void);
|
|
|
|
void xive_flush_interrupt(void);
|
2017-04-05 10:54:50 +03:00
|
|
|
|
|
|
|
/* xmon hook */
|
2019-11-15 21:10:58 +03:00
|
|
|
void xmon_xive_do_dump(int cpu);
|
|
|
|
int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d);
|
2021-03-31 17:45:11 +03:00
|
|
|
void xmon_xive_get_irq_all(void);
|
2017-04-05 10:54:50 +03:00
|
|
|
|
|
|
|
/* APIs used by KVM */
|
2019-11-15 21:10:58 +03:00
|
|
|
u32 xive_native_default_eq_shift(void);
|
|
|
|
u32 xive_native_alloc_vp_block(u32 max_vcpus);
|
|
|
|
void xive_native_free_vp_block(u32 vp_base);
|
|
|
|
int xive_native_populate_irq_data(u32 hw_irq,
|
|
|
|
struct xive_irq_data *data);
|
|
|
|
void xive_cleanup_irq_data(struct xive_irq_data *xd);
|
|
|
|
void xive_native_free_irq(u32 irq);
|
|
|
|
int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
|
|
|
|
|
|
|
|
int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
|
|
|
|
__be32 *qpage, u32 order, bool can_escalate);
|
|
|
|
void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
|
|
|
|
|
|
|
|
void xive_native_sync_source(u32 hw_irq);
|
|
|
|
void xive_native_sync_queue(u32 hw_irq);
|
|
|
|
bool is_xive_irq(struct irq_chip *chip);
|
|
|
|
int xive_native_enable_vp(u32 vp_id, bool single_escalation);
|
|
|
|
int xive_native_disable_vp(u32 vp_id);
|
|
|
|
int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
|
|
|
|
bool xive_native_has_single_escalation(void);
|
|
|
|
|
|
|
|
int xive_native_get_queue_info(u32 vp_id, uint32_t prio,
|
|
|
|
u64 *out_qpage,
|
|
|
|
u64 *out_qsize,
|
|
|
|
u64 *out_qeoi_page,
|
|
|
|
u32 *out_escalate_irq,
|
|
|
|
u64 *out_qflags);
|
|
|
|
|
|
|
|
int xive_native_get_queue_state(u32 vp_id, uint32_t prio, u32 *qtoggle,
|
|
|
|
u32 *qindex);
|
|
|
|
int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle,
|
|
|
|
u32 qindex);
|
|
|
|
int xive_native_get_vp_state(u32 vp_id, u64 *out_state);
|
|
|
|
bool xive_native_has_queue_state_support(void);
|
2020-04-16 08:58:40 +03:00
|
|
|
extern u32 xive_native_alloc_irq_on_chip(u32 chip_id);
|
|
|
|
|
|
|
|
static inline u32 xive_native_alloc_irq(void)
|
|
|
|
{
|
|
|
|
return xive_native_alloc_irq_on_chip(OPAL_XIVE_ANY_CHIP);
|
|
|
|
}
|
2019-04-10 20:04:33 +03:00
|
|
|
|
2017-04-05 10:54:50 +03:00
|
|
|
#else
|
|
|
|
|
|
|
|
static inline bool xive_enabled(void) { return false; }
|
|
|
|
|
powerpc/xive: guest exploitation of the XIVE interrupt controller
This is the framework for using XIVE in a PowerVM guest. The support
is very similar to the native one in a much simpler form.
Each source is associated with an Event State Buffer (ESB). This is a
two bit state machine which is used to trigger events. The bits are
named "P" (pending) and "Q" (queued) and can be controlled by MMIO.
The Guest OS registers event (or notifications) queues on which the HW
will post event data for a target to notify.
Instead of OPAL calls, a set of Hypervisors call are used to configure
the interrupt sources and the event/notification queues of the guest:
- H_INT_GET_SOURCE_INFO
used to obtain the address of the MMIO page of the Event State
Buffer (PQ bits) entry associated with the source.
- H_INT_SET_SOURCE_CONFIG
assigns a source to a "target".
- H_INT_GET_SOURCE_CONFIG
determines to which "target" and "priority" is assigned to a source
- H_INT_GET_QUEUE_INFO
returns the address of the notification management page associated
with the specified "target" and "priority".
- H_INT_SET_QUEUE_CONFIG
sets or resets the event queue for a given "target" and "priority".
It is also used to set the notification config associated with the
queue, only unconditional notification for the moment. Reset is
performed with a queue size of 0 and queueing is disabled in that
case.
- H_INT_GET_QUEUE_CONFIG
returns the queue settings for a given "target" and "priority".
- H_INT_RESET
resets all of the partition's interrupt exploitation structures to
their initial state, losing all configuration set via the hcalls
H_INT_SET_SOURCE_CONFIG and H_INT_SET_QUEUE_CONFIG.
- H_INT_SYNC
issue a synchronisation on a source to make sure sure all
notifications have reached their queue.
As for XICS, the XIVE interface for the guest is described in the
device tree under the "interrupt-controller" node. A couple of new
properties are specific to XIVE :
- "reg"
contains the base address and size of the thread interrupt
managnement areas (TIMA), also called rings, for the User level and
for the Guest OS level. Only the Guest OS level is taken into
account today.
- "ibm,xive-eq-sizes"
the size of the event queues. One cell per size supported, contains
log2 of size, in ascending order.
- "ibm,xive-lisn-ranges"
the interrupt numbers ranges assigned to the guest. These are
allocated using a simple bitmap.
and also :
- "/ibm,plat-res-int-priorities"
contains a list of priorities that the hypervisor has reserved for
its own use.
Tested with a QEMU XIVE model for pseries and with the Power hypervisor.
Signed-off-by: Cédric Le Goater <clg@kaod.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2017-08-30 22:46:11 +03:00
|
|
|
static inline bool xive_spapr_init(void) { return false; }
|
2017-04-05 10:54:50 +03:00
|
|
|
static inline bool xive_native_init(void) { return false; }
|
|
|
|
static inline void xive_smp_probe(void) { }
|
2017-12-26 16:00:17 +03:00
|
|
|
static inline int xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
|
2017-04-05 10:54:50 +03:00
|
|
|
static inline void xive_smp_setup_cpu(void) { }
|
|
|
|
static inline void xive_smp_disable_cpu(void) { }
|
|
|
|
static inline void xive_shutdown(void) { }
|
|
|
|
static inline void xive_flush_interrupt(void) { }
|
|
|
|
|
|
|
|
static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; }
|
|
|
|
static inline void xive_native_free_vp_block(u32 vp_base) { }
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _ASM_POWERPC_XIVE_H */
|