Merge branch 'x86/asm' into perf/x86, to avoid conflicts with upcoming patches
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Коммит
0967160ad6
|
@ -78,9 +78,6 @@ The expensive (paranoid) way is to read back the MSR_GS_BASE value
|
|||
xorl %ebx,%ebx
|
||||
1: ret
|
||||
|
||||
and the whole paranoid non-paranoid macro complexity is about whether
|
||||
to suffer that RDMSR cost.
|
||||
|
||||
If we are at an interrupt or user-trap/gate-alike boundary then we can
|
||||
use the faster check: the stack will be a reliable indicator of
|
||||
whether SWAPGS was already done: if we see that we are a secondary
|
||||
|
@ -93,6 +90,15 @@ which might have triggered right after a normal entry wrote CS to the
|
|||
stack but before we executed SWAPGS, then the only safe way to check
|
||||
for GS is the slower method: the RDMSR.
|
||||
|
||||
So we try only to mark those entry methods 'paranoid' that absolutely
|
||||
need the more expensive check for the GS base - and we generate all
|
||||
'normal' entry points with the regular (faster) entry macros.
|
||||
Therefore, super-atomic entries (except NMI, which is handled separately)
|
||||
must use idtentry with paranoid=1 to handle gsbase correctly. This
|
||||
triggers three main behavior changes:
|
||||
|
||||
- Interrupt entry will use the slower gsbase check.
|
||||
- Interrupt entry from user mode will switch off the IST stack.
|
||||
- Interrupt exit to kernel mode will not attempt to reschedule.
|
||||
|
||||
We try to only use IST entries and the paranoid entry code for vectors
|
||||
that absolutely need the more expensive check for the GS base - and we
|
||||
generate all 'normal' entry points with the regular (faster) paranoid=0
|
||||
variant.
|
||||
|
|
|
@ -40,9 +40,11 @@ An IST is selected by a non-zero value in the IST field of an
|
|||
interrupt-gate descriptor. When an interrupt occurs and the hardware
|
||||
loads such a descriptor, the hardware automatically sets the new stack
|
||||
pointer based on the IST value, then invokes the interrupt handler. If
|
||||
software wants to allow nested IST interrupts then the handler must
|
||||
adjust the IST values on entry to and exit from the interrupt handler.
|
||||
(This is occasionally done, e.g. for debug exceptions.)
|
||||
the interrupt came from user mode, then the interrupt handler prologue
|
||||
will switch back to the per-thread stack. If software wants to allow
|
||||
nested IST interrupts then the handler must adjust the IST values on
|
||||
entry to and exit from the interrupt handler. (This is occasionally
|
||||
done, e.g. for debug exceptions.)
|
||||
|
||||
Events with different IST codes (i.e. with different stacks) can be
|
||||
nested. For example, a debug interrupt can safely be interrupted by an
|
||||
|
|
|
@ -179,8 +179,8 @@ sysenter_dispatch:
|
|||
sysexit_from_sys_call:
|
||||
andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
|
||||
/* clear IF, that popfq doesn't enable interrupts early */
|
||||
andl $~0x200,EFLAGS-R11(%rsp)
|
||||
movl RIP-R11(%rsp),%edx /* User %eip */
|
||||
andl $~0x200,EFLAGS-ARGOFFSET(%rsp)
|
||||
movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */
|
||||
CFI_REGISTER rip,rdx
|
||||
RESTORE_ARGS 0,24,0,0,0,0
|
||||
xorq %r8,%r8
|
||||
|
|
|
@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with
|
|||
#define SS 160
|
||||
|
||||
#define ARGOFFSET R11
|
||||
#define SWFRAME ORIG_RAX
|
||||
|
||||
.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
|
||||
subq $9*8+\addskip, %rsp
|
||||
|
|
|
@ -190,7 +190,6 @@ enum mcp_flags {
|
|||
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
|
||||
|
||||
int mce_notify_irq(void);
|
||||
void mce_notify_process(void);
|
||||
|
||||
DECLARE_PER_CPU(struct mce, injectm);
|
||||
|
||||
|
|
|
@ -75,7 +75,6 @@ struct thread_info {
|
|||
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
|
||||
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
|
||||
#define TIF_SECCOMP 8 /* secure computing */
|
||||
#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
|
||||
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
|
||||
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
|
||||
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
|
||||
|
@ -100,7 +99,6 @@ struct thread_info {
|
|||
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
|
||||
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
|
||||
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
|
||||
#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
|
||||
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
|
||||
#define _TIF_UPROBE (1 << TIF_UPROBE)
|
||||
#define _TIF_NOTSC (1 << TIF_NOTSC)
|
||||
|
@ -140,7 +138,7 @@ struct thread_info {
|
|||
|
||||
/* Only used for 64 bit */
|
||||
#define _TIF_DO_NOTIFY_MASK \
|
||||
(_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
|
||||
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \
|
||||
_TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
|
||||
|
||||
/* flags to check in __switch_to() */
|
||||
|
@ -170,6 +168,17 @@ static inline struct thread_info *current_thread_info(void)
|
|||
return ti;
|
||||
}
|
||||
|
||||
static inline unsigned long current_stack_pointer(void)
|
||||
{
|
||||
unsigned long sp;
|
||||
#ifdef CONFIG_X86_64
|
||||
asm("mov %%rsp,%0" : "=g" (sp));
|
||||
#else
|
||||
asm("mov %%esp,%0" : "=g" (sp));
|
||||
#endif
|
||||
return sp;
|
||||
}
|
||||
|
||||
#else /* !__ASSEMBLY__ */
|
||||
|
||||
/* how to get the thread information struct from ASM */
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#ifndef _ASM_X86_TRAPS_H
|
||||
#define _ASM_X86_TRAPS_H
|
||||
|
||||
#include <linux/context_tracking_state.h>
|
||||
#include <linux/kprobes.h>
|
||||
|
||||
#include <asm/debugreg.h>
|
||||
|
@ -110,6 +111,11 @@ asmlinkage void smp_thermal_interrupt(void);
|
|||
asmlinkage void mce_threshold_interrupt(void);
|
||||
#endif
|
||||
|
||||
extern enum ctx_state ist_enter(struct pt_regs *regs);
|
||||
extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
|
||||
extern void ist_begin_non_atomic(struct pt_regs *regs);
|
||||
extern void ist_end_non_atomic(void);
|
||||
|
||||
/* Interrupts/Exceptions */
|
||||
enum {
|
||||
X86_TRAP_DE = 0, /* 0, Divide-by-zero */
|
||||
|
|
|
@ -43,6 +43,7 @@
|
|||
#include <linux/export.h>
|
||||
|
||||
#include <asm/processor.h>
|
||||
#include <asm/traps.h>
|
||||
#include <asm/mce.h>
|
||||
#include <asm/msr.h>
|
||||
|
||||
|
@ -1002,51 +1003,6 @@ static void mce_clear_state(unsigned long *toclear)
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Need to save faulting physical address associated with a process
|
||||
* in the machine check handler some place where we can grab it back
|
||||
* later in mce_notify_process()
|
||||
*/
|
||||
#define MCE_INFO_MAX 16
|
||||
|
||||
struct mce_info {
|
||||
atomic_t inuse;
|
||||
struct task_struct *t;
|
||||
__u64 paddr;
|
||||
int restartable;
|
||||
} mce_info[MCE_INFO_MAX];
|
||||
|
||||
static void mce_save_info(__u64 addr, int c)
|
||||
{
|
||||
struct mce_info *mi;
|
||||
|
||||
for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
|
||||
if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
|
||||
mi->t = current;
|
||||
mi->paddr = addr;
|
||||
mi->restartable = c;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
mce_panic("Too many concurrent recoverable errors", NULL, NULL);
|
||||
}
|
||||
|
||||
static struct mce_info *mce_find_info(void)
|
||||
{
|
||||
struct mce_info *mi;
|
||||
|
||||
for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
|
||||
if (atomic_read(&mi->inuse) && mi->t == current)
|
||||
return mi;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void mce_clear_info(struct mce_info *mi)
|
||||
{
|
||||
atomic_set(&mi->inuse, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* The actual machine check handler. This only handles real
|
||||
* exceptions when something got corrupted coming in through int 18.
|
||||
|
@ -1063,6 +1019,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|||
{
|
||||
struct mca_config *cfg = &mca_cfg;
|
||||
struct mce m, *final;
|
||||
enum ctx_state prev_state;
|
||||
int i;
|
||||
int worst = 0;
|
||||
int severity;
|
||||
|
@ -1084,6 +1041,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|||
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
|
||||
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
|
||||
char *msg = "Unknown";
|
||||
u64 recover_paddr = ~0ull;
|
||||
int flags = MF_ACTION_REQUIRED;
|
||||
|
||||
prev_state = ist_enter(regs);
|
||||
|
||||
this_cpu_inc(mce_exception_count);
|
||||
|
||||
|
@ -1203,9 +1164,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|||
if (no_way_out)
|
||||
mce_panic("Fatal machine check on current CPU", &m, msg);
|
||||
if (worst == MCE_AR_SEVERITY) {
|
||||
/* schedule action before return to userland */
|
||||
mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
|
||||
set_thread_flag(TIF_MCE_NOTIFY);
|
||||
recover_paddr = m.addr;
|
||||
if (!(m.mcgstatus & MCG_STATUS_RIPV))
|
||||
flags |= MF_MUST_KILL;
|
||||
} else if (kill_it) {
|
||||
force_sig(SIGBUS, current);
|
||||
}
|
||||
|
@ -1216,6 +1177,27 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|||
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
|
||||
out:
|
||||
sync_core();
|
||||
|
||||
if (recover_paddr == ~0ull)
|
||||
goto done;
|
||||
|
||||
pr_err("Uncorrected hardware memory error in user-access at %llx",
|
||||
recover_paddr);
|
||||
/*
|
||||
* We must call memory_failure() here even if the current process is
|
||||
* doomed. We still need to mark the page as poisoned and alert any
|
||||
* other users of the page.
|
||||
*/
|
||||
ist_begin_non_atomic(regs);
|
||||
local_irq_enable();
|
||||
if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
|
||||
pr_err("Memory error not recovered");
|
||||
force_sig(SIGBUS, current);
|
||||
}
|
||||
local_irq_disable();
|
||||
ist_end_non_atomic();
|
||||
done:
|
||||
ist_exit(regs, prev_state);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(do_machine_check);
|
||||
|
||||
|
@ -1232,42 +1214,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
|
|||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Called in process context that interrupted by MCE and marked with
|
||||
* TIF_MCE_NOTIFY, just before returning to erroneous userland.
|
||||
* This code is allowed to sleep.
|
||||
* Attempt possible recovery such as calling the high level VM handler to
|
||||
* process any corrupted pages, and kill/signal current process if required.
|
||||
* Action required errors are handled here.
|
||||
*/
|
||||
void mce_notify_process(void)
|
||||
{
|
||||
unsigned long pfn;
|
||||
struct mce_info *mi = mce_find_info();
|
||||
int flags = MF_ACTION_REQUIRED;
|
||||
|
||||
if (!mi)
|
||||
mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
|
||||
pfn = mi->paddr >> PAGE_SHIFT;
|
||||
|
||||
clear_thread_flag(TIF_MCE_NOTIFY);
|
||||
|
||||
pr_err("Uncorrected hardware memory error in user-access at %llx",
|
||||
mi->paddr);
|
||||
/*
|
||||
* We must call memory_failure() here even if the current process is
|
||||
* doomed. We still need to mark the page as poisoned and alert any
|
||||
* other users of the page.
|
||||
*/
|
||||
if (!mi->restartable)
|
||||
flags |= MF_MUST_KILL;
|
||||
if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
|
||||
pr_err("Memory error not recovered");
|
||||
force_sig(SIGBUS, current);
|
||||
}
|
||||
mce_clear_info(mi);
|
||||
}
|
||||
|
||||
/*
|
||||
* Action optional processing happens here (picking up
|
||||
* from the list of faulting pages that do_machine_check()
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#include <linux/smp.h>
|
||||
|
||||
#include <asm/processor.h>
|
||||
#include <asm/traps.h>
|
||||
#include <asm/mce.h>
|
||||
#include <asm/msr.h>
|
||||
|
||||
|
@ -17,8 +18,11 @@ int mce_p5_enabled __read_mostly;
|
|||
/* Machine check handler for Pentium class Intel CPUs: */
|
||||
static void pentium_machine_check(struct pt_regs *regs, long error_code)
|
||||
{
|
||||
enum ctx_state prev_state;
|
||||
u32 loaddr, hi, lotype;
|
||||
|
||||
prev_state = ist_enter(regs);
|
||||
|
||||
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
|
||||
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
|
||||
|
||||
|
@ -33,6 +37,8 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
|
|||
}
|
||||
|
||||
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
|
||||
|
||||
ist_exit(regs, prev_state);
|
||||
}
|
||||
|
||||
/* Set up machine check reporting for processors with Intel style MCE: */
|
||||
|
|
|
@ -7,14 +7,19 @@
|
|||
#include <linux/types.h>
|
||||
|
||||
#include <asm/processor.h>
|
||||
#include <asm/traps.h>
|
||||
#include <asm/mce.h>
|
||||
#include <asm/msr.h>
|
||||
|
||||
/* Machine check handler for WinChip C6: */
|
||||
static void winchip_machine_check(struct pt_regs *regs, long error_code)
|
||||
{
|
||||
enum ctx_state prev_state = ist_enter(regs);
|
||||
|
||||
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
|
||||
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
|
||||
|
||||
ist_exit(regs, prev_state);
|
||||
}
|
||||
|
||||
/* Set up machine check reporting on the Winchip C6 series */
|
||||
|
|
|
@ -143,7 +143,8 @@ ENDPROC(native_usergs_sysret64)
|
|||
movq \tmp,RSP+\offset(%rsp)
|
||||
movq $__USER_DS,SS+\offset(%rsp)
|
||||
movq $__USER_CS,CS+\offset(%rsp)
|
||||
movq $-1,RCX+\offset(%rsp)
|
||||
movq RIP+\offset(%rsp),\tmp /* get rip */
|
||||
movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
|
||||
movq R11+\offset(%rsp),\tmp /* get eflags */
|
||||
movq \tmp,EFLAGS+\offset(%rsp)
|
||||
.endm
|
||||
|
@ -155,27 +156,6 @@ ENDPROC(native_usergs_sysret64)
|
|||
movq \tmp,R11+\offset(%rsp)
|
||||
.endm
|
||||
|
||||
.macro FAKE_STACK_FRAME child_rip
|
||||
/* push in order ss, rsp, eflags, cs, rip */
|
||||
xorl %eax, %eax
|
||||
pushq_cfi $__KERNEL_DS /* ss */
|
||||
/*CFI_REL_OFFSET ss,0*/
|
||||
pushq_cfi %rax /* rsp */
|
||||
CFI_REL_OFFSET rsp,0
|
||||
pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */
|
||||
/*CFI_REL_OFFSET rflags,0*/
|
||||
pushq_cfi $__KERNEL_CS /* cs */
|
||||
/*CFI_REL_OFFSET cs,0*/
|
||||
pushq_cfi \child_rip /* rip */
|
||||
CFI_REL_OFFSET rip,0
|
||||
pushq_cfi %rax /* orig rax */
|
||||
.endm
|
||||
|
||||
.macro UNFAKE_STACK_FRAME
|
||||
addq $8*6, %rsp
|
||||
CFI_ADJUST_CFA_OFFSET -(6*8)
|
||||
.endm
|
||||
|
||||
/*
|
||||
* initial frame state for interrupts (and exceptions without error code)
|
||||
*/
|
||||
|
@ -238,51 +218,6 @@ ENDPROC(native_usergs_sysret64)
|
|||
CFI_REL_OFFSET r15, R15+\offset
|
||||
.endm
|
||||
|
||||
/* save partial stack frame */
|
||||
.macro SAVE_ARGS_IRQ
|
||||
cld
|
||||
/* start from rbp in pt_regs and jump over */
|
||||
movq_cfi rdi, (RDI-RBP)
|
||||
movq_cfi rsi, (RSI-RBP)
|
||||
movq_cfi rdx, (RDX-RBP)
|
||||
movq_cfi rcx, (RCX-RBP)
|
||||
movq_cfi rax, (RAX-RBP)
|
||||
movq_cfi r8, (R8-RBP)
|
||||
movq_cfi r9, (R9-RBP)
|
||||
movq_cfi r10, (R10-RBP)
|
||||
movq_cfi r11, (R11-RBP)
|
||||
|
||||
/* Save rbp so that we can unwind from get_irq_regs() */
|
||||
movq_cfi rbp, 0
|
||||
|
||||
/* Save previous stack value */
|
||||
movq %rsp, %rsi
|
||||
|
||||
leaq -RBP(%rsp),%rdi /* arg1 for handler */
|
||||
testl $3, CS-RBP(%rsi)
|
||||
je 1f
|
||||
SWAPGS
|
||||
/*
|
||||
* irq_count is used to check if a CPU is already on an interrupt stack
|
||||
* or not. While this is essentially redundant with preempt_count it is
|
||||
* a little cheaper to use a separate counter in the PDA (short of
|
||||
* moving irq_enter into assembly, which would be too much work)
|
||||
*/
|
||||
1: incl PER_CPU_VAR(irq_count)
|
||||
cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
|
||||
CFI_DEF_CFA_REGISTER rsi
|
||||
|
||||
/* Store previous stack value */
|
||||
pushq %rsi
|
||||
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
|
||||
0x77 /* DW_OP_breg7 */, 0, \
|
||||
0x06 /* DW_OP_deref */, \
|
||||
0x08 /* DW_OP_const1u */, SS+8-RBP, \
|
||||
0x22 /* DW_OP_plus */
|
||||
/* We entered an interrupt context - irqs are off: */
|
||||
TRACE_IRQS_OFF
|
||||
.endm
|
||||
|
||||
ENTRY(save_paranoid)
|
||||
XCPT_FRAME 1 RDI+8
|
||||
cld
|
||||
|
@ -426,15 +361,12 @@ system_call_fastpath:
|
|||
* Has incomplete stack frame and undefined top of stack.
|
||||
*/
|
||||
ret_from_sys_call:
|
||||
movl $_TIF_ALLWORK_MASK,%edi
|
||||
/* edi: flagmask */
|
||||
sysret_check:
|
||||
testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
|
||||
jnz int_ret_from_sys_call_fixup /* Go the the slow path */
|
||||
|
||||
LOCKDEP_SYS_EXIT
|
||||
DISABLE_INTERRUPTS(CLBR_NONE)
|
||||
TRACE_IRQS_OFF
|
||||
movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
|
||||
andl %edi,%edx
|
||||
jnz sysret_careful
|
||||
CFI_REMEMBER_STATE
|
||||
/*
|
||||
* sysretq will re-enable interrupts:
|
||||
|
@ -448,49 +380,10 @@ sysret_check:
|
|||
USERGS_SYSRET64
|
||||
|
||||
CFI_RESTORE_STATE
|
||||
/* Handle reschedules */
|
||||
/* edx: work, edi: workmask */
|
||||
sysret_careful:
|
||||
bt $TIF_NEED_RESCHED,%edx
|
||||
jnc sysret_signal
|
||||
TRACE_IRQS_ON
|
||||
ENABLE_INTERRUPTS(CLBR_NONE)
|
||||
pushq_cfi %rdi
|
||||
SCHEDULE_USER
|
||||
popq_cfi %rdi
|
||||
jmp sysret_check
|
||||
|
||||
/* Handle a signal */
|
||||
sysret_signal:
|
||||
TRACE_IRQS_ON
|
||||
ENABLE_INTERRUPTS(CLBR_NONE)
|
||||
#ifdef CONFIG_AUDITSYSCALL
|
||||
bt $TIF_SYSCALL_AUDIT,%edx
|
||||
jc sysret_audit
|
||||
#endif
|
||||
/*
|
||||
* We have a signal, or exit tracing or single-step.
|
||||
* These all wind up with the iret return path anyway,
|
||||
* so just join that path right now.
|
||||
*/
|
||||
int_ret_from_sys_call_fixup:
|
||||
FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
|
||||
jmp int_check_syscall_exit_work
|
||||
|
||||
#ifdef CONFIG_AUDITSYSCALL
|
||||
/*
|
||||
* Return fast path for syscall audit. Call __audit_syscall_exit()
|
||||
* directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
|
||||
* masked off.
|
||||
*/
|
||||
sysret_audit:
|
||||
movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
|
||||
cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
|
||||
setbe %al /* 1 if so, 0 if not */
|
||||
movzbl %al,%edi /* zero-extend that into %edi */
|
||||
call __audit_syscall_exit
|
||||
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
|
||||
jmp sysret_check
|
||||
#endif /* CONFIG_AUDITSYSCALL */
|
||||
jmp int_ret_from_sys_call
|
||||
|
||||
/* Do syscall tracing */
|
||||
tracesys:
|
||||
|
@ -626,19 +519,6 @@ END(\label)
|
|||
FORK_LIKE vfork
|
||||
FIXED_FRAME stub_iopl, sys_iopl
|
||||
|
||||
ENTRY(ptregscall_common)
|
||||
DEFAULT_FRAME 1 8 /* offset 8: return address */
|
||||
RESTORE_TOP_OF_STACK %r11, 8
|
||||
movq_cfi_restore R15+8, r15
|
||||
movq_cfi_restore R14+8, r14
|
||||
movq_cfi_restore R13+8, r13
|
||||
movq_cfi_restore R12+8, r12
|
||||
movq_cfi_restore RBP+8, rbp
|
||||
movq_cfi_restore RBX+8, rbx
|
||||
ret $REST_SKIP /* pop extended registers */
|
||||
CFI_ENDPROC
|
||||
END(ptregscall_common)
|
||||
|
||||
ENTRY(stub_execve)
|
||||
CFI_STARTPROC
|
||||
addq $8, %rsp
|
||||
|
@ -779,7 +659,48 @@ END(interrupt)
|
|||
/* reserve pt_regs for scratch regs and rbp */
|
||||
subq $ORIG_RAX-RBP, %rsp
|
||||
CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
|
||||
SAVE_ARGS_IRQ
|
||||
cld
|
||||
/* start from rbp in pt_regs and jump over */
|
||||
movq_cfi rdi, (RDI-RBP)
|
||||
movq_cfi rsi, (RSI-RBP)
|
||||
movq_cfi rdx, (RDX-RBP)
|
||||
movq_cfi rcx, (RCX-RBP)
|
||||
movq_cfi rax, (RAX-RBP)
|
||||
movq_cfi r8, (R8-RBP)
|
||||
movq_cfi r9, (R9-RBP)
|
||||
movq_cfi r10, (R10-RBP)
|
||||
movq_cfi r11, (R11-RBP)
|
||||
|
||||
/* Save rbp so that we can unwind from get_irq_regs() */
|
||||
movq_cfi rbp, 0
|
||||
|
||||
/* Save previous stack value */
|
||||
movq %rsp, %rsi
|
||||
|
||||
leaq -RBP(%rsp),%rdi /* arg1 for handler */
|
||||
testl $3, CS-RBP(%rsi)
|
||||
je 1f
|
||||
SWAPGS
|
||||
/*
|
||||
* irq_count is used to check if a CPU is already on an interrupt stack
|
||||
* or not. While this is essentially redundant with preempt_count it is
|
||||
* a little cheaper to use a separate counter in the PDA (short of
|
||||
* moving irq_enter into assembly, which would be too much work)
|
||||
*/
|
||||
1: incl PER_CPU_VAR(irq_count)
|
||||
cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
|
||||
CFI_DEF_CFA_REGISTER rsi
|
||||
|
||||
/* Store previous stack value */
|
||||
pushq %rsi
|
||||
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
|
||||
0x77 /* DW_OP_breg7 */, 0, \
|
||||
0x06 /* DW_OP_deref */, \
|
||||
0x08 /* DW_OP_const1u */, SS+8-RBP, \
|
||||
0x22 /* DW_OP_plus */
|
||||
/* We entered an interrupt context - irqs are off: */
|
||||
TRACE_IRQS_OFF
|
||||
|
||||
call \func
|
||||
.endm
|
||||
|
||||
|
@ -831,6 +752,60 @@ retint_swapgs: /* return to user-space */
|
|||
*/
|
||||
DISABLE_INTERRUPTS(CLBR_ANY)
|
||||
TRACE_IRQS_IRETQ
|
||||
|
||||
/*
|
||||
* Try to use SYSRET instead of IRET if we're returning to
|
||||
* a completely clean 64-bit userspace context.
|
||||
*/
|
||||
movq (RCX-R11)(%rsp), %rcx
|
||||
cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
|
||||
jne opportunistic_sysret_failed
|
||||
|
||||
/*
|
||||
* On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
|
||||
* in kernel space. This essentially lets the user take over
|
||||
* the kernel, since userspace controls RSP. It's not worth
|
||||
* testing for canonicalness exactly -- this check detects any
|
||||
* of the 17 high bits set, which is true for non-canonical
|
||||
* or kernel addresses. (This will pessimize vsyscall=native.
|
||||
* Big deal.)
|
||||
*
|
||||
* If virtual addresses ever become wider, this will need
|
||||
* to be updated to remain correct on both old and new CPUs.
|
||||
*/
|
||||
.ifne __VIRTUAL_MASK_SHIFT - 47
|
||||
.error "virtual address width changed -- sysret checks need update"
|
||||
.endif
|
||||
shr $__VIRTUAL_MASK_SHIFT, %rcx
|
||||
jnz opportunistic_sysret_failed
|
||||
|
||||
cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
|
||||
jne opportunistic_sysret_failed
|
||||
|
||||
movq (R11-ARGOFFSET)(%rsp), %r11
|
||||
cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
|
||||
jne opportunistic_sysret_failed
|
||||
|
||||
testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
|
||||
jnz opportunistic_sysret_failed
|
||||
|
||||
/* nothing to check for RSP */
|
||||
|
||||
cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
|
||||
jne opportunistic_sysret_failed
|
||||
|
||||
/*
|
||||
* We win! This label is here just for ease of understanding
|
||||
* perf profiles. Nothing jumps here.
|
||||
*/
|
||||
irq_return_via_sysret:
|
||||
CFI_REMEMBER_STATE
|
||||
RESTORE_ARGS 1,8,1
|
||||
movq (RSP-RIP)(%rsp),%rsp
|
||||
USERGS_SYSRET64
|
||||
CFI_RESTORE_STATE
|
||||
|
||||
opportunistic_sysret_failed:
|
||||
SWAPGS
|
||||
jmp restore_args
|
||||
|
||||
|
@ -1048,6 +1023,11 @@ ENTRY(\sym)
|
|||
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
|
||||
|
||||
.if \paranoid
|
||||
.if \paranoid == 1
|
||||
CFI_REMEMBER_STATE
|
||||
testl $3, CS(%rsp) /* If coming from userspace, switch */
|
||||
jnz 1f /* stacks. */
|
||||
.endif
|
||||
call save_paranoid
|
||||
.else
|
||||
call error_entry
|
||||
|
@ -1088,6 +1068,36 @@ ENTRY(\sym)
|
|||
jmp error_exit /* %ebx: no swapgs flag */
|
||||
.endif
|
||||
|
||||
.if \paranoid == 1
|
||||
CFI_RESTORE_STATE
|
||||
/*
|
||||
* Paranoid entry from userspace. Switch stacks and treat it
|
||||
* as a normal entry. This means that paranoid handlers
|
||||
* run in real process context if user_mode(regs).
|
||||
*/
|
||||
1:
|
||||
call error_entry
|
||||
|
||||
DEFAULT_FRAME 0
|
||||
|
||||
movq %rsp,%rdi /* pt_regs pointer */
|
||||
call sync_regs
|
||||
movq %rax,%rsp /* switch stack */
|
||||
|
||||
movq %rsp,%rdi /* pt_regs pointer */
|
||||
|
||||
.if \has_error_code
|
||||
movq ORIG_RAX(%rsp),%rsi /* get error code */
|
||||
movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
|
||||
.else
|
||||
xorl %esi,%esi /* no error code */
|
||||
.endif
|
||||
|
||||
call \do_sym
|
||||
|
||||
jmp error_exit /* %ebx: no swapgs flag */
|
||||
.endif
|
||||
|
||||
CFI_ENDPROC
|
||||
END(\sym)
|
||||
.endm
|
||||
|
@ -1108,7 +1118,7 @@ idtentry overflow do_overflow has_error_code=0
|
|||
idtentry bounds do_bounds has_error_code=0
|
||||
idtentry invalid_op do_invalid_op has_error_code=0
|
||||
idtentry device_not_available do_device_not_available has_error_code=0
|
||||
idtentry double_fault do_double_fault has_error_code=1 paranoid=1
|
||||
idtentry double_fault do_double_fault has_error_code=1 paranoid=2
|
||||
idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
|
||||
idtentry invalid_TSS do_invalid_TSS has_error_code=1
|
||||
idtentry segment_not_present do_segment_not_present has_error_code=1
|
||||
|
@ -1289,16 +1299,14 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(
|
|||
#endif
|
||||
|
||||
/*
|
||||
* "Paranoid" exit path from exception stack.
|
||||
* Paranoid because this is used by NMIs and cannot take
|
||||
* any kernel state for granted.
|
||||
* We don't do kernel preemption checks here, because only
|
||||
* NMI should be common and it does not enable IRQs and
|
||||
* cannot get reschedule ticks.
|
||||
* "Paranoid" exit path from exception stack. This is invoked
|
||||
* only on return from non-NMI IST interrupts that came
|
||||
* from kernel space.
|
||||
*
|
||||
* "trace" is 0 for the NMI handler only, because irq-tracing
|
||||
* is fundamentally NMI-unsafe. (we cannot change the soft and
|
||||
* hard flags at once, atomically)
|
||||
* We may be returning to very strange contexts (e.g. very early
|
||||
* in syscall entry), so checking for preemption here would
|
||||
* be complicated. Fortunately, we there's no good reason
|
||||
* to try to handle preemption here.
|
||||
*/
|
||||
|
||||
/* ebx: no swapgs flag */
|
||||
|
@ -1308,43 +1316,14 @@ ENTRY(paranoid_exit)
|
|||
TRACE_IRQS_OFF_DEBUG
|
||||
testl %ebx,%ebx /* swapgs needed? */
|
||||
jnz paranoid_restore
|
||||
testl $3,CS(%rsp)
|
||||
jnz paranoid_userspace
|
||||
paranoid_swapgs:
|
||||
TRACE_IRQS_IRETQ 0
|
||||
SWAPGS_UNSAFE_STACK
|
||||
RESTORE_ALL 8
|
||||
jmp irq_return
|
||||
INTERRUPT_RETURN
|
||||
paranoid_restore:
|
||||
TRACE_IRQS_IRETQ_DEBUG 0
|
||||
RESTORE_ALL 8
|
||||
jmp irq_return
|
||||
paranoid_userspace:
|
||||
GET_THREAD_INFO(%rcx)
|
||||
movl TI_flags(%rcx),%ebx
|
||||
andl $_TIF_WORK_MASK,%ebx
|
||||
jz paranoid_swapgs
|
||||
movq %rsp,%rdi /* &pt_regs */
|
||||
call sync_regs
|
||||
movq %rax,%rsp /* switch stack for scheduling */
|
||||
testl $_TIF_NEED_RESCHED,%ebx
|
||||
jnz paranoid_schedule
|
||||
movl %ebx,%edx /* arg3: thread flags */
|
||||
TRACE_IRQS_ON
|
||||
ENABLE_INTERRUPTS(CLBR_NONE)
|
||||
xorl %esi,%esi /* arg2: oldset */
|
||||
movq %rsp,%rdi /* arg1: &pt_regs */
|
||||
call do_notify_resume
|
||||
DISABLE_INTERRUPTS(CLBR_NONE)
|
||||
TRACE_IRQS_OFF
|
||||
jmp paranoid_userspace
|
||||
paranoid_schedule:
|
||||
TRACE_IRQS_ON
|
||||
ENABLE_INTERRUPTS(CLBR_ANY)
|
||||
SCHEDULE_USER
|
||||
DISABLE_INTERRUPTS(CLBR_ANY)
|
||||
TRACE_IRQS_OFF
|
||||
jmp paranoid_userspace
|
||||
INTERRUPT_RETURN
|
||||
CFI_ENDPROC
|
||||
END(paranoid_exit)
|
||||
|
||||
|
|
|
@ -69,16 +69,9 @@ static void call_on_stack(void *func, void *stack)
|
|||
: "memory", "cc", "edx", "ecx", "eax");
|
||||
}
|
||||
|
||||
/* how to get the current stack pointer from C */
|
||||
#define current_stack_pointer ({ \
|
||||
unsigned long sp; \
|
||||
asm("mov %%esp,%0" : "=g" (sp)); \
|
||||
sp; \
|
||||
})
|
||||
|
||||
static inline void *current_stack(void)
|
||||
{
|
||||
return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
|
||||
return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
|
||||
}
|
||||
|
||||
static inline int
|
||||
|
@ -103,7 +96,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
|
|||
|
||||
/* Save the next esp at the bottom of the stack */
|
||||
prev_esp = (u32 *)irqstk;
|
||||
*prev_esp = current_stack_pointer;
|
||||
*prev_esp = current_stack_pointer();
|
||||
|
||||
if (unlikely(overflow))
|
||||
call_on_stack(print_stack_overflow, isp);
|
||||
|
@ -156,7 +149,7 @@ void do_softirq_own_stack(void)
|
|||
|
||||
/* Push the previous esp onto the stack */
|
||||
prev_esp = (u32 *)irqstk;
|
||||
*prev_esp = current_stack_pointer;
|
||||
*prev_esp = current_stack_pointer();
|
||||
|
||||
call_on_stack(__do_softirq, isp);
|
||||
}
|
||||
|
|
|
@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
|
|||
{
|
||||
user_exit();
|
||||
|
||||
#ifdef CONFIG_X86_MCE
|
||||
/* notify userspace of pending MCEs */
|
||||
if (thread_info_flags & _TIF_MCE_NOTIFY)
|
||||
mce_notify_process();
|
||||
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
|
||||
|
||||
if (thread_info_flags & _TIF_UPROBE)
|
||||
uprobe_notify_resume(regs);
|
||||
|
||||
|
|
|
@ -108,6 +108,88 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
|
|||
preempt_count_dec();
|
||||
}
|
||||
|
||||
enum ctx_state ist_enter(struct pt_regs *regs)
|
||||
{
|
||||
enum ctx_state prev_state;
|
||||
|
||||
if (user_mode_vm(regs)) {
|
||||
/* Other than that, we're just an exception. */
|
||||
prev_state = exception_enter();
|
||||
} else {
|
||||
/*
|
||||
* We might have interrupted pretty much anything. In
|
||||
* fact, if we're a machine check, we can even interrupt
|
||||
* NMI processing. We don't want in_nmi() to return true,
|
||||
* but we need to notify RCU.
|
||||
*/
|
||||
rcu_nmi_enter();
|
||||
prev_state = IN_KERNEL; /* the value is irrelevant. */
|
||||
}
|
||||
|
||||
/*
|
||||
* We are atomic because we're on the IST stack (or we're on x86_32,
|
||||
* in which case we still shouldn't schedule).
|
||||
*
|
||||
* This must be after exception_enter(), because exception_enter()
|
||||
* won't do anything if in_interrupt() returns true.
|
||||
*/
|
||||
preempt_count_add(HARDIRQ_OFFSET);
|
||||
|
||||
/* This code is a bit fragile. Test it. */
|
||||
rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
|
||||
|
||||
return prev_state;
|
||||
}
|
||||
|
||||
void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
|
||||
{
|
||||
/* Must be before exception_exit. */
|
||||
preempt_count_sub(HARDIRQ_OFFSET);
|
||||
|
||||
if (user_mode_vm(regs))
|
||||
return exception_exit(prev_state);
|
||||
else
|
||||
rcu_nmi_exit();
|
||||
}
|
||||
|
||||
/**
|
||||
* ist_begin_non_atomic() - begin a non-atomic section in an IST exception
|
||||
* @regs: regs passed to the IST exception handler
|
||||
*
|
||||
* IST exception handlers normally cannot schedule. As a special
|
||||
* exception, if the exception interrupted userspace code (i.e.
|
||||
* user_mode_vm(regs) would return true) and the exception was not
|
||||
* a double fault, it can be safe to schedule. ist_begin_non_atomic()
|
||||
* begins a non-atomic section within an ist_enter()/ist_exit() region.
|
||||
* Callers are responsible for enabling interrupts themselves inside
|
||||
* the non-atomic section, and callers must call is_end_non_atomic()
|
||||
* before ist_exit().
|
||||
*/
|
||||
void ist_begin_non_atomic(struct pt_regs *regs)
|
||||
{
|
||||
BUG_ON(!user_mode_vm(regs));
|
||||
|
||||
/*
|
||||
* Sanity check: we need to be on the normal thread stack. This
|
||||
* will catch asm bugs and any attempt to use ist_preempt_enable
|
||||
* from double_fault.
|
||||
*/
|
||||
BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
|
||||
& ~(THREAD_SIZE - 1)) != 0);
|
||||
|
||||
preempt_count_sub(HARDIRQ_OFFSET);
|
||||
}
|
||||
|
||||
/**
|
||||
* ist_end_non_atomic() - begin a non-atomic section in an IST exception
|
||||
*
|
||||
* Ends a non-atomic section started with ist_begin_non_atomic().
|
||||
*/
|
||||
void ist_end_non_atomic(void)
|
||||
{
|
||||
preempt_count_add(HARDIRQ_OFFSET);
|
||||
}
|
||||
|
||||
static nokprobe_inline int
|
||||
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
|
||||
struct pt_regs *regs, long error_code)
|
||||
|
@ -251,6 +333,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
|
|||
* end up promoting it to a doublefault. In that case, modify
|
||||
* the stack to make it look like we just entered the #GP
|
||||
* handler from user space, similar to bad_iret.
|
||||
*
|
||||
* No need for ist_enter here because we don't use RCU.
|
||||
*/
|
||||
if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
|
||||
regs->cs == __KERNEL_CS &&
|
||||
|
@ -263,12 +347,12 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
|
|||
normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
|
||||
regs->ip = (unsigned long)general_protection;
|
||||
regs->sp = (unsigned long)&normal_regs->orig_ax;
|
||||
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
exception_enter();
|
||||
/* Return not checked because double check cannot be ignored */
|
||||
ist_enter(regs); /* Discard prev_state because we won't return. */
|
||||
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
|
||||
|
||||
tsk->thread.error_code = error_code;
|
||||
|
@ -434,7 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
|
|||
if (poke_int3_handler(regs))
|
||||
return;
|
||||
|
||||
prev_state = exception_enter();
|
||||
prev_state = ist_enter(regs);
|
||||
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
|
||||
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
|
||||
SIGTRAP) == NOTIFY_STOP)
|
||||
|
@ -460,33 +544,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
|
|||
preempt_conditional_cli(regs);
|
||||
debug_stack_usage_dec();
|
||||
exit:
|
||||
exception_exit(prev_state);
|
||||
ist_exit(regs, prev_state);
|
||||
}
|
||||
NOKPROBE_SYMBOL(do_int3);
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/*
|
||||
* Help handler running on IST stack to switch back to user stack
|
||||
* for scheduling or signal handling. The actual stack switch is done in
|
||||
* entry.S
|
||||
* Help handler running on IST stack to switch off the IST stack if the
|
||||
* interrupted code was in user mode. The actual stack switch is done in
|
||||
* entry_64.S
|
||||
*/
|
||||
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
|
||||
{
|
||||
struct pt_regs *regs = eregs;
|
||||
/* Did already sync */
|
||||
if (eregs == (struct pt_regs *)eregs->sp)
|
||||
;
|
||||
/* Exception from user space */
|
||||
else if (user_mode(eregs))
|
||||
regs = task_pt_regs(current);
|
||||
/*
|
||||
* Exception from kernel and interrupts are enabled. Move to
|
||||
* kernel process stack.
|
||||
*/
|
||||
else if (eregs->flags & X86_EFLAGS_IF)
|
||||
regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
|
||||
if (eregs != regs)
|
||||
*regs = *eregs;
|
||||
struct pt_regs *regs = task_pt_regs(current);
|
||||
*regs = *eregs;
|
||||
return regs;
|
||||
}
|
||||
NOKPROBE_SYMBOL(sync_regs);
|
||||
|
@ -554,7 +625,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
|
|||
unsigned long dr6;
|
||||
int si_code;
|
||||
|
||||
prev_state = exception_enter();
|
||||
prev_state = ist_enter(regs);
|
||||
|
||||
get_debugreg(dr6, 6);
|
||||
|
||||
|
@ -629,7 +700,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
|
|||
debug_stack_usage_dec();
|
||||
|
||||
exit:
|
||||
exception_exit(prev_state);
|
||||
ist_exit(regs, prev_state);
|
||||
}
|
||||
NOKPROBE_SYMBOL(do_debug);
|
||||
|
||||
|
|
|
@ -205,4 +205,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
|
|||
PHONY += vdso_install $(vdso_img_insttargets)
|
||||
vdso_install: $(vdso_img_insttargets) FORCE
|
||||
|
||||
clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80*
|
||||
clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64*
|
||||
|
|
|
@ -759,39 +759,71 @@ void rcu_irq_enter(void)
|
|||
/**
|
||||
* rcu_nmi_enter - inform RCU of entry to NMI context
|
||||
*
|
||||
* If the CPU was idle with dynamic ticks active, and there is no
|
||||
* irq handler running, this updates rdtp->dynticks_nmi to let the
|
||||
* RCU grace-period handling know that the CPU is active.
|
||||
* If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and
|
||||
* rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know
|
||||
* that the CPU is active. This implementation permits nested NMIs, as
|
||||
* long as the nesting level does not overflow an int. (You will probably
|
||||
* run out of stack space first.)
|
||||
*/
|
||||
void rcu_nmi_enter(void)
|
||||
{
|
||||
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
|
||||
int incby = 2;
|
||||
|
||||
if (rdtp->dynticks_nmi_nesting == 0 &&
|
||||
(atomic_read(&rdtp->dynticks) & 0x1))
|
||||
return;
|
||||
rdtp->dynticks_nmi_nesting++;
|
||||
smp_mb__before_atomic(); /* Force delay from prior write. */
|
||||
atomic_inc(&rdtp->dynticks);
|
||||
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
|
||||
smp_mb__after_atomic(); /* See above. */
|
||||
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
|
||||
/* Complain about underflow. */
|
||||
WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0);
|
||||
|
||||
/*
|
||||
* If idle from RCU viewpoint, atomically increment ->dynticks
|
||||
* to mark non-idle and increment ->dynticks_nmi_nesting by one.
|
||||
* Otherwise, increment ->dynticks_nmi_nesting by two. This means
|
||||
* if ->dynticks_nmi_nesting is equal to one, we are guaranteed
|
||||
* to be in the outermost NMI handler that interrupted an RCU-idle
|
||||
* period (observation due to Andy Lutomirski).
|
||||
*/
|
||||
if (!(atomic_read(&rdtp->dynticks) & 0x1)) {
|
||||
smp_mb__before_atomic(); /* Force delay from prior write. */
|
||||
atomic_inc(&rdtp->dynticks);
|
||||
/* atomic_inc() before later RCU read-side crit sects */
|
||||
smp_mb__after_atomic(); /* See above. */
|
||||
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
|
||||
incby = 1;
|
||||
}
|
||||
rdtp->dynticks_nmi_nesting += incby;
|
||||
barrier();
|
||||
}
|
||||
|
||||
/**
|
||||
* rcu_nmi_exit - inform RCU of exit from NMI context
|
||||
*
|
||||
* If the CPU was idle with dynamic ticks active, and there is no
|
||||
* irq handler running, this updates rdtp->dynticks_nmi to let the
|
||||
* RCU grace-period handling know that the CPU is no longer active.
|
||||
* If we are returning from the outermost NMI handler that interrupted an
|
||||
* RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
|
||||
* to let the RCU grace-period handling know that the CPU is back to
|
||||
* being RCU-idle.
|
||||
*/
|
||||
void rcu_nmi_exit(void)
|
||||
{
|
||||
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
|
||||
|
||||
if (rdtp->dynticks_nmi_nesting == 0 ||
|
||||
--rdtp->dynticks_nmi_nesting != 0)
|
||||
/*
|
||||
* Check for ->dynticks_nmi_nesting underflow and bad ->dynticks.
|
||||
* (We are exiting an NMI handler, so RCU better be paying attention
|
||||
* to us!)
|
||||
*/
|
||||
WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0);
|
||||
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
|
||||
|
||||
/*
|
||||
* If the nesting level is not 1, the CPU wasn't RCU-idle, so
|
||||
* leave it in non-RCU-idle state.
|
||||
*/
|
||||
if (rdtp->dynticks_nmi_nesting != 1) {
|
||||
rdtp->dynticks_nmi_nesting -= 2;
|
||||
return;
|
||||
}
|
||||
|
||||
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
|
||||
rdtp->dynticks_nmi_nesting = 0;
|
||||
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
|
||||
smp_mb__before_atomic(); /* See above. */
|
||||
atomic_inc(&rdtp->dynticks);
|
||||
|
|
Загрузка…
Ссылка в новой задаче