x86/mm, tracing: Fix CR2 corruption
Despite the current efforts to read CR2 before tracing happens there still exist a number of possible holes: idtentry page_fault do_page_fault has_error_code=1 call error_entry TRACE_IRQS_OFF call trace_hardirqs_off* #PF // modifies CR2 CALL_enter_from_user_mode __context_tracking_exit() trace_user_exit(0) #PF // modifies CR2 call do_page_fault address = read_cr2(); /* whoopsie */ And similar for i386. Fix it by pulling the CR2 read into the entry code, before any of that stuff gets a chance to run and ruin things. Reported-by: He Zhe <zhe.he@windriver.com> Reported-by: Eiichi Tsukata <devel@etsukata.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Andy Lutomirski <luto@kernel.org> Cc: bp@alien8.de Cc: rostedt@goodmis.org Cc: torvalds@linux-foundation.org Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: jgross@suse.com Cc: joel@joelfernandes.org Link: https://lkml.kernel.org/r/20190711114336.116812491@infradead.org Debugged-by: Steven Rostedt <rostedt@goodmis.org>
This commit is contained in:
Родитель
4234653e88
Коммит
a0d14b8909
|
@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR,
|
||||||
|
|
||||||
ENTRY(page_fault)
|
ENTRY(page_fault)
|
||||||
ASM_CLAC
|
ASM_CLAC
|
||||||
pushl $do_page_fault
|
pushl $0; /* %gs's slot on the stack */
|
||||||
ALIGN
|
|
||||||
jmp common_exception
|
SAVE_ALL switch_stacks=1 skip_gs=1
|
||||||
|
|
||||||
|
ENCODE_FRAME_POINTER
|
||||||
|
UNWIND_ESPFIX_STACK
|
||||||
|
|
||||||
|
/* fixup %gs */
|
||||||
|
GS_TO_REG %ecx
|
||||||
|
REG_TO_PTGS %ecx
|
||||||
|
SET_KERNEL_GS %ecx
|
||||||
|
|
||||||
|
GET_CR2_INTO(%ecx) # might clobber %eax
|
||||||
|
|
||||||
|
/* fixup orig %eax */
|
||||||
|
movl PT_ORIG_EAX(%esp), %edx # get the error code
|
||||||
|
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
|
||||||
|
|
||||||
|
TRACE_IRQS_OFF
|
||||||
|
movl %esp, %eax # pt_regs pointer
|
||||||
|
call do_page_fault
|
||||||
|
jmp ret_from_exception
|
||||||
END(page_fault)
|
END(page_fault)
|
||||||
|
|
||||||
common_exception:
|
common_exception:
|
||||||
|
|
|
@ -864,7 +864,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
|
||||||
*/
|
*/
|
||||||
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
|
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
|
||||||
|
|
||||||
.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0
|
.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0
|
||||||
|
|
||||||
.if \paranoid
|
.if \paranoid
|
||||||
call paranoid_entry
|
call paranoid_entry
|
||||||
|
@ -874,12 +874,21 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
|
||||||
.endif
|
.endif
|
||||||
UNWIND_HINT_REGS
|
UNWIND_HINT_REGS
|
||||||
|
|
||||||
.if \paranoid
|
.if \read_cr2
|
||||||
|
GET_CR2_INTO(%rdx); /* can clobber %rax */
|
||||||
|
.endif
|
||||||
|
|
||||||
.if \shift_ist != -1
|
.if \shift_ist != -1
|
||||||
TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
|
TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
|
||||||
.else
|
.else
|
||||||
TRACE_IRQS_OFF
|
TRACE_IRQS_OFF
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
|
.if \paranoid == 0
|
||||||
|
testb $3, CS(%rsp)
|
||||||
|
jz .Lfrom_kernel_no_context_tracking_\@
|
||||||
|
CALL_enter_from_user_mode
|
||||||
|
.Lfrom_kernel_no_context_tracking_\@:
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
movq %rsp, %rdi /* pt_regs pointer */
|
movq %rsp, %rdi /* pt_regs pointer */
|
||||||
|
@ -923,6 +932,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
|
||||||
* fresh stack. (This is for #DB, which has a nasty habit
|
* fresh stack. (This is for #DB, which has a nasty habit
|
||||||
* of recursing.)
|
* of recursing.)
|
||||||
* @create_gap: create a 6-word stack gap when coming from kernel mode.
|
* @create_gap: create a 6-word stack gap when coming from kernel mode.
|
||||||
|
* @read_cr2: load CR2 into the 3rd argument; done before calling any C code
|
||||||
*
|
*
|
||||||
* idtentry generates an IDT stub that sets up a usable kernel context,
|
* idtentry generates an IDT stub that sets up a usable kernel context,
|
||||||
* creates struct pt_regs, and calls @do_sym. The stub has the following
|
* creates struct pt_regs, and calls @do_sym. The stub has the following
|
||||||
|
@ -947,7 +957,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
|
||||||
* @paranoid == 2 is special: the stub will never switch stacks. This is for
|
* @paranoid == 2 is special: the stub will never switch stacks. This is for
|
||||||
* #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
|
* #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
|
||||||
*/
|
*/
|
||||||
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0
|
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
|
||||||
ENTRY(\sym)
|
ENTRY(\sym)
|
||||||
UNWIND_HINT_IRET_REGS offset=\has_error_code*8
|
UNWIND_HINT_IRET_REGS offset=\has_error_code*8
|
||||||
|
|
||||||
|
@ -985,7 +995,7 @@ ENTRY(\sym)
|
||||||
.Lfrom_usermode_no_gap_\@:
|
.Lfrom_usermode_no_gap_\@:
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset
|
idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset
|
||||||
|
|
||||||
.if \paranoid == 1
|
.if \paranoid == 1
|
||||||
/*
|
/*
|
||||||
|
@ -994,7 +1004,7 @@ ENTRY(\sym)
|
||||||
* run in real process context if user_mode(regs).
|
* run in real process context if user_mode(regs).
|
||||||
*/
|
*/
|
||||||
.Lfrom_usermode_switch_stack_\@:
|
.Lfrom_usermode_switch_stack_\@:
|
||||||
idtentry_part \do_sym, \has_error_code, paranoid=0
|
idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0
|
||||||
.endif
|
.endif
|
||||||
|
|
||||||
_ASM_NOKPROBE(\sym)
|
_ASM_NOKPROBE(\sym)
|
||||||
|
@ -1006,7 +1016,7 @@ idtentry overflow do_overflow has_error_code=0
|
||||||
idtentry bounds do_bounds has_error_code=0
|
idtentry bounds do_bounds has_error_code=0
|
||||||
idtentry invalid_op do_invalid_op has_error_code=0
|
idtentry invalid_op do_invalid_op has_error_code=0
|
||||||
idtentry device_not_available do_device_not_available has_error_code=0
|
idtentry device_not_available do_device_not_available has_error_code=0
|
||||||
idtentry double_fault do_double_fault has_error_code=1 paranoid=2
|
idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1
|
||||||
idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
|
idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
|
||||||
idtentry invalid_TSS do_invalid_TSS has_error_code=1
|
idtentry invalid_TSS do_invalid_TSS has_error_code=1
|
||||||
idtentry segment_not_present do_segment_not_present has_error_code=1
|
idtentry segment_not_present do_segment_not_present has_error_code=1
|
||||||
|
@ -1179,10 +1189,10 @@ idtentry xenint3 do_int3 has_error_code=0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
idtentry general_protection do_general_protection has_error_code=1
|
idtentry general_protection do_general_protection has_error_code=1
|
||||||
idtentry page_fault do_page_fault has_error_code=1
|
idtentry page_fault do_page_fault has_error_code=1 read_cr2=1
|
||||||
|
|
||||||
#ifdef CONFIG_KVM_GUEST
|
#ifdef CONFIG_KVM_GUEST
|
||||||
idtentry async_page_fault do_async_page_fault has_error_code=1
|
idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_X86_MCE
|
#ifdef CONFIG_X86_MCE
|
||||||
|
@ -1281,18 +1291,9 @@ ENTRY(error_entry)
|
||||||
movq %rax, %rsp /* switch stack */
|
movq %rax, %rsp /* switch stack */
|
||||||
ENCODE_FRAME_POINTER
|
ENCODE_FRAME_POINTER
|
||||||
pushq %r12
|
pushq %r12
|
||||||
|
|
||||||
/*
|
|
||||||
* We need to tell lockdep that IRQs are off. We can't do this until
|
|
||||||
* we fix gsbase, and we should do it before enter_from_user_mode
|
|
||||||
* (which can take locks).
|
|
||||||
*/
|
|
||||||
TRACE_IRQS_OFF
|
|
||||||
CALL_enter_from_user_mode
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.Lerror_entry_done:
|
.Lerror_entry_done:
|
||||||
TRACE_IRQS_OFF
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -92,7 +92,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel);
|
||||||
void kvm_async_pf_task_wake(u32 token);
|
void kvm_async_pf_task_wake(u32 token);
|
||||||
u32 kvm_read_and_reset_pf_reason(void);
|
u32 kvm_read_and_reset_pf_reason(void);
|
||||||
extern void kvm_disable_steal_time(void);
|
extern void kvm_disable_steal_time(void);
|
||||||
void do_async_page_fault(struct pt_regs *regs, unsigned long error_code);
|
void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
|
||||||
|
|
||||||
#ifdef CONFIG_PARAVIRT_SPINLOCKS
|
#ifdef CONFIG_PARAVIRT_SPINLOCKS
|
||||||
void __init kvm_spinlock_init(void);
|
void __init kvm_spinlock_init(void);
|
||||||
|
|
|
@ -74,14 +74,14 @@ dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code);
|
||||||
dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code);
|
dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code);
|
||||||
dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code);
|
dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code);
|
||||||
#ifdef CONFIG_X86_64
|
#ifdef CONFIG_X86_64
|
||||||
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code);
|
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long address);
|
||||||
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
|
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
|
||||||
asmlinkage __visible notrace
|
asmlinkage __visible notrace
|
||||||
struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);
|
struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);
|
||||||
void __init trap_init(void);
|
void __init trap_init(void);
|
||||||
#endif
|
#endif
|
||||||
dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code);
|
dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code);
|
||||||
dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code);
|
dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address);
|
||||||
dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code);
|
dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code);
|
||||||
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code);
|
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code);
|
||||||
dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code);
|
dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code);
|
||||||
|
|
|
@ -242,23 +242,23 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
|
||||||
NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
|
NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
|
||||||
|
|
||||||
dotraplinkage void
|
dotraplinkage void
|
||||||
do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
|
do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
|
||||||
{
|
{
|
||||||
enum ctx_state prev_state;
|
enum ctx_state prev_state;
|
||||||
|
|
||||||
switch (kvm_read_and_reset_pf_reason()) {
|
switch (kvm_read_and_reset_pf_reason()) {
|
||||||
default:
|
default:
|
||||||
do_page_fault(regs, error_code);
|
do_page_fault(regs, error_code, address);
|
||||||
break;
|
break;
|
||||||
case KVM_PV_REASON_PAGE_NOT_PRESENT:
|
case KVM_PV_REASON_PAGE_NOT_PRESENT:
|
||||||
/* page is swapped out by the host. */
|
/* page is swapped out by the host. */
|
||||||
prev_state = exception_enter();
|
prev_state = exception_enter();
|
||||||
kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs));
|
kvm_async_pf_task_wait((u32)address, !user_mode(regs));
|
||||||
exception_exit(prev_state);
|
exception_exit(prev_state);
|
||||||
break;
|
break;
|
||||||
case KVM_PV_REASON_PAGE_READY:
|
case KVM_PV_REASON_PAGE_READY:
|
||||||
rcu_irq_enter();
|
rcu_irq_enter();
|
||||||
kvm_async_pf_task_wake((u32)read_cr2());
|
kvm_async_pf_task_wake((u32)address);
|
||||||
rcu_irq_exit();
|
rcu_irq_exit();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -313,13 +313,10 @@ __visible void __noreturn handle_stack_overflow(const char *message,
|
||||||
|
|
||||||
#ifdef CONFIG_X86_64
|
#ifdef CONFIG_X86_64
|
||||||
/* Runs on IST stack */
|
/* Runs on IST stack */
|
||||||
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
|
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
|
||||||
{
|
{
|
||||||
static const char str[] = "double fault";
|
static const char str[] = "double fault";
|
||||||
struct task_struct *tsk = current;
|
struct task_struct *tsk = current;
|
||||||
#ifdef CONFIG_VMAP_STACK
|
|
||||||
unsigned long cr2;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CONFIG_X86_ESPFIX64
|
#ifdef CONFIG_X86_ESPFIX64
|
||||||
extern unsigned char native_irq_return_iret[];
|
extern unsigned char native_irq_return_iret[];
|
||||||
|
@ -415,7 +412,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
|
||||||
* stack even if the actual trigger for the double fault was
|
* stack even if the actual trigger for the double fault was
|
||||||
* something else.
|
* something else.
|
||||||
*/
|
*/
|
||||||
cr2 = read_cr2();
|
|
||||||
if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
|
if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
|
||||||
handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
|
handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1507,9 +1507,8 @@ good_area:
|
||||||
NOKPROBE_SYMBOL(do_user_addr_fault);
|
NOKPROBE_SYMBOL(do_user_addr_fault);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This routine handles page faults. It determines the address,
|
* Explicitly marked noinline such that the function tracer sees this as the
|
||||||
* and the problem, and then passes it off to one of the appropriate
|
* page_fault entry point.
|
||||||
* routines.
|
|
||||||
*/
|
*/
|
||||||
static noinline void
|
static noinline void
|
||||||
__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
|
__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
|
||||||
|
@ -1528,33 +1527,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
|
||||||
}
|
}
|
||||||
NOKPROBE_SYMBOL(__do_page_fault);
|
NOKPROBE_SYMBOL(__do_page_fault);
|
||||||
|
|
||||||
static nokprobe_inline void
|
static __always_inline void
|
||||||
trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
|
trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
|
||||||
unsigned long error_code)
|
unsigned long address)
|
||||||
{
|
{
|
||||||
|
if (!trace_pagefault_enabled())
|
||||||
|
return;
|
||||||
|
|
||||||
if (user_mode(regs))
|
if (user_mode(regs))
|
||||||
trace_page_fault_user(address, regs, error_code);
|
trace_page_fault_user(address, regs, error_code);
|
||||||
else
|
else
|
||||||
trace_page_fault_kernel(address, regs, error_code);
|
trace_page_fault_kernel(address, regs, error_code);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
dotraplinkage void
|
||||||
* We must have this function blacklisted from kprobes, tagged with notrace
|
do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
|
||||||
* and call read_cr2() before calling anything else. To avoid calling any
|
|
||||||
* kind of tracing machinery before we've observed the CR2 value.
|
|
||||||
*
|
|
||||||
* exception_{enter,exit}() contains all sorts of tracepoints.
|
|
||||||
*/
|
|
||||||
dotraplinkage void notrace
|
|
||||||
do_page_fault(struct pt_regs *regs, unsigned long error_code)
|
|
||||||
{
|
{
|
||||||
unsigned long address = read_cr2(); /* Get the faulting address */
|
|
||||||
enum ctx_state prev_state;
|
enum ctx_state prev_state;
|
||||||
|
|
||||||
prev_state = exception_enter();
|
prev_state = exception_enter();
|
||||||
if (trace_pagefault_enabled())
|
trace_page_fault_entries(regs, error_code, address);
|
||||||
trace_page_fault_entries(address, regs, error_code);
|
|
||||||
|
|
||||||
__do_page_fault(regs, error_code, address);
|
__do_page_fault(regs, error_code, address);
|
||||||
exception_exit(prev_state);
|
exception_exit(prev_state);
|
||||||
}
|
}
|
||||||
|
|
Загрузка…
Ссылка в новой задаче