diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 4bde0dc66100..e9c86299b835 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -5,6 +5,7 @@ #include #ifndef __ASSEMBLY__ +#include #include /* duplicated to the one in bootmem.h */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 448cd01eb3ec..a34430b7af4a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -22,6 +22,7 @@ #define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) #ifndef __ASSEMBLY__ +#include #include #include #include diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 5db5d083c873..331474b150f1 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -89,6 +89,7 @@ static inline void set_real_mode_mem(phys_addr_t mem) } void reserve_real_mode(void); +void load_trampoline_pgtable(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index b587a9ee9cb2..98fa0a114074 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -261,4 +261,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); #endif /* !MODULE */ +static inline void __native_tlb_flush_global(unsigned long cr4) +{ + native_write_cr4(cr4 ^ X86_CR4_PGE); + native_write_cr4(cr4); +} #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0083464de5e3..79b3d67addcc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -384,7 +384,7 @@ set_register: } EXPORT_SYMBOL(native_write_cr0); -void native_write_cr4(unsigned long val) +void __no_profile native_write_cr4(unsigned long val) { unsigned long bits_changed = 0; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 3be9dd213dad..de563db9cdcd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -487,6 +487,10 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); + /* + * This needs to happen *before* kasan_early_init() because latter maps stuff + * into that page. + */ clear_page(init_top_pgt); /* @@ -498,6 +502,16 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) kasan_early_init(); + /* + * Flush global TLB entries which could be left over from the trampoline page + * table. + * + * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs + * instrument native_write_cr4() so KASAN must be initialized for that + * instrumentation to work. + */ + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); + idt_setup_early_handler(); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d8b3ebd2bb85..9c63fc5988cd 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -166,9 +166,26 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) call sev_verify_cbit popq %rsi - /* Switch to new page-table */ + /* + * Switch to new page-table + * + * For the boot CPU this switches to early_top_pgt which still has the + * indentity mappings present. The secondary CPUs will switch to the + * init_top_pgt here, away from the trampoline_pgd and unmap the + * indentity mapped ranges. + */ movq %rax, %cr3 + /* + * Do a global TLB flush after the CR3 switch to make sure the TLB + * entries from the identity mapping are flushed. + */ + movq %cr4, %rcx + movq %rcx, %rax + xorq $X86_CR4_PGE, %rcx + movq %rcx, %cr4 + movq %rax, %cr4 + /* Ensure I am executing from virtual addresses */ movq $1f, %rax ANNOTATE_RETPOLINE_SAFE diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0a40df66a40d..fa700b46588e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -113,17 +113,9 @@ void __noreturn machine_real_restart(unsigned int type) spin_unlock(&rtc_lock); /* - * Switch back to the initial page table. + * Switch to the trampoline page table. */ -#ifdef CONFIG_X86_32 - load_cr3(initial_page_table); -#else - write_cr3(real_mode_header->trampoline_pgd); - - /* Exiting long mode will fail if CR4.PCIDE is set. */ - if (boot_cpu_has(X86_FEATURE_PCID)) - cr4_clear_bits(X86_CR4_PCIDE); -#endif + load_trampoline_pgtable(); /* Jump to the identity-mapped low memory code */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1895986842b9..4ba024d5b63a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -714,6 +714,11 @@ static void __init memory_map_bottom_up(unsigned long map_start, static void __init init_trampoline(void) { #ifdef CONFIG_X86_64 + /* + * The code below will alias kernel page-tables in the user-range of the + * address space, including the Global bit. So global TLB entries will + * be created when using the trampoline page-table. + */ if (!kaslr_memory_enabled()) trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; else diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 59ba2968af1b..1e6513f95133 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1148,7 +1148,7 @@ void flush_tlb_one_user(unsigned long addr) */ STATIC_NOPV void native_flush_tlb_global(void) { - unsigned long cr4, flags; + unsigned long flags; if (static_cpu_has(X86_FEATURE_INVPCID)) { /* @@ -1168,11 +1168,7 @@ STATIC_NOPV void native_flush_tlb_global(void) */ raw_local_irq_save(flags); - cr4 = this_cpu_read(cpu_tlbstate.cr4); - /* toggle PGE */ - native_write_cr4(cr4 ^ X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); raw_local_irq_restore(flags); } diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 38d24d2ab38b..c5e29db02a46 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -17,6 +17,32 @@ u32 *trampoline_cr4_features; /* Hold the pgd entry used on booting additional CPUs */ pgd_t trampoline_pgd_entry; +void load_trampoline_pgtable(void) +{ +#ifdef CONFIG_X86_32 + load_cr3(initial_page_table); +#else + /* + * This function is called before exiting to real-mode and that will + * fail with CR4.PCIDE still set. + */ + if (boot_cpu_has(X86_FEATURE_PCID)) + cr4_clear_bits(X86_CR4_PCIDE); + + write_cr3(real_mode_header->trampoline_pgd); +#endif + + /* + * The CR3 write above will not flush global TLB entries. + * Stale, global entries from previous page tables may still be + * present. Flush those stale entries. + * + * This ensures that memory accessed while running with + * trampoline_pgd is *actually* mapped into trampoline_pgd. + */ + __flush_tlb_all(); +} + void __init reserve_real_mode(void) { phys_addr_t mem;