From e1cd82a339024beda8439fb2e20718363ee989a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Nov 2021 06:13:05 +0100 Subject: [PATCH 1/6] x86/mm: Add missing dependency to In the following commit: 025768a966a3 x86/cpu: Use alternative to generate the TASK_SIZE_MAX constant ... we added the new task_size_max() inline, which uses X86_FEATURE_LA57, but doesn't include which defines the constant. Due to the way alternatives macros work currently this doesn't get reported as an immediate build error, only as a link error, if a .c file happens to include first: > ld: kernel/fork.o:(.altinstructions+0x98): undefined reference to `X86_FEATURE_LA57' In the current upstream kernel no .c file includes before including some other header that includes , which is why this dependency bug went unnoticed. Cc: Linus Torvalds Cc: Borislav Petkov Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page_64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 4bde0dc66100..e9c86299b835 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -5,6 +5,7 @@ #include #ifndef __ASSEMBLY__ +#include #include /* duplicated to the one in bootmem.h */ From 9de4999050b5f2e847c84372c6a1aa1fe32bb269 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 2 Dec 2021 16:32:23 +0100 Subject: [PATCH 2/6] x86/realmode: Add comment for Global bit usage in trampoline_pgd Document the fact that using the trampoline_pgd will result in the creation of global TLB entries in the user range of the address space. Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20211202153226.22946-2-joro@8bytes.org --- arch/x86/mm/init.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1895986842b9..4ba024d5b63a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -714,6 +714,11 @@ static void __init memory_map_bottom_up(unsigned long map_start, static void __init init_trampoline(void) { #ifdef CONFIG_X86_64 + /* + * The code below will alias kernel page-tables in the user-range of the + * address space, including the Global bit. So global TLB entries will + * be created when using the trampoline page-table. + */ if (!kaslr_memory_enabled()) trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; else From f154f290855b070cc94dd44ad253c0ef8a9337bb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 2 Dec 2021 16:32:24 +0100 Subject: [PATCH 3/6] x86/mm/64: Flush global TLB on boot and AP bringup The AP bringup code uses the trampoline_pgd page-table which establishes global mappings in the user range of the address space. Flush the global TLB entries after the indentity mappings are removed so no stale entries remain in the TLB. Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211202153226.22946-3-joro@8bytes.org --- arch/x86/include/asm/tlbflush.h | 5 +++++ arch/x86/kernel/head64.c | 2 ++ arch/x86/kernel/head_64.S | 19 ++++++++++++++++++- arch/x86/mm/tlb.c | 8 ++------ 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index b587a9ee9cb2..98fa0a114074 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -261,4 +261,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); #endif /* !MODULE */ +static inline void __native_tlb_flush_global(unsigned long cr4) +{ + native_write_cr4(cr4 ^ X86_CR4_PGE); + native_write_cr4(cr4); +} #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index fc5371a7e9d1..75acb6027a87 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -483,6 +483,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* Kill off the identity-map trampoline */ reset_early_page_tables(); + __native_tlb_flush_global(native_read_cr4()); + clear_bss(); clear_page(init_top_pgt); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d8b3ebd2bb85..9c63fc5988cd 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -166,9 +166,26 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) call sev_verify_cbit popq %rsi - /* Switch to new page-table */ + /* + * Switch to new page-table + * + * For the boot CPU this switches to early_top_pgt which still has the + * indentity mappings present. The secondary CPUs will switch to the + * init_top_pgt here, away from the trampoline_pgd and unmap the + * indentity mapped ranges. + */ movq %rax, %cr3 + /* + * Do a global TLB flush after the CR3 switch to make sure the TLB + * entries from the identity mapping are flushed. + */ + movq %cr4, %rcx + movq %rcx, %rax + xorq $X86_CR4_PGE, %rcx + movq %rcx, %cr4 + movq %rax, %cr4 + /* Ensure I am executing from virtual addresses */ movq $1f, %rax ANNOTATE_RETPOLINE_SAFE diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 59ba2968af1b..1e6513f95133 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1148,7 +1148,7 @@ void flush_tlb_one_user(unsigned long addr) */ STATIC_NOPV void native_flush_tlb_global(void) { - unsigned long cr4, flags; + unsigned long flags; if (static_cpu_has(X86_FEATURE_INVPCID)) { /* @@ -1168,11 +1168,7 @@ STATIC_NOPV void native_flush_tlb_global(void) */ raw_local_irq_save(flags); - cr4 = this_cpu_read(cpu_tlbstate.cr4); - /* toggle PGE */ - native_write_cr4(cr4 ^ X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); raw_local_irq_restore(flags); } From 71d5049b053876afbde6c3273250b76935494ab2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 2 Dec 2021 16:32:25 +0100 Subject: [PATCH 4/6] x86/mm: Flush global TLB when switching to trampoline page-table Move the switching code into a function so that it can be re-used and add a global TLB flush. This makes sure that usage of memory which is not mapped in the trampoline page-table is reliably caught. Also move the clearing of CR4.PCIDE before the CR3 switch because the cr4_clear_bits() function will access data not mapped into the trampoline page-table. Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20211202153226.22946-4-joro@8bytes.org --- arch/x86/include/asm/realmode.h | 1 + arch/x86/kernel/reboot.c | 12 ++---------- arch/x86/realmode/init.c | 26 ++++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 5db5d083c873..331474b150f1 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -89,6 +89,7 @@ static inline void set_real_mode_mem(phys_addr_t mem) } void reserve_real_mode(void); +void load_trampoline_pgtable(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 0a40df66a40d..fa700b46588e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -113,17 +113,9 @@ void __noreturn machine_real_restart(unsigned int type) spin_unlock(&rtc_lock); /* - * Switch back to the initial page table. + * Switch to the trampoline page table. */ -#ifdef CONFIG_X86_32 - load_cr3(initial_page_table); -#else - write_cr3(real_mode_header->trampoline_pgd); - - /* Exiting long mode will fail if CR4.PCIDE is set. */ - if (boot_cpu_has(X86_FEATURE_PCID)) - cr4_clear_bits(X86_CR4_PCIDE); -#endif + load_trampoline_pgtable(); /* Jump to the identity-mapped low memory code */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 4a3da7592b99..6d98609387ba 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -17,6 +17,32 @@ u32 *trampoline_cr4_features; /* Hold the pgd entry used on booting additional CPUs */ pgd_t trampoline_pgd_entry; +void load_trampoline_pgtable(void) +{ +#ifdef CONFIG_X86_32 + load_cr3(initial_page_table); +#else + /* + * This function is called before exiting to real-mode and that will + * fail with CR4.PCIDE still set. + */ + if (boot_cpu_has(X86_FEATURE_PCID)) + cr4_clear_bits(X86_CR4_PCIDE); + + write_cr3(real_mode_header->trampoline_pgd); +#endif + + /* + * The CR3 write above will not flush global TLB entries. + * Stale, global entries from previous page tables may still be + * present. Flush those stale entries. + * + * This ensures that memory accessed while running with + * trampoline_pgd is *actually* mapped into trampoline_pgd. + */ + __flush_tlb_all(); +} + void __init reserve_real_mode(void) { phys_addr_t mem; From 35fa745286ac44ee26ed100c2bd2553368ad193b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 2 Nov 2021 17:52:24 +0100 Subject: [PATCH 5/6] x86/mm: Include spinlock_t definition in pgtable. This header file provides forward declartion for pgd_lock but does not include the header defining its type. This works since the definition of spinlock_t is usually included somehow via printk. By trying to avoid recursive includes on PREEMPT_RT I avoided the loop in printk and as a consequnce kernel/intel.c failed to compile due to missing type definition. Include the needed definition for spinlock_t. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20211102165224.wpz4zyhsvwccx5p3@linutronix.de --- arch/x86/include/asm/pgtable.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 448cd01eb3ec..a34430b7af4a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -22,6 +22,7 @@ #define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) #ifndef __ASSEMBLY__ +#include #include #include #include From b64dfcde1ca9cb82e38e573753f0c0db8fb841c2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 17 Dec 2021 16:48:29 +0100 Subject: [PATCH 6/6] x86/mm: Prevent early boot triple-faults with instrumentation Commit in Fixes added a global TLB flush on the early boot path, after the kernel switches off of the trampoline page table. Compiler profiling options enabled with GCOV_PROFILE add additional measurement code on clang which needs to be initialized prior to use. The global flush in x86_64_start_kernel() happens before those initializations can happen, leading to accessing invalid memory. GCOV_PROFILE builds with gcc are still ok so this is clang-specific. The second issue this fixes is with KASAN: for a similar reason, kasan_early_init() needs to have happened before KASAN-instrumented functions are called. Therefore, reorder the flush to happen after the KASAN early init and prevent the compilers from adding profiling instrumentation to native_write_cr4(). Fixes: f154f290855b ("x86/mm/64: Flush global TLB on boot and AP bringup") Reported-by: "J. Bruce Fields" Reported-by: kernel test robot Signed-off-by: Borislav Petkov Tested-by: Carel Si Tested-by: "J. Bruce Fields" Link: https://lore.kernel.org/r/20211209144141.GC25654@xsang-OptiPlex-9020 --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/head64.c | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0083464de5e3..79b3d67addcc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -384,7 +384,7 @@ set_register: } EXPORT_SYMBOL(native_write_cr0); -void native_write_cr4(unsigned long val) +void __no_profile native_write_cr4(unsigned long val) { unsigned long bits_changed = 0; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 75acb6027a87..f5e80a8377ad 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -483,10 +483,12 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* Kill off the identity-map trampoline */ reset_early_page_tables(); - __native_tlb_flush_global(native_read_cr4()); - clear_bss(); + /* + * This needs to happen *before* kasan_early_init() because latter maps stuff + * into that page. + */ clear_page(init_top_pgt); /* @@ -498,6 +500,16 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) kasan_early_init(); + /* + * Flush global TLB entries which could be left over from the trampoline page + * table. + * + * This needs to happen *after* kasan_early_init() as KASAN-enabled .configs + * instrument native_write_cr4() so KASAN must be initialized for that + * instrumentation to work. + */ + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); + idt_setup_early_handler(); copy_bootdata(__va(real_mode_data));