From 51b75b5b563a2637f9d8dc5bd02a31b2ff9e5ea0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 19 Jul 2019 20:46:50 +0200 Subject: [PATCH 1/9] x86/mm: Check for pfn instead of page in vmalloc_sync_one() Do not require a struct page for the mapped memory location because it might not exist. This can happen when an ioremapped region is mapped with 2MB pages. Fixes: 5d72b4fba40ef ('x86, mm: support huge I/O mapping capability I/F') Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20190719184652.11391-2-joro@8bytes.org --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6c46095cd0d9..e64173db4970 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -183,7 +183,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) if (!pmd_present(*pmd)) set_pmd(pmd, *pmd_k); else - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); return pmd_k; } From 8e998fc24de47c55b47a887f6c95ab91acd4a720 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 19 Jul 2019 20:46:51 +0200 Subject: [PATCH 2/9] x86/mm: Sync also unmappings in vmalloc_sync_all() With huge-page ioremap areas the unmappings also need to be synced between all page-tables. Otherwise it can cause data corruption when a region is unmapped and later re-used. Make the vmalloc_sync_one() function ready to sync unmappings and make sure vmalloc_sync_all() iterates over all page-tables even when an unmapped PMD is found. Fixes: 5d72b4fba40ef ('x86, mm: support huge I/O mapping capability I/F') Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20190719184652.11391-3-joro@8bytes.org --- arch/x86/mm/fault.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e64173db4970..9ceacd1156db 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -177,11 +177,12 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) pmd = pmd_offset(pud, address); pmd_k = pmd_offset(pud_k, address); + + if (pmd_present(*pmd) != pmd_present(*pmd_k)) + set_pmd(pmd, *pmd_k); + if (!pmd_present(*pmd_k)) return NULL; - - if (!pmd_present(*pmd)) - set_pmd(pmd, *pmd_k); else BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k)); @@ -203,17 +204,13 @@ void vmalloc_sync_all(void) spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { spinlock_t *pgt_lock; - pmd_t *ret; /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - ret = vmalloc_sync_one(page_address(page), address); + vmalloc_sync_one(page_address(page), address); spin_unlock(pgt_lock); - - if (!ret) - break; } spin_unlock(&pgd_lock); } From 3f8fd02b1bf1d7ba964485a56f2f4b53ae88c167 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 19 Jul 2019 20:46:52 +0200 Subject: [PATCH 3/9] mm/vmalloc: Sync unmappings in __purge_vmap_area_lazy() On x86-32 with PTI enabled, parts of the kernel page-tables are not shared between processes. This can cause mappings in the vmalloc/ioremap area to persist in some page-tables after the region is unmapped and released. When the region is re-used the processes with the old mappings do not fault in the new mappings but still access the old ones. This causes undefined behavior, in reality often data corruption, kernel oopses and panics and even spontaneous reboots. Fix this problem by activly syncing unmaps in the vmalloc/ioremap area to all page-tables in the system before the regions can be re-used. References: https://bugzilla.suse.com/show_bug.cgi?id=1118689 Fixes: 5d72b4fba40ef ('x86, mm: support huge I/O mapping capability I/F') Signed-off-by: Joerg Roedel Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20190719184652.11391-4-joro@8bytes.org --- mm/vmalloc.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4fa8d84599b0..e0fc963acc41 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1258,6 +1258,12 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) if (unlikely(valist == NULL)) return false; + /* + * First make sure the mappings are removed from all page-tables + * before they are freed. + */ + vmalloc_sync_all(); + /* * TODO: to calculate a flush range without looping. * The list can be up to lazy_max_pages() elements. @@ -3038,6 +3044,9 @@ EXPORT_SYMBOL(remap_vmalloc_range); /* * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. + * + * The purpose of this function is to make sure the vmalloc area + * mappings are identical in all page-tables in the system. */ void __weak vmalloc_sync_all(void) { From 2af7c85714d8cafadf925d55441458eae312cd6b Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Mon, 22 Jul 2019 17:32:16 +0900 Subject: [PATCH 4/9] x86/stacktrace: Prevent access_ok() warnings in arch_stack_walk_user() When arch_stack_walk_user() is called from atomic contexts, access_ok() can trigger the following warning if compiled with CONFIG_DEBUG_ATOMIC_SLEEP=y. Reproducer: // CONFIG_DEBUG_ATOMIC_SLEEP=y # cd /sys/kernel/debug/tracing # echo 1 > options/userstacktrace # echo 1 > events/irq/irq_handler_entry/enable WARNING: CPU: 0 PID: 2649 at arch/x86/kernel/stacktrace.c:103 arch_stack_walk_user+0x6e/0xf6 CPU: 0 PID: 2649 Comm: bash Not tainted 5.3.0-rc1+ #99 RIP: 0010:arch_stack_walk_user+0x6e/0xf6 Call Trace: stack_trace_save_user+0x10a/0x16d trace_buffer_unlock_commit_regs+0x185/0x240 trace_event_buffer_commit+0xec/0x330 trace_event_raw_event_irq_handler_entry+0x159/0x1e0 __handle_irq_event_percpu+0x22d/0x440 handle_irq_event_percpu+0x70/0x100 handle_irq_event+0x5a/0x8b handle_edge_irq+0x12f/0x3f0 handle_irq+0x34/0x40 do_IRQ+0xa6/0x1f0 common_interrupt+0xf/0xf Fix it by calling __range_not_ok() directly instead of access_ok() as copy_from_user_nmi() does. This is fine here because the actual copy is inside a pagefault disabled region. Reported-by: Juri Lelli Signed-off-by: Eiichi Tsukata Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190722083216.16192-2-devel@etsukata.com --- arch/x86/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 4f36d3241faf..2d6898c2cb64 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -100,7 +100,7 @@ copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; - if (!access_ok(fp, sizeof(*frame))) + if (__range_not_ok(fp, sizeof(*frame), TASK_SIZE)) return 0; ret = 1; From d02f1aa39189e0619c3525d5cd03254e61bf606a Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 21 Jul 2019 17:24:18 +0200 Subject: [PATCH 5/9] x86/sysfb_efi: Add quirks for some devices with swapped width and height Some Lenovo 2-in-1s with a detachable keyboard have a portrait screen but advertise a landscape resolution and pitch, resulting in a messed up display if the kernel tries to show anything on the efifb (because of the wrong pitch). Fix this by adding a new DMI match table for devices which need to have their width and height swapped. At first it was tried to use the existing table for overriding some of the efifb parameters, but some of the affected devices have variants with different LCD resolutions which will not work with hardcoded override values. Reference: https://bugzilla.redhat.com/show_bug.cgi?id=1730783 Signed-off-by: Hans de Goede Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190721152418.11644-1-hdegoede@redhat.com --- arch/x86/kernel/sysfb_efi.c | 46 +++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c index 8eb67a670b10..653b7f617b61 100644 --- a/arch/x86/kernel/sysfb_efi.c +++ b/arch/x86/kernel/sysfb_efi.c @@ -230,9 +230,55 @@ static const struct dmi_system_id efifb_dmi_system_table[] __initconst = { {}, }; +/* + * Some devices have a portrait LCD but advertise a landscape resolution (and + * pitch). We simply swap width and height for these devices so that we can + * correctly deal with some of them coming with multiple resolutions. + */ +static const struct dmi_system_id efifb_dmi_swap_width_height[] __initconst = { + { + /* + * Lenovo MIIX310-10ICR, only some batches have the troublesome + * 800x1280 portrait screen. Luckily the portrait version has + * its own BIOS version, so we match on that. + */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "MIIX 310-10ICR"), + DMI_EXACT_MATCH(DMI_BIOS_VERSION, "1HCN44WW"), + }, + }, + { + /* Lenovo MIIX 320-10ICR with 800x1280 portrait screen */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, + "Lenovo MIIX 320-10ICR"), + }, + }, + { + /* Lenovo D330 with 800x1280 or 1200x1920 portrait screen */ + .matches = { + DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, + "Lenovo ideapad D330-10IGM"), + }, + }, + {}, +}; + __init void sysfb_apply_efi_quirks(void) { if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS)) dmi_check_system(efifb_dmi_system_table); + + if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI && + dmi_check_system(efifb_dmi_swap_width_height)) { + u16 temp = screen_info.lfb_width; + + screen_info.lfb_width = screen_info.lfb_height; + screen_info.lfb_height = temp; + screen_info.lfb_linelength = 4 * screen_info.lfb_width; + } } From 385065734cd417b9d7739b2ebb62c960aeb3ccb5 Mon Sep 17 00:00:00 2001 From: Cao jin Date: Fri, 19 Jul 2019 16:16:35 +0800 Subject: [PATCH 6/9] x86/irq/64: Update stale comment Commit e6401c130931 ("x86/irq/64: Split the IRQ stack into its own pages") missed to update one piece of comment as it did to its peer in Xen, which will confuse people who still need to read comment. Signed-off-by: Cao jin Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190719081635.26528-1-caoj.fnst@cn.fujitsu.com --- arch/x86/kernel/head_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index a6342c899be5..f3d3e9646a99 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -193,10 +193,10 @@ ENTRY(secondary_startup_64) /* Set up %gs. * - * The base of %gs always points to the bottom of the irqstack - * union. If the stack protector canary is enabled, it is - * located at %gs:40. Note that, on SMP, the boot cpu uses - * init data section till per cpu areas are set up. + * The base of %gs always points to fixed_percpu_data. If the + * stack protector canary is enabled, it is located at %gs:40. + * Note that, on SMP, the boot cpu uses init data section until + * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx movl initial_gs(%rip),%eax From b8f70953c1251d8b16276995816a95639f598e70 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Tue, 23 Jul 2019 21:20:58 -0700 Subject: [PATCH 7/9] x86/entry/32: Pass cr2 to do_async_page_fault() Commit a0d14b8909de ("x86/mm, tracing: Fix CR2 corruption") added the address parameter to do_async_page_fault(), but does not pass it from the 32-bit entry point. To plumb it through, factor-out common_exception_read_cr2 in the same fashion as common_exception, and uses it from both page_fault and async_page_fault. For a 32-bit KVM guest, this fixes: Run /sbin/init as init process Starting init: /sbin/init exists but couldn't execute it (error -14) Fixes: a0d14b8909de ("x86/mm, tracing: Fix CR2 corruption") Signed-off-by: Matt Mullins Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190724042058.24506-1-mmullins@fb.com --- arch/x86/entry/entry_32.S | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2bb986f305ac..4f86928246e7 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1443,8 +1443,12 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, ENTRY(page_fault) ASM_CLAC - pushl $0; /* %gs's slot on the stack */ + pushl $do_page_fault + jmp common_exception_read_cr2 +END(page_fault) +common_exception_read_cr2: + /* the function address is in %gs's slot on the stack */ SAVE_ALL switch_stacks=1 skip_gs=1 ENCODE_FRAME_POINTER @@ -1452,6 +1456,7 @@ ENTRY(page_fault) /* fixup %gs */ GS_TO_REG %ecx + movl PT_GS(%esp), %edi REG_TO_PTGS %ecx SET_KERNEL_GS %ecx @@ -1463,9 +1468,9 @@ ENTRY(page_fault) TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - call do_page_fault + CALL_NOSPEC %edi jmp ret_from_exception -END(page_fault) +END(common_exception_read_cr2) common_exception: /* the function address is in %gs's slot on the stack */ @@ -1595,7 +1600,7 @@ END(general_protection) ENTRY(async_page_fault) ASM_CLAC pushl $do_async_page_fault - jmp common_exception + jmp common_exception_read_cr2 END(async_page_fault) #endif From 643d83f0a3518d6fbcf88f970de0340a5aa6b5a2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Jul 2019 08:28:45 +0200 Subject: [PATCH 8/9] x86/hpet: Undo the early counter is counting check Rui reported that on a Pentium D machine which has HPET forced enabled because it is not advertised by ACPI, the early counter is counting check leads to a silent boot hang. The reason is that the ordering of checking the counter first and then reconfiguring the HPET fails to work on that machine. As the HPET is not advertised and presumably not initialized by the BIOS the early enable and the following reconfiguration seems to bring it into a broken state. Adding clocksource=jiffies to the command line results in the following clocksource watchdog warning: clocksource: timekeeping watchdog on CPU1: Marking clocksource 'tsc-early' as unstable because the skew is too large: clocksource: 'hpet' wd_now: 33 wd_last: 33 mask: ffffffff That clearly shows that the HPET is not counting after it got reconfigured and reenabled. If the counter is not working then the HPET timer is not expiring either, which explains the boot hang. Move the counter is counting check after the full configuration again to unbreak these systems. Reported-by: Rui Salvaterra Fixes: 3222daf970f3 ("x86/hpet: Separate counter check out of clocksource register code") Signed-off-by: Thomas Gleixner Tested-by: Rui Salvaterra Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1907250810530.1791@nanos.tec.linutronix.de --- arch/x86/kernel/hpet.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c43e96a938d0..c6f791bc481e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -827,10 +827,6 @@ int __init hpet_enable(void) if (!hpet_cfg_working()) goto out_nohpet; - /* Validate that the counter is counting */ - if (!hpet_counting()) - goto out_nohpet; - /* * Read the period and check for a sane value: */ @@ -896,6 +892,14 @@ int __init hpet_enable(void) } hpet_print_config(); + /* + * Validate that the counter is counting. This needs to be done + * after sanitizing the config registers to properly deal with + * force enabled HPETs. + */ + if (!hpet_counting()) + goto out_nohpet; + clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); if (id & HPET_ID_LEGSUP) { From 517c3ba00916383af6411aec99442c307c23f684 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Thu, 25 Jul 2019 10:39:09 +0800 Subject: [PATCH 9/9] x86/speculation/mds: Apply more accurate check on hypervisor platform X86_HYPER_NATIVE isn't accurate for checking if running on native platform, e.g. CONFIG_HYPERVISOR_GUEST isn't set or "nopv" is enabled. Checking the CPU feature bit X86_FEATURE_HYPERVISOR to determine if it's running on native platform is more accurate. This still doesn't cover the platforms on which X86_FEATURE_HYPERVISOR is unsupported, e.g. VMware, but there is nothing which can be done about this scenario. Fixes: 8a4b06d391b0 ("x86/speculation/mds: Add sysfs reporting for MDS") Signed-off-by: Zhenzhong Duan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1564022349-17338-1-git-send-email-zhenzhong.duan@oracle.com --- arch/x86/kernel/cpu/bugs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 66ca906aa790..801ecd1c3fd5 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1226,7 +1226,7 @@ static ssize_t l1tf_show_state(char *buf) static ssize_t mds_show_state(char *buf) { - if (!hypervisor_is_type(X86_HYPER_NATIVE)) { + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { return sprintf(buf, "%s; SMT Host state unknown\n", mds_strings[mds_mitigation]); }