From 103a8542cb35b5130f732d00b0419a594ba1b517 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 28 Aug 2020 15:38:52 +0530 Subject: [PATCH 1/7] powerpc/book3s64/radix: Fix boot failure with large amount of guest memory If the hypervisor doesn't support hugepages, the kernel ends up allocating a large number of page table pages. The early page table allocation was wrongly setting the max memblock limit to ppc64_rma_size with radix translation which resulted in boot failure as shown below. Kernel panic - not syncing: early_alloc_pgtable: Failed to allocate 16777216 bytes align=0x1000000 nid=-1 from=0x0000000000000000 max_addr=0xffffffffffffffff CPU: 0 PID: 0 Comm: swapper Not tainted 5.8.0-24.9-default+ #2 Call Trace: [c0000000016f3d00] [c0000000007c6470] dump_stack+0xc4/0x114 (unreliable) [c0000000016f3d40] [c00000000014c78c] panic+0x164/0x418 [c0000000016f3dd0] [c000000000098890] early_alloc_pgtable+0xe0/0xec [c0000000016f3e60] [c0000000010a5440] radix__early_init_mmu+0x360/0x4b4 [c0000000016f3ef0] [c000000001099bac] early_init_mmu+0x1c/0x3c [c0000000016f3f10] [c00000000109a320] early_setup+0x134/0x170 This was because the kernel was checking for the radix feature before we enable the feature via mmu_features. This resulted in the kernel using hash restrictions on radix. Rework the early init code such that the kernel boot with memblock restrictions as imposed by hash. At that point, the kernel still hasn't finalized the translation the kernel will end up using. We have three different ways of detecting radix. 1. dt_cpu_ftrs_scan -> used only in case of PowerNV 2. ibm,pa-features -> Used when we don't use cpu_dt_ftr_scan 3. CAS -> Where we negotiate with hypervisor about the supported translation. We look at 1 or 2 early in the boot and after that, we look at the CAS vector to finalize the translation the kernel will use. We also support a kernel command line option (disable_radix) to switch to hash. Update the memblock limit after mmu_early_init_devtree() if the kernel is going to use radix translation. This forces some of the memblock allocations we do before mmu_early_init_devtree() to be within the RMA limit. Fixes: 2bfd65e45e87 ("powerpc/mm/radix: Add radix callbacks for early init routines") Reported-by: Shirisha Ganta Signed-off-by: Aneesh Kumar K.V Reviewed-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200828100852.426575-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/book3s/64/mmu.h | 10 +++++----- arch/powerpc/mm/book3s64/radix_pgtable.c | 15 --------------- arch/powerpc/mm/init_64.c | 11 +++++++++-- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 55442d45c597..b392384a3b15 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -239,14 +239,14 @@ static inline void early_init_mmu_secondary(void) extern void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size); -extern void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size); static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { - if (early_radix_enabled()) - return radix__setup_initial_memory_limit(first_memblock_base, - first_memblock_size); + /* + * Hash has more strict restrictions. At this point we don't + * know which translations we will pick. Hence go with hash + * restrictions. + */ return hash__setup_initial_memory_limit(first_memblock_base, first_memblock_size); } diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 28c784976bed..d5f0c10d752a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -734,21 +734,6 @@ void radix__mmu_cleanup_all(void) } } -void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size) -{ - /* - * We don't currently support the first MEMBLOCK not mapping 0 - * physical on those processors - */ - BUG_ON(first_memblock_base != 0); - - /* - * Radix mode is not limited by RMA / VRMA addressing. - */ - ppc64_rma_size = ULONG_MAX; -} - #ifdef CONFIG_MEMORY_HOTPLUG static void free_pte_table(pte_t *pte_start, pmd_t *pmd) { diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 02e127fa5777..8459056cce67 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -433,9 +433,16 @@ void __init mmu_early_init_devtree(void) if (!(mfmsr() & MSR_HV)) early_check_vec5(); - if (early_radix_enabled()) + if (early_radix_enabled()) { radix__early_init_devtree(); - else + /* + * We have finalized the translation we are going to use by now. + * Radix mode is not limited by RMA / VRMA addressing. + * Hence don't limit memblock allocations. + */ + ppc64_rma_size = ULONG_MAX; + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + } else hash__early_init_devtree(); } #endif /* CONFIG_PPC_BOOK3S_64 */ From fc1f178cdb31783ff37296ecae817a1045a1a513 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 1 Sep 2020 22:45:17 +1000 Subject: [PATCH 2/7] selftests/powerpc: Skip PROT_SAO test in guests/LPARS In commit 9b725a90a8f1 ("powerpc/64s: Disallow PROT_SAO in LPARs by default") PROT_SAO was disabled in guests/LPARs by default. So skip the test if we are running in a guest to avoid a spurious failure. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200901124653.523182-1-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/mm/prot_sao.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/powerpc/mm/prot_sao.c b/tools/testing/selftests/powerpc/mm/prot_sao.c index e0cf8ebbf8cd..30b71b1d78d5 100644 --- a/tools/testing/selftests/powerpc/mm/prot_sao.c +++ b/tools/testing/selftests/powerpc/mm/prot_sao.c @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -18,9 +19,13 @@ int test_prot_sao(void) { char *p; - /* SAO was introduced in 2.06 and removed in 3.1 */ + /* + * SAO was introduced in 2.06 and removed in 3.1. It's disabled in + * guests/LPARs by default, so also skip if we are running in a guest. + */ SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06) || - have_hwcap2(PPC_FEATURE2_ARCH_3_1)); + have_hwcap2(PPC_FEATURE2_ARCH_3_1) || + access("/proc/device-tree/rtas/ibm,hypertas-functions", F_OK) == 0); /* * Ensure we can ask for PROT_SAO. From 675bceb097e6fb9769309093ec6f3d33a2fed9cc Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 2 Sep 2020 09:31:22 +0530 Subject: [PATCH 3/7] powerpc/mm: Remove DEBUG_VM_PGTABLE support on powerpc The test is broken w.r.t page table update rules and results in kernel crash as below. Disable the support until we get the tests updated. [ 21.083519] kernel BUG at arch/powerpc/mm/pgtable.c:304! cpu 0x0: Vector: 700 (Program Check) at [c000000c6d1e76c0] pc: c00000000009a5ec: assert_pte_locked+0x14c/0x380 lr: c0000000005eeeec: pte_update+0x11c/0x190 sp: c000000c6d1e7950 msr: 8000000002029033 current = 0xc000000c6d172c80 paca = 0xc000000003ba0000 irqmask: 0x03 irq_happened: 0x01 pid = 1, comm = swapper/0 kernel BUG at arch/powerpc/mm/pgtable.c:304! [link register ] c0000000005eeeec pte_update+0x11c/0x190 [c000000c6d1e7950] 0000000000000001 (unreliable) [c000000c6d1e79b0] c0000000005eee14 pte_update+0x44/0x190 [c000000c6d1e7a10] c000000001a2ca9c pte_advanced_tests+0x160/0x3d8 [c000000c6d1e7ab0] c000000001a2d4fc debug_vm_pgtable+0x7e8/0x1338 [c000000c6d1e7ba0] c0000000000116ec do_one_initcall+0xac/0x5f0 [c000000c6d1e7c80] c0000000019e4fac kernel_init_freeable+0x4dc/0x5a4 [c000000c6d1e7db0] c000000000012474 kernel_init+0x24/0x160 [c000000c6d1e7e20] c00000000000cbd0 ret_from_kernel_thread+0x5c/0x6c With DEBUG_VM disabled [ 20.530152] BUG: Kernel NULL pointer dereference on read at 0x00000000 [ 20.530183] Faulting instruction address: 0xc0000000000df330 cpu 0x33: Vector: 380 (Data SLB Access) at [c000000c6d19f700] pc: c0000000000df330: memset+0x68/0x104 lr: c00000000009f6d8: hash__pmdp_huge_get_and_clear+0xe8/0x1b0 sp: c000000c6d19f990 msr: 8000000002009033 dar: 0 current = 0xc000000c6d177480 paca = 0xc00000001ec4f400 irqmask: 0x03 irq_happened: 0x01 pid = 1, comm = swapper/0 [link register ] c00000000009f6d8 hash__pmdp_huge_get_and_clear+0xe8/0x1b0 [c000000c6d19f990] c00000000009f748 hash__pmdp_huge_get_and_clear+0x158/0x1b0 (unreliable) [c000000c6d19fa10] c0000000019ebf30 pmd_advanced_tests+0x1f0/0x378 [c000000c6d19fab0] c0000000019ed088 debug_vm_pgtable+0x79c/0x1244 [c000000c6d19fba0] c0000000000116ec do_one_initcall+0xac/0x5f0 [c000000c6d19fc80] c0000000019a4fac kernel_init_freeable+0x4dc/0x5a4 [c000000c6d19fdb0] c000000000012474 kernel_init+0x24/0x160 [c000000c6d19fe20] c00000000000cbd0 ret_from_kernel_thread+0x5c/0x6c 33:mon> Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200902040122.136414-1-aneesh.kumar@linux.ibm.com --- Documentation/features/debug/debug-vm-pgtable/arch-support.txt | 2 +- arch/powerpc/Kconfig | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt index 53da483c8326..1c49723e7534 100644 --- a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt +++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt @@ -22,7 +22,7 @@ | nios2: | TODO | | openrisc: | TODO | | parisc: | TODO | - | powerpc: | ok | + | powerpc: | TODO | | riscv: | ok | | s390: | ok | | sh: | TODO | diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 65bed1fdeaad..787e829b6f25 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -116,7 +116,6 @@ config PPC # select ARCH_32BIT_OFF_T if PPC32 select ARCH_HAS_DEBUG_VIRTUAL - select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE From 4c62285439f80f8996c38e0bda79b1125a192365 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 2 Sep 2020 22:14:39 +1000 Subject: [PATCH 4/7] Revert "powerpc/build: vdso linker warning for orphan sections" This reverts commit f2af201002a8bc22500c04cc474ea480bf361351. It added a usage of cc-ldoption, but cc-ldoption was removed in commit 055efab3120b ("kbuild: drop support for cc-ldoption"). Reported-by: Nick Desaulniers Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/vdso32/Makefile | 2 +- arch/powerpc/kernel/vdso32/vdso32.lds.S | 1 - arch/powerpc/kernel/vdso64/Makefile | 2 +- arch/powerpc/kernel/vdso64/vdso64.lds.S | 3 +-- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index 87ab1152d5ce..e147bbdc12cd 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -50,7 +50,7 @@ $(obj-vdso32): %.o: %.S FORCE # actual build commands quiet_cmd_vdso32ld = VDSO32L $@ - cmd_vdso32ld = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ $(call cc-ldoption, -Wl$(comma)--orphan-handling=warn) -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) + cmd_vdso32ld = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) quiet_cmd_vdso32as = VDSO32A $@ cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) -c -o $@ $< diff --git a/arch/powerpc/kernel/vdso32/vdso32.lds.S b/arch/powerpc/kernel/vdso32/vdso32.lds.S index 4c985467a668..5206c2eb2a1d 100644 --- a/arch/powerpc/kernel/vdso32/vdso32.lds.S +++ b/arch/powerpc/kernel/vdso32/vdso32.lds.S @@ -111,7 +111,6 @@ SECTIONS *(.note.GNU-stack) *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) - *(.glink .iplt .plt .rela*) } } diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile index 38c317f25141..32ebb3522ea1 100644 --- a/arch/powerpc/kernel/vdso64/Makefile +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -34,7 +34,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE # actual build commands quiet_cmd_vdso64ld = VDSO64L $@ - cmd_vdso64ld = $(CC) $(c_flags) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) $(call cc-ldoption, -Wl$(comma)--orphan-handling=warn) + cmd_vdso64ld = $(CC) $(c_flags) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) # install commands for the unstripped file quiet_cmd_vdso_install = INSTALL $@ diff --git a/arch/powerpc/kernel/vdso64/vdso64.lds.S b/arch/powerpc/kernel/vdso64/vdso64.lds.S index 4e3a8d4ee614..256fb9720298 100644 --- a/arch/powerpc/kernel/vdso64/vdso64.lds.S +++ b/arch/powerpc/kernel/vdso64/vdso64.lds.S @@ -30,7 +30,7 @@ SECTIONS . = ALIGN(16); .text : { *(.text .stub .text.* .gnu.linkonce.t.* __ftr_alt_*) - *(.sfpr) + *(.sfpr .glink) } :text PROVIDE(__etext = .); PROVIDE(_etext = .); @@ -111,7 +111,6 @@ SECTIONS *(.branch_lt) *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) - *(.glink .iplt .plt .rela*) } } From 437ef802e0adc9f162a95213a3488e8646e5fc03 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 8 Sep 2020 11:51:06 +1000 Subject: [PATCH 5/7] powerpc/dma: Fix dma_map_ops::get_required_mask There are 2 problems with it: 1. "<" vs expected "<<" 2. the shift number is an IOMMU page number mask, not an address mask as the IOMMU page shift is missing. This did not hit us before f1565c24b596 ("powerpc: use the generic dma_ops_bypass mode") because we had additional code to handle bypass mask so this chunk (almost?) never executed.However there were reports that aacraid does not work with "iommu=nobypass". After f1565c24b596, aacraid (and probably others which call dma_get_required_mask() before setting the mask) was unable to enable 64bit DMA and fall back to using IOMMU which was known not to work, one of the problems is double free of an IOMMU page. This fixes DMA for aacraid, both with and without "iommu=nobypass" in the kernel command line. Verified with "stress-ng -d 4". Fixes: 6a5c7be5e484 ("powerpc: Override dma_get_required_mask by platform hook and ops") Cc: stable@vger.kernel.org # v3.2+ Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200908015106.79661-1-aik@ozlabs.ru --- arch/powerpc/kernel/dma-iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 569fecd7b5b2..9053fc9d20c7 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -120,7 +120,8 @@ u64 dma_iommu_get_required_mask(struct device *dev) if (!tbl) return 0; - mask = 1ULL < (fls_long(tbl->it_offset + tbl->it_size) - 1); + mask = 1ULL << (fls_long(tbl->it_offset + tbl->it_size) + + tbl->it_page_shift - 1); mask += mask - 1; return mask; From 1d3ee7df009a46440c58508b8819213c09503acd Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 3 Sep 2020 14:57:27 +0530 Subject: [PATCH 6/7] cpuidle: pseries: Fix CEDE latency conversion from tb to us Commit d947fb4c965c ("cpuidle: pseries: Fixup exit latency for CEDE(0)") sets the exit latency of CEDE(0) based on the latency values of the Extended CEDE states advertised by the platform. The values advertised by the platform are in timebase ticks. However the cpuidle framework requires the latency values in microseconds. If the tb-ticks value advertised by the platform correspond to a value smaller than 1us, during the conversion from tb-ticks to microseconds, in the current code, the result becomes zero. This is incorrect as it puts a CEDE state on par with the snooze state. This patch fixes this by rounding up the result obtained while converting the latency value from tb-ticks to microseconds. It also prints a warning in case we discover an extended-cede state with wakeup latency to be 0. In such a case, ensure that CEDE(0) has a non-zero wakeup latency. Fixes: d947fb4c965c ("cpuidle: pseries: Fixup exit latency for CEDE(0)") Signed-off-by: Gautham R. Shenoy Reviewed-by: Vaidyanathan Srinivasan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1599125247-28488-1-git-send-email-ego@linux.vnet.ibm.com --- drivers/cpuidle/cpuidle-pseries.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c index ff6d99e923a4..a2b5c6f60cf0 100644 --- a/drivers/cpuidle/cpuidle-pseries.c +++ b/drivers/cpuidle/cpuidle-pseries.c @@ -361,7 +361,10 @@ static void __init fixup_cede0_latency(void) for (i = 0; i < nr_xcede_records; i++) { struct xcede_latency_record *record = &payload->records[i]; u64 latency_tb = be64_to_cpu(record->latency_ticks); - u64 latency_us = tb_to_ns(latency_tb) / NSEC_PER_USEC; + u64 latency_us = DIV_ROUND_UP_ULL(tb_to_ns(latency_tb), NSEC_PER_USEC); + + if (latency_us == 0) + pr_warn("cpuidle: xcede record %d has an unrealistic latency of 0us.\n", i); if (latency_us < min_latency_us) min_latency_us = latency_us; @@ -378,10 +381,14 @@ static void __init fixup_cede0_latency(void) * Perform the fix-up. */ if (min_latency_us < dedicated_states[1].exit_latency) { - u64 cede0_latency = min_latency_us - 1; + /* + * We set a minimum of 1us wakeup latency for cede0 to + * distinguish it from snooze + */ + u64 cede0_latency = 1; - if (cede0_latency <= 0) - cede0_latency = min_latency_us; + if (min_latency_us > cede0_latency) + cede0_latency = min_latency_us - 1; dedicated_states[1].exit_latency = cede0_latency; dedicated_states[1].target_residency = 10 * (cede0_latency); From 0460534b532e5518c657c7d6492b9337d975eaa3 Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Mon, 7 Sep 2020 16:35:40 +0530 Subject: [PATCH 7/7] powerpc/papr_scm: Limit the readability of 'perf_stats' sysfs attribute The newly introduced 'perf_stats' attribute uses the default access mode of 0444, allowing non-root users to access performance stats of an nvdimm and potentially force the kernel into issuing a large number of expensive hypercalls. Since the information exposed by this attribute cannot be cached it is better to ward off access to this attribute from users who don't need to access to these performance statistics. Hence update the access mode of 'perf_stats' attribute to be only readable by root users. Fixes: 2d02bf835e57 ("powerpc/papr_scm: Fetch nvdimm performance stats from PHYP") Reported-by: Aneesh Kumar K.V Signed-off-by: Vaibhav Jain Reviewed-by: Ira Weiny Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200907110540.21349-1-vaibhav@linux.ibm.com --- arch/powerpc/platforms/pseries/papr_scm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index f439f0dfea7d..a88a707a608a 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -822,7 +822,7 @@ free_stats: kfree(stats); return rc ? rc : seq_buf_used(&s); } -DEVICE_ATTR_RO(perf_stats); +DEVICE_ATTR_ADMIN_RO(perf_stats); static ssize_t flags_show(struct device *dev, struct device_attribute *attr, char *buf)