From 23dcab8f8e89bb25d7e156ffec4b27542d1f737a Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Thu, 22 Jul 2010 13:30:44 -0500 Subject: [PATCH 1/7] powerpc/kexec: Fix boundary case for book-e kexec memory limits The KEXEC_*_MEMORY_LIMITs are inclusive addresses. We define them as 2Gs as that is what we allow mapping via TLBs. However, this should be 2G - 1 to be inclusive, otherwise if we have >2G of memory in a system we fail to boot properly via kexec. Signed-off-by: Kumar Gala --- arch/powerpc/include/asm/kexec.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 2a9cd74a841e..076327f2eff7 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -8,9 +8,9 @@ * On FSL-BookE we setup a 1:1 mapping which covers the first 2GiB of memory * and therefore we can only deal with memory within this range */ -#define KEXEC_SOURCE_MEMORY_LIMIT (2 * 1024 * 1024 * 1024UL) -#define KEXEC_DESTINATION_MEMORY_LIMIT (2 * 1024 * 1024 * 1024UL) -#define KEXEC_CONTROL_MEMORY_LIMIT (2 * 1024 * 1024 * 1024UL) +#define KEXEC_SOURCE_MEMORY_LIMIT (2 * 1024 * 1024 * 1024UL - 1) +#define KEXEC_DESTINATION_MEMORY_LIMIT (2 * 1024 * 1024 * 1024UL - 1) +#define KEXEC_CONTROL_MEMORY_LIMIT (2 * 1024 * 1024 * 1024UL - 1) #else From b1623e7eb280f853f60338c7bb68bd3f3a970205 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 14 Jul 2010 19:31:48 +0000 Subject: [PATCH 2/7] powerpc/mm: Handle hypervisor pte insert failure in __hash_page_huge If the hypervisor gives us an error on a hugepage insert we panic. The normal page code already handles this by returning an error instead and we end calling low_hash_fault which will just kill the task if possible. The patch below does a similar thing for the hugepage case. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hugetlbpage-hash64.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index 199539882f92..c9acd7910eea 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -121,8 +121,15 @@ repeat: } } - if (unlikely(slot == -2)) - panic("hash_huge_page: pte_insert failed\n"); + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + err = -1; + goto out; + } new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); } From ca91e6c09d656c6deb1f2bc5d57186c718106aa5 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 23 Jul 2010 08:53:23 +1000 Subject: [PATCH 3/7] powerpc/mm: Move around testing of _PAGE_PRESENT in hash code Instead of adding _PAGE_PRESENT to the access permission mask in each low level routine independently, we add it once from hash_page(). We also move the preliminary access check (the racy one before the PTE is locked) up so it applies to the huge page case. This duplicates code in __hash_page_huge() which we'll remove in a subsequent patch to fix a race in there. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hash_low_64.S | 9 --------- arch/powerpc/mm/hash_utils_64.c | 19 +++++++++++-------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S index a719f53921a5..3079f6b44cf5 100644 --- a/arch/powerpc/mm/hash_low_64.S +++ b/arch/powerpc/mm/hash_low_64.S @@ -68,9 +68,6 @@ _GLOBAL(__hash_page_4K) std r8,STK_PARM(r8)(r1) std r9,STK_PARM(r9)(r1) - /* Add _PAGE_PRESENT to access */ - ori r4,r4,_PAGE_PRESENT - /* Save non-volatile registers. * r31 will hold "old PTE" * r30 is "new PTE" @@ -347,9 +344,6 @@ _GLOBAL(__hash_page_4K) std r8,STK_PARM(r8)(r1) std r9,STK_PARM(r9)(r1) - /* Add _PAGE_PRESENT to access */ - ori r4,r4,_PAGE_PRESENT - /* Save non-volatile registers. * r31 will hold "old PTE" * r30 is "new PTE" @@ -687,9 +681,6 @@ _GLOBAL(__hash_page_64K) std r8,STK_PARM(r8)(r1) std r9,STK_PARM(r9)(r1) - /* Add _PAGE_PRESENT to access */ - ori r4,r4,_PAGE_PRESENT - /* Save non-volatile registers. * r31 will hold "old PTE" * r30 is "new PTE" diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 98f262de5585..40847a9dd6db 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -955,6 +955,17 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) return 1; } + /* Add _PAGE_PRESENT to the required access perm */ + access |= _PAGE_PRESENT; + + /* Pre-check access permissions (will be re-checked atomically + * in __hash_page_XX but this pre-check is a fast path + */ + if (access & ~pte_val(*ptep)) { + DBG_LOW(" no access !\n"); + return 1; + } + #ifdef CONFIG_HUGETLB_PAGE if (hugeshift) return __hash_page_huge(ea, access, vsid, ptep, trap, local, @@ -967,14 +978,6 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), pte_val(*(ptep + PTRS_PER_PTE))); #endif - /* Pre-check access permissions (will be re-checked atomically - * in __hash_page_XX but this pre-check is a fast path - */ - if (access & ~pte_val(*ptep)) { - DBG_LOW(" no access !\n"); - return 1; - } - /* Do actual hashing */ #ifdef CONFIG_PPC_64K_PAGES /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ From 171aa2caaad16ed32b655d33565e112a12cb3537 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 23 Jul 2010 09:02:27 +1000 Subject: [PATCH 4/7] powerpc/mm: Fix bugs in huge page hashing There's a couple of nasty bugs lurking in our huge page hashing code. First, we don't check the access permission atomically with setting the _PAGE_BUSY bit, which means that the PTE value we end up using for the hashing might be different than the one we have checked the access permissions for. We've seen cases where that leads us to try to use an invalidated PTE for hashing, causing all sort of "interesting" issues. Then, we also failed to set _PAGE_DIRTY on a write access. Finally, a minor tweak but we should return 0 when we find the PTE busy, in order to just re-execute the access, rather than 1 which means going to do_page_fault(). Signed-off-by: Benjamin Herrenschmidt --- --- arch/powerpc/mm/hugetlbpage-hash64.c | 31 ++++++++++++---------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index c9acd7910eea..faae9ec4cb04 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -21,21 +21,13 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, unsigned long old_pte, new_pte; unsigned long va, rflags, pa, sz; long slot; - int err = 1; BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); /* Search the Linux page table for a match with va */ va = hpt_va(ea, vsid, ssize); - /* - * Check the user's access rights to the page. If access should be - * prevented then send the problem up to do_page_fault. - */ - if (unlikely(access & ~pte_val(*ptep))) - goto out; - /* - * At this point, we have a pte (old_pte) which can be used to build + /* At this point, we have a pte (old_pte) which can be used to build * or update an HPTE. There are 2 cases: * * 1. There is a valid (present) pte with no associated HPTE (this is @@ -49,9 +41,17 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, do { old_pte = pte_val(*ptep); - if (old_pte & _PAGE_BUSY) - goto out; + /* If PTE busy, retry the access */ + if (unlikely(old_pte & _PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(access & ~old_pte)) + return 1; + /* Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access */ new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_RW) + new_pte |= _PAGE_DIRTY; } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, old_pte, new_pte)); @@ -127,8 +127,7 @@ repeat: */ if (unlikely(slot == -2)) { *ptep = __pte(old_pte); - err = -1; - goto out; + return -1; } new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); @@ -138,9 +137,5 @@ repeat: * No need to use ldarx/stdcx here */ *ptep = __pte(new_pte & ~_PAGE_BUSY); - - err = 0; - - out: - return err; + return 0; } From 4b8692c022a4b149d0c2cc3f4f7a363453fde72a Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 23 Jul 2010 10:31:13 +1000 Subject: [PATCH 5/7] powerpc/mm: Add some debug output when hash insertion fails This adds some debug output to our MMU hash code to print out some useful debug data if the hypervisor refuses the insertion (which should normally never happen). Signed-off-by: Benjamin Herrenschmidt --- --- arch/powerpc/include/asm/mmu-hash64.h | 4 +++- arch/powerpc/mm/hash_utils_64.c | 34 +++++++++++++++++++++++---- arch/powerpc/mm/hugetlbpage-hash64.c | 2 ++ 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 2102b214a87c..0e398cfee2c8 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -250,7 +250,9 @@ extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap) int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, int local, int ssize, unsigned int shift, unsigned int mmu_psize); - +extern void hash_failure_debug(unsigned long ea, unsigned long access, + unsigned long vsid, unsigned long trap, + int ssize, int psize, unsigned long pte); extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend, unsigned long pstart, unsigned long prot, int psize, int ssize); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 40847a9dd6db..09dffe6efa46 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -871,6 +871,18 @@ static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) } #endif +void hash_failure_debug(unsigned long ea, unsigned long access, + unsigned long vsid, unsigned long trap, + int ssize, int psize, unsigned long pte) +{ + if (!printk_ratelimit()) + return; + pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", + ea, access, current->comm); + pr_info(" trap=0x%lx vsid=0x%lx ssize=%d psize=%d pte=0x%lx\n", + trap, vsid, ssize, psize, pte); +} + /* Result code is: * 0 - handled * 1 - normal page fault @@ -1036,6 +1048,12 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) local, ssize, spp); } + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, psize, + pte_val(*ptep)); #ifndef CONFIG_PPC_64K_PAGES DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); #else @@ -1054,8 +1072,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, void *pgdir; pte_t *ptep; unsigned long flags; - int local = 0; - int ssize; + int rc, ssize, local = 0; BUG_ON(REGION_ID(ea) != USER_REGION_ID); @@ -1101,11 +1118,18 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, /* Hash it in */ #ifdef CONFIG_PPC_HAS_HASH_64K if (mm->context.user_psize == MMU_PAGE_64K) - __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); + rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); else #endif /* CONFIG_PPC_HAS_HASH_64K */ - __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize, - subpage_protection(pgdir, ea)); + rc = __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize, + subpage_protection(pgdir, ea)); + + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, + mm->context.user_psize, pte_val(*ptep)); local_irq_restore(flags); } diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index faae9ec4cb04..cc5c273086cf 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -127,6 +127,8 @@ repeat: */ if (unlikely(slot == -2)) { *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + mmu_psize, old_pte); return -1; } From 3fdfd99051fbc210464378cd44a4b8914282bac3 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 23 Jul 2010 10:35:52 +1000 Subject: [PATCH 6/7] powerpc: Fix erroneous lmb->memblock conversions Oooops... we missed these. We incorrectly converted strings used when parsing the device-tree on pseries, thus breaking access to drconf memory and hotplug memory. While at it, also revert some variable names that represent something the FW calls "lmb" and thus don't need to be converted to "memblock". Signed-off-by: Benjamin Herrenschmidt --- --- arch/powerpc/kernel/prom.c | 2 +- arch/powerpc/mm/numa.c | 24 +++++++++---------- .../platforms/pseries/hotplug-memory.c | 22 ++++++++--------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 9d3953983fb7..fed9bf6187d1 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -414,7 +414,7 @@ static int __init early_init_dt_scan_drconf_memory(unsigned long node) u64 base, size, memblock_size; unsigned int is_kexec_kdump = 0, rngs; - ls = of_get_flat_dt_prop(node, "ibm,memblock-size", &l); + ls = of_get_flat_dt_prop(node, "ibm,lmb-size", &l); if (ls == NULL || l < dt_root_size_cells * sizeof(__be32)) return 0; memblock_size = dt_mem_next_cell(dt_root_size_cells, &ls); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f47364585ecd..aa731af720c0 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -398,15 +398,15 @@ static int of_get_drconf_memory(struct device_node *memory, const u32 **dm) } /* - * Retreive and validate the ibm,memblock-size property for drconf memory + * Retreive and validate the ibm,lmb-size property for drconf memory * from the device tree. */ -static u64 of_get_memblock_size(struct device_node *memory) +static u64 of_get_lmb_size(struct device_node *memory) { const u32 *prop; u32 len; - prop = of_get_property(memory, "ibm,memblock-size", &len); + prop = of_get_property(memory, "ibm,lmb-size", &len); if (!prop || len < sizeof(unsigned int)) return 0; @@ -562,7 +562,7 @@ static unsigned long __init numa_enforce_memory_limit(unsigned long start, static inline int __init read_usm_ranges(const u32 **usm) { /* - * For each memblock in ibm,dynamic-memory a corresponding + * For each lmb in ibm,dynamic-memory a corresponding * entry in linux,drconf-usable-memory property contains * a counter followed by that many (base, size) duple. * read the counter from linux,drconf-usable-memory @@ -578,7 +578,7 @@ static void __init parse_drconf_memory(struct device_node *memory) { const u32 *dm, *usm; unsigned int n, rc, ranges, is_kexec_kdump = 0; - unsigned long memblock_size, base, size, sz; + unsigned long lmb_size, base, size, sz; int nid; struct assoc_arrays aa; @@ -586,8 +586,8 @@ static void __init parse_drconf_memory(struct device_node *memory) if (!n) return; - memblock_size = of_get_memblock_size(memory); - if (!memblock_size) + lmb_size = of_get_lmb_size(memory); + if (!lmb_size) return; rc = of_get_assoc_arrays(memory, &aa); @@ -611,7 +611,7 @@ static void __init parse_drconf_memory(struct device_node *memory) continue; base = drmem.base_addr; - size = memblock_size; + size = lmb_size; ranges = 1; if (is_kexec_kdump) { @@ -1072,7 +1072,7 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory, { const u32 *dm; unsigned int drconf_cell_cnt, rc; - unsigned long memblock_size; + unsigned long lmb_size; struct assoc_arrays aa; int nid = -1; @@ -1080,8 +1080,8 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory, if (!drconf_cell_cnt) return -1; - memblock_size = of_get_memblock_size(memory); - if (!memblock_size) + lmb_size = of_get_lmb_size(memory); + if (!lmb_size) return -1; rc = of_get_assoc_arrays(memory, &aa); @@ -1100,7 +1100,7 @@ static int hot_add_drconf_scn_to_nid(struct device_node *memory, continue; if ((scn_addr < drmem.base_addr) - || (scn_addr >= (drmem.base_addr + memblock_size))) + || (scn_addr >= (drmem.base_addr + lmb_size))) continue; nid = of_drconf_to_nid_single(&drmem, &aa); diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index deab5f946090..bc8803664140 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -69,7 +69,7 @@ static int pseries_remove_memory(struct device_node *np) const char *type; const unsigned int *regs; unsigned long base; - unsigned int memblock_size; + unsigned int lmb_size; int ret = -EINVAL; /* @@ -87,9 +87,9 @@ static int pseries_remove_memory(struct device_node *np) return ret; base = *(unsigned long *)regs; - memblock_size = regs[3]; + lmb_size = regs[3]; - ret = pseries_remove_memblock(base, memblock_size); + ret = pseries_remove_memblock(base, lmb_size); return ret; } @@ -98,7 +98,7 @@ static int pseries_add_memory(struct device_node *np) const char *type; const unsigned int *regs; unsigned long base; - unsigned int memblock_size; + unsigned int lmb_size; int ret = -EINVAL; /* @@ -116,36 +116,36 @@ static int pseries_add_memory(struct device_node *np) return ret; base = *(unsigned long *)regs; - memblock_size = regs[3]; + lmb_size = regs[3]; /* * Update memory region to represent the memory add */ - ret = memblock_add(base, memblock_size); + ret = memblock_add(base, lmb_size); return (ret < 0) ? -EINVAL : 0; } static int pseries_drconf_memory(unsigned long *base, unsigned int action) { struct device_node *np; - const unsigned long *memblock_size; + const unsigned long *lmb_size; int rc; np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); if (!np) return -EINVAL; - memblock_size = of_get_property(np, "ibm,memblock-size", NULL); - if (!memblock_size) { + lmb_size = of_get_property(np, "ibm,lmb-size", NULL); + if (!lmb_size) { of_node_put(np); return -EINVAL; } if (action == PSERIES_DRCONF_MEM_ADD) { - rc = memblock_add(*base, *memblock_size); + rc = memblock_add(*base, *lmb_size); rc = (rc < 0) ? -EINVAL : 0; } else if (action == PSERIES_DRCONF_MEM_REMOVE) { - rc = pseries_remove_memblock(*base, *memblock_size); + rc = pseries_remove_memblock(*base, *lmb_size); } else { rc = -EINVAL; } From da5e37efe8704fc2b354626467f80f73c5e3c020 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Tue, 13 Jul 2010 11:39:42 +0200 Subject: [PATCH 7/7] vmlinux.lds: fix .data..init_task output section (fix popwerpc boot) The .data..init_task output section was missing a load offset causing a popwerpc target to fail to boot. Sean MacLennan tracked it down to the definition of INIT_TASK_DATA_SECTION(). There are only two users of INIT_TASK_DATA_SECTION() in the kernel today: cris and popwerpc. cris do not support relocatable kernels and is thus not impacted by this change. Fix INIT_TASK_DATA_SECTION() to specify load offset like all other output sections. Reported-by: Sean MacLennan Signed-off-by: Sam Ravnborg Signed-off-by: Benjamin Herrenschmidt --- include/asm-generic/vmlinux.lds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 48c5299cbf26..cdfff74e9739 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -435,7 +435,7 @@ */ #define INIT_TASK_DATA_SECTION(align) \ . = ALIGN(align); \ - .data..init_task : { \ + .data..init_task : AT(ADDR(.data..init_task) - LOAD_OFFSET) { \ INIT_TASK_DATA(align) \ }