2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* PPC64 (POWER4) Huge TLB Page Support for Kernel.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2003 David Gibson, IBM Corporation.
|
|
|
|
*
|
|
|
|
* Based on the IA-32 version:
|
|
|
|
* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/mm.h>
|
2009-10-26 22:24:31 +03:00
|
|
|
#include <linux/io.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <linux/hugetlb.h>
|
2009-10-26 22:24:31 +03:00
|
|
|
#include <asm/pgtable.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <asm/pgalloc.h>
|
|
|
|
#include <asm/tlb.h>
|
|
|
|
|
2008-07-24 08:27:55 +04:00
|
|
|
#define PAGE_SHIFT_64K 16
|
|
|
|
#define PAGE_SHIFT_16M 24
|
|
|
|
#define PAGE_SHIFT_16G 34
|
2008-01-04 01:59:50 +03:00
|
|
|
|
2008-07-24 08:27:53 +04:00
|
|
|
#define MAX_NUMBER_GPAGES 1024
|
|
|
|
|
|
|
|
/* Tracks the 16G pages after the device tree is scanned and before the
|
|
|
|
* huge_boot_pages list is ready. */
|
|
|
|
static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
|
|
|
|
static unsigned nr_gpages;
|
2005-08-11 10:55:21 +04:00
|
|
|
|
2006-04-28 09:02:51 +04:00
|
|
|
/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
|
|
|
|
* will choke on pointers to hugepte tables, which is handy for
|
|
|
|
* catching screwups early. */
|
|
|
|
|
2008-07-24 08:27:56 +04:00
|
|
|
static inline int shift_to_mmu_psize(unsigned int shift)
|
|
|
|
{
|
2009-10-26 22:24:31 +03:00
|
|
|
int psize;
|
|
|
|
|
|
|
|
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
|
|
|
|
if (mmu_psize_defs[psize].shift == shift)
|
|
|
|
return psize;
|
2008-07-24 08:27:56 +04:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
|
|
|
|
{
|
|
|
|
if (mmu_psize_defs[mmu_psize].shift)
|
|
|
|
return mmu_psize_defs[mmu_psize].shift;
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
#define hugepd_none(hpd) ((hpd).pd == 0)
|
|
|
|
|
2006-04-28 09:02:51 +04:00
|
|
|
static inline pte_t *hugepd_page(hugepd_t hpd)
|
|
|
|
{
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
BUG_ON(!hugepd_ok(hpd));
|
|
|
|
return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned int hugepd_shift(hugepd_t hpd)
|
|
|
|
{
|
|
|
|
return hpd.pd & HUGEPD_SHIFT_MASK;
|
2006-04-28 09:02:51 +04:00
|
|
|
}
|
|
|
|
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
|
2006-04-28 09:02:51 +04:00
|
|
|
{
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
|
2006-04-28 09:02:51 +04:00
|
|
|
pte_t *dir = hugepd_page(*hpdp);
|
|
|
|
|
|
|
|
return dir + idx;
|
|
|
|
}
|
|
|
|
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
|
|
|
|
{
|
|
|
|
pgd_t *pg;
|
|
|
|
pud_t *pu;
|
|
|
|
pmd_t *pm;
|
|
|
|
hugepd_t *hpdp = NULL;
|
|
|
|
unsigned pdshift = PGDIR_SHIFT;
|
|
|
|
|
|
|
|
if (shift)
|
|
|
|
*shift = 0;
|
|
|
|
|
|
|
|
pg = pgdir + pgd_index(ea);
|
|
|
|
if (is_hugepd(pg)) {
|
|
|
|
hpdp = (hugepd_t *)pg;
|
|
|
|
} else if (!pgd_none(*pg)) {
|
|
|
|
pdshift = PUD_SHIFT;
|
|
|
|
pu = pud_offset(pg, ea);
|
|
|
|
if (is_hugepd(pu))
|
|
|
|
hpdp = (hugepd_t *)pu;
|
|
|
|
else if (!pud_none(*pu)) {
|
|
|
|
pdshift = PMD_SHIFT;
|
|
|
|
pm = pmd_offset(pu, ea);
|
|
|
|
if (is_hugepd(pm))
|
|
|
|
hpdp = (hugepd_t *)pm;
|
|
|
|
else if (!pmd_none(*pm)) {
|
|
|
|
return pte_offset_map(pm, ea);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!hpdp)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (shift)
|
|
|
|
*shift = hugepd_shift(*hpdp);
|
|
|
|
return hugepte_offset(hpdp, ea, pdshift);
|
|
|
|
}
|
|
|
|
|
|
|
|
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
|
|
|
|
{
|
|
|
|
return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
|
|
|
|
}
|
|
|
|
|
2006-04-28 09:02:51 +04:00
|
|
|
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
unsigned long address, unsigned pdshift, unsigned pshift)
|
2006-04-28 09:02:51 +04:00
|
|
|
{
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
|
2009-10-28 19:27:18 +03:00
|
|
|
GFP_KERNEL|__GFP_REPEAT);
|
2006-04-28 09:02:51 +04:00
|
|
|
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
BUG_ON(pshift > HUGEPD_SHIFT_MASK);
|
|
|
|
BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
|
|
|
|
|
2006-04-28 09:02:51 +04:00
|
|
|
if (! new)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
spin_lock(&mm->page_table_lock);
|
|
|
|
if (!hugepd_none(*hpdp))
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
|
2006-04-28 09:02:51 +04:00
|
|
|
else
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
|
2006-04-28 09:02:51 +04:00
|
|
|
spin_unlock(&mm->page_table_lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
|
2008-09-05 05:49:54 +04:00
|
|
|
{
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
pgd_t *pg;
|
|
|
|
pud_t *pu;
|
|
|
|
pmd_t *pm;
|
|
|
|
hugepd_t *hpdp = NULL;
|
|
|
|
unsigned pshift = __ffs(sz);
|
|
|
|
unsigned pdshift = PGDIR_SHIFT;
|
|
|
|
|
|
|
|
addr &= ~(sz-1);
|
|
|
|
|
|
|
|
pg = pgd_offset(mm, addr);
|
|
|
|
if (pshift >= PUD_SHIFT) {
|
|
|
|
hpdp = (hugepd_t *)pg;
|
|
|
|
} else {
|
|
|
|
pdshift = PUD_SHIFT;
|
|
|
|
pu = pud_alloc(mm, pg, addr);
|
|
|
|
if (pshift >= PMD_SHIFT) {
|
|
|
|
hpdp = (hugepd_t *)pu;
|
|
|
|
} else {
|
|
|
|
pdshift = PMD_SHIFT;
|
|
|
|
pm = pmd_alloc(mm, pu, addr);
|
|
|
|
hpdp = (hugepd_t *)pm;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!hpdp)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
|
|
|
|
|
|
|
|
if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return hugepte_offset(hpdp, addr, pdshift);
|
2008-01-04 01:59:50 +03:00
|
|
|
}
|
|
|
|
|
2008-07-24 08:27:54 +04:00
|
|
|
/* Build list of addresses of gigantic pages. This function is used in early
|
|
|
|
* boot before the buddy or bootmem allocator is setup.
|
|
|
|
*/
|
|
|
|
void add_gpage(unsigned long addr, unsigned long page_size,
|
|
|
|
unsigned long number_of_pages)
|
|
|
|
{
|
|
|
|
if (!addr)
|
|
|
|
return;
|
|
|
|
while (number_of_pages > 0) {
|
|
|
|
gpage_freearray[nr_gpages] = addr;
|
|
|
|
nr_gpages++;
|
|
|
|
number_of_pages--;
|
|
|
|
addr += page_size;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-07-24 08:27:53 +04:00
|
|
|
/* Moves the gigantic page addresses from the temporary list to the
|
2008-07-24 08:27:56 +04:00
|
|
|
* huge_boot_pages list.
|
|
|
|
*/
|
|
|
|
int alloc_bootmem_huge_page(struct hstate *hstate)
|
2008-07-24 08:27:53 +04:00
|
|
|
{
|
|
|
|
struct huge_bootmem_page *m;
|
|
|
|
if (nr_gpages == 0)
|
|
|
|
return 0;
|
|
|
|
m = phys_to_virt(gpage_freearray[--nr_gpages]);
|
|
|
|
gpage_freearray[nr_gpages] = 0;
|
|
|
|
list_add(&m->list, &huge_boot_pages);
|
2008-07-24 08:27:56 +04:00
|
|
|
m->hstate = hstate;
|
2008-07-24 08:27:53 +04:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2006-12-07 07:32:03 +03:00
|
|
|
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
|
|
|
|
unsigned long start, unsigned long end,
|
|
|
|
unsigned long floor, unsigned long ceiling)
|
2006-04-28 09:02:51 +04:00
|
|
|
{
|
|
|
|
pte_t *hugepte = hugepd_page(*hpdp);
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
unsigned shift = hugepd_shift(*hpdp);
|
|
|
|
unsigned long pdmask = ~((1UL << pdshift) - 1);
|
|
|
|
|
|
|
|
start &= pdmask;
|
|
|
|
if (start < floor)
|
|
|
|
return;
|
|
|
|
if (ceiling) {
|
|
|
|
ceiling &= pdmask;
|
|
|
|
if (! ceiling)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (end - 1 > ceiling - 1)
|
|
|
|
return;
|
2006-04-28 09:02:51 +04:00
|
|
|
|
|
|
|
hpdp->pd = 0;
|
|
|
|
tlb->need_flush = 1;
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
pgtable_free_tlb(tlb, hugepte, pdshift - shift);
|
2006-04-28 09:02:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
|
|
|
|
unsigned long addr, unsigned long end,
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
unsigned long floor, unsigned long ceiling)
|
2006-04-28 09:02:51 +04:00
|
|
|
{
|
|
|
|
pmd_t *pmd;
|
|
|
|
unsigned long next;
|
|
|
|
unsigned long start;
|
|
|
|
|
|
|
|
start = addr;
|
|
|
|
pmd = pmd_offset(pud, addr);
|
|
|
|
do {
|
|
|
|
next = pmd_addr_end(addr, end);
|
|
|
|
if (pmd_none(*pmd))
|
|
|
|
continue;
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
|
|
|
|
addr, next, floor, ceiling);
|
2006-04-28 09:02:51 +04:00
|
|
|
} while (pmd++, addr = next, addr != end);
|
|
|
|
|
|
|
|
start &= PUD_MASK;
|
|
|
|
if (start < floor)
|
|
|
|
return;
|
|
|
|
if (ceiling) {
|
|
|
|
ceiling &= PUD_MASK;
|
|
|
|
if (!ceiling)
|
|
|
|
return;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2006-04-28 09:02:51 +04:00
|
|
|
if (end - 1 > ceiling - 1)
|
|
|
|
return;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2006-04-28 09:02:51 +04:00
|
|
|
pmd = pmd_offset(pud, start);
|
|
|
|
pud_clear(pud);
|
mm: Pass virtual address to [__]p{te,ud,md}_free_tlb()
mm: Pass virtual address to [__]p{te,ud,md}_free_tlb()
Upcoming paches to support the new 64-bit "BookE" powerpc architecture
will need to have the virtual address corresponding to PTE page when
freeing it, due to the way the HW table walker works.
Basically, the TLB can be loaded with "large" pages that cover the whole
virtual space (well, sort-of, half of it actually) represented by a PTE
page, and which contain an "indirect" bit indicating that this TLB entry
RPN points to an array of PTEs from which the TLB can then create direct
entries. Thus, in order to invalidate those when PTE pages are deleted,
we need the virtual address to pass to tlbilx or tlbivax instructions.
The old trick of sticking it somewhere in the PTE page struct page sucks
too much, the address is almost readily available in all call sites and
almost everybody implemets these as macros, so we may as well add the
argument everywhere. I added it to the pmd and pud variants for consistency.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: David Howells <dhowells@redhat.com> [MN10300 & FRV]
Acked-by: Nick Piggin <npiggin@suse.de>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com> [s390]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-07-22 09:44:28 +04:00
|
|
|
pmd_free_tlb(tlb, pmd, start);
|
2006-04-28 09:02:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
unsigned long floor, unsigned long ceiling)
|
|
|
|
{
|
|
|
|
pud_t *pud;
|
|
|
|
unsigned long next;
|
|
|
|
unsigned long start;
|
|
|
|
|
|
|
|
start = addr;
|
|
|
|
pud = pud_offset(pgd, addr);
|
|
|
|
do {
|
|
|
|
next = pud_addr_end(addr, end);
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
if (!is_hugepd(pud)) {
|
2008-01-04 01:59:50 +03:00
|
|
|
if (pud_none_or_clear_bad(pud))
|
|
|
|
continue;
|
2008-07-24 08:27:56 +04:00
|
|
|
hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
ceiling);
|
2008-01-04 01:59:50 +03:00
|
|
|
} else {
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
|
|
|
|
addr, next, floor, ceiling);
|
2008-01-04 01:59:50 +03:00
|
|
|
}
|
2006-04-28 09:02:51 +04:00
|
|
|
} while (pud++, addr = next, addr != end);
|
|
|
|
|
|
|
|
start &= PGDIR_MASK;
|
|
|
|
if (start < floor)
|
|
|
|
return;
|
|
|
|
if (ceiling) {
|
|
|
|
ceiling &= PGDIR_MASK;
|
|
|
|
if (!ceiling)
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (end - 1 > ceiling - 1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
pud = pud_offset(pgd, start);
|
|
|
|
pgd_clear(pgd);
|
mm: Pass virtual address to [__]p{te,ud,md}_free_tlb()
mm: Pass virtual address to [__]p{te,ud,md}_free_tlb()
Upcoming paches to support the new 64-bit "BookE" powerpc architecture
will need to have the virtual address corresponding to PTE page when
freeing it, due to the way the HW table walker works.
Basically, the TLB can be loaded with "large" pages that cover the whole
virtual space (well, sort-of, half of it actually) represented by a PTE
page, and which contain an "indirect" bit indicating that this TLB entry
RPN points to an array of PTEs from which the TLB can then create direct
entries. Thus, in order to invalidate those when PTE pages are deleted,
we need the virtual address to pass to tlbilx or tlbivax instructions.
The old trick of sticking it somewhere in the PTE page struct page sucks
too much, the address is almost readily available in all call sites and
almost everybody implemets these as macros, so we may as well add the
argument everywhere. I added it to the pmd and pud variants for consistency.
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: David Howells <dhowells@redhat.com> [MN10300 & FRV]
Acked-by: Nick Piggin <npiggin@suse.de>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com> [s390]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-07-22 09:44:28 +04:00
|
|
|
pud_free_tlb(tlb, pud, start);
|
2006-04-28 09:02:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function frees user-level page tables of a process.
|
|
|
|
*
|
|
|
|
* Must be called with pagetable lock held.
|
|
|
|
*/
|
2008-07-24 08:27:10 +04:00
|
|
|
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
|
2006-04-28 09:02:51 +04:00
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
unsigned long floor, unsigned long ceiling)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
unsigned long next;
|
|
|
|
|
|
|
|
/*
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
* Because there are a number of different possible pagetable
|
|
|
|
* layouts for hugepage ranges, we limit knowledge of how
|
|
|
|
* things should be laid out to the allocation path
|
|
|
|
* (huge_pte_alloc(), above). Everything else works out the
|
|
|
|
* structure as it goes from information in the hugepd
|
|
|
|
* pointers. That means that we can't here use the
|
|
|
|
* optimization used in the normal page free_pgd_range(), of
|
|
|
|
* checking whether we're actually covering a large enough
|
|
|
|
* range to have to do anything at the top level of the walk
|
|
|
|
* instead of at the bottom.
|
2006-04-28 09:02:51 +04:00
|
|
|
*
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
* To make sense of this, you should probably go read the big
|
|
|
|
* block comment at the top of the normal free_pgd_range(),
|
|
|
|
* too.
|
2006-04-28 09:02:51 +04:00
|
|
|
*/
|
|
|
|
|
2008-07-24 08:27:10 +04:00
|
|
|
pgd = pgd_offset(tlb->mm, addr);
|
2006-04-28 09:02:51 +04:00
|
|
|
do {
|
|
|
|
next = pgd_addr_end(addr, end);
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
if (!is_hugepd(pgd)) {
|
2008-09-05 05:49:54 +04:00
|
|
|
if (pgd_none_or_clear_bad(pgd))
|
|
|
|
continue;
|
|
|
|
hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
|
|
|
|
} else {
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
|
|
|
|
addr, next, floor, ceiling);
|
2008-09-05 05:49:54 +04:00
|
|
|
}
|
2006-04-28 09:02:51 +04:00
|
|
|
} while (pgd++, addr = next, addr != end);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2005-08-05 13:39:06 +04:00
|
|
|
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pte_t *ptep, pte_t pte)
|
|
|
|
{
|
|
|
|
if (pte_present(*ptep)) {
|
2005-11-07 03:06:55 +03:00
|
|
|
/* We open-code pte_clear because we need to pass the right
|
2007-04-10 11:09:37 +04:00
|
|
|
* argument to hpte_need_flush (huge / !huge). Might not be
|
|
|
|
* necessary anymore if we make hpte_need_flush() get the
|
|
|
|
* page size from the slices
|
2005-11-07 03:06:55 +03:00
|
|
|
*/
|
2009-10-26 22:24:31 +03:00
|
|
|
pte_update(mm, addr, ptep, ~0UL, 1);
|
2005-08-05 13:39:06 +04:00
|
|
|
}
|
2005-11-07 03:06:55 +03:00
|
|
|
*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2005-08-05 13:39:06 +04:00
|
|
|
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
|
|
|
|
pte_t *ptep)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2007-04-10 11:09:37 +04:00
|
|
|
unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
|
2005-08-05 13:39:06 +04:00
|
|
|
return __pte(old);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
struct page *
|
|
|
|
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
|
|
|
|
{
|
|
|
|
pte_t *ptep;
|
|
|
|
struct page *page;
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
unsigned shift;
|
|
|
|
unsigned long mask;
|
|
|
|
|
|
|
|
ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-07-24 08:27:56 +04:00
|
|
|
/* Verify it is a huge page else bail. */
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
if (!ptep || !shift)
|
2005-04-17 02:20:36 +04:00
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
mask = (1UL << shift) - 1;
|
2005-04-17 02:20:36 +04:00
|
|
|
page = pte_page(*ptep);
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
if (page)
|
|
|
|
page += (address & mask) / PAGE_SIZE;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
|
|
|
int pmd_huge(pmd_t pmd)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-07-24 08:27:50 +04:00
|
|
|
int pud_huge(pud_t pud)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
struct page *
|
|
|
|
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
|
|
|
|
pmd_t *pmd, int write)
|
|
|
|
{
|
|
|
|
BUG();
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
|
|
|
|
unsigned long end, int write, struct page **pages, int *nr)
|
|
|
|
{
|
|
|
|
unsigned long mask;
|
|
|
|
unsigned long pte_end;
|
|
|
|
struct page *head, *page;
|
|
|
|
pte_t pte;
|
|
|
|
int refs;
|
|
|
|
|
|
|
|
pte_end = (addr + sz) & ~(sz-1);
|
|
|
|
if (pte_end < end)
|
|
|
|
end = pte_end;
|
|
|
|
|
|
|
|
pte = *ptep;
|
|
|
|
mask = _PAGE_PRESENT | _PAGE_USER;
|
|
|
|
if (write)
|
|
|
|
mask |= _PAGE_RW;
|
|
|
|
|
|
|
|
if ((pte_val(pte) & mask) != mask)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* hugepages are never "special" */
|
|
|
|
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
|
|
|
|
|
|
|
|
refs = 0;
|
|
|
|
head = pte_page(pte);
|
|
|
|
|
|
|
|
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
|
|
|
|
do {
|
|
|
|
VM_BUG_ON(compound_head(page) != head);
|
|
|
|
pages[*nr] = page;
|
|
|
|
(*nr)++;
|
|
|
|
page++;
|
|
|
|
refs++;
|
|
|
|
} while (addr += PAGE_SIZE, addr != end);
|
|
|
|
|
|
|
|
if (!page_cache_add_speculative(head, refs)) {
|
|
|
|
*nr -= refs;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
|
|
|
|
/* Could be optimized better */
|
|
|
|
while (*nr) {
|
|
|
|
put_page(page);
|
|
|
|
(*nr)--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
|
|
|
|
unsigned long addr, unsigned long end,
|
|
|
|
int write, struct page **pages, int *nr)
|
|
|
|
{
|
|
|
|
pte_t *ptep;
|
|
|
|
unsigned long sz = 1UL << hugepd_shift(*hugepd);
|
|
|
|
|
|
|
|
ptep = hugepte_offset(hugepd, addr, pdshift);
|
|
|
|
do {
|
|
|
|
if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
|
|
|
|
return 0;
|
|
|
|
} while (ptep++, addr += sz, addr != end);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff,
|
|
|
|
unsigned long flags)
|
|
|
|
{
|
2008-07-24 08:27:56 +04:00
|
|
|
struct hstate *hstate = hstate_file(file);
|
|
|
|
int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
|
2008-12-04 07:07:54 +03:00
|
|
|
|
2008-07-24 08:27:56 +04:00
|
|
|
return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
2009-01-07 01:38:54 +03:00
|
|
|
unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);
|
|
|
|
|
|
|
|
return 1UL << mmu_psize_to_shift(psize);
|
|
|
|
}
|
|
|
|
|
2009-10-26 22:24:31 +03:00
|
|
|
static int __init add_huge_page_size(unsigned long long size)
|
2008-01-04 01:59:50 +03:00
|
|
|
{
|
2009-10-26 22:24:31 +03:00
|
|
|
int shift = __ffs(size);
|
|
|
|
int mmu_psize;
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
|
2008-01-04 01:59:50 +03:00
|
|
|
/* Check that it is a page size supported by the hardware and
|
2009-10-26 22:24:31 +03:00
|
|
|
* that it fits within pagetable and slice limits. */
|
|
|
|
if (!is_power_of_2(size)
|
|
|
|
|| (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
|
|
|
|
return -EINVAL;
|
2008-07-24 08:27:55 +04:00
|
|
|
|
2009-10-26 22:24:31 +03:00
|
|
|
if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
#ifdef CONFIG_SPU_FS_64K_LS
|
|
|
|
/* Disable support for 64K huge pages when 64K SPU local store
|
|
|
|
* support is enabled as the current implementation conflicts.
|
|
|
|
*/
|
|
|
|
if (shift == PAGE_SHIFT_64K)
|
|
|
|
return -EINVAL;
|
|
|
|
#endif /* CONFIG_SPU_FS_64K_LS */
|
|
|
|
|
|
|
|
BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
|
|
|
|
|
|
|
|
/* Return if huge page size has already been setup */
|
|
|
|
if (size_to_hstate(size))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
hugetlb_add_hstate(shift - PAGE_SHIFT);
|
|
|
|
|
|
|
|
return 0;
|
2008-01-04 01:59:50 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int __init hugepage_setup_sz(char *str)
|
|
|
|
{
|
|
|
|
unsigned long long size;
|
|
|
|
|
|
|
|
size = memparse(str, &str);
|
|
|
|
|
2009-10-26 22:24:31 +03:00
|
|
|
if (add_huge_page_size(size) != 0)
|
2008-01-04 01:59:50 +03:00
|
|
|
printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("hugepagesz=", hugepage_setup_sz);
|
|
|
|
|
2006-04-28 09:02:51 +04:00
|
|
|
static int __init hugetlbpage_init(void)
|
|
|
|
{
|
powerpc/mm: Allow more flexible layouts for hugepage pagetables
Currently each available hugepage size uses a slightly different
pagetable layout: that is, the bottem level table of pointers to
hugepages is a different size, and may branch off from the normal page
tables at a different level. Every hugepage aware path that needs to
walk the pagetables must therefore look up the hugepage size from the
slice info first, and work out the correct way to walk the pagetables
accordingly. Future hardware is likely to add more possible hugepage
sizes, more layout options and more mess.
This patch, therefore reworks the handling of hugepage pagetables to
reduce this complexity. In the new scheme, instead of having to
consult the slice mask, pagetable walking code can check a flag in the
PGD/PUD/PMD entries to see where to branch off to hugepage pagetables,
and the entry also contains the information (eseentially hugepage
shift) necessary to then interpret that table without recourse to the
slice mask. This scheme can be extended neatly to handle multiple
levels of self-describing "special" hugepage pagetables, although for
now we assume only one level exists.
This approach means that only the pagetable allocation path needs to
know how the pagetables should be set out. All other (hugepage)
pagetable walking paths can just interpret the structure as they go.
There already was a flag bit in PGD/PUD/PMD entries for hugepage
directory pointers, but it was only used for debug. We alter that
flag bit to instead be a 0 in the MSB to indicate a hugepage pagetable
pointer (normally it would be 1 since the pointer lies in the linear
mapping). This means that asm pagetable walking can test for (and
punt on) hugepage pointers with the same test that checks for
unpopulated page directory entries (beq becomes bge), since hugepage
pointers will always be positive, and normal pointers always negative.
While we're at it, we get rid of the confusing (and grep defeating)
#defining of hugepte_shift to be the same thing as mmu_huge_psizes.
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
2009-10-26 22:24:31 +03:00
|
|
|
int psize;
|
2008-07-24 08:27:56 +04:00
|
|
|
|
2006-04-28 09:02:51 +04:00
|
|
|
if (!cpu_has_feature(CPU_FTR_16M_PAGE))
|
|
|
|
return -ENODEV;
|
2008-07-28 10:13:18 +04:00
|
|
|
|
2009-10-26 22:24:31 +03:00
|
|
|
for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
|
|
|
|
unsigned shift;
|
|
|
|
unsigned pdshift;
|
2008-07-24 08:27:56 +04:00
|
|
|
|
2009-10-26 22:24:31 +03:00
|
|
|
if (!mmu_psize_defs[psize].shift)
|
|
|
|
continue;
|
2008-07-28 10:13:18 +04:00
|
|
|
|
2009-10-26 22:24:31 +03:00
|
|
|
shift = mmu_psize_to_shift(psize);
|
|
|
|
|
|
|
|
if (add_huge_page_size(1ULL << shift) < 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (shift < PMD_SHIFT)
|
|
|
|
pdshift = PMD_SHIFT;
|
|
|
|
else if (shift < PUD_SHIFT)
|
|
|
|
pdshift = PUD_SHIFT;
|
|
|
|
else
|
|
|
|
pdshift = PGDIR_SHIFT;
|
|
|
|
|
|
|
|
pgtable_cache_add(pdshift - shift, NULL);
|
|
|
|
if (!PGT_CACHE(pdshift - shift))
|
|
|
|
panic("hugetlbpage_init(): could not create "
|
|
|
|
"pgtable cache for %d bit pagesize\n", shift);
|
2008-07-24 08:27:56 +04:00
|
|
|
}
|
2006-04-28 09:02:51 +04:00
|
|
|
|
2009-10-26 22:24:31 +03:00
|
|
|
/* Set default large page size. Currently, we pick 16M or 1M
|
|
|
|
* depending on what is available
|
|
|
|
*/
|
|
|
|
if (mmu_psize_defs[MMU_PAGE_16M].shift)
|
|
|
|
HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
|
|
|
|
else if (mmu_psize_defs[MMU_PAGE_1M].shift)
|
|
|
|
HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
|
|
|
|
|
2006-04-28 09:02:51 +04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
module_init(hugetlbpage_init);
|