s390/kvm: remove delayed reallocation of page tables for KVM
Replacing a 2K page table with a 4K page table while a VMA is active for the affected memory region is fundamentally broken. Rip out the page table reallocation code and replace it with a simple system control 'vm.allocate_pgste'. If the system control is set the page tables for all processes are allocated as full 4K pages, even for processes that do not need it. Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
This commit is contained in:
Родитель
7e01b5acd8
Коммит
0b46e0a3ec
|
@ -14,7 +14,9 @@ typedef struct {
|
||||||
unsigned long asce_bits;
|
unsigned long asce_bits;
|
||||||
unsigned long asce_limit;
|
unsigned long asce_limit;
|
||||||
unsigned long vdso_base;
|
unsigned long vdso_base;
|
||||||
/* The mmu context has extended page tables. */
|
/* The mmu context allocates 4K page tables. */
|
||||||
|
unsigned int alloc_pgste:1;
|
||||||
|
/* The mmu context uses extended page tables. */
|
||||||
unsigned int has_pgste:1;
|
unsigned int has_pgste:1;
|
||||||
/* The mmu context uses storage keys. */
|
/* The mmu context uses storage keys. */
|
||||||
unsigned int use_skey:1;
|
unsigned int use_skey:1;
|
||||||
|
|
|
@ -20,8 +20,11 @@ static inline int init_new_context(struct task_struct *tsk,
|
||||||
mm->context.flush_mm = 0;
|
mm->context.flush_mm = 0;
|
||||||
mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS;
|
mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS;
|
||||||
mm->context.asce_bits |= _ASCE_TYPE_REGION3;
|
mm->context.asce_bits |= _ASCE_TYPE_REGION3;
|
||||||
|
#ifdef CONFIG_PGSTE
|
||||||
|
mm->context.alloc_pgste = page_table_allocate_pgste;
|
||||||
mm->context.has_pgste = 0;
|
mm->context.has_pgste = 0;
|
||||||
mm->context.use_skey = 0;
|
mm->context.use_skey = 0;
|
||||||
|
#endif
|
||||||
mm->context.asce_limit = STACK_TOP_MAX;
|
mm->context.asce_limit = STACK_TOP_MAX;
|
||||||
crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
|
crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -21,6 +21,7 @@ void crst_table_free(struct mm_struct *, unsigned long *);
|
||||||
unsigned long *page_table_alloc(struct mm_struct *);
|
unsigned long *page_table_alloc(struct mm_struct *);
|
||||||
void page_table_free(struct mm_struct *, unsigned long *);
|
void page_table_free(struct mm_struct *, unsigned long *);
|
||||||
void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
|
void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
|
||||||
|
extern int page_table_allocate_pgste;
|
||||||
|
|
||||||
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
|
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
|
||||||
unsigned long key, bool nq);
|
unsigned long key, bool nq);
|
||||||
|
|
|
@ -423,6 +423,15 @@ static inline int mm_has_pgste(struct mm_struct *mm)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int mm_alloc_pgste(struct mm_struct *mm)
|
||||||
|
{
|
||||||
|
#ifdef CONFIG_PGSTE
|
||||||
|
if (unlikely(mm->context.alloc_pgste))
|
||||||
|
return 1;
|
||||||
|
#endif
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In the case that a guest uses storage keys
|
* In the case that a guest uses storage keys
|
||||||
* faults should no longer be backed by zero pages
|
* faults should no longer be backed by zero pages
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
#include <linux/rcupdate.h>
|
#include <linux/rcupdate.h>
|
||||||
#include <linux/slab.h>
|
#include <linux/slab.h>
|
||||||
#include <linux/swapops.h>
|
#include <linux/swapops.h>
|
||||||
|
#include <linux/sysctl.h>
|
||||||
#include <linux/ksm.h>
|
#include <linux/ksm.h>
|
||||||
#include <linux/mman.h>
|
#include <linux/mman.h>
|
||||||
|
|
||||||
|
@ -920,6 +921,40 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(get_guest_storage_key);
|
EXPORT_SYMBOL(get_guest_storage_key);
|
||||||
|
|
||||||
|
static int page_table_allocate_pgste_min = 0;
|
||||||
|
static int page_table_allocate_pgste_max = 1;
|
||||||
|
int page_table_allocate_pgste = 0;
|
||||||
|
EXPORT_SYMBOL(page_table_allocate_pgste);
|
||||||
|
|
||||||
|
static struct ctl_table page_table_sysctl[] = {
|
||||||
|
{
|
||||||
|
.procname = "allocate_pgste",
|
||||||
|
.data = &page_table_allocate_pgste,
|
||||||
|
.maxlen = sizeof(int),
|
||||||
|
.mode = S_IRUGO | S_IWUSR,
|
||||||
|
.proc_handler = proc_dointvec,
|
||||||
|
.extra1 = &page_table_allocate_pgste_min,
|
||||||
|
.extra2 = &page_table_allocate_pgste_max,
|
||||||
|
},
|
||||||
|
{ }
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct ctl_table page_table_sysctl_dir[] = {
|
||||||
|
{
|
||||||
|
.procname = "vm",
|
||||||
|
.maxlen = 0,
|
||||||
|
.mode = 0555,
|
||||||
|
.child = page_table_sysctl,
|
||||||
|
},
|
||||||
|
{ }
|
||||||
|
};
|
||||||
|
|
||||||
|
static int __init page_table_register_sysctl(void)
|
||||||
|
{
|
||||||
|
return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
|
||||||
|
}
|
||||||
|
__initcall(page_table_register_sysctl);
|
||||||
|
|
||||||
#else /* CONFIG_PGSTE */
|
#else /* CONFIG_PGSTE */
|
||||||
|
|
||||||
static inline int page_table_with_pgste(struct page *page)
|
static inline int page_table_with_pgste(struct page *page)
|
||||||
|
@ -963,7 +998,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
|
||||||
struct page *uninitialized_var(page);
|
struct page *uninitialized_var(page);
|
||||||
unsigned int mask, bit;
|
unsigned int mask, bit;
|
||||||
|
|
||||||
if (mm_has_pgste(mm))
|
if (mm_alloc_pgste(mm))
|
||||||
return page_table_alloc_pgste(mm);
|
return page_table_alloc_pgste(mm);
|
||||||
/* Allocate fragments of a 4K page as 1K/2K page table */
|
/* Allocate fragments of a 4K page as 1K/2K page table */
|
||||||
spin_lock_bh(&mm->context.list_lock);
|
spin_lock_bh(&mm->context.list_lock);
|
||||||
|
@ -1165,116 +1200,25 @@ static inline void thp_split_mm(struct mm_struct *mm)
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||||
|
|
||||||
static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
|
|
||||||
struct mm_struct *mm, pud_t *pud,
|
|
||||||
unsigned long addr, unsigned long end)
|
|
||||||
{
|
|
||||||
unsigned long next, *table, *new;
|
|
||||||
struct page *page;
|
|
||||||
spinlock_t *ptl;
|
|
||||||
pmd_t *pmd;
|
|
||||||
|
|
||||||
pmd = pmd_offset(pud, addr);
|
|
||||||
do {
|
|
||||||
next = pmd_addr_end(addr, end);
|
|
||||||
again:
|
|
||||||
if (pmd_none_or_clear_bad(pmd))
|
|
||||||
continue;
|
|
||||||
table = (unsigned long *) pmd_deref(*pmd);
|
|
||||||
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
|
|
||||||
if (page_table_with_pgste(page))
|
|
||||||
continue;
|
|
||||||
/* Allocate new page table with pgstes */
|
|
||||||
new = page_table_alloc_pgste(mm);
|
|
||||||
if (!new)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
ptl = pmd_lock(mm, pmd);
|
|
||||||
if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
|
|
||||||
/* Nuke pmd entry pointing to the "short" page table */
|
|
||||||
pmdp_flush_lazy(mm, addr, pmd);
|
|
||||||
pmd_clear(pmd);
|
|
||||||
/* Copy ptes from old table to new table */
|
|
||||||
memcpy(new, table, PAGE_SIZE/2);
|
|
||||||
clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
|
|
||||||
/* Establish new table */
|
|
||||||
pmd_populate(mm, pmd, (pte_t *) new);
|
|
||||||
/* Free old table with rcu, there might be a walker! */
|
|
||||||
page_table_free_rcu(tlb, table, addr);
|
|
||||||
new = NULL;
|
|
||||||
}
|
|
||||||
spin_unlock(ptl);
|
|
||||||
if (new) {
|
|
||||||
page_table_free_pgste(new);
|
|
||||||
goto again;
|
|
||||||
}
|
|
||||||
} while (pmd++, addr = next, addr != end);
|
|
||||||
|
|
||||||
return addr;
|
|
||||||
}
|
|
||||||
|
|
||||||
static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
|
|
||||||
struct mm_struct *mm, pgd_t *pgd,
|
|
||||||
unsigned long addr, unsigned long end)
|
|
||||||
{
|
|
||||||
unsigned long next;
|
|
||||||
pud_t *pud;
|
|
||||||
|
|
||||||
pud = pud_offset(pgd, addr);
|
|
||||||
do {
|
|
||||||
next = pud_addr_end(addr, end);
|
|
||||||
if (pud_none_or_clear_bad(pud))
|
|
||||||
continue;
|
|
||||||
next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
|
|
||||||
if (unlikely(IS_ERR_VALUE(next)))
|
|
||||||
return next;
|
|
||||||
} while (pud++, addr = next, addr != end);
|
|
||||||
|
|
||||||
return addr;
|
|
||||||
}
|
|
||||||
|
|
||||||
static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
|
|
||||||
unsigned long addr, unsigned long end)
|
|
||||||
{
|
|
||||||
unsigned long next;
|
|
||||||
pgd_t *pgd;
|
|
||||||
|
|
||||||
pgd = pgd_offset(mm, addr);
|
|
||||||
do {
|
|
||||||
next = pgd_addr_end(addr, end);
|
|
||||||
if (pgd_none_or_clear_bad(pgd))
|
|
||||||
continue;
|
|
||||||
next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
|
|
||||||
if (unlikely(IS_ERR_VALUE(next)))
|
|
||||||
return next;
|
|
||||||
} while (pgd++, addr = next, addr != end);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* switch on pgstes for its userspace process (for kvm)
|
* switch on pgstes for its userspace process (for kvm)
|
||||||
*/
|
*/
|
||||||
int s390_enable_sie(void)
|
int s390_enable_sie(void)
|
||||||
{
|
{
|
||||||
struct task_struct *tsk = current;
|
struct mm_struct *mm = current->mm;
|
||||||
struct mm_struct *mm = tsk->mm;
|
|
||||||
struct mmu_gather tlb;
|
|
||||||
|
|
||||||
/* Do we have pgstes? if yes, we are done */
|
/* Do we have pgstes? if yes, we are done */
|
||||||
if (mm_has_pgste(tsk->mm))
|
if (mm_has_pgste(mm))
|
||||||
return 0;
|
return 0;
|
||||||
|
/* Fail if the page tables are 2K */
|
||||||
|
if (!mm_alloc_pgste(mm))
|
||||||
|
return -EINVAL;
|
||||||
down_write(&mm->mmap_sem);
|
down_write(&mm->mmap_sem);
|
||||||
|
mm->context.has_pgste = 1;
|
||||||
/* split thp mappings and disable thp for future mappings */
|
/* split thp mappings and disable thp for future mappings */
|
||||||
thp_split_mm(mm);
|
thp_split_mm(mm);
|
||||||
/* Reallocate the page tables with pgstes */
|
|
||||||
tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
|
|
||||||
if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE))
|
|
||||||
mm->context.has_pgste = 1;
|
|
||||||
tlb_finish_mmu(&tlb, 0, TASK_SIZE);
|
|
||||||
up_write(&mm->mmap_sem);
|
up_write(&mm->mmap_sem);
|
||||||
return mm->context.has_pgste ? 0 : -ENOMEM;
|
return 0;
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(s390_enable_sie);
|
EXPORT_SYMBOL_GPL(s390_enable_sie);
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче