s390/pageattr: allow kernel page table splitting

set_memory_ro() and set_memory_rw() currently only work on 4k
mappings, which is good enough for module code aka the vmalloc area.

However we stumbled already twice into the need to make this also work
on larger mappings:
- the ro after init patch set
- the crash kernel resize code

Therefore this patch implements automatic kernel page table splitting
if e.g. set_memory_ro() would be called on parts of a 2G mapping.
This works quite the same as the x86 code, but is much simpler.

In order to make this work and to be architecturally compliant we now
always use the csp, cspg or crdte instructions to replace valid page
table entries. This means that set_memory_ro() and set_memory_rw()
will be much more expensive than before. In order to avoid huge
latencies the code contains a couple of cond_resched() calls.

The current code only splits page tables, but does not merge them if
it would be possible.  The reason for this is that currently there is
no real life scenarion where this would really happen. All current use
cases that I know of only change access rights once during the life
time. If that should change we can still implement kernel page table
merging at a later time.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
This commit is contained in:
Heiko Carstens 2016-05-17 10:50:15 +02:00 коммит произвёл Martin Schwidefsky
Родитель 9e20b4dac1
Коммит e8a97e42dc
3 изменённых файлов: 252 добавлений и 35 удалений

Просмотреть файл

@ -34,6 +34,8 @@
extern pgd_t swapper_pg_dir[] __attribute__ ((aligned (4096))); extern pgd_t swapper_pg_dir[] __attribute__ ((aligned (4096)));
extern void paging_init(void); extern void paging_init(void);
extern void vmem_map_init(void); extern void vmem_map_init(void);
pmd_t *vmem_pmd_alloc(void);
pte_t *vmem_pte_alloc(void);
/* /*
* The S390 doesn't have any external MMU info: the kernel page * The S390 doesn't have any external MMU info: the kernel page
@ -477,6 +479,40 @@ static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new)
: "cc"); : "cc");
} }
static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new)
{
register unsigned long reg2 asm("2") = old;
register unsigned long reg3 asm("3") = new;
unsigned long address = (unsigned long)ptr | 1;
asm volatile(
" .insn rre,0xb98a0000,%0,%3"
: "+d" (reg2), "+m" (*ptr)
: "d" (reg3), "d" (address)
: "cc");
}
#define CRDTE_DTT_PAGE 0x00UL
#define CRDTE_DTT_SEGMENT 0x10UL
#define CRDTE_DTT_REGION3 0x14UL
#define CRDTE_DTT_REGION2 0x18UL
#define CRDTE_DTT_REGION1 0x1cUL
static inline void crdte(unsigned long old, unsigned long new,
unsigned long table, unsigned long dtt,
unsigned long address, unsigned long asce)
{
register unsigned long reg2 asm("2") = old;
register unsigned long reg3 asm("3") = new;
register unsigned long reg4 asm("4") = table | dtt;
register unsigned long reg5 asm("5") = address;
asm volatile(".insn rrf,0xb98f0000,%0,%2,%4,0"
: "+d" (reg2)
: "d" (reg3), "d" (reg4), "d" (reg5), "a" (asce)
: "memory", "cc");
}
/* /*
* pgd/pmd/pte query functions * pgd/pmd/pte query functions
*/ */

Просмотреть файл

@ -40,54 +40,235 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
} }
#endif #endif
static pte_t *walk_page_table(unsigned long addr) static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
unsigned long dtt)
{ {
pgd_t *pgdp; unsigned long table, mask;
pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
pgdp = pgd_offset_k(addr); mask = 0;
if (pgd_none(*pgdp)) if (MACHINE_HAS_EDAT2) {
return NULL; switch (dtt) {
pudp = pud_offset(pgdp, addr); case CRDTE_DTT_REGION3:
if (pud_none(*pudp) || pud_large(*pudp)) mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1);
return NULL; break;
pmdp = pmd_offset(pudp, addr); case CRDTE_DTT_SEGMENT:
if (pmd_none(*pmdp) || pmd_large(*pmdp)) mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
return NULL; break;
ptep = pte_offset_kernel(pmdp, addr); case CRDTE_DTT_PAGE:
if (pte_none(*ptep)) mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
return NULL; break;
return ptep; }
table = (unsigned long)old & mask;
crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce);
} else if (MACHINE_HAS_IDTE) {
cspg(old, *old, new);
} else {
csp((unsigned int *)old + 1, *old, new);
}
} }
static void change_page_attr(unsigned long addr, int numpages, struct cpa {
pte_t (*set) (pte_t)) unsigned int set_ro : 1;
{ unsigned int clear_ro : 1;
pte_t *ptep; };
int i;
for (i = 0; i < numpages; i++) { static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
ptep = walk_page_table(addr); struct cpa cpa)
if (WARN_ON_ONCE(!ptep)) {
break; pte_t *ptep, new;
*ptep = set(*ptep);
ptep = pte_offset(pmdp, addr);
do {
if (pte_none(*ptep))
return -EINVAL;
if (cpa.set_ro)
new = pte_wrprotect(*ptep);
else if (cpa.clear_ro)
new = pte_mkwrite(pte_mkdirty(*ptep));
pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
ptep++;
addr += PAGE_SIZE; addr += PAGE_SIZE;
cond_resched();
} while (addr < end);
return 0;
}
static int split_pmd_page(pmd_t *pmdp, unsigned long addr)
{
unsigned long pte_addr, prot;
pte_t *pt_dir, *ptep;
pmd_t new;
int i, ro;
pt_dir = vmem_pte_alloc();
if (!pt_dir)
return -ENOMEM;
pte_addr = pmd_pfn(*pmdp) << PAGE_SHIFT;
ro = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT);
prot = pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL);
ptep = pt_dir;
for (i = 0; i < PTRS_PER_PTE; i++) {
pte_val(*ptep) = pte_addr | prot;
pte_addr += PAGE_SIZE;
ptep++;
} }
__tlb_flush_kernel(); pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY;
pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
return 0;
}
static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, struct cpa cpa)
{
pmd_t new;
if (cpa.set_ro)
new = pmd_wrprotect(*pmdp);
else if (cpa.clear_ro)
new = pmd_mkwrite(pmd_mkdirty(*pmdp));
pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
}
static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
struct cpa cpa)
{
unsigned long next;
pmd_t *pmdp;
int rc = 0;
pmdp = pmd_offset(pudp, addr);
do {
if (pmd_none(*pmdp))
return -EINVAL;
next = pmd_addr_end(addr, end);
if (pmd_large(*pmdp)) {
if (addr & ~PMD_MASK || addr + PMD_SIZE > next) {
rc = split_pmd_page(pmdp, addr);
if (rc)
return rc;
continue;
}
modify_pmd_page(pmdp, addr, cpa);
} else {
rc = walk_pte_level(pmdp, addr, next, cpa);
if (rc)
return rc;
}
pmdp++;
addr = next;
cond_resched();
} while (addr < end);
return rc;
}
static int split_pud_page(pud_t *pudp, unsigned long addr)
{
unsigned long pmd_addr, prot;
pmd_t *pm_dir, *pmdp;
pud_t new;
int i, ro;
pm_dir = vmem_pmd_alloc();
if (!pm_dir)
return -ENOMEM;
pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT;
ro = !!(pud_val(*pudp) & _REGION_ENTRY_PROTECT);
prot = pgprot_val(ro ? SEGMENT_KERNEL_RO : SEGMENT_KERNEL);
pmdp = pm_dir;
for (i = 0; i < PTRS_PER_PMD; i++) {
pmd_val(*pmdp) = pmd_addr | prot;
pmd_addr += PMD_SIZE;
pmdp++;
}
pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY;
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
return 0;
}
static void modify_pud_page(pud_t *pudp, unsigned long addr, struct cpa cpa)
{
pud_t new;
if (cpa.set_ro)
new = pud_wrprotect(*pudp);
else if (cpa.clear_ro)
new = pud_mkwrite(pud_mkdirty(*pudp));
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
}
static int walk_pud_level(pgd_t *pgd, unsigned long addr, unsigned long end,
struct cpa cpa)
{
unsigned long next;
pud_t *pudp;
int rc = 0;
pudp = pud_offset(pgd, addr);
do {
if (pud_none(*pudp))
return -EINVAL;
next = pud_addr_end(addr, end);
if (pud_large(*pudp)) {
if (addr & ~PUD_MASK || addr + PUD_SIZE > next) {
rc = split_pud_page(pudp, addr);
if (rc)
break;
continue;
}
modify_pud_page(pudp, addr, cpa);
} else {
rc = walk_pmd_level(pudp, addr, next, cpa);
}
pudp++;
addr = next;
cond_resched();
} while (addr < end && !rc);
return rc;
}
static DEFINE_MUTEX(cpa_mutex);
static int change_page_attr(unsigned long addr, unsigned long end,
struct cpa cpa)
{
unsigned long next;
int rc = -EINVAL;
pgd_t *pgdp;
if (end >= MODULES_END)
return -EINVAL;
mutex_lock(&cpa_mutex);
pgdp = pgd_offset_k(addr);
do {
if (pgd_none(*pgdp))
break;
next = pgd_addr_end(addr, end);
rc = walk_pud_level(pgdp, addr, next, cpa);
if (rc)
break;
cond_resched();
} while (pgdp++, addr = next, addr < end && !rc);
mutex_unlock(&cpa_mutex);
return rc;
} }
int set_memory_ro(unsigned long addr, int numpages) int set_memory_ro(unsigned long addr, int numpages)
{ {
change_page_attr(addr, numpages, pte_wrprotect); struct cpa cpa = {
return 0; .set_ro = 1,
};
addr &= PAGE_MASK;
return change_page_attr(addr, addr + numpages * PAGE_SIZE, cpa);
} }
int set_memory_rw(unsigned long addr, int numpages) int set_memory_rw(unsigned long addr, int numpages)
{ {
change_page_attr(addr, numpages, pte_mkwrite); struct cpa cpa = {
return 0; .clear_ro = 1,
};
addr &= PAGE_MASK;
return change_page_attr(addr, addr + numpages * PAGE_SIZE, cpa);
} }
/* not possible */ /* not possible */

Просмотреть файл

@ -47,7 +47,7 @@ static inline pud_t *vmem_pud_alloc(void)
return pud; return pud;
} }
static inline pmd_t *vmem_pmd_alloc(void) pmd_t *vmem_pmd_alloc(void)
{ {
pmd_t *pmd = NULL; pmd_t *pmd = NULL;
@ -58,7 +58,7 @@ static inline pmd_t *vmem_pmd_alloc(void)
return pmd; return pmd;
} }
static pte_t __ref *vmem_pte_alloc(void) pte_t __ref *vmem_pte_alloc(void)
{ {
pte_t *pte; pte_t *pte;