diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 700301bc5190..0ebfbc8f0449 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1021,6 +1021,16 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, */ return true; } + + +#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit +static inline bool arch_needs_pgtable_deposit(void) +{ + if (radix_enabled()) + return false; + return true; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 2065e81701fc..18af2bcefe6a 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -652,6 +652,9 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif +#ifndef arch_needs_pgtable_deposit +#define arch_needs_pgtable_deposit() (false) +#endif /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and diff --git a/mm/Kconfig b/mm/Kconfig index 33a9b06ec618..9b8fccb969dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -447,13 +447,9 @@ choice benefit. endchoice -# -# We don't deposit page tables on file THP mapping, -# but Power makes use of them to address MMU quirk. -# config TRANSPARENT_HUGE_PAGECACHE def_bool y - depends on TRANSPARENT_HUGEPAGE && !PPC + depends on TRANSPARENT_HUGEPAGE # # UP and nommu archs use km based percpu allocator diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b54044c21076..2b44ac11178f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1380,6 +1380,15 @@ out_unlocked: return ret; } +static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) +{ + pgtable_t pgtable; + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pte_free(mm, pgtable); + atomic_long_dec(&mm->nr_ptes); +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { @@ -1421,6 +1430,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, atomic_long_dec(&tlb->mm->nr_ptes); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); } spin_unlock(ptl); @@ -1607,6 +1618,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + /* + * We are going to unmap this huge page. So + * just go ahead and zap it + */ + if (arch_needs_pgtable_deposit()) + zap_deposited_table(mm, pmd); if (vma_is_dax(vma)) return; page = pmd_page(_pmd); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7a50c726c5ae..09460955e818 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1242,6 +1242,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; unsigned long addr; pmd_t *pmd, _pmd; + bool deposited = false; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1266,10 +1267,26 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); + /* + * now deposit the pgtable for arch that need it + * otherwise free it. + */ + if (arch_needs_pgtable_deposit()) { + /* + * The deposit should be visibile only after + * collapse is seen by others. + */ + smp_wmb(); + pgtable_trans_huge_deposit(vma->vm_mm, pmd, + pmd_pgtable(_pmd)); + deposited = true; + } spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - atomic_long_dec(&vma->vm_mm->nr_ptes); - pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + if (!deposited) { + atomic_long_dec(&vma->vm_mm->nr_ptes); + pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } } } i_mmap_unlock_write(mapping); diff --git a/mm/memory.c b/mm/memory.c index 0a72f821ccdc..32e9b7aec366 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2935,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } +static void deposit_prealloc_pte(struct fault_env *fe) +{ + struct vm_area_struct *vma = fe->vma; + + pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + /* + * We are going to consume the prealloc table, + * count that as nr_ptes. + */ + atomic_long_inc(&vma->vm_mm->nr_ptes); + fe->prealloc_pte = 0; +} + static int do_set_pmd(struct fault_env *fe, struct page *page) { struct vm_area_struct *vma = fe->vma; @@ -2949,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = VM_FAULT_FALLBACK; page = compound_head(page); + /* + * Archs like ppc64 need additonal space to store information + * related to pte entry. Use the preallocated table for that. + */ + if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { + fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); + if (!fe->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); if (unlikely(!pmd_none(*fe->pmd))) goto out; @@ -2962,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); page_add_file_rmap(page, true); + /* + * deposit and withdraw with pmd lock held + */ + if (arch_needs_pgtable_deposit()) + deposit_prealloc_pte(fe); set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); @@ -2971,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = 0; count_vm_event(THP_FILE_MAPPED); out: + /* + * If we are going to fallback to pte mapping, do a + * withdraw with pmd lock held. + */ + if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) + fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, + fe->pmd); spin_unlock(fe->ptl); return ret; } @@ -3010,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, ret = do_set_pmd(fe, page); if (ret != VM_FAULT_FALLBACK) - return ret; + goto fault_handled; } if (!fe->pte) { ret = pte_alloc_one_map(fe); if (ret) - return ret; + goto fault_handled; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) - return VM_FAULT_NOPAGE; + if (unlikely(!pte_none(*fe->pte))) { + ret = VM_FAULT_NOPAGE; + goto fault_handled; + } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); @@ -3041,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, fe->address, fe->pte); + ret = 0; - return 0; +fault_handled: + /* preallocated pagetable is unused: free it */ + if (fe->prealloc_pte) { + pte_free(fe->vma->vm_mm, fe->prealloc_pte); + fe->prealloc_pte = 0; + } + return ret; } static unsigned long fault_around_bytes __read_mostly = @@ -3141,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); - /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; - } /* Huge page is mapped? Page fault is solved */ if (pmd_trans_huge(*fe->pmd)) { ret = VM_FAULT_NOPAGE;