From c74df32c724a1652ad8399b4891bb02c9d43743a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:23 -0700 Subject: [PATCH] [PATCH] mm: ptd_alloc take ptlock Second step in pushing down the page_table_lock. Remove the temporary bridging hack from __pud_alloc, __pmd_alloc, __pte_alloc: expect callers not to hold page_table_lock, whether it's on init_mm or a user mm; take page_table_lock internally to check if a racing task already allocated. Convert their callers from common code. But avoid coming back to change them again later: instead of moving the spin_lock(&mm->page_table_lock) down, switch over to new macros pte_alloc_map_lock and pte_unmap_unlock, which encapsulate the mapping+locking and unlocking+unmapping together, and in the end may use alternatives to the mm page_table_lock itself. These callers all hold mmap_sem (some exclusively, some not), so at no level can a page table be whipped away from beneath them; and pte_alloc uses the "atomic" pmd_present to test whether it needs to allocate. It appears that on all arches we can safely descend without page_table_lock. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 14 +++--- include/linux/mm.h | 18 ++++++++ kernel/fork.c | 2 - mm/fremap.c | 48 ++++++++------------- mm/hugetlb.c | 12 ++++-- mm/memory.c | 104 ++++++++++++++------------------------------- mm/mremap.c | 27 ++++-------- 7 files changed, 90 insertions(+), 135 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 9bb55c8cf224..ba73797eb4cb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -309,25 +309,24 @@ void install_arg_page(struct vm_area_struct *vma, pud_t * pud; pmd_t * pmd; pte_t * pte; + spinlock_t *ptl; if (unlikely(anon_vma_prepare(vma))) - goto out_sig; + goto out; flush_dcache_page(page); pgd = pgd_offset(mm, address); - - spin_lock(&mm->page_table_lock); pud = pud_alloc(mm, pgd, address); if (!pud) goto out; pmd = pmd_alloc(mm, pud, address); if (!pmd) goto out; - pte = pte_alloc_map(mm, pmd, address); + pte = pte_alloc_map_lock(mm, pmd, address, &ptl); if (!pte) goto out; if (!pte_none(*pte)) { - pte_unmap(pte); + pte_unmap_unlock(pte, ptl); goto out; } inc_mm_counter(mm, anon_rss); @@ -335,14 +334,11 @@ void install_arg_page(struct vm_area_struct *vma, set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( page, vma->vm_page_prot)))); page_add_anon_rmap(page, vma, address); - pte_unmap(pte); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(pte, ptl); /* no need for flush_tlb */ return; out: - spin_unlock(&mm->page_table_lock); -out_sig: __free_page(page); force_sig(SIGKILL, current); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 22c2d6922c0e..d4c3512e7db4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -779,10 +779,28 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ +#define pte_offset_map_lock(mm, pmd, address, ptlp) \ +({ \ + spinlock_t *__ptl = &(mm)->page_table_lock; \ + pte_t *__pte = pte_offset_map(pmd, address); \ + *(ptlp) = __ptl; \ + spin_lock(__ptl); \ + __pte; \ +}) + +#define pte_unmap_unlock(pte, ptl) do { \ + spin_unlock(ptl); \ + pte_unmap(pte); \ +} while (0) + #define pte_alloc_map(mm, pmd, address) \ ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ NULL: pte_offset_map(pmd, address)) +#define pte_alloc_map_lock(mm, pmd, address, ptlp) \ + ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) + #define pte_alloc_kernel(pmd, address) \ ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) diff --git a/kernel/fork.c b/kernel/fork.c index 2a587b3224e3..8a069612eac3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -255,7 +255,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) /* * Link in the new vma and copy the page table entries. */ - spin_lock(&mm->page_table_lock); *pprev = tmp; pprev = &tmp->vm_next; @@ -265,7 +264,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->map_count++; retval = copy_page_range(mm, oldmm, tmp); - spin_unlock(&mm->page_table_lock); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); diff --git a/mm/fremap.c b/mm/fremap.c index 49719a35769a..d862be3bc3e3 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -63,23 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud; pgd_t *pgd; pte_t pte_val; + spinlock_t *ptl; BUG_ON(vma->vm_flags & VM_RESERVED); pgd = pgd_offset(mm, addr); - spin_lock(&mm->page_table_lock); - pud = pud_alloc(mm, pgd, addr); if (!pud) - goto err_unlock; - + goto out; pmd = pmd_alloc(mm, pud, addr); if (!pmd) - goto err_unlock; - - pte = pte_alloc_map(mm, pmd, addr); + goto out; + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) - goto err_unlock; + goto out; /* * This page may have been truncated. Tell the @@ -89,10 +86,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, inode = vma->vm_file->f_mapping->host; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (!page->mapping || page->index >= size) - goto err_unlock; + goto unlock; err = -ENOMEM; if (page_mapcount(page) > INT_MAX/2) - goto err_unlock; + goto unlock; if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) inc_mm_counter(mm, file_rss); @@ -101,17 +98,15 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, addr, pte, mk_pte(page, prot)); page_add_file_rmap(page); pte_val = *pte; - pte_unmap(pte); update_mmu_cache(vma, addr, pte_val); - err = 0; -err_unlock: - spin_unlock(&mm->page_table_lock); +unlock: + pte_unmap_unlock(pte, ptl); +out: return err; } EXPORT_SYMBOL(install_page); - /* * Install a file pte to a given virtual memory address, release any * previously existing mapping. @@ -125,23 +120,20 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud; pgd_t *pgd; pte_t pte_val; + spinlock_t *ptl; BUG_ON(vma->vm_flags & VM_RESERVED); pgd = pgd_offset(mm, addr); - spin_lock(&mm->page_table_lock); - pud = pud_alloc(mm, pgd, addr); if (!pud) - goto err_unlock; - + goto out; pmd = pmd_alloc(mm, pud, addr); if (!pmd) - goto err_unlock; - - pte = pte_alloc_map(mm, pmd, addr); + goto out; + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) - goto err_unlock; + goto out; if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { update_hiwater_rss(mm); @@ -150,17 +142,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); pte_val = *pte; - pte_unmap(pte); update_mmu_cache(vma, addr, pte_val); - spin_unlock(&mm->page_table_lock); - return 0; - -err_unlock: - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(pte, ptl); + err = 0; +out: return err; } - /*** * sys_remap_file_pages - remap arbitrary pages of a shared backing store * file within an existing vma. diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ac5f044bf514..ea0826ff2663 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -277,12 +277,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long addr; for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + src_pte = huge_pte_offset(src, addr); + if (!src_pte) + continue; dst_pte = huge_pte_alloc(dst, addr); if (!dst_pte) goto nomem; + spin_lock(&dst->page_table_lock); spin_lock(&src->page_table_lock); - src_pte = huge_pte_offset(src, addr); - if (src_pte && !pte_none(*src_pte)) { + if (!pte_none(*src_pte)) { entry = *src_pte; ptepage = pte_page(entry); get_page(ptepage); @@ -290,6 +293,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); + spin_unlock(&dst->page_table_lock); } return 0; @@ -354,7 +358,6 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) hugetlb_prefault_arch_hook(mm); - spin_lock(&mm->page_table_lock); for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { unsigned long idx; pte_t *pte = huge_pte_alloc(mm, addr); @@ -389,11 +392,12 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) goto out; } } + spin_lock(&mm->page_table_lock); add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); + spin_unlock(&mm->page_table_lock); } out: - spin_unlock(&mm->page_table_lock); return ret; } diff --git a/mm/memory.c b/mm/memory.c index 4bdd1186b43b..a40e4b1cee4f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -282,14 +282,11 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { - struct page *new; - - spin_unlock(&mm->page_table_lock); - new = pte_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); + struct page *new = pte_alloc_one(mm, address); if (!new) return -ENOMEM; + spin_lock(&mm->page_table_lock); if (pmd_present(*pmd)) /* Another has populated it */ pte_free(new); else { @@ -297,6 +294,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); } + spin_unlock(&mm->page_table_lock); return 0; } @@ -344,9 +342,6 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. - * - * dst->page_table_lock is held on entry and exit, - * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). */ static inline void @@ -419,17 +414,19 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long addr, unsigned long end) { pte_t *src_pte, *dst_pte; + spinlock_t *src_ptl, *dst_ptl; int progress = 0; int rss[2]; again: rss[1] = rss[0] = 0; - dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; src_pte = pte_offset_map_nested(src_pmd, addr); + src_ptl = &src_mm->page_table_lock; + spin_lock(src_ptl); - spin_lock(&src_mm->page_table_lock); do { /* * We are holding two locks at this point - either of them @@ -438,8 +435,8 @@ again: if (progress >= 32) { progress = 0; if (need_resched() || - need_lockbreak(&src_mm->page_table_lock) || - need_lockbreak(&dst_mm->page_table_lock)) + need_lockbreak(src_ptl) || + need_lockbreak(dst_ptl)) break; } if (pte_none(*src_pte)) { @@ -449,12 +446,12 @@ again: copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); - spin_unlock(&src_mm->page_table_lock); + spin_unlock(src_ptl); pte_unmap_nested(src_pte - 1); - pte_unmap(dst_pte - 1); add_mm_rss(dst_mm, rss[0], rss[1]); - cond_resched_lock(&dst_mm->page_table_lock); + pte_unmap_unlock(dst_pte - 1, dst_ptl); + cond_resched(); if (addr != end) goto again; return 0; @@ -1049,8 +1046,9 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot) { pte_t *pte; + spinlock_t *ptl; - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; do { @@ -1062,7 +1060,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, zero_pte); } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); + pte_unmap_unlock(pte - 1, ptl); return 0; } @@ -1112,14 +1110,12 @@ int zeromap_page_range(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); - spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); err = zeromap_pud_range(mm, pgd, addr, next, prot); if (err) break; } while (pgd++, addr = next, addr != end); - spin_unlock(&mm->page_table_lock); return err; } @@ -1133,8 +1129,9 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long pfn, pgprot_t prot) { pte_t *pte; + spinlock_t *ptl; - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; do { @@ -1142,7 +1139,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); + pte_unmap_unlock(pte - 1, ptl); return 0; } @@ -1210,7 +1207,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); - spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); err = remap_pud_range(mm, pgd, addr, next, @@ -1218,7 +1214,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); - spin_unlock(&mm->page_table_lock); return err; } EXPORT_SYMBOL(remap_pfn_range); @@ -1985,17 +1980,9 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * - * Note the "page_table_lock". It is to protect against kswapd removing - * pages from under us. Note that kswapd only ever _removes_ pages, never - * adds them. As such, once we have noticed that the page is not present, - * we can drop the lock early. - * - * The adding of pages is protected by the MM semaphore (which we hold), - * so we don't need to worry about a page being suddenly been added into - * our VM. - * - * We enter with the pagetable spinlock held, we are supposed to - * release it when done. + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. */ static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -2003,6 +1990,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, { pte_t entry; + spin_lock(&mm->page_table_lock); entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { @@ -2051,30 +2039,18 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, write_access); - /* - * We need the page table lock to synchronize with kswapd - * and the SMP-safe atomic PTE updates. - */ pgd = pgd_offset(mm, address); - spin_lock(&mm->page_table_lock); - pud = pud_alloc(mm, pgd, address); if (!pud) - goto oom; - + return VM_FAULT_OOM; pmd = pmd_alloc(mm, pud, address); if (!pmd) - goto oom; - + return VM_FAULT_OOM; pte = pte_alloc_map(mm, pmd, address); if (!pte) - goto oom; - - return handle_pte_fault(mm, vma, address, pte, pmd, write_access); + return VM_FAULT_OOM; - oom: - spin_unlock(&mm->page_table_lock); - return VM_FAULT_OOM; + return handle_pte_fault(mm, vma, address, pte, pmd, write_access); } #ifndef __PAGETABLE_PUD_FOLDED @@ -2084,24 +2060,16 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { - pud_t *new; - - if (mm != &init_mm) /* Temporary bridging hack */ - spin_unlock(&mm->page_table_lock); - new = pud_alloc_one(mm, address); - if (!new) { - if (mm != &init_mm) /* Temporary bridging hack */ - spin_lock(&mm->page_table_lock); + pud_t *new = pud_alloc_one(mm, address); + if (!new) return -ENOMEM; - } spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) /* Another has populated it */ pud_free(new); else pgd_populate(mm, pgd, new); - if (mm == &init_mm) /* Temporary bridging hack */ - spin_unlock(&mm->page_table_lock); + spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_PUD_FOLDED */ @@ -2113,16 +2081,9 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) */ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { - pmd_t *new; - - if (mm != &init_mm) /* Temporary bridging hack */ - spin_unlock(&mm->page_table_lock); - new = pmd_alloc_one(mm, address); - if (!new) { - if (mm != &init_mm) /* Temporary bridging hack */ - spin_lock(&mm->page_table_lock); + pmd_t *new = pmd_alloc_one(mm, address); + if (!new) return -ENOMEM; - } spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK @@ -2136,8 +2097,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) else pgd_populate(mm, pud, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ - if (mm == &init_mm) /* Temporary bridging hack */ - spin_unlock(&mm->page_table_lock); + spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ diff --git a/mm/mremap.c b/mm/mremap.c index 616facc3d28a..8de77b632a20 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -28,9 +28,6 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) pud_t *pud; pmd_t *pmd; - /* - * We don't need page_table_lock: we have mmap_sem exclusively. - */ pgd = pgd_offset(mm, addr); if (pgd_none_or_clear_bad(pgd)) return NULL; @@ -50,25 +47,20 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; - pmd_t *pmd = NULL; + pmd_t *pmd; - /* - * We do need page_table_lock: because allocators expect that. - */ - spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); if (!pud) - goto out; + return NULL; pmd = pmd_alloc(mm, pud, addr); if (!pmd) - goto out; + return NULL; if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) - pmd = NULL; -out: - spin_unlock(&mm->page_table_lock); + return NULL; + return pmd; } @@ -80,6 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct address_space *mapping = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; + spinlock_t *old_ptl; if (vma->vm_file) { /* @@ -95,9 +88,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_vma->vm_truncate_count = 0; } - spin_lock(&mm->page_table_lock); - old_pte = pte_offset_map(old_pmd, old_addr); - new_pte = pte_offset_map_nested(new_pmd, new_addr); + old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); + new_pte = pte_offset_map_nested(new_pmd, new_addr); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { @@ -110,8 +102,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, } pte_unmap_nested(new_pte - 1); - pte_unmap(old_pte - 1); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); }