mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable
rmap_walk_anon() and try_to_unmap_anon() appears to be too careful about locking the anon vma: while it needs protection against anon vma list modifications, it does not need exclusive access to the list itself. Transforming this exclusive lock to a read-locked rwsem removes a global lock from the hot path of page-migration intense threaded workloads which can cause pathological performance like this: 96.43% process 0 [kernel.kallsyms] [k] perf_trace_sched_switch | --- perf_trace_sched_switch __schedule schedule schedule_preempt_disabled __mutex_lock_common.isra.6 __mutex_lock_slowpath mutex_lock | |--50.61%-- rmap_walk | move_to_new_page | migrate_pages | migrate_misplaced_page | __do_numa_page.isra.69 | handle_pte_fault | handle_mm_fault | __do_page_fault | do_page_fault | page_fault | __memset_sse2 | | | --100.00%-- worker_thread | | | --100.00%-- start_thread | --49.39%-- page_lock_anon_vma try_to_unmap_anon try_to_unmap migrate_pages migrate_misplaced_page __do_numa_page.isra.69 handle_pte_fault handle_mm_fault __do_page_fault do_page_fault page_fault __memset_sse2 | --100.00%-- worker_thread start_thread With this change applied the profile is now nicely flat and there's no anon-vma related scheduling/blocking. Rename anon_vma_[un]lock() => anon_vma_[un]lock_write(), to make it clearer that it's an exclusive write-lock in that case - suggested by Rik van Riel. Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Paul Turner <pjt@google.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Cc: Christoph Lameter <cl@linux.com> Cc: Rik van Riel <riel@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Mel Gorman <mgorman@suse.de>
This commit is contained in:
Родитель
5a505085f0
Коммит
4fc3f1d66b
|
@ -102,7 +102,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
|
|||
#define wait_split_huge_page(__anon_vma, __pmd) \
|
||||
do { \
|
||||
pmd_t *____pmd = (__pmd); \
|
||||
anon_vma_lock(__anon_vma); \
|
||||
anon_vma_lock_write(__anon_vma); \
|
||||
anon_vma_unlock(__anon_vma); \
|
||||
BUG_ON(pmd_trans_splitting(*____pmd) || \
|
||||
pmd_trans_huge(*____pmd)); \
|
||||
|
|
|
@ -118,7 +118,7 @@ static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
|
|||
up_write(&anon_vma->root->rwsem);
|
||||
}
|
||||
|
||||
static inline void anon_vma_lock(struct anon_vma *anon_vma)
|
||||
static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
|
||||
{
|
||||
down_write(&anon_vma->root->rwsem);
|
||||
}
|
||||
|
@ -128,6 +128,17 @@ static inline void anon_vma_unlock(struct anon_vma *anon_vma)
|
|||
up_write(&anon_vma->root->rwsem);
|
||||
}
|
||||
|
||||
static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
|
||||
{
|
||||
down_read(&anon_vma->root->rwsem);
|
||||
}
|
||||
|
||||
static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
|
||||
{
|
||||
up_read(&anon_vma->root->rwsem);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* anon_vma helper functions.
|
||||
*/
|
||||
|
@ -220,8 +231,8 @@ int try_to_munlock(struct page *);
|
|||
/*
|
||||
* Called by memory-failure.c to kill processes.
|
||||
*/
|
||||
struct anon_vma *page_lock_anon_vma(struct page *page);
|
||||
void page_unlock_anon_vma(struct anon_vma *anon_vma);
|
||||
struct anon_vma *page_lock_anon_vma_read(struct page *page);
|
||||
void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
|
||||
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
|
||||
|
||||
/*
|
||||
|
|
|
@ -1549,7 +1549,7 @@ int split_huge_page(struct page *page)
|
|||
int ret = 1;
|
||||
|
||||
BUG_ON(!PageAnon(page));
|
||||
anon_vma = page_lock_anon_vma(page);
|
||||
anon_vma = page_lock_anon_vma_read(page);
|
||||
if (!anon_vma)
|
||||
goto out;
|
||||
ret = 0;
|
||||
|
@ -1562,7 +1562,7 @@ int split_huge_page(struct page *page)
|
|||
|
||||
BUG_ON(PageCompound(page));
|
||||
out_unlock:
|
||||
page_unlock_anon_vma(anon_vma);
|
||||
page_unlock_anon_vma_read(anon_vma);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
@ -2074,7 +2074,7 @@ static void collapse_huge_page(struct mm_struct *mm,
|
|||
if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
|
||||
goto out;
|
||||
|
||||
anon_vma_lock(vma->anon_vma);
|
||||
anon_vma_lock_write(vma->anon_vma);
|
||||
|
||||
pte = pte_offset_map(pmd, address);
|
||||
ptl = pte_lockptr(mm, pmd);
|
||||
|
|
6
mm/ksm.c
6
mm/ksm.c
|
@ -1634,7 +1634,7 @@ again:
|
|||
struct anon_vma_chain *vmac;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
|
||||
0, ULONG_MAX) {
|
||||
vma = vmac->vma;
|
||||
|
@ -1688,7 +1688,7 @@ again:
|
|||
struct anon_vma_chain *vmac;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
|
||||
0, ULONG_MAX) {
|
||||
vma = vmac->vma;
|
||||
|
@ -1741,7 +1741,7 @@ again:
|
|||
struct anon_vma_chain *vmac;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
|
||||
0, ULONG_MAX) {
|
||||
vma = vmac->vma;
|
||||
|
|
|
@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
|
|||
struct anon_vma *av;
|
||||
pgoff_t pgoff;
|
||||
|
||||
av = page_lock_anon_vma(page);
|
||||
av = page_lock_anon_vma_read(page);
|
||||
if (av == NULL) /* Not actually mapped anymore */
|
||||
return;
|
||||
|
||||
|
@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
|
|||
}
|
||||
}
|
||||
read_unlock(&tasklist_lock);
|
||||
page_unlock_anon_vma(av);
|
||||
page_unlock_anon_vma_read(av);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -754,7 +754,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
|||
*/
|
||||
if (PageAnon(page)) {
|
||||
/*
|
||||
* Only page_lock_anon_vma() understands the subtleties of
|
||||
* Only page_lock_anon_vma_read() understands the subtleties of
|
||||
* getting a hold on an anon_vma from outside one of its mms.
|
||||
*/
|
||||
anon_vma = page_get_anon_vma(page);
|
||||
|
|
|
@ -602,7 +602,7 @@ again: remove_next = 1 + (end > next->vm_end);
|
|||
if (anon_vma) {
|
||||
VM_BUG_ON(adjust_next && next->anon_vma &&
|
||||
anon_vma != next->anon_vma);
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_interval_tree_pre_update_vma(vma);
|
||||
if (adjust_next)
|
||||
anon_vma_interval_tree_pre_update_vma(next);
|
||||
|
|
|
@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
|
|||
}
|
||||
if (vma->anon_vma) {
|
||||
anon_vma = vma->anon_vma;
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
48
mm/rmap.c
48
mm/rmap.c
|
@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
|
|||
VM_BUG_ON(atomic_read(&anon_vma->refcount));
|
||||
|
||||
/*
|
||||
* Synchronize against page_lock_anon_vma() such that
|
||||
* Synchronize against page_lock_anon_vma_read() such that
|
||||
* we can safely hold the lock without the anon_vma getting
|
||||
* freed.
|
||||
*
|
||||
* Relies on the full mb implied by the atomic_dec_and_test() from
|
||||
* put_anon_vma() against the acquire barrier implied by
|
||||
* mutex_trylock() from page_lock_anon_vma(). This orders:
|
||||
* down_read_trylock() from page_lock_anon_vma_read(). This orders:
|
||||
*
|
||||
* page_lock_anon_vma() VS put_anon_vma()
|
||||
* mutex_trylock() atomic_dec_and_test()
|
||||
* page_lock_anon_vma_read() VS put_anon_vma()
|
||||
* down_read_trylock() atomic_dec_and_test()
|
||||
* LOCK MB
|
||||
* atomic_read() mutex_is_locked()
|
||||
* atomic_read() rwsem_is_locked()
|
||||
*
|
||||
* LOCK should suffice since the actual taking of the lock must
|
||||
* happen _before_ what follows.
|
||||
*/
|
||||
if (rwsem_is_locked(&anon_vma->root->rwsem)) {
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_unlock(anon_vma);
|
||||
}
|
||||
|
||||
|
@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
|
|||
* allocate a new one.
|
||||
*
|
||||
* Anon-vma allocations are very subtle, because we may have
|
||||
* optimistically looked up an anon_vma in page_lock_anon_vma()
|
||||
* optimistically looked up an anon_vma in page_lock_anon_vma_read()
|
||||
* and that may actually touch the spinlock even in the newly
|
||||
* allocated vma (it depends on RCU to make sure that the
|
||||
* anon_vma isn't actually destroyed).
|
||||
|
@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
|
|||
allocated = anon_vma;
|
||||
}
|
||||
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
/* page_table_lock to protect against threads */
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (likely(!vma->anon_vma)) {
|
||||
|
@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
|
|||
get_anon_vma(anon_vma->root);
|
||||
/* Mark this anon_vma as the one where our new (COWed) pages go. */
|
||||
vma->anon_vma = anon_vma;
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_chain_link(vma, avc, anon_vma);
|
||||
anon_vma_unlock(anon_vma);
|
||||
|
||||
|
@ -442,7 +442,7 @@ out:
|
|||
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
|
||||
* reference like with page_get_anon_vma() and then block on the mutex.
|
||||
*/
|
||||
struct anon_vma *page_lock_anon_vma(struct page *page)
|
||||
struct anon_vma *page_lock_anon_vma_read(struct page *page)
|
||||
{
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
struct anon_vma *root_anon_vma;
|
||||
|
@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
|
|||
|
||||
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
|
||||
root_anon_vma = ACCESS_ONCE(anon_vma->root);
|
||||
if (down_write_trylock(&root_anon_vma->rwsem)) {
|
||||
if (down_read_trylock(&root_anon_vma->rwsem)) {
|
||||
/*
|
||||
* If the page is still mapped, then this anon_vma is still
|
||||
* its anon_vma, and holding the mutex ensures that it will
|
||||
* not go away, see anon_vma_free().
|
||||
*/
|
||||
if (!page_mapped(page)) {
|
||||
up_write(&root_anon_vma->rwsem);
|
||||
up_read(&root_anon_vma->rwsem);
|
||||
anon_vma = NULL;
|
||||
}
|
||||
goto out;
|
||||
|
@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
|
|||
|
||||
/* we pinned the anon_vma, its safe to sleep */
|
||||
rcu_read_unlock();
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_read(anon_vma);
|
||||
|
||||
if (atomic_dec_and_test(&anon_vma->refcount)) {
|
||||
/*
|
||||
* Oops, we held the last refcount, release the lock
|
||||
* and bail -- can't simply use put_anon_vma() because
|
||||
* we'll deadlock on the anon_vma_lock() recursion.
|
||||
* we'll deadlock on the anon_vma_lock_write() recursion.
|
||||
*/
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
__put_anon_vma(anon_vma);
|
||||
anon_vma = NULL;
|
||||
}
|
||||
|
@ -504,9 +504,9 @@ out:
|
|||
return anon_vma;
|
||||
}
|
||||
|
||||
void page_unlock_anon_vma(struct anon_vma *anon_vma)
|
||||
void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
|
||||
{
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -732,7 +732,7 @@ static int page_referenced_anon(struct page *page,
|
|||
struct anon_vma_chain *avc;
|
||||
int referenced = 0;
|
||||
|
||||
anon_vma = page_lock_anon_vma(page);
|
||||
anon_vma = page_lock_anon_vma_read(page);
|
||||
if (!anon_vma)
|
||||
return referenced;
|
||||
|
||||
|
@ -754,7 +754,7 @@ static int page_referenced_anon(struct page *page,
|
|||
break;
|
||||
}
|
||||
|
||||
page_unlock_anon_vma(anon_vma);
|
||||
page_unlock_anon_vma_read(anon_vma);
|
||||
return referenced;
|
||||
}
|
||||
|
||||
|
@ -1474,7 +1474,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
|
|||
struct anon_vma_chain *avc;
|
||||
int ret = SWAP_AGAIN;
|
||||
|
||||
anon_vma = page_lock_anon_vma(page);
|
||||
anon_vma = page_lock_anon_vma_read(page);
|
||||
if (!anon_vma)
|
||||
return ret;
|
||||
|
||||
|
@ -1501,7 +1501,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
|
|||
break;
|
||||
}
|
||||
|
||||
page_unlock_anon_vma(anon_vma);
|
||||
page_unlock_anon_vma_read(anon_vma);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -1696,7 +1696,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
|
|||
int ret = SWAP_AGAIN;
|
||||
|
||||
/*
|
||||
* Note: remove_migration_ptes() cannot use page_lock_anon_vma()
|
||||
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
|
||||
* because that depends on page_mapped(); but not all its usages
|
||||
* are holding mmap_sem. Users without mmap_sem are required to
|
||||
* take a reference count to prevent the anon_vma disappearing
|
||||
|
@ -1704,7 +1704,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
|
|||
anon_vma = page_anon_vma(page);
|
||||
if (!anon_vma)
|
||||
return ret;
|
||||
anon_vma_lock(anon_vma);
|
||||
anon_vma_lock_read(anon_vma);
|
||||
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
|
||||
struct vm_area_struct *vma = avc->vma;
|
||||
unsigned long address = vma_address(page, vma);
|
||||
|
@ -1712,7 +1712,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
|
|||
if (ret != SWAP_AGAIN)
|
||||
break;
|
||||
}
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_read(anon_vma);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче