WSL2-Linux-Kernel/mm/vmacache.c

/*
 * Copyright (C) 2014 Davidlohr Bueso.
 */
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/vmacache.h>

/*
 * Flush vma caches for threads that share a given mm.
 *
 * The operation is safe because the caller holds the mmap_sem
 * exclusively and other threads accessing the vma cache will
 * have mmap_sem held at least for read, so no extra locking
 * is required to maintain the vma cache.
 */
void vmacache_flush_all(struct mm_struct *mm)
{
	struct task_struct *g, *p;

	rcu_read_lock();
	for_each_process_thread(g, p) {
		/*
		 * Only flush the vmacache pointers as the
		 * mm seqnum is already set and curr's will
		 * be set upon invalidation when the next
		 * lookup is done.
		 */
		if (mm == p->mm)
			vmacache_flush(p);
	}
	rcu_read_unlock();
}

/*
 * This task may be accessing a foreign mm via (for example)
 * get_user_pages()->find_vma().  The vmacache is task-local and this
 * task's vmacache pertains to a different mm (ie, its own).  There is
 * nothing we can do here.
 *
 * Also handle the case where a kernel thread has adopted this mm via use_mm().
 * That kernel thread's vmacache is not applicable to this mm.
 */
static bool vmacache_valid_mm(struct mm_struct *mm)
{
	return current->mm == mm && !(current->flags & PF_KTHREAD);
}

void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
{
	if (vmacache_valid_mm(newvma->vm_mm))
		current->vmacache[VMACACHE_HASH(addr)] = newvma;
}

static bool vmacache_valid(struct mm_struct *mm)
{
	struct task_struct *curr;

	if (!vmacache_valid_mm(mm))
		return false;

	curr = current;
	if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
		/*
		 * First attempt will always be invalid, initialize
		 * the new cache for this task here.
		 */
		curr->vmacache_seqnum = mm->vmacache_seqnum;
		vmacache_flush(curr);
		return false;
	}
	return true;
}

struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
{
	int i;

	if (!vmacache_valid(mm))
		return NULL;

	for (i = 0; i < VMACACHE_SIZE; i++) {
		struct vm_area_struct *vma = current->vmacache[i];

		if (!vma)
			continue;
		if (WARN_ON_ONCE(vma->vm_mm != mm))
			break;
		if (vma->vm_start <= addr && vma->vm_end > addr)
			return vma;
	}

	return NULL;
}

#ifndef CONFIG_MMU
struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
					   unsigned long start,
					   unsigned long end)
{
	int i;

	if (!vmacache_valid(mm))
		return NULL;

	for (i = 0; i < VMACACHE_SIZE; i++) {
		struct vm_area_struct *vma = current->vmacache[i];

		if (vma && vma->vm_start == start && vma->vm_end == end)
			return vma;
	}

	return NULL;
}
#endif
mm: per-thread vma caching This patch is a continuation of efforts trying to optimize find_vma(), avoiding potentially expensive rbtree walks to locate a vma upon faults. The original approach (https://lkml.org/lkml/2013/11/1/410), where the largest vma was also cached, ended up being too specific and random, thus further comparison with other approaches were needed. There are two things to consider when dealing with this, the cache hit rate and the latency of find_vma(). Improving the hit-rate does not necessarily translate in finding the vma any faster, as the overhead of any fancy caching schemes can be too high to consider. We currently cache the last used vma for the whole address space, which provides a nice optimization, reducing the total cycles in find_vma() by up to 250%, for workloads with good locality. On the other hand, this simple scheme is pretty much useless for workloads with poor locality. Analyzing ebizzy runs shows that, no matter how many threads are running, the mmap_cache hit rate is less than 2%, and in many situations below 1%. The proposed approach is to replace this scheme with a small per-thread cache, maximizing hit rates at a very low maintenance cost. Invalidations are performed by simply bumping up a 32-bit sequence number. The only expensive operation is in the rare case of a seq number overflow, where all caches that share the same address space are flushed. Upon a miss, the proposed replacement policy is based on the page number that contains the virtual address in question. Concretely, the following results are seen on an 80 core, 8 socket x86-64 box: 1) System bootup: Most programs are single threaded, so the per-thread scheme does improve ~50% hit rate by just adding a few more slots to the cache. +----------------+----------+------------------+ \| caching scheme \| hit-rate \| cycles (billion) \| +----------------+----------+------------------+ \| baseline \| 50.61% \| 19.90 \| \| patched \| 73.45% \| 13.58 \| +----------------+----------+------------------+ 2) Kernel build: This one is already pretty good with the current approach as we're dealing with good locality. +----------------+----------+------------------+ \| caching scheme \| hit-rate \| cycles (billion) \| +----------------+----------+------------------+ \| baseline \| 75.28% \| 11.03 \| \| patched \| 88.09% \| 9.31 \| +----------------+----------+------------------+ 3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload. +----------------+----------+------------------+ \| caching scheme \| hit-rate \| cycles (billion) \| +----------------+----------+------------------+ \| baseline \| 70.66% \| 17.14 \| \| patched \| 91.15% \| 12.57 \| +----------------+----------+------------------+ 4) Ebizzy: There's a fair amount of variation from run to run, but this approach always shows nearly perfect hit rates, while baseline is just about non-existent. The amounts of cycles can fluctuate between anywhere from ~60 to ~116 for the baseline scheme, but this approach reduces it considerably. For instance, with 80 threads: +----------------+----------+------------------+ \| caching scheme \| hit-rate \| cycles (billion) \| +----------------+----------+------------------+ \| baseline \| 1.06% \| 91.54 \| \| patched \| 99.97% \| 14.18 \| +----------------+----------+------------------+ [akpm@linux-foundation.org: fix nommu build, per Davidlohr] [akpm@linux-foundation.org: document vmacache_valid() logic] [akpm@linux-foundation.org: attempt to untangle header files] [akpm@linux-foundation.org: add vmacache_find() BUG_ON] [hughd@google.com: add vmacache_valid_mm() (from Oleg)] [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: adjust and enhance comments] Signed-off-by: Davidlohr Bueso <davidlohr@hp.com> Reviewed-by: Rik van Riel <riel@redhat.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Reviewed-by: Michel Lespinasse <walken@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Tested-by: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-08 02:37:25 +04:00			`/*`
			`* Copyright (C) 2014 Davidlohr Bueso.`
			`*/`
			`#include <linux/sched.h>`
			`#include <linux/mm.h>`
			`#include <linux/vmacache.h>`

			`/*`
			`* Flush vma caches for threads that share a given mm.`
			`*`
			`* The operation is safe because the caller holds the mmap_sem`
			`* exclusively and other threads accessing the vma cache will`
			`* have mmap_sem held at least for read, so no extra locking`
			`* is required to maintain the vma cache.`
			`*/`
			`void vmacache_flush_all(struct mm_struct *mm)`
			`{`
			`struct task_struct g, p;`

			`rcu_read_lock();`
			`for_each_process_thread(g, p) {`
			`/*`
			`* Only flush the vmacache pointers as the`
			`* mm seqnum is already set and curr's will`
			`* be set upon invalidation when the next`
			`* lookup is done.`
			`*/`
			`if (mm == p->mm)`
			`vmacache_flush(p);`
			`}`
			`rcu_read_unlock();`
			`}`

			`/*`
			`* This task may be accessing a foreign mm via (for example)`
			`* get_user_pages()->find_vma(). The vmacache is task-local and this`
			`* task's vmacache pertains to a different mm (ie, its own). There is`
			`* nothing we can do here.`
			`*`
			`* Also handle the case where a kernel thread has adopted this mm via use_mm().`
			`* That kernel thread's vmacache is not applicable to this mm.`
			`*/`
			`static bool vmacache_valid_mm(struct mm_struct *mm)`
			`{`
			`return current->mm == mm && !(current->flags & PF_KTHREAD);`
			`}`

			`void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)`
			`{`
			`if (vmacache_valid_mm(newvma->vm_mm))`
			`current->vmacache[VMACACHE_HASH(addr)] = newvma;`
			`}`

			`static bool vmacache_valid(struct mm_struct *mm)`
			`{`
			`struct task_struct *curr;`

			`if (!vmacache_valid_mm(mm))`
			`return false;`

			`curr = current;`
			`if (mm->vmacache_seqnum != curr->vmacache_seqnum) {`
			`/*`
			`* First attempt will always be invalid, initialize`
			`* the new cache for this task here.`
			`*/`
			`curr->vmacache_seqnum = mm->vmacache_seqnum;`
			`vmacache_flush(curr);`
			`return false;`
			`}`
			`return true;`
			`}`

			`struct vm_area_struct vmacache_find(struct mm_struct mm, unsigned long addr)`
			`{`
			`int i;`

			`if (!vmacache_valid(mm))`
			`return NULL;`

			`for (i = 0; i < VMACACHE_SIZE; i++) {`
			`struct vm_area_struct *vma = current->vmacache[i];`

mm: don't pointlessly use BUG_ON() for sanity check BUG_ON() is a big hammer, and should be used _only_ if there is some major corruption that you cannot possibly recover from, making it imperative that the current process (and possibly the whole machine) be terminated with extreme prejudice. The trivial sanity check in the vmacache code is not such a fatal error. Recovering from it is absolutely trivial, and using BUG_ON() just makes it harder to debug for no actual advantage. To make matters worse, the placement of the BUG_ON() (only if the range check matched) actually makes it harder to hit the sanity check to begin with, so _if_ there is a bug (and we just got a report from Srivatsa Bhat that this can indeed trigger), it is harder to debug not just because the machine is possibly dead, but because we don't have better coverage. BUG_ON() must die. Maybe we should add a checkpatch warning for it, because it is simply just about the worst thing you can ever do if you hit some "this cannot happen" situation. Reported-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> Cc: Davidlohr Bueso <davidlohr@hp.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-29 01:24:09 +04:00			`if (!vma)`
			`continue;`
			`if (WARN_ON_ONCE(vma->vm_mm != mm))`
			`break;`
			`if (vma->vm_start <= addr && vma->vm_end > addr)`
mm: per-thread vma caching This patch is a continuation of efforts trying to optimize find_vma(), avoiding potentially expensive rbtree walks to locate a vma upon faults. The original approach (https://lkml.org/lkml/2013/11/1/410), where the largest vma was also cached, ended up being too specific and random, thus further comparison with other approaches were needed. There are two things to consider when dealing with this, the cache hit rate and the latency of find_vma(). Improving the hit-rate does not necessarily translate in finding the vma any faster, as the overhead of any fancy caching schemes can be too high to consider. We currently cache the last used vma for the whole address space, which provides a nice optimization, reducing the total cycles in find_vma() by up to 250%, for workloads with good locality. On the other hand, this simple scheme is pretty much useless for workloads with poor locality. Analyzing ebizzy runs shows that, no matter how many threads are running, the mmap_cache hit rate is less than 2%, and in many situations below 1%. The proposed approach is to replace this scheme with a small per-thread cache, maximizing hit rates at a very low maintenance cost. Invalidations are performed by simply bumping up a 32-bit sequence number. The only expensive operation is in the rare case of a seq number overflow, where all caches that share the same address space are flushed. Upon a miss, the proposed replacement policy is based on the page number that contains the virtual address in question. Concretely, the following results are seen on an 80 core, 8 socket x86-64 box: 1) System bootup: Most programs are single threaded, so the per-thread scheme does improve ~50% hit rate by just adding a few more slots to the cache. +----------------+----------+------------------+ \| caching scheme \| hit-rate \| cycles (billion) \| +----------------+----------+------------------+ \| baseline \| 50.61% \| 19.90 \| \| patched \| 73.45% \| 13.58 \| +----------------+----------+------------------+ 2) Kernel build: This one is already pretty good with the current approach as we're dealing with good locality. +----------------+----------+------------------+ \| caching scheme \| hit-rate \| cycles (billion) \| +----------------+----------+------------------+ \| baseline \| 75.28% \| 11.03 \| \| patched \| 88.09% \| 9.31 \| +----------------+----------+------------------+ 3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload. +----------------+----------+------------------+ \| caching scheme \| hit-rate \| cycles (billion) \| +----------------+----------+------------------+ \| baseline \| 70.66% \| 17.14 \| \| patched \| 91.15% \| 12.57 \| +----------------+----------+------------------+ 4) Ebizzy: There's a fair amount of variation from run to run, but this approach always shows nearly perfect hit rates, while baseline is just about non-existent. The amounts of cycles can fluctuate between anywhere from ~60 to ~116 for the baseline scheme, but this approach reduces it considerably. For instance, with 80 threads: +----------------+----------+------------------+ \| caching scheme \| hit-rate \| cycles (billion) \| +----------------+----------+------------------+ \| baseline \| 1.06% \| 91.54 \| \| patched \| 99.97% \| 14.18 \| +----------------+----------+------------------+ [akpm@linux-foundation.org: fix nommu build, per Davidlohr] [akpm@linux-foundation.org: document vmacache_valid() logic] [akpm@linux-foundation.org: attempt to untangle header files] [akpm@linux-foundation.org: add vmacache_find() BUG_ON] [hughd@google.com: add vmacache_valid_mm() (from Oleg)] [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: adjust and enhance comments] Signed-off-by: Davidlohr Bueso <davidlohr@hp.com> Reviewed-by: Rik van Riel <riel@redhat.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Reviewed-by: Michel Lespinasse <walken@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Tested-by: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2014-04-08 02:37:25 +04:00			`return vma;`
			`}`

			`return NULL;`
			`}`

			`#ifndef CONFIG_MMU`
			`struct vm_area_struct vmacache_find_exact(struct mm_struct mm,`
			`unsigned long start,`
			`unsigned long end)`
			`{`
			`int i;`

			`if (!vmacache_valid(mm))`
			`return NULL;`

			`for (i = 0; i < VMACACHE_SIZE; i++) {`
			`struct vm_area_struct *vma = current->vmacache[i];`

			`if (vma && vma->vm_start == start && vma->vm_end == end)`
			`return vma;`
			`}`

			`return NULL;`
			`}`
			`#endif`