2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* Macros for manipulating and testing page->flags
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef PAGE_FLAGS_H
|
|
|
|
#define PAGE_FLAGS_H
|
|
|
|
|
2006-06-23 13:03:06 +04:00
|
|
|
#include <linux/types.h>
|
2008-04-28 13:12:48 +04:00
|
|
|
#ifndef __GENERATING_BOUNDS_H
|
2007-05-07 01:49:40 +04:00
|
|
|
#include <linux/mm_types.h>
|
2009-04-19 23:57:19 +04:00
|
|
|
#include <generated/bounds.h>
|
2008-04-28 13:12:48 +04:00
|
|
|
#endif /* !__GENERATING_BOUNDS_H */
|
2006-06-23 13:03:06 +04:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* Various page->flags bits:
|
|
|
|
*
|
|
|
|
* PG_reserved is set for special pages, which can never be swapped out. Some
|
|
|
|
* of them might not even exist (eg empty_bad_page)...
|
|
|
|
*
|
2006-09-26 10:31:35 +04:00
|
|
|
* The PG_private bitflag is set on pagecache pages if they contain filesystem
|
|
|
|
* specific data (which is normally at page->private). It can be used by
|
|
|
|
* private allocations for its own usage.
|
2005-04-17 02:20:36 +04:00
|
|
|
*
|
2006-09-26 10:31:35 +04:00
|
|
|
* During initiation of disk I/O, PG_locked is set. This bit is set before I/O
|
|
|
|
* and cleared when writeback _starts_ or when read _completes_. PG_writeback
|
|
|
|
* is set before writeback starts and cleared when it finishes.
|
|
|
|
*
|
|
|
|
* PG_locked also pins a page in pagecache, and blocks truncation of the file
|
|
|
|
* while it is held.
|
|
|
|
*
|
|
|
|
* page_waitqueue(page) is a wait queue of all tasks waiting for the page
|
|
|
|
* to become unlocked.
|
2005-04-17 02:20:36 +04:00
|
|
|
*
|
|
|
|
* PG_uptodate tells whether the page's contents is valid. When a read
|
|
|
|
* completes, the page becomes uptodate, unless a disk I/O error happened.
|
|
|
|
*
|
2006-09-26 10:31:35 +04:00
|
|
|
* PG_referenced, PG_reclaim are used for page reclaim for anonymous and
|
|
|
|
* file-backed pagecache (see mm/vmscan.c).
|
2005-04-17 02:20:36 +04:00
|
|
|
*
|
|
|
|
* PG_error is set to indicate that an I/O error occurred on this page.
|
|
|
|
*
|
|
|
|
* PG_arch_1 is an architecture specific page state bit. The generic code
|
|
|
|
* guarantees that this bit is cleared for a page when it first is entered into
|
|
|
|
* the page cache.
|
|
|
|
*
|
|
|
|
* PG_highmem pages are not permanently mapped into the kernel virtual address
|
|
|
|
* space, they need to be kmapped separately for doing IO on the pages. The
|
|
|
|
* struct page (these bits with information) are always mapped into kernel
|
|
|
|
* address space...
|
2006-09-26 10:31:35 +04:00
|
|
|
*
|
|
|
|
* PG_buddy is set to indicate that the page is free and in the buddy system
|
|
|
|
* (see mm/page_alloc.c).
|
|
|
|
*
|
2009-09-16 13:50:03 +04:00
|
|
|
* PG_hwpoison indicates that a page got corrupted in hardware and contains
|
|
|
|
* data with incorrect ECC bits that triggered a machine check. Accessing is
|
|
|
|
* not safe since it may cause another machine check. Don't touch!
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't use the *_dontuse flags. Use the macros. Otherwise you'll break
|
2006-04-11 09:53:01 +04:00
|
|
|
* locked- and dirty-page accounting.
|
|
|
|
*
|
|
|
|
* The page flags field is split into two parts, the main flags area
|
|
|
|
* which extends from the low bits upwards, and the fields area which
|
|
|
|
* extends from the high bits downwards.
|
|
|
|
*
|
|
|
|
* | FIELD | ... | FLAGS |
|
2008-04-28 13:12:48 +04:00
|
|
|
* N-1 ^ 0
|
|
|
|
* (NR_PAGEFLAGS)
|
2006-04-11 09:53:01 +04:00
|
|
|
*
|
2008-04-28 13:12:48 +04:00
|
|
|
* The fields area is reserved for fields mapping zone, node (for NUMA) and
|
|
|
|
* SPARSEMEM section (for variants of SPARSEMEM that require section ids like
|
|
|
|
* SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2008-04-28 13:12:47 +04:00
|
|
|
enum pageflags {
|
|
|
|
PG_locked, /* Page is locked. Don't touch. */
|
|
|
|
PG_error,
|
|
|
|
PG_referenced,
|
|
|
|
PG_uptodate,
|
|
|
|
PG_dirty,
|
|
|
|
PG_lru,
|
|
|
|
PG_active,
|
|
|
|
PG_slab,
|
|
|
|
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
|
|
|
|
PG_arch_1,
|
|
|
|
PG_reserved,
|
|
|
|
PG_private, /* If pagecache, has fs-private data */
|
2009-04-03 19:42:36 +04:00
|
|
|
PG_private_2, /* If pagecache, has fs aux data */
|
2008-04-28 13:12:47 +04:00
|
|
|
PG_writeback, /* Page is under writeback */
|
2008-04-28 13:12:55 +04:00
|
|
|
#ifdef CONFIG_PAGEFLAGS_EXTENDED
|
|
|
|
PG_head, /* A head page */
|
|
|
|
PG_tail, /* A tail page */
|
|
|
|
#else
|
2008-04-28 13:12:47 +04:00
|
|
|
PG_compound, /* A compound page */
|
2008-04-28 13:12:55 +04:00
|
|
|
#endif
|
2008-04-28 13:12:47 +04:00
|
|
|
PG_swapcache, /* Swap page: swp_entry_t in private */
|
|
|
|
PG_mappedtodisk, /* Has blocks allocated on-disk */
|
|
|
|
PG_reclaim, /* To be reclaimed asap */
|
|
|
|
PG_buddy, /* Page is free, on buddy lists */
|
2008-10-19 07:26:30 +04:00
|
|
|
PG_swapbacked, /* Page is backed by RAM/swap */
|
Unevictable LRU Infrastructure
When the system contains lots of mlocked or otherwise unevictable pages,
the pageout code (kswapd) can spend lots of time scanning over these
pages. Worse still, the presence of lots of unevictable pages can confuse
kswapd into thinking that more aggressive pageout modes are required,
resulting in all kinds of bad behaviour.
Infrastructure to manage pages excluded from reclaim--i.e., hidden from
vmscan. Based on a patch by Larry Woodman of Red Hat. Reworked to
maintain "unevictable" pages on a separate per-zone LRU list, to "hide"
them from vmscan.
Kosaki Motohiro added the support for the memory controller unevictable
lru list.
Pages on the unevictable list have both PG_unevictable and PG_lru set.
Thus, PG_unevictable is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.
The unevictable infrastructure is enabled by a new mm Kconfig option
[CONFIG_]UNEVICTABLE_LRU.
A new function 'page_evictable(page, vma)' in vmscan.c tests whether or
not a page may be evictable. Subsequent patches will add the various
!evictable tests. We'll want to keep these tests light-weight for use in
shrink_active_list() and, possibly, the fault path.
To avoid races between tasks putting pages [back] onto an LRU list and
tasks that might be moving the page from non-evictable to evictable state,
the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()'
-- tests the "evictability" of a page after placing it on the LRU, before
dropping the reference. If the page has become unevictable,
putback_lru_page() will redo the 'putback', thus moving the page to the
unevictable list. This way, we avoid "stranding" evictable pages on the
unevictable list.
[akpm@linux-foundation.org: fix fallout from out-of-order merge]
[riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build]
[nishimura@mxp.nes.nec.co.jp: remove redundant mapping check]
[kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework]
[kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c]
[kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure]
[kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch]
[kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:39 +04:00
|
|
|
PG_unevictable, /* Page is "unevictable" */
|
2009-12-15 04:58:59 +03:00
|
|
|
#ifdef CONFIG_MMU
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
|
|
|
PG_mlocked, /* Page is vma mlocked */
|
Unevictable LRU Infrastructure
When the system contains lots of mlocked or otherwise unevictable pages,
the pageout code (kswapd) can spend lots of time scanning over these
pages. Worse still, the presence of lots of unevictable pages can confuse
kswapd into thinking that more aggressive pageout modes are required,
resulting in all kinds of bad behaviour.
Infrastructure to manage pages excluded from reclaim--i.e., hidden from
vmscan. Based on a patch by Larry Woodman of Red Hat. Reworked to
maintain "unevictable" pages on a separate per-zone LRU list, to "hide"
them from vmscan.
Kosaki Motohiro added the support for the memory controller unevictable
lru list.
Pages on the unevictable list have both PG_unevictable and PG_lru set.
Thus, PG_unevictable is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.
The unevictable infrastructure is enabled by a new mm Kconfig option
[CONFIG_]UNEVICTABLE_LRU.
A new function 'page_evictable(page, vma)' in vmscan.c tests whether or
not a page may be evictable. Subsequent patches will add the various
!evictable tests. We'll want to keep these tests light-weight for use in
shrink_active_list() and, possibly, the fault path.
To avoid races between tasks putting pages [back] onto an LRU list and
tasks that might be moving the page from non-evictable to evictable state,
the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()'
-- tests the "evictability" of a page after placing it on the LRU, before
dropping the reference. If the page has become unevictable,
putback_lru_page() will redo the 'putback', thus moving the page to the
unevictable list. This way, we avoid "stranding" evictable pages on the
unevictable list.
[akpm@linux-foundation.org: fix fallout from out-of-order merge]
[riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build]
[nishimura@mxp.nes.nec.co.jp: remove redundant mapping check]
[kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework]
[kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c]
[kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure]
[kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch]
[kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:39 +04:00
|
|
|
#endif
|
2009-07-10 20:57:37 +04:00
|
|
|
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
|
2008-04-28 13:12:52 +04:00
|
|
|
PG_uncached, /* Page has been mapped as uncached */
|
2009-09-16 13:50:03 +04:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_MEMORY_FAILURE
|
|
|
|
PG_hwpoison, /* hardware poisoned page. Don't touch */
|
2006-06-23 13:03:06 +04:00
|
|
|
#endif
|
2008-07-24 08:27:16 +04:00
|
|
|
__NR_PAGEFLAGS,
|
|
|
|
|
|
|
|
/* Filesystems */
|
|
|
|
PG_checked = PG_owner_priv_1,
|
|
|
|
|
2009-04-03 19:42:36 +04:00
|
|
|
/* Two page bits are conscripted by FS-Cache to maintain local caching
|
|
|
|
* state. These bits are set on pages belonging to the netfs's inodes
|
|
|
|
* when those inodes are being locally cached.
|
|
|
|
*/
|
|
|
|
PG_fscache = PG_private_2, /* page backed by cache */
|
|
|
|
|
2008-07-24 08:27:16 +04:00
|
|
|
/* XEN */
|
|
|
|
PG_pinned = PG_owner_priv_1,
|
|
|
|
PG_savepinned = PG_dirty,
|
2008-07-24 08:27:18 +04:00
|
|
|
|
2008-07-24 08:27:19 +04:00
|
|
|
/* SLOB */
|
|
|
|
PG_slob_free = PG_private,
|
|
|
|
|
2008-07-24 08:27:18 +04:00
|
|
|
/* SLUB */
|
|
|
|
PG_slub_frozen = PG_active,
|
|
|
|
PG_slub_debug = PG_error,
|
2008-04-28 13:12:47 +04:00
|
|
|
};
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-04-28 13:12:48 +04:00
|
|
|
#ifndef __GENERATING_BOUNDS_H
|
|
|
|
|
2008-04-28 13:12:49 +04:00
|
|
|
/*
|
|
|
|
* Macros to create function definitions for page flags
|
|
|
|
*/
|
|
|
|
#define TESTPAGEFLAG(uname, lname) \
|
|
|
|
static inline int Page##uname(struct page *page) \
|
|
|
|
{ return test_bit(PG_##lname, &page->flags); }
|
|
|
|
|
|
|
|
#define SETPAGEFLAG(uname, lname) \
|
|
|
|
static inline void SetPage##uname(struct page *page) \
|
|
|
|
{ set_bit(PG_##lname, &page->flags); }
|
|
|
|
|
|
|
|
#define CLEARPAGEFLAG(uname, lname) \
|
|
|
|
static inline void ClearPage##uname(struct page *page) \
|
|
|
|
{ clear_bit(PG_##lname, &page->flags); }
|
|
|
|
|
|
|
|
#define __SETPAGEFLAG(uname, lname) \
|
|
|
|
static inline void __SetPage##uname(struct page *page) \
|
|
|
|
{ __set_bit(PG_##lname, &page->flags); }
|
|
|
|
|
|
|
|
#define __CLEARPAGEFLAG(uname, lname) \
|
|
|
|
static inline void __ClearPage##uname(struct page *page) \
|
|
|
|
{ __clear_bit(PG_##lname, &page->flags); }
|
|
|
|
|
|
|
|
#define TESTSETFLAG(uname, lname) \
|
|
|
|
static inline int TestSetPage##uname(struct page *page) \
|
|
|
|
{ return test_and_set_bit(PG_##lname, &page->flags); }
|
|
|
|
|
|
|
|
#define TESTCLEARFLAG(uname, lname) \
|
|
|
|
static inline int TestClearPage##uname(struct page *page) \
|
|
|
|
{ return test_and_clear_bit(PG_##lname, &page->flags); }
|
|
|
|
|
2009-09-22 04:01:48 +04:00
|
|
|
#define __TESTCLEARFLAG(uname, lname) \
|
|
|
|
static inline int __TestClearPage##uname(struct page *page) \
|
|
|
|
{ return __test_and_clear_bit(PG_##lname, &page->flags); }
|
2008-04-28 13:12:49 +04:00
|
|
|
|
|
|
|
#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
|
|
|
|
SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
|
|
|
|
|
|
|
|
#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
|
|
|
|
__SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname)
|
|
|
|
|
2008-04-28 13:12:53 +04:00
|
|
|
#define PAGEFLAG_FALSE(uname) \
|
|
|
|
static inline int Page##uname(struct page *page) \
|
|
|
|
{ return 0; }
|
|
|
|
|
2008-04-28 13:12:49 +04:00
|
|
|
#define TESTSCFLAG(uname, lname) \
|
|
|
|
TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
|
|
|
|
|
2008-10-19 07:26:37 +04:00
|
|
|
#define SETPAGEFLAG_NOOP(uname) \
|
|
|
|
static inline void SetPage##uname(struct page *page) { }
|
|
|
|
|
|
|
|
#define CLEARPAGEFLAG_NOOP(uname) \
|
|
|
|
static inline void ClearPage##uname(struct page *page) { }
|
|
|
|
|
|
|
|
#define __CLEARPAGEFLAG_NOOP(uname) \
|
|
|
|
static inline void __ClearPage##uname(struct page *page) { }
|
|
|
|
|
|
|
|
#define TESTCLEARFLAG_FALSE(uname) \
|
|
|
|
static inline int TestClearPage##uname(struct page *page) { return 0; }
|
|
|
|
|
2009-09-22 04:01:48 +04:00
|
|
|
#define __TESTCLEARFLAG_FALSE(uname) \
|
|
|
|
static inline int __TestClearPage##uname(struct page *page) { return 0; }
|
|
|
|
|
2008-04-28 13:12:50 +04:00
|
|
|
struct page; /* forward declaration */
|
|
|
|
|
2009-04-03 19:42:35 +04:00
|
|
|
TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked)
|
2008-04-28 13:12:50 +04:00
|
|
|
PAGEFLAG(Error, error)
|
|
|
|
PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
|
|
|
|
PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
|
|
|
|
PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
|
|
|
|
PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
|
Unevictable LRU Infrastructure
When the system contains lots of mlocked or otherwise unevictable pages,
the pageout code (kswapd) can spend lots of time scanning over these
pages. Worse still, the presence of lots of unevictable pages can confuse
kswapd into thinking that more aggressive pageout modes are required,
resulting in all kinds of bad behaviour.
Infrastructure to manage pages excluded from reclaim--i.e., hidden from
vmscan. Based on a patch by Larry Woodman of Red Hat. Reworked to
maintain "unevictable" pages on a separate per-zone LRU list, to "hide"
them from vmscan.
Kosaki Motohiro added the support for the memory controller unevictable
lru list.
Pages on the unevictable list have both PG_unevictable and PG_lru set.
Thus, PG_unevictable is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.
The unevictable infrastructure is enabled by a new mm Kconfig option
[CONFIG_]UNEVICTABLE_LRU.
A new function 'page_evictable(page, vma)' in vmscan.c tests whether or
not a page may be evictable. Subsequent patches will add the various
!evictable tests. We'll want to keep these tests light-weight for use in
shrink_active_list() and, possibly, the fault path.
To avoid races between tasks putting pages [back] onto an LRU list and
tasks that might be moving the page from non-evictable to evictable state,
the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()'
-- tests the "evictability" of a page after placing it on the LRU, before
dropping the reference. If the page has become unevictable,
putback_lru_page() will redo the 'putback', thus moving the page to the
unevictable list. This way, we avoid "stranding" evictable pages on the
unevictable list.
[akpm@linux-foundation.org: fix fallout from out-of-order merge]
[riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build]
[nishimura@mxp.nes.nec.co.jp: remove redundant mapping check]
[kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework]
[kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c]
[kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure]
[kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch]
[kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:39 +04:00
|
|
|
TESTCLEARFLAG(Active, active)
|
2008-04-28 13:12:50 +04:00
|
|
|
__PAGEFLAG(Slab, slab)
|
2008-07-24 08:27:16 +04:00
|
|
|
PAGEFLAG(Checked, checked) /* Used by some filesystems */
|
|
|
|
PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
|
|
|
|
PAGEFLAG(SavePinned, savepinned); /* Xen */
|
2008-04-28 13:12:50 +04:00
|
|
|
PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
|
2008-10-19 07:26:30 +04:00
|
|
|
PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
|
2008-04-28 13:12:50 +04:00
|
|
|
|
2008-07-24 08:27:19 +04:00
|
|
|
__PAGEFLAG(SlobFree, slob_free)
|
|
|
|
|
2008-07-24 08:27:18 +04:00
|
|
|
__PAGEFLAG(SlubFrozen, slub_frozen)
|
|
|
|
__PAGEFLAG(SlubDebug, slub_debug)
|
|
|
|
|
2009-04-03 19:42:36 +04:00
|
|
|
/*
|
|
|
|
* Private page markings that may be used by the filesystem that owns the page
|
|
|
|
* for its own purposes.
|
|
|
|
* - PG_private and PG_private_2 cause releasepage() and co to be invoked
|
|
|
|
*/
|
|
|
|
PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private)
|
|
|
|
__CLEARPAGEFLAG(Private, private)
|
|
|
|
PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2)
|
|
|
|
PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
|
|
|
|
|
2008-04-28 13:12:50 +04:00
|
|
|
/*
|
|
|
|
* Only test-and-set exist for PG_writeback. The unconditional operators are
|
|
|
|
* risky: they bypass page accounting.
|
|
|
|
*/
|
|
|
|
TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
|
|
|
|
__PAGEFLAG(Buddy, buddy)
|
|
|
|
PAGEFLAG(MappedToDisk, mappedtodisk)
|
|
|
|
|
|
|
|
/* PG_readahead is only used for file reads; PG_reclaim is only for writes */
|
|
|
|
PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
|
2008-04-28 13:12:52 +04:00
|
|
|
PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */
|
2008-04-28 13:12:50 +04:00
|
|
|
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
2008-04-28 13:12:50 +04:00
|
|
|
* Must use a macro here due to header dependency issues. page_zone() is not
|
|
|
|
* available at this point.
|
2005-04-17 02:20:36 +04:00
|
|
|
*/
|
2008-04-28 13:12:52 +04:00
|
|
|
#define PageHighMem(__p) is_highmem(page_zone(__p))
|
2008-04-28 13:12:50 +04:00
|
|
|
#else
|
2008-04-28 13:12:53 +04:00
|
|
|
PAGEFLAG_FALSE(HighMem)
|
2008-04-28 13:12:50 +04:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_SWAP
|
|
|
|
PAGEFLAG(SwapCache, swapcache)
|
|
|
|
#else
|
2008-04-28 13:12:53 +04:00
|
|
|
PAGEFLAG_FALSE(SwapCache)
|
2009-01-07 01:39:24 +03:00
|
|
|
SETPAGEFLAG_NOOP(SwapCache) CLEARPAGEFLAG_NOOP(SwapCache)
|
2008-04-28 13:12:50 +04:00
|
|
|
#endif
|
|
|
|
|
Unevictable LRU Infrastructure
When the system contains lots of mlocked or otherwise unevictable pages,
the pageout code (kswapd) can spend lots of time scanning over these
pages. Worse still, the presence of lots of unevictable pages can confuse
kswapd into thinking that more aggressive pageout modes are required,
resulting in all kinds of bad behaviour.
Infrastructure to manage pages excluded from reclaim--i.e., hidden from
vmscan. Based on a patch by Larry Woodman of Red Hat. Reworked to
maintain "unevictable" pages on a separate per-zone LRU list, to "hide"
them from vmscan.
Kosaki Motohiro added the support for the memory controller unevictable
lru list.
Pages on the unevictable list have both PG_unevictable and PG_lru set.
Thus, PG_unevictable is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.
The unevictable infrastructure is enabled by a new mm Kconfig option
[CONFIG_]UNEVICTABLE_LRU.
A new function 'page_evictable(page, vma)' in vmscan.c tests whether or
not a page may be evictable. Subsequent patches will add the various
!evictable tests. We'll want to keep these tests light-weight for use in
shrink_active_list() and, possibly, the fault path.
To avoid races between tasks putting pages [back] onto an LRU list and
tasks that might be moving the page from non-evictable to evictable state,
the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()'
-- tests the "evictability" of a page after placing it on the LRU, before
dropping the reference. If the page has become unevictable,
putback_lru_page() will redo the 'putback', thus moving the page to the
unevictable list. This way, we avoid "stranding" evictable pages on the
unevictable list.
[akpm@linux-foundation.org: fix fallout from out-of-order merge]
[riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build]
[nishimura@mxp.nes.nec.co.jp: remove redundant mapping check]
[kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework]
[kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c]
[kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure]
[kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch]
[kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:39 +04:00
|
|
|
PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
|
|
|
|
TESTCLEARFLAG(Unevictable, unevictable)
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
|
|
|
|
2009-12-15 04:58:59 +03:00
|
|
|
#ifdef CONFIG_MMU
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
|
|
|
PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
|
2009-09-22 04:01:48 +04:00
|
|
|
TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
|
Unevictable LRU Infrastructure
When the system contains lots of mlocked or otherwise unevictable pages,
the pageout code (kswapd) can spend lots of time scanning over these
pages. Worse still, the presence of lots of unevictable pages can confuse
kswapd into thinking that more aggressive pageout modes are required,
resulting in all kinds of bad behaviour.
Infrastructure to manage pages excluded from reclaim--i.e., hidden from
vmscan. Based on a patch by Larry Woodman of Red Hat. Reworked to
maintain "unevictable" pages on a separate per-zone LRU list, to "hide"
them from vmscan.
Kosaki Motohiro added the support for the memory controller unevictable
lru list.
Pages on the unevictable list have both PG_unevictable and PG_lru set.
Thus, PG_unevictable is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.
The unevictable infrastructure is enabled by a new mm Kconfig option
[CONFIG_]UNEVICTABLE_LRU.
A new function 'page_evictable(page, vma)' in vmscan.c tests whether or
not a page may be evictable. Subsequent patches will add the various
!evictable tests. We'll want to keep these tests light-weight for use in
shrink_active_list() and, possibly, the fault path.
To avoid races between tasks putting pages [back] onto an LRU list and
tasks that might be moving the page from non-evictable to evictable state,
the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()'
-- tests the "evictability" of a page after placing it on the LRU, before
dropping the reference. If the page has become unevictable,
putback_lru_page() will redo the 'putback', thus moving the page to the
unevictable list. This way, we avoid "stranding" evictable pages on the
unevictable list.
[akpm@linux-foundation.org: fix fallout from out-of-order merge]
[riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build]
[nishimura@mxp.nes.nec.co.jp: remove redundant mapping check]
[kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework]
[kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c]
[kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure]
[kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch]
[kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:39 +04:00
|
|
|
#else
|
2009-09-22 04:01:48 +04:00
|
|
|
PAGEFLAG_FALSE(Mlocked) SETPAGEFLAG_NOOP(Mlocked)
|
|
|
|
TESTCLEARFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
|
Unevictable LRU Infrastructure
When the system contains lots of mlocked or otherwise unevictable pages,
the pageout code (kswapd) can spend lots of time scanning over these
pages. Worse still, the presence of lots of unevictable pages can confuse
kswapd into thinking that more aggressive pageout modes are required,
resulting in all kinds of bad behaviour.
Infrastructure to manage pages excluded from reclaim--i.e., hidden from
vmscan. Based on a patch by Larry Woodman of Red Hat. Reworked to
maintain "unevictable" pages on a separate per-zone LRU list, to "hide"
them from vmscan.
Kosaki Motohiro added the support for the memory controller unevictable
lru list.
Pages on the unevictable list have both PG_unevictable and PG_lru set.
Thus, PG_unevictable is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.
The unevictable infrastructure is enabled by a new mm Kconfig option
[CONFIG_]UNEVICTABLE_LRU.
A new function 'page_evictable(page, vma)' in vmscan.c tests whether or
not a page may be evictable. Subsequent patches will add the various
!evictable tests. We'll want to keep these tests light-weight for use in
shrink_active_list() and, possibly, the fault path.
To avoid races between tasks putting pages [back] onto an LRU list and
tasks that might be moving the page from non-evictable to evictable state,
the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()'
-- tests the "evictability" of a page after placing it on the LRU, before
dropping the reference. If the page has become unevictable,
putback_lru_page() will redo the 'putback', thus moving the page to the
unevictable list. This way, we avoid "stranding" evictable pages on the
unevictable list.
[akpm@linux-foundation.org: fix fallout from out-of-order merge]
[riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build]
[nishimura@mxp.nes.nec.co.jp: remove redundant mapping check]
[kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework]
[kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c]
[kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure]
[kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch]
[kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:39 +04:00
|
|
|
#endif
|
|
|
|
|
2009-07-10 20:57:37 +04:00
|
|
|
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
|
2008-04-28 13:12:50 +04:00
|
|
|
PAGEFLAG(Uncached, uncached)
|
2008-04-28 13:12:52 +04:00
|
|
|
#else
|
2008-04-28 13:12:53 +04:00
|
|
|
PAGEFLAG_FALSE(Uncached)
|
2008-04-28 13:12:50 +04:00
|
|
|
#endif
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2009-09-16 13:50:03 +04:00
|
|
|
#ifdef CONFIG_MEMORY_FAILURE
|
|
|
|
PAGEFLAG(HWPoison, hwpoison)
|
2009-12-16 14:19:58 +03:00
|
|
|
TESTSCFLAG(HWPoison, hwpoison)
|
2009-09-16 13:50:03 +04:00
|
|
|
#define __PG_HWPOISON (1UL << PG_hwpoison)
|
|
|
|
#else
|
|
|
|
PAGEFLAG_FALSE(HWPoison)
|
|
|
|
#define __PG_HWPOISON 0
|
|
|
|
#endif
|
|
|
|
|
2009-12-16 14:19:59 +03:00
|
|
|
u64 stable_page_flags(struct page *page);
|
|
|
|
|
mm: fix PageUptodate data race
After running SetPageUptodate, preceeding stores to the page contents to
actually bring it uptodate may not be ordered with the store to set the
page uptodate.
Therefore, another CPU which checks PageUptodate is true, then reads the
page contents can get stale data.
Fix this by having an smp_wmb before SetPageUptodate, and smp_rmb after
PageUptodate.
Many places that test PageUptodate, do so with the page locked, and this
would be enough to ensure memory ordering in those places if
SetPageUptodate were only called while the page is locked. Unfortunately
that is not always the case for some filesystems, but it could be an idea
for the future.
Also bring the handling of anonymous page uptodateness in line with that of
file backed page management, by marking anon pages as uptodate when they
_are_ uptodate, rather than when our implementation requires that they be
marked as such. Doing allows us to get rid of the smp_wmb's in the page
copying functions, which were especially added for anonymous pages for an
analogous memory ordering problem. Both file and anonymous pages are
handled with the same barriers.
FAQ:
Q. Why not do this in flush_dcache_page?
A. Firstly, flush_dcache_page handles only one side (the smb side) of the
ordering protocol; we'd still need smp_rmb somewhere. Secondly, hiding away
memory barriers in a completely unrelated function is nasty; at least in the
PageUptodate macros, they are located together with (half) the operations
involved in the ordering. Thirdly, the smp_wmb is only required when first
bringing the page uptodate, wheras flush_dcache_page should be called each time
it is written to through the kernel mapping. It is logically the wrong place to
put it.
Q. Why does this increase my text size / reduce my performance / etc.
A. Because it is adding the necessary instructions to eliminate the data-race.
Q. Can it be improved?
A. Yes, eg. if you were to create a rule that all SetPageUptodate operations
run under the page lock, we could avoid the smp_rmb places where PageUptodate
is queried under the page lock. Requires audit of all filesystems and at least
some would need reworking. That's great you're interested, I'm eagerly awaiting
your patches.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 09:29:34 +03:00
|
|
|
static inline int PageUptodate(struct page *page)
|
|
|
|
{
|
|
|
|
int ret = test_bit(PG_uptodate, &(page)->flags);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Must ensure that the data we read out of the page is loaded
|
|
|
|
* _after_ we've loaded page->flags to check for PageUptodate.
|
|
|
|
* We can skip the barrier if the page is not uptodate, because
|
|
|
|
* we wouldn't be reading anything from it.
|
|
|
|
*
|
|
|
|
* See SetPageUptodate() for the other side of the story.
|
|
|
|
*/
|
|
|
|
if (ret)
|
|
|
|
smp_rmb();
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void __SetPageUptodate(struct page *page)
|
|
|
|
{
|
|
|
|
smp_wmb();
|
|
|
|
__set_bit(PG_uptodate, &(page)->flags);
|
|
|
|
}
|
|
|
|
|
2006-09-29 12:58:41 +04:00
|
|
|
static inline void SetPageUptodate(struct page *page)
|
|
|
|
{
|
mm: fix PageUptodate data race
After running SetPageUptodate, preceeding stores to the page contents to
actually bring it uptodate may not be ordered with the store to set the
page uptodate.
Therefore, another CPU which checks PageUptodate is true, then reads the
page contents can get stale data.
Fix this by having an smp_wmb before SetPageUptodate, and smp_rmb after
PageUptodate.
Many places that test PageUptodate, do so with the page locked, and this
would be enough to ensure memory ordering in those places if
SetPageUptodate were only called while the page is locked. Unfortunately
that is not always the case for some filesystems, but it could be an idea
for the future.
Also bring the handling of anonymous page uptodateness in line with that of
file backed page management, by marking anon pages as uptodate when they
_are_ uptodate, rather than when our implementation requires that they be
marked as such. Doing allows us to get rid of the smp_wmb's in the page
copying functions, which were especially added for anonymous pages for an
analogous memory ordering problem. Both file and anonymous pages are
handled with the same barriers.
FAQ:
Q. Why not do this in flush_dcache_page?
A. Firstly, flush_dcache_page handles only one side (the smb side) of the
ordering protocol; we'd still need smp_rmb somewhere. Secondly, hiding away
memory barriers in a completely unrelated function is nasty; at least in the
PageUptodate macros, they are located together with (half) the operations
involved in the ordering. Thirdly, the smp_wmb is only required when first
bringing the page uptodate, wheras flush_dcache_page should be called each time
it is written to through the kernel mapping. It is logically the wrong place to
put it.
Q. Why does this increase my text size / reduce my performance / etc.
A. Because it is adding the necessary instructions to eliminate the data-race.
Q. Can it be improved?
A. Yes, eg. if you were to create a rule that all SetPageUptodate operations
run under the page lock, we could avoid the smp_rmb places where PageUptodate
is queried under the page lock. Requires audit of all filesystems and at least
some would need reworking. That's great you're interested, I'm eagerly awaiting
your patches.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 09:29:34 +03:00
|
|
|
#ifdef CONFIG_S390
|
2006-09-29 12:58:41 +04:00
|
|
|
if (!test_and_set_bit(PG_uptodate, &page->flags))
|
2007-04-27 18:01:57 +04:00
|
|
|
page_clear_dirty(page);
|
2006-06-30 12:55:32 +04:00
|
|
|
#else
|
mm: fix PageUptodate data race
After running SetPageUptodate, preceeding stores to the page contents to
actually bring it uptodate may not be ordered with the store to set the
page uptodate.
Therefore, another CPU which checks PageUptodate is true, then reads the
page contents can get stale data.
Fix this by having an smp_wmb before SetPageUptodate, and smp_rmb after
PageUptodate.
Many places that test PageUptodate, do so with the page locked, and this
would be enough to ensure memory ordering in those places if
SetPageUptodate were only called while the page is locked. Unfortunately
that is not always the case for some filesystems, but it could be an idea
for the future.
Also bring the handling of anonymous page uptodateness in line with that of
file backed page management, by marking anon pages as uptodate when they
_are_ uptodate, rather than when our implementation requires that they be
marked as such. Doing allows us to get rid of the smp_wmb's in the page
copying functions, which were especially added for anonymous pages for an
analogous memory ordering problem. Both file and anonymous pages are
handled with the same barriers.
FAQ:
Q. Why not do this in flush_dcache_page?
A. Firstly, flush_dcache_page handles only one side (the smb side) of the
ordering protocol; we'd still need smp_rmb somewhere. Secondly, hiding away
memory barriers in a completely unrelated function is nasty; at least in the
PageUptodate macros, they are located together with (half) the operations
involved in the ordering. Thirdly, the smp_wmb is only required when first
bringing the page uptodate, wheras flush_dcache_page should be called each time
it is written to through the kernel mapping. It is logically the wrong place to
put it.
Q. Why does this increase my text size / reduce my performance / etc.
A. Because it is adding the necessary instructions to eliminate the data-race.
Q. Can it be improved?
A. Yes, eg. if you were to create a rule that all SetPageUptodate operations
run under the page lock, we could avoid the smp_rmb places where PageUptodate
is queried under the page lock. Requires audit of all filesystems and at least
some would need reworking. That's great you're interested, I'm eagerly awaiting
your patches.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 09:29:34 +03:00
|
|
|
/*
|
|
|
|
* Memory barrier must be issued before setting the PG_uptodate bit,
|
|
|
|
* so that all previous stores issued in order to bring the page
|
|
|
|
* uptodate are actually visible before PageUptodate becomes true.
|
|
|
|
*
|
|
|
|
* s390 doesn't need an explicit smp_wmb here because the test and
|
|
|
|
* set bit already provides full barriers.
|
|
|
|
*/
|
|
|
|
smp_wmb();
|
|
|
|
set_bit(PG_uptodate, &(page)->flags);
|
2005-04-17 02:20:36 +04:00
|
|
|
#endif
|
mm: fix PageUptodate data race
After running SetPageUptodate, preceeding stores to the page contents to
actually bring it uptodate may not be ordered with the store to set the
page uptodate.
Therefore, another CPU which checks PageUptodate is true, then reads the
page contents can get stale data.
Fix this by having an smp_wmb before SetPageUptodate, and smp_rmb after
PageUptodate.
Many places that test PageUptodate, do so with the page locked, and this
would be enough to ensure memory ordering in those places if
SetPageUptodate were only called while the page is locked. Unfortunately
that is not always the case for some filesystems, but it could be an idea
for the future.
Also bring the handling of anonymous page uptodateness in line with that of
file backed page management, by marking anon pages as uptodate when they
_are_ uptodate, rather than when our implementation requires that they be
marked as such. Doing allows us to get rid of the smp_wmb's in the page
copying functions, which were especially added for anonymous pages for an
analogous memory ordering problem. Both file and anonymous pages are
handled with the same barriers.
FAQ:
Q. Why not do this in flush_dcache_page?
A. Firstly, flush_dcache_page handles only one side (the smb side) of the
ordering protocol; we'd still need smp_rmb somewhere. Secondly, hiding away
memory barriers in a completely unrelated function is nasty; at least in the
PageUptodate macros, they are located together with (half) the operations
involved in the ordering. Thirdly, the smp_wmb is only required when first
bringing the page uptodate, wheras flush_dcache_page should be called each time
it is written to through the kernel mapping. It is logically the wrong place to
put it.
Q. Why does this increase my text size / reduce my performance / etc.
A. Because it is adding the necessary instructions to eliminate the data-race.
Q. Can it be improved?
A. Yes, eg. if you were to create a rule that all SetPageUptodate operations
run under the page lock, we could avoid the smp_rmb places where PageUptodate
is queried under the page lock. Requires audit of all filesystems and at least
some would need reworking. That's great you're interested, I'm eagerly awaiting
your patches.
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 09:29:34 +03:00
|
|
|
}
|
|
|
|
|
2008-04-28 13:12:50 +04:00
|
|
|
CLEARPAGEFLAG(Uptodate, uptodate)
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-04-28 13:12:50 +04:00
|
|
|
extern void cancel_dirty_page(struct page *page, unsigned int account_size);
|
2007-07-19 12:47:55 +04:00
|
|
|
|
2008-04-28 13:12:50 +04:00
|
|
|
int test_clear_page_writeback(struct page *page);
|
|
|
|
int test_set_page_writeback(struct page *page);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-04-28 13:12:50 +04:00
|
|
|
static inline void set_page_writeback(struct page *page)
|
|
|
|
{
|
|
|
|
test_set_page_writeback(page);
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-04-28 13:12:55 +04:00
|
|
|
#ifdef CONFIG_PAGEFLAGS_EXTENDED
|
|
|
|
/*
|
|
|
|
* System with lots of page flags available. This allows separate
|
|
|
|
* flags for PageHead() and PageTail() checks of compound pages so that bit
|
|
|
|
* tests can be used in performance sensitive paths. PageCompound is
|
|
|
|
* generally not used in hot code paths.
|
|
|
|
*/
|
|
|
|
__PAGEFLAG(Head, head)
|
|
|
|
__PAGEFLAG(Tail, tail)
|
|
|
|
|
|
|
|
static inline int PageCompound(struct page *page)
|
|
|
|
{
|
|
|
|
return page->flags & ((1L << PG_head) | (1L << PG_tail));
|
|
|
|
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* Reduce page flag use as much as possible by overlapping
|
|
|
|
* compound page flags with the flags used for page cache pages. Possible
|
|
|
|
* because PageCompound is always set for compound pages and not for
|
|
|
|
* pages on the LRU and/or pagecache.
|
|
|
|
*/
|
2008-04-28 13:12:50 +04:00
|
|
|
TESTPAGEFLAG(Compound, compound)
|
|
|
|
__PAGEFLAG(Head, compound)
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2007-05-07 01:49:39 +04:00
|
|
|
/*
|
2007-05-07 01:49:40 +04:00
|
|
|
* PG_reclaim is used in combination with PG_compound to mark the
|
2008-04-28 13:12:50 +04:00
|
|
|
* head and tail of a compound page. This saves one page flag
|
|
|
|
* but makes it impossible to use compound pages for the page cache.
|
|
|
|
* The PG_reclaim bit would have to be used for reclaim or readahead
|
|
|
|
* if compound pages enter the page cache.
|
2007-05-07 01:49:40 +04:00
|
|
|
*
|
|
|
|
* PG_compound & PG_reclaim => Tail page
|
|
|
|
* PG_compound & ~PG_reclaim => Head page
|
2007-05-07 01:49:39 +04:00
|
|
|
*/
|
2007-05-07 01:49:40 +04:00
|
|
|
#define PG_head_tail_mask ((1L << PG_compound) | (1L << PG_reclaim))
|
|
|
|
|
2008-04-28 13:12:50 +04:00
|
|
|
static inline int PageTail(struct page *page)
|
|
|
|
{
|
|
|
|
return ((page->flags & PG_head_tail_mask) == PG_head_tail_mask);
|
|
|
|
}
|
2007-05-07 01:49:40 +04:00
|
|
|
|
|
|
|
static inline void __SetPageTail(struct page *page)
|
|
|
|
{
|
|
|
|
page->flags |= PG_head_tail_mask;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void __ClearPageTail(struct page *page)
|
|
|
|
{
|
|
|
|
page->flags &= ~PG_head_tail_mask;
|
|
|
|
}
|
|
|
|
|
2008-04-28 13:12:55 +04:00
|
|
|
#endif /* !PAGEFLAGS_EXTENDED */
|
2008-06-09 20:18:45 +04:00
|
|
|
|
2009-12-15 04:58:59 +03:00
|
|
|
#ifdef CONFIG_MMU
|
2009-04-01 02:23:26 +04:00
|
|
|
#define __PG_MLOCKED (1 << PG_mlocked)
|
|
|
|
#else
|
mlock: mlocked pages are unevictable
Make sure that mlocked pages also live on the unevictable LRU, so kswapd
will not scan them over and over again.
This is achieved through various strategies:
1) add yet another page flag--PG_mlocked--to indicate that
the page is locked for efficient testing in vmscan and,
optionally, fault path. This allows early culling of
unevictable pages, preventing them from getting to
page_referenced()/try_to_unmap(). Also allows separate
accounting of mlock'd pages, as Nick's original patch
did.
Note: Nick's original mlock patch used a PG_mlocked
flag. I had removed this in favor of the PG_unevictable
flag + an mlock_count [new page struct member]. I
restored the PG_mlocked flag to eliminate the new
count field.
2) add the mlock/unevictable infrastructure to mm/mlock.c,
with internal APIs in mm/internal.h. This is a rework
of Nick's original patch to these files, taking into
account that mlocked pages are now kept on unevictable
LRU list.
3) update vmscan.c:page_evictable() to check PageMlocked()
and, if vma passed in, the vm_flags. Note that the vma
will only be passed in for new pages in the fault path;
and then only if the "cull unevictable pages in fault
path" patch is included.
4) add try_to_unlock() to rmap.c to walk a page's rmap and
ClearPageMlocked() if no other vmas have it mlocked.
Reuses as much of try_to_unmap() as possible. This
effectively replaces the use of one of the lru list links
as an mlock count. If this mechanism let's pages in mlocked
vmas leak through w/o PG_mlocked set [I don't know that it
does], we should catch them later in try_to_unmap(). One
hopes this will be rare, as it will be relatively expensive.
Original mm/internal.h, mm/rmap.c and mm/mlock.c changes:
Signed-off-by: Nick Piggin <npiggin@suse.de>
splitlru: introduce __get_user_pages():
New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS.
because current get_user_pages() can't grab PROT_NONE pages theresore it
cause PROT_NONE pages can't munlock.
[akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch]
[akpm@linux-foundation.org: untangle patch interdependencies]
[akpm@linux-foundation.org: fix things after out-of-order merging]
[hugh@veritas.com: fix page-flags mess]
[lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm']
[kosaki.motohiro@jp.fujitsu.com: build fix]
[kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments]
[kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:44 +04:00
|
|
|
#define __PG_MLOCKED 0
|
Unevictable LRU Infrastructure
When the system contains lots of mlocked or otherwise unevictable pages,
the pageout code (kswapd) can spend lots of time scanning over these
pages. Worse still, the presence of lots of unevictable pages can confuse
kswapd into thinking that more aggressive pageout modes are required,
resulting in all kinds of bad behaviour.
Infrastructure to manage pages excluded from reclaim--i.e., hidden from
vmscan. Based on a patch by Larry Woodman of Red Hat. Reworked to
maintain "unevictable" pages on a separate per-zone LRU list, to "hide"
them from vmscan.
Kosaki Motohiro added the support for the memory controller unevictable
lru list.
Pages on the unevictable list have both PG_unevictable and PG_lru set.
Thus, PG_unevictable is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.
The unevictable infrastructure is enabled by a new mm Kconfig option
[CONFIG_]UNEVICTABLE_LRU.
A new function 'page_evictable(page, vma)' in vmscan.c tests whether or
not a page may be evictable. Subsequent patches will add the various
!evictable tests. We'll want to keep these tests light-weight for use in
shrink_active_list() and, possibly, the fault path.
To avoid races between tasks putting pages [back] onto an LRU list and
tasks that might be moving the page from non-evictable to evictable state,
the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()'
-- tests the "evictability" of a page after placing it on the LRU, before
dropping the reference. If the page has become unevictable,
putback_lru_page() will redo the 'putback', thus moving the page to the
unevictable list. This way, we avoid "stranding" evictable pages on the
unevictable list.
[akpm@linux-foundation.org: fix fallout from out-of-order merge]
[riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build]
[nishimura@mxp.nes.nec.co.jp: remove redundant mapping check]
[kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework]
[kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c]
[kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure]
[kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch]
[kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:39 +04:00
|
|
|
#endif
|
|
|
|
|
2008-06-09 20:18:45 +04:00
|
|
|
/*
|
|
|
|
* Flags checked when a page is freed. Pages being freed should not have
|
|
|
|
* these flags set. It they are, there is a problem.
|
|
|
|
*/
|
2009-01-07 01:40:05 +03:00
|
|
|
#define PAGE_FLAGS_CHECK_AT_FREE \
|
2009-04-03 19:42:36 +04:00
|
|
|
(1 << PG_lru | 1 << PG_locked | \
|
|
|
|
1 << PG_private | 1 << PG_private_2 | \
|
|
|
|
1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \
|
|
|
|
1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \
|
2009-09-16 13:50:03 +04:00
|
|
|
1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
|
2008-06-09 20:18:45 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Flags checked when a page is prepped for return by the page allocator.
|
2009-01-07 01:40:05 +03:00
|
|
|
* Pages being prepped should not have any flags set. It they are set,
|
|
|
|
* there has been a kernel bug or struct page corruption.
|
2008-06-09 20:18:45 +04:00
|
|
|
*/
|
2009-01-07 01:40:05 +03:00
|
|
|
#define PAGE_FLAGS_CHECK_AT_PREP ((1 << NR_PAGEFLAGS) - 1)
|
2008-06-09 20:18:45 +04:00
|
|
|
|
2009-09-22 04:02:59 +04:00
|
|
|
#define PAGE_FLAGS_PRIVATE \
|
|
|
|
(1 << PG_private | 1 << PG_private_2)
|
2009-04-03 19:42:36 +04:00
|
|
|
/**
|
|
|
|
* page_has_private - Determine if page has private stuff
|
|
|
|
* @page: The page to be checked
|
|
|
|
*
|
|
|
|
* Determine if a page has private stuff, indicating that release routines
|
|
|
|
* should be invoked upon it.
|
|
|
|
*/
|
2009-09-22 04:02:59 +04:00
|
|
|
static inline int page_has_private(struct page *page)
|
|
|
|
{
|
|
|
|
return !!(page->flags & PAGE_FLAGS_PRIVATE);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* !__GENERATING_BOUNDS_H */
|
2009-04-03 19:42:36 +04:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#endif /* PAGE_FLAGS_H */
|