2005-04-17 02:20:36 +04:00
|
|
|
#ifndef _LINUX_SWAP_H
|
|
|
|
#define _LINUX_SWAP_H
|
|
|
|
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/list.h>
|
2008-02-07 11:13:56 +03:00
|
|
|
#include <linux/memcontrol.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <linux/sched.h>
|
2008-10-19 07:26:53 +04:00
|
|
|
#include <linux/node.h>
|
2005-07-13 00:58:31 +04:00
|
|
|
|
2011-07-27 03:09:06 +04:00
|
|
|
#include <linux/atomic.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <asm/page.h>
|
|
|
|
|
2006-09-26 10:31:20 +04:00
|
|
|
struct notifier_block;
|
|
|
|
|
2006-09-26 10:32:42 +04:00
|
|
|
struct bio;
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */
|
|
|
|
#define SWAP_FLAG_PRIO_MASK 0x7fff
|
|
|
|
#define SWAP_FLAG_PRIO_SHIFT 0
|
swap: discard while swapping only if SWAP_FLAG_DISCARD
Tests with recent firmware on Intel X25-M 80GB and OCZ Vertex 60GB SSDs
show a shift since I last tested in December: in part because of firmware
updates, in part because of the necessary move from barriers to awaiting
completion at the block layer. While discard at swapon still shows as
slightly beneficial on both, discarding 1MB swap cluster when allocating
is now disadvanteous: adds 25% overhead on Intel, adds 230% on OCZ (YMMV).
Surrender: discard as presently implemented is more hindrance than help
for swap; but might prove useful on other devices, or with improvements.
So continue to do the discard at swapon, but make discard while swapping
conditional on a SWAP_FLAG_DISCARD to sys_swapon() (which has been using
only the lower 16 bits of int flags).
We can add a --discard or -d to swapon(8), and a "discard" to swap in
/etc/fstab: matching the mount option for btrfs, ext4, fat, gfs2, nilfs2.
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Nigel Cunningham <nigel@tuxonice.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <jaxboe@fusionio.com>
Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-09-10 03:38:11 +04:00
|
|
|
#define SWAP_FLAG_DISCARD 0x10000 /* discard swap cluster after use */
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2012-03-29 01:42:42 +04:00
|
|
|
#define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
|
|
|
|
SWAP_FLAG_DISCARD)
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
static inline int current_is_kswapd(void)
|
|
|
|
{
|
|
|
|
return current->flags & PF_KSWAPD;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* MAX_SWAPFILES defines the maximum number of swaptypes: things which can
|
|
|
|
* be swapped to. The swap type and the offset into that swap type are
|
|
|
|
* encoded into pte's and into pgoff_t's in the swapcache. Using five bits
|
|
|
|
* for the type means that the maximum number of swapcache pages is 27 bits
|
|
|
|
* on 32-bit-pgoff_t architectures. And that assumes that the architecture packs
|
|
|
|
* the type/offset into the pte as 5/27 as well.
|
|
|
|
*/
|
|
|
|
#define MAX_SWAPFILES_SHIFT 5
|
2009-09-16 13:50:05 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Use some of the swap files numbers for other purposes. This
|
|
|
|
* is a convenient way to hook into the VM to trigger special
|
|
|
|
* actions on faults.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NUMA node memory migration support
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_MIGRATION
|
|
|
|
#define SWP_MIGRATION_NUM 2
|
|
|
|
#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM)
|
|
|
|
#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
|
[PATCH] Swapless page migration: add R/W migration entries
Implement read/write migration ptes
We take the upper two swapfiles for the two types of migration ptes and define
a series of macros in swapops.h.
The VM is modified to handle the migration entries. migration entries can
only be encountered when the page they are pointing to is locked. This limits
the number of places one has to fix. We also check in copy_pte_range and in
mprotect_pte_range() for migration ptes.
We check for migration ptes in do_swap_cache and call a function that will
then wait on the page lock. This allows us to effectively stop all accesses
to apge.
Migration entries are created by try_to_unmap if called for migration and
removed by local functions in migrate.c
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration (I've no NUMA, just
hacking it up to migrate recklessly while running load), I've hit the
BUG_ON(!PageLocked(p)) in migration_entry_to_page.
This comes from an orphaned migration entry, unrelated to the current
correctly locked migration, but hit by remove_anon_migration_ptes as it
checks an address in each vma of the anon_vma list.
Such an orphan may be left behind if an earlier migration raced with fork:
copy_one_pte can duplicate a migration entry from parent to child, after
remove_anon_migration_ptes has checked the child vma, but before it has
removed it from the parent vma. (If the process were later to fault on this
orphaned entry, it would hit the same BUG from migration_entry_wait.)
This could be fixed by locking anon_vma in copy_one_pte, but we'd rather
not. There's no such problem with file pages, because vma_prio_tree_add
adds child vma after parent vma, and the page table locking at each end is
enough to serialize. Follow that example with anon_vma: add new vmas to the
tail instead of the head.
(There's no corresponding problem when inserting migration entries,
because a missed pte will leave the page count and mapcount high, which is
allowed for. And there's no corresponding problem when migrating via swap,
because a leftover swap entry will be correctly faulted. But the swapless
method has no refcounting of its entries.)
From: Ingo Molnar <mingo@elte.hu>
pte_unmap_unlock() takes the pte pointer as an argument.
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration, gcc has tried to exec
a pointer instead of a string: smells like COW mappings are not being
properly write-protected on fork.
The protection in copy_one_pte looks very convincing, until at last you
realize that the second arg to make_migration_entry is a boolean "write",
and SWP_MIGRATION_READ is 30.
Anyway, it's better done like in change_pte_range, using
is_write_migration_entry and make_migration_entry_read.
From: Hugh Dickins <hugh@veritas.com>
Remove unnecessary obfuscation from sys_swapon's range check on swap type,
which blew up causing memory corruption once swapless migration made
MAX_SWAPFILES no longer 2 ^ MAX_SWAPFILES_SHIFT.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
From: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:03:35 +04:00
|
|
|
#else
|
2009-09-16 13:50:05 +04:00
|
|
|
#define SWP_MIGRATION_NUM 0
|
[PATCH] Swapless page migration: add R/W migration entries
Implement read/write migration ptes
We take the upper two swapfiles for the two types of migration ptes and define
a series of macros in swapops.h.
The VM is modified to handle the migration entries. migration entries can
only be encountered when the page they are pointing to is locked. This limits
the number of places one has to fix. We also check in copy_pte_range and in
mprotect_pte_range() for migration ptes.
We check for migration ptes in do_swap_cache and call a function that will
then wait on the page lock. This allows us to effectively stop all accesses
to apge.
Migration entries are created by try_to_unmap if called for migration and
removed by local functions in migrate.c
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration (I've no NUMA, just
hacking it up to migrate recklessly while running load), I've hit the
BUG_ON(!PageLocked(p)) in migration_entry_to_page.
This comes from an orphaned migration entry, unrelated to the current
correctly locked migration, but hit by remove_anon_migration_ptes as it
checks an address in each vma of the anon_vma list.
Such an orphan may be left behind if an earlier migration raced with fork:
copy_one_pte can duplicate a migration entry from parent to child, after
remove_anon_migration_ptes has checked the child vma, but before it has
removed it from the parent vma. (If the process were later to fault on this
orphaned entry, it would hit the same BUG from migration_entry_wait.)
This could be fixed by locking anon_vma in copy_one_pte, but we'd rather
not. There's no such problem with file pages, because vma_prio_tree_add
adds child vma after parent vma, and the page table locking at each end is
enough to serialize. Follow that example with anon_vma: add new vmas to the
tail instead of the head.
(There's no corresponding problem when inserting migration entries,
because a missed pte will leave the page count and mapcount high, which is
allowed for. And there's no corresponding problem when migrating via swap,
because a leftover swap entry will be correctly faulted. But the swapless
method has no refcounting of its entries.)
From: Ingo Molnar <mingo@elte.hu>
pte_unmap_unlock() takes the pte pointer as an argument.
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration, gcc has tried to exec
a pointer instead of a string: smells like COW mappings are not being
properly write-protected on fork.
The protection in copy_one_pte looks very convincing, until at last you
realize that the second arg to make_migration_entry is a boolean "write",
and SWP_MIGRATION_READ is 30.
Anyway, it's better done like in change_pte_range, using
is_write_migration_entry and make_migration_entry_read.
From: Hugh Dickins <hugh@veritas.com>
Remove unnecessary obfuscation from sys_swapon's range check on swap type,
which blew up causing memory corruption once swapless migration made
MAX_SWAPFILES no longer 2 ^ MAX_SWAPFILES_SHIFT.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
From: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:03:35 +04:00
|
|
|
#endif
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2009-09-16 13:50:05 +04:00
|
|
|
/*
|
|
|
|
* Handling of hardware poisoned pages with memory corruption.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_MEMORY_FAILURE
|
|
|
|
#define SWP_HWPOISON_NUM 1
|
|
|
|
#define SWP_HWPOISON MAX_SWAPFILES
|
|
|
|
#else
|
|
|
|
#define SWP_HWPOISON_NUM 0
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define MAX_SWAPFILES \
|
|
|
|
((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* Magic header for a swap area. The first part of the union is
|
|
|
|
* what the swap magic looks like for the old (limited to 128MB)
|
|
|
|
* swap area format, the second part of the union adds - in the
|
|
|
|
* old reserved area - some extra information. Note that the first
|
|
|
|
* kilobyte is reserved for boot loader or disk label stuff...
|
|
|
|
*
|
|
|
|
* Having the magic at the end of the PAGE_SIZE makes detecting swap
|
|
|
|
* areas somewhat tricky on machines that support multiple page sizes.
|
|
|
|
* For 2.5 we'll probably want to move the magic to just beyond the
|
|
|
|
* bootbits...
|
|
|
|
*/
|
|
|
|
union swap_header {
|
|
|
|
struct {
|
|
|
|
char reserved[PAGE_SIZE - 10];
|
|
|
|
char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */
|
|
|
|
} magic;
|
|
|
|
struct {
|
2006-06-23 13:03:14 +04:00
|
|
|
char bootbits[1024]; /* Space for disklabel etc. */
|
|
|
|
__u32 version;
|
|
|
|
__u32 last_page;
|
|
|
|
__u32 nr_badpages;
|
|
|
|
unsigned char sws_uuid[16];
|
|
|
|
unsigned char sws_volume[16];
|
|
|
|
__u32 padding[117];
|
|
|
|
__u32 badpages[1];
|
2005-04-17 02:20:36 +04:00
|
|
|
} info;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* A swap entry has to fit into a "unsigned long", as
|
|
|
|
* the entry is hidden in the "index" field of the
|
|
|
|
* swapper address space.
|
|
|
|
*/
|
|
|
|
typedef struct {
|
|
|
|
unsigned long val;
|
|
|
|
} swp_entry_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* current->reclaim_state points to one of these when a task is running
|
|
|
|
* memory reclaim
|
|
|
|
*/
|
|
|
|
struct reclaim_state {
|
|
|
|
unsigned long reclaimed_slab;
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
|
|
|
|
struct address_space;
|
|
|
|
struct sysinfo;
|
|
|
|
struct writeback_control;
|
|
|
|
struct zone;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
|
|
|
|
* disk blocks. A list of swap extents maps the entire swapfile. (Where the
|
|
|
|
* term `swapfile' refers to either a blockdevice or an IS_REG file. Apart
|
|
|
|
* from setup, they're handled identically.
|
|
|
|
*
|
|
|
|
* We always assume that blocks are of size PAGE_SIZE.
|
|
|
|
*/
|
|
|
|
struct swap_extent {
|
|
|
|
struct list_head list;
|
|
|
|
pgoff_t start_page;
|
|
|
|
pgoff_t nr_pages;
|
|
|
|
sector_t start_block;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Max bad pages in the new format..
|
|
|
|
*/
|
|
|
|
#define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x)
|
|
|
|
#define MAX_SWAP_BADPAGES \
|
|
|
|
((__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int))
|
|
|
|
|
|
|
|
enum {
|
|
|
|
SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
|
|
|
|
SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
|
swap: discard while swapping only if SWAP_FLAG_DISCARD
Tests with recent firmware on Intel X25-M 80GB and OCZ Vertex 60GB SSDs
show a shift since I last tested in December: in part because of firmware
updates, in part because of the necessary move from barriers to awaiting
completion at the block layer. While discard at swapon still shows as
slightly beneficial on both, discarding 1MB swap cluster when allocating
is now disadvanteous: adds 25% overhead on Intel, adds 230% on OCZ (YMMV).
Surrender: discard as presently implemented is more hindrance than help
for swap; but might prove useful on other devices, or with improvements.
So continue to do the discard at swapon, but make discard while swapping
conditional on a SWAP_FLAG_DISCARD to sys_swapon() (which has been using
only the lower 16 bits of int flags).
We can add a --discard or -d to swapon(8), and a "discard" to swap in
/etc/fstab: matching the mount option for btrfs, ext4, fat, gfs2, nilfs2.
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Nigel Cunningham <nigel@tuxonice.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <jaxboe@fusionio.com>
Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-09-10 03:38:11 +04:00
|
|
|
SWP_DISCARDABLE = (1 << 2), /* swapon+blkdev support discard */
|
2009-01-07 01:39:53 +03:00
|
|
|
SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
|
2009-01-07 01:39:54 +03:00
|
|
|
SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
|
swap_info: swap count continuations
Swap is duplicated (reference count incremented by one) whenever the same
swap page is inserted into another mm (when forking finds a swap entry in
place of a pte, or when reclaim unmaps a pte to insert the swap entry).
swap_info_struct's vmalloc'ed swap_map is the array of these reference
counts: but what happens when the unsigned short (or unsigned char since
the preceding patch) is full? (and its high bit is kept for a cache flag)
We then lose track of it, never freeing, leaving it in use until swapoff:
at which point we _hope_ that a single pass will have found all instances,
assume there are no more, and will lose user data if we're wrong.
Swapping of KSM pages has not yet been enabled; but it is implemented,
and makes it very easy for a user to overflow the maximum swap count:
possible with ordinary process pages, but unlikely, even when pid_max
has been raised from PID_MAX_DEFAULT.
This patch implements swap count continuations: when the count overflows,
a continuation page is allocated and linked to the original vmalloc'ed
map page, and this used to hold the continuation counts for that entry
and its neighbours. These continuation pages are seldom referenced:
the common paths all work on the original swap_map, only referring to
a continuation page when the low "digit" of a count is incremented or
decremented through SWAP_MAP_MAX.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 04:58:46 +03:00
|
|
|
SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
|
2010-05-17 09:32:42 +04:00
|
|
|
SWP_BLKDEV = (1 << 6), /* its a block device */
|
[PATCH] swap: scan_swap_map drop swap_device_lock
get_swap_page has often shown up on latency traces, doing lengthy scans while
holding two spinlocks. swap_list_lock is already dropped, now scan_swap_map
drop swap_device_lock before scanning the swap_map.
While scanning for an empty cluster, don't worry that racing tasks may
allocate what was free and free what was allocated; but when allocating an
entry, check it's still free after retaking the lock. Avoid dropping the lock
in the expected common path. No barriers beyond the locks, just let the
cookie crumble; highest_bit limit is volatile, but benign.
Guard against swapoff: must check SWP_WRITEOK before allocating, must raise
SWP_SCANNING reference count while in scan_swap_map, swapoff wait for that to
fall - just use schedule_timeout, we don't want to burden scan_swap_map
itself, and it's very unlikely that anyone can really still be in
scan_swap_map once swapoff gets this far.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-04 02:54:39 +04:00
|
|
|
/* add others here before... */
|
|
|
|
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
|
2005-04-17 02:20:36 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
#define SWAP_CLUSTER_MAX 32
|
2010-05-25 01:32:27 +04:00
|
|
|
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
|
2005-04-17 02:20:36 +04:00
|
|
|
|
mm: vmscan: kswapd should not free an excessive number of pages when balancing small zones
When reclaiming for order-0 pages, kswapd requires that all zones be
balanced. Each cycle through balance_pgdat() does background ageing on
all zones if necessary and applies equal pressure on the inactive zone
unless a lot of pages are free already.
A "lot of free pages" is defined as a "balance gap" above the high
watermark which is currently 7*high_watermark. Historically this was
reasonable as min_free_kbytes was small. However, on systems using huge
pages, it is recommended that min_free_kbytes is higher and it is tuned
with hugeadm --set-recommended-min_free_kbytes. With the introduction of
transparent huge page support, this recommended value is also applied. On
X86-64 with 4G of memory, min_free_kbytes becomes 67584 so one would
expect around 68M of memory to be free. The Normal zone is approximately
35000 pages so under even normal memory pressure such as copying a large
file, it gets exhausted quickly. As it is getting exhausted, kswapd
applies pressure equally to all zones, including the DMA32 zone. DMA32 is
approximately 700,000 pages with a high watermark of around 23,000 pages.
In this situation, kswapd will reclaim around (23000*8 where 8 is the high
watermark + balance gap of 7 * high watermark) pages or 718M of pages
before the zone is ignored. What the user sees is that free memory far
higher than it should be.
To avoid an excessive number of pages being reclaimed from the larger
zones, explicitely defines the "balance gap" to be either 1% of the zone
or the low watermark for the zone, whichever is smaller. While kswapd
will check all zones to apply pressure, it'll ignore zones that meets the
(high_wmark + balance_gap) watermark.
To test this, 80G were copied from a partition and the amount of memory
being used was recorded. A comparison of a patch and unpatched kernel can
be seen at
http://www.csn.ul.ie/~mel/postings/minfree-20110222/memory-usage-hydra.ps
and shows that kswapd is not reclaiming as much memory with the patch
applied.
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: "Chen, Tim C" <tim.c.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-23 02:33:04 +03:00
|
|
|
/*
|
|
|
|
* Ratio between the present memory in the zone and the "gap" that
|
|
|
|
* we're allowing kswapd to shrink in addition to the per-zone high
|
|
|
|
* wmark, even for zones that already have the high wmark satisfied,
|
|
|
|
* in order to provide better per-zone lru behavior. We are ok to
|
|
|
|
* spend not more than 1% of the memory for this zone balancing "gap".
|
|
|
|
*/
|
|
|
|
#define KSWAPD_ZONE_BALANCE_GAP_RATIO 100
|
|
|
|
|
swap_info: swap count continuations
Swap is duplicated (reference count incremented by one) whenever the same
swap page is inserted into another mm (when forking finds a swap entry in
place of a pte, or when reclaim unmaps a pte to insert the swap entry).
swap_info_struct's vmalloc'ed swap_map is the array of these reference
counts: but what happens when the unsigned short (or unsigned char since
the preceding patch) is full? (and its high bit is kept for a cache flag)
We then lose track of it, never freeing, leaving it in use until swapoff:
at which point we _hope_ that a single pass will have found all instances,
assume there are no more, and will lose user data if we're wrong.
Swapping of KSM pages has not yet been enabled; but it is implemented,
and makes it very easy for a user to overflow the maximum swap count:
possible with ordinary process pages, but unlikely, even when pid_max
has been raised from PID_MAX_DEFAULT.
This patch implements swap count continuations: when the count overflows,
a continuation page is allocated and linked to the original vmalloc'ed
map page, and this used to hold the continuation counts for that entry
and its neighbours. These continuation pages are seldom referenced:
the common paths all work on the original swap_map, only referring to
a continuation page when the low "digit" of a count is incremented or
decremented through SWAP_MAP_MAX.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 04:58:46 +03:00
|
|
|
#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */
|
|
|
|
#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */
|
|
|
|
#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
|
|
|
|
#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */
|
|
|
|
#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */
|
2009-12-15 04:58:47 +03:00
|
|
|
#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */
|
2009-12-15 04:58:44 +03:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* The in-memory structure used to track swap areas.
|
|
|
|
*/
|
|
|
|
struct swap_info_struct {
|
2009-12-15 04:58:41 +03:00
|
|
|
unsigned long flags; /* SWP_USED etc: see above */
|
|
|
|
signed short prio; /* swap priority of this type */
|
|
|
|
signed char type; /* strange name for an index */
|
|
|
|
signed char next; /* next type on the swap list */
|
2009-12-15 04:58:48 +03:00
|
|
|
unsigned int max; /* extent of the swap_map */
|
|
|
|
unsigned char *swap_map; /* vmalloc'ed array of usage counts */
|
|
|
|
unsigned int lowest_bit; /* index of first free in swap_map */
|
|
|
|
unsigned int highest_bit; /* index of last free in swap_map */
|
|
|
|
unsigned int pages; /* total of usable pages of swap */
|
|
|
|
unsigned int inuse_pages; /* number of those currently in use */
|
|
|
|
unsigned int cluster_next; /* likely index for next allocation */
|
|
|
|
unsigned int cluster_nr; /* countdown to next cluster search */
|
2009-01-07 01:39:53 +03:00
|
|
|
unsigned int lowest_alloc; /* while preparing discard cluster */
|
|
|
|
unsigned int highest_alloc; /* while preparing discard cluster */
|
2009-12-15 04:58:48 +03:00
|
|
|
struct swap_extent *curr_swap_extent;
|
|
|
|
struct swap_extent first_swap_extent;
|
|
|
|
struct block_device *bdev; /* swap device or bdev of swap file */
|
|
|
|
struct file *swap_file; /* seldom referenced */
|
|
|
|
unsigned int old_block_size; /* seldom referenced */
|
2005-04-17 02:20:36 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
struct swap_list_t {
|
|
|
|
int head; /* head of priority-ordered swapfile list */
|
|
|
|
int next; /* swapfile to be used next */
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Swap 50% full? Release swapcache more aggressively.. */
|
|
|
|
#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
|
|
|
|
|
|
|
|
/* linux/mm/page_alloc.c */
|
|
|
|
extern unsigned long totalram_pages;
|
2006-04-11 09:52:59 +04:00
|
|
|
extern unsigned long totalreserve_pages;
|
2012-01-11 03:07:42 +04:00
|
|
|
extern unsigned long dirty_balance_reserve;
|
2005-04-17 02:20:36 +04:00
|
|
|
extern unsigned int nr_free_buffer_pages(void);
|
|
|
|
extern unsigned int nr_free_pagecache_pages(void);
|
|
|
|
|
2007-02-10 12:43:03 +03:00
|
|
|
/* Definition of global_page_state not available yet */
|
|
|
|
#define nr_free_pages() global_page_state(NR_FREE_PAGES)
|
|
|
|
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/* linux/mm/swap.c */
|
2008-10-19 07:26:19 +04:00
|
|
|
extern void __lru_cache_add(struct page *, enum lru_list lru);
|
|
|
|
extern void lru_cache_add_lru(struct page *, enum lru_list lru);
|
thp: transparent hugepage core
Lately I've been working to make KVM use hugepages transparently without
the usual restrictions of hugetlbfs. Some of the restrictions I'd like to
see removed:
1) hugepages have to be swappable or the guest physical memory remains
locked in RAM and can't be paged out to swap
2) if a hugepage allocation fails, regular pages should be allocated
instead and mixed in the same vma without any failure and without
userland noticing
3) if some task quits and more hugepages become available in the
buddy, guest physical memory backed by regular pages should be
relocated on hugepages automatically in regions under
madvise(MADV_HUGEPAGE) (ideally event driven by waking up the
kernel deamon if the order=HPAGE_PMD_SHIFT-PAGE_SHIFT list becomes
not null)
4) avoidance of reservation and maximization of use of hugepages whenever
possible. Reservation (needed to avoid runtime fatal faliures) may be ok for
1 machine with 1 database with 1 database cache with 1 database cache size
known at boot time. It's definitely not feasible with a virtualization
hypervisor usage like RHEV-H that runs an unknown number of virtual machines
with an unknown size of each virtual machine with an unknown amount of
pagecache that could be potentially useful in the host for guest not using
O_DIRECT (aka cache=off).
hugepages in the virtualization hypervisor (and also in the guest!) are
much more important than in a regular host not using virtualization,
becasue with NPT/EPT they decrease the tlb-miss cacheline accesses from 24
to 19 in case only the hypervisor uses transparent hugepages, and they
decrease the tlb-miss cacheline accesses from 19 to 15 in case both the
linux hypervisor and the linux guest both uses this patch (though the
guest will limit the addition speedup to anonymous regions only for
now...). Even more important is that the tlb miss handler is much slower
on a NPT/EPT guest than for a regular shadow paging or no-virtualization
scenario. So maximizing the amount of virtual memory cached by the TLB
pays off significantly more with NPT/EPT than without (even if there would
be no significant speedup in the tlb-miss runtime).
The first (and more tedious) part of this work requires allowing the VM to
handle anonymous hugepages mixed with regular pages transparently on
regular anonymous vmas. This is what this patch tries to achieve in the
least intrusive possible way. We want hugepages and hugetlb to be used in
a way so that all applications can benefit without changes (as usual we
leverage the KVM virtualization design: by improving the Linux VM at
large, KVM gets the performance boost too).
The most important design choice is: always fallback to 4k allocation if
the hugepage allocation fails! This is the _very_ opposite of some large
pagecache patches that failed with -EIO back then if a 64k (or similar)
allocation failed...
Second important decision (to reduce the impact of the feature on the
existing pagetable handling code) is that at any time we can split an
hugepage into 512 regular pages and it has to be done with an operation
that can't fail. This way the reliability of the swapping isn't decreased
(no need to allocate memory when we are short on memory to swap) and it's
trivial to plug a split_huge_page* one-liner where needed without
polluting the VM. Over time we can teach mprotect, mremap and friends to
handle pmd_trans_huge natively without calling split_huge_page*. The fact
it can't fail isn't just for swap: if split_huge_page would return -ENOMEM
(instead of the current void) we'd need to rollback the mprotect from the
middle of it (ideally including undoing the split_vma) which would be a
big change and in the very wrong direction (it'd likely be simpler not to
call split_huge_page at all and to teach mprotect and friends to handle
hugepages instead of rolling them back from the middle). In short the
very value of split_huge_page is that it can't fail.
The collapsing and madvise(MADV_HUGEPAGE) part will remain separated and
incremental and it'll just be an "harmless" addition later if this initial
part is agreed upon. It also should be noted that locking-wise replacing
regular pages with hugepages is going to be very easy if compared to what
I'm doing below in split_huge_page, as it will only happen when
page_count(page) matches page_mapcount(page) if we can take the PG_lock
and mmap_sem in write mode. collapse_huge_page will be a "best effort"
that (unlike split_huge_page) can fail at the minimal sign of trouble and
we can try again later. collapse_huge_page will be similar to how KSM
works and the madvise(MADV_HUGEPAGE) will work similar to
madvise(MADV_MERGEABLE).
The default I like is that transparent hugepages are used at page fault
time. This can be changed with
/sys/kernel/mm/transparent_hugepage/enabled. The control knob can be set
to three values "always", "madvise", "never" which mean respectively that
hugepages are always used, or only inside madvise(MADV_HUGEPAGE) regions,
or never used. /sys/kernel/mm/transparent_hugepage/defrag instead
controls if the hugepage allocation should defrag memory aggressively
"always", only inside "madvise" regions, or "never".
The pmd_trans_splitting/pmd_trans_huge locking is very solid. The
put_page (from get_user_page users that can't use mmu notifier like
O_DIRECT) that runs against a __split_huge_page_refcount instead was a
pain to serialize in a way that would result always in a coherent page
count for both tail and head. I think my locking solution with a
compound_lock taken only after the page_first is valid and is still a
PageHead should be safe but it surely needs review from SMP race point of
view. In short there is no current existing way to serialize the O_DIRECT
final put_page against split_huge_page_refcount so I had to invent a new
one (O_DIRECT loses knowledge on the mapping status by the time gup_fast
returns so...). And I didn't want to impact all gup/gup_fast users for
now, maybe if we change the gup interface substantially we can avoid this
locking, I admit I didn't think too much about it because changing the gup
unpinning interface would be invasive.
If we ignored O_DIRECT we could stick to the existing compound refcounting
code, by simply adding a get_user_pages_fast_flags(foll_flags) where KVM
(and any other mmu notifier user) would call it without FOLL_GET (and if
FOLL_GET isn't set we'd just BUG_ON if nobody registered itself in the
current task mmu notifier list yet). But O_DIRECT is fundamental for
decent performance of virtualized I/O on fast storage so we can't avoid it
to solve the race of put_page against split_huge_page_refcount to achieve
a complete hugepage feature for KVM.
Swap and oom works fine (well just like with regular pages ;). MMU
notifier is handled transparently too, with the exception of the young bit
on the pmd, that didn't have a range check but I think KVM will be fine
because the whole point of hugepages is that EPT/NPT will also use a huge
pmd when they notice gup returns pages with PageCompound set, so they
won't care of a range and there's just the pmd young bit to check in that
case.
NOTE: in some cases if the L2 cache is small, this may slowdown and waste
memory during COWs because 4M of memory are accessed in a single fault
instead of 8k (the payoff is that after COW the program can run faster).
So we might want to switch the copy_huge_page (and clear_huge_page too) to
not temporal stores. I also extensively researched ways to avoid this
cache trashing with a full prefault logic that would cow in 8k/16k/32k/64k
up to 1M (I can send those patches that fully implemented prefault) but I
concluded they're not worth it and they add an huge additional complexity
and they remove all tlb benefits until the full hugepage has been faulted
in, to save a little bit of memory and some cache during app startup, but
they still don't improve substantially the cache-trashing during startup
if the prefault happens in >4k chunks. One reason is that those 4k pte
entries copied are still mapped on a perfectly cache-colored hugepage, so
the trashing is the worst one can generate in those copies (cow of 4k page
copies aren't so well colored so they trashes less, but again this results
in software running faster after the page fault). Those prefault patches
allowed things like a pte where post-cow pages were local 4k regular anon
pages and the not-yet-cowed pte entries were pointing in the middle of
some hugepage mapped read-only. If it doesn't payoff substantially with
todays hardware it will payoff even less in the future with larger l2
caches, and the prefault logic would blot the VM a lot. If one is
emebdded transparent_hugepage can be disabled during boot with sysfs or
with the boot commandline parameter transparent_hugepage=0 (or
transparent_hugepage=2 to restrict hugepages inside madvise regions) that
will ensure not a single hugepage is allocated at boot time. It is simple
enough to just disable transparent hugepage globally and let transparent
hugepages be allocated selectively by applications in the MADV_HUGEPAGE
region (both at page fault time, and if enabled with the
collapse_huge_page too through the kernel daemon).
This patch supports only hugepages mapped in the pmd, archs that have
smaller hugepages will not fit in this patch alone. Also some archs like
power have certain tlb limits that prevents mixing different page size in
the same regions so they will not fit in this framework that requires
"graceful fallback" to basic PAGE_SIZE in case of physical memory
fragmentation. hugetlbfs remains a perfect fit for those because its
software limits happen to match the hardware limits. hugetlbfs also
remains a perfect fit for hugepage sizes like 1GByte that cannot be hoped
to be found not fragmented after a certain system uptime and that would be
very expensive to defragment with relocation, so requiring reservation.
hugetlbfs is the "reservation way", the point of transparent hugepages is
not to have any reservation at all and maximizing the use of cache and
hugepages at all times automatically.
Some performance result:
vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largep
ages3
memset page fault 1566023
memset tlb miss 453854
memset second tlb miss 453321
random access tlb miss 41635
random access second tlb miss 41658
vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largepages3
memset page fault 1566471
memset tlb miss 453375
memset second tlb miss 453320
random access tlb miss 41636
random access second tlb miss 41637
vmx andrea # ./largepages3
memset page fault 1566642
memset tlb miss 453417
memset second tlb miss 453313
random access tlb miss 41630
random access second tlb miss 41647
vmx andrea # ./largepages3
memset page fault 1566872
memset tlb miss 453418
memset second tlb miss 453315
random access tlb miss 41618
random access second tlb miss 41659
vmx andrea # echo 0 > /proc/sys/vm/transparent_hugepage
vmx andrea # ./largepages3
memset page fault 2182476
memset tlb miss 460305
memset second tlb miss 460179
random access tlb miss 44483
random access second tlb miss 44186
vmx andrea # ./largepages3
memset page fault 2182791
memset tlb miss 460742
memset second tlb miss 459962
random access tlb miss 43981
random access second tlb miss 43988
============
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#define SIZE (3UL*1024*1024*1024)
int main()
{
char *p = malloc(SIZE), *p2;
struct timeval before, after;
gettimeofday(&before, NULL);
memset(p, 0, SIZE);
gettimeofday(&after, NULL);
printf("memset page fault %Lu\n",
(after.tv_sec-before.tv_sec)*1000000UL +
after.tv_usec-before.tv_usec);
gettimeofday(&before, NULL);
memset(p, 0, SIZE);
gettimeofday(&after, NULL);
printf("memset tlb miss %Lu\n",
(after.tv_sec-before.tv_sec)*1000000UL +
after.tv_usec-before.tv_usec);
gettimeofday(&before, NULL);
memset(p, 0, SIZE);
gettimeofday(&after, NULL);
printf("memset second tlb miss %Lu\n",
(after.tv_sec-before.tv_sec)*1000000UL +
after.tv_usec-before.tv_usec);
gettimeofday(&before, NULL);
for (p2 = p; p2 < p+SIZE; p2 += 4096)
*p2 = 0;
gettimeofday(&after, NULL);
printf("random access tlb miss %Lu\n",
(after.tv_sec-before.tv_sec)*1000000UL +
after.tv_usec-before.tv_usec);
gettimeofday(&before, NULL);
for (p2 = p; p2 < p+SIZE; p2 += 4096)
*p2 = 0;
gettimeofday(&after, NULL);
printf("random access second tlb miss %Lu\n",
(after.tv_sec-before.tv_sec)*1000000UL +
after.tv_usec-before.tv_usec);
return 0;
}
============
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-01-14 02:46:52 +03:00
|
|
|
extern void lru_add_page_tail(struct zone* zone,
|
|
|
|
struct page *page, struct page *page_tail);
|
2008-02-14 02:03:15 +03:00
|
|
|
extern void activate_page(struct page *);
|
|
|
|
extern void mark_page_accessed(struct page *);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern void lru_add_drain(void);
|
2012-03-22 03:34:06 +04:00
|
|
|
extern void lru_add_drain_cpu(int cpu);
|
2006-01-19 04:42:27 +03:00
|
|
|
extern int lru_add_drain_all(void);
|
2008-04-28 13:12:38 +04:00
|
|
|
extern void rotate_reclaimable_page(struct page *page);
|
2011-03-23 02:32:52 +03:00
|
|
|
extern void deactivate_page(struct page *page);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern void swap_setup(void);
|
|
|
|
|
Unevictable LRU Infrastructure
When the system contains lots of mlocked or otherwise unevictable pages,
the pageout code (kswapd) can spend lots of time scanning over these
pages. Worse still, the presence of lots of unevictable pages can confuse
kswapd into thinking that more aggressive pageout modes are required,
resulting in all kinds of bad behaviour.
Infrastructure to manage pages excluded from reclaim--i.e., hidden from
vmscan. Based on a patch by Larry Woodman of Red Hat. Reworked to
maintain "unevictable" pages on a separate per-zone LRU list, to "hide"
them from vmscan.
Kosaki Motohiro added the support for the memory controller unevictable
lru list.
Pages on the unevictable list have both PG_unevictable and PG_lru set.
Thus, PG_unevictable is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.
The unevictable infrastructure is enabled by a new mm Kconfig option
[CONFIG_]UNEVICTABLE_LRU.
A new function 'page_evictable(page, vma)' in vmscan.c tests whether or
not a page may be evictable. Subsequent patches will add the various
!evictable tests. We'll want to keep these tests light-weight for use in
shrink_active_list() and, possibly, the fault path.
To avoid races between tasks putting pages [back] onto an LRU list and
tasks that might be moving the page from non-evictable to evictable state,
the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()'
-- tests the "evictability" of a page after placing it on the LRU, before
dropping the reference. If the page has become unevictable,
putback_lru_page() will redo the 'putback', thus moving the page to the
unevictable list. This way, we avoid "stranding" evictable pages on the
unevictable list.
[akpm@linux-foundation.org: fix fallout from out-of-order merge]
[riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build]
[nishimura@mxp.nes.nec.co.jp: remove redundant mapping check]
[kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework]
[kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c]
[kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure]
[kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch]
[kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:39 +04:00
|
|
|
extern void add_page_to_unevictable_list(struct page *page);
|
|
|
|
|
2008-10-19 07:26:19 +04:00
|
|
|
/**
|
|
|
|
* lru_cache_add: add a page to the page lists
|
|
|
|
* @page: the page to add
|
|
|
|
*/
|
2008-10-19 07:26:32 +04:00
|
|
|
static inline void lru_cache_add_anon(struct page *page)
|
2008-10-19 07:26:19 +04:00
|
|
|
{
|
2008-10-19 07:26:32 +04:00
|
|
|
__lru_cache_add(page, LRU_INACTIVE_ANON);
|
2008-10-19 07:26:19 +04:00
|
|
|
}
|
|
|
|
|
2008-10-19 07:26:32 +04:00
|
|
|
static inline void lru_cache_add_file(struct page *page)
|
|
|
|
{
|
|
|
|
__lru_cache_add(page, LRU_INACTIVE_FILE);
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/* linux/mm/vmscan.c */
|
2008-04-28 13:12:12 +04:00
|
|
|
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
|
2009-04-01 02:23:31 +04:00
|
|
|
gfp_t gfp_mask, nodemask_t *mask);
|
2011-11-01 04:06:47 +04:00
|
|
|
extern int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file);
|
2011-09-15 03:21:58 +04:00
|
|
|
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
|
|
|
|
gfp_t gfp_mask, bool noswap);
|
|
|
|
extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
|
|
|
|
gfp_t gfp_mask, bool noswap,
|
|
|
|
struct zone *zone,
|
|
|
|
unsigned long *nr_scanned);
|
2006-03-22 11:08:19 +03:00
|
|
|
extern unsigned long shrink_all_memory(unsigned long nr_pages);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern int vm_swappiness;
|
2006-03-22 11:09:12 +03:00
|
|
|
extern int remove_mapping(struct address_space *mapping, struct page *page);
|
2006-06-23 13:03:47 +04:00
|
|
|
extern long vm_total_pages;
|
2006-03-22 11:09:12 +03:00
|
|
|
|
2006-01-19 04:42:31 +03:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
extern int zone_reclaim_mode;
|
2006-07-03 11:24:13 +04:00
|
|
|
extern int sysctl_min_unmapped_ratio;
|
2006-09-26 10:31:52 +04:00
|
|
|
extern int sysctl_min_slab_ratio;
|
2006-01-19 04:42:31 +03:00
|
|
|
extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
|
|
|
|
#else
|
|
|
|
#define zone_reclaim_mode 0
|
|
|
|
static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
Unevictable LRU Infrastructure
When the system contains lots of mlocked or otherwise unevictable pages,
the pageout code (kswapd) can spend lots of time scanning over these
pages. Worse still, the presence of lots of unevictable pages can confuse
kswapd into thinking that more aggressive pageout modes are required,
resulting in all kinds of bad behaviour.
Infrastructure to manage pages excluded from reclaim--i.e., hidden from
vmscan. Based on a patch by Larry Woodman of Red Hat. Reworked to
maintain "unevictable" pages on a separate per-zone LRU list, to "hide"
them from vmscan.
Kosaki Motohiro added the support for the memory controller unevictable
lru list.
Pages on the unevictable list have both PG_unevictable and PG_lru set.
Thus, PG_unevictable is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.
The unevictable infrastructure is enabled by a new mm Kconfig option
[CONFIG_]UNEVICTABLE_LRU.
A new function 'page_evictable(page, vma)' in vmscan.c tests whether or
not a page may be evictable. Subsequent patches will add the various
!evictable tests. We'll want to keep these tests light-weight for use in
shrink_active_list() and, possibly, the fault path.
To avoid races between tasks putting pages [back] onto an LRU list and
tasks that might be moving the page from non-evictable to evictable state,
the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()'
-- tests the "evictability" of a page after placing it on the LRU, before
dropping the reference. If the page has become unevictable,
putback_lru_page() will redo the 'putback', thus moving the page to the
unevictable list. This way, we avoid "stranding" evictable pages on the
unevictable list.
[akpm@linux-foundation.org: fix fallout from out-of-order merge]
[riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build]
[nishimura@mxp.nes.nec.co.jp: remove redundant mapping check]
[kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework]
[kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c]
[kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure]
[kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch]
[kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:39 +04:00
|
|
|
extern int page_evictable(struct page *page, struct vm_area_struct *vma);
|
SHM_UNLOCK: fix Unevictable pages stranded after swap
Commit cc39c6a9bbde ("mm: account skipped entries to avoid looping in
find_get_pages") correctly fixed an infinite loop; but left a problem
that find_get_pages() on shmem would return 0 (appearing to callers to
mean end of tree) when it meets a run of nr_pages swap entries.
The only uses of find_get_pages() on shmem are via pagevec_lookup(),
called from invalidate_mapping_pages(), and from shmctl SHM_UNLOCK's
scan_mapping_unevictable_pages(). The first is already commented, and
not worth worrying about; but the second can leave pages on the
Unevictable list after an unusual sequence of swapping and locking.
Fix that by using shmem_find_get_pages_and_swap() (then ignoring the
swap) instead of pagevec_lookup().
But I don't want to contaminate vmscan.c with shmem internals, nor
shmem.c with LRU locking. So move scan_mapping_unevictable_pages() into
shmem.c, renaming it shmem_unlock_mapping(); and rename
check_move_unevictable_page() to check_move_unevictable_pages(), looping
down an array of pages, oftentimes under the same lock.
Leave out the "rotate unevictable list" block: that's a leftover from
when this was used for /proc/sys/vm/scan_unevictable_pages, whose flawed
handling involved looking at pages at tail of LRU.
Was there significance to the sequence first ClearPageUnevictable, then
test page_evictable, then SetPageUnevictable here? I think not, we're
under LRU lock, and have no barriers between those.
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Cc: <stable@vger.kernel.org> [back to 3.1 but will need respins]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-01-21 02:34:21 +04:00
|
|
|
extern void check_move_unevictable_pages(struct page **, int nr_pages);
|
2008-10-19 07:26:53 +04:00
|
|
|
|
|
|
|
extern unsigned long scan_unevictable_pages;
|
2009-09-24 02:57:19 +04:00
|
|
|
extern int scan_unevictable_handler(struct ctl_table *, int,
|
2008-10-19 07:26:53 +04:00
|
|
|
void __user *, size_t *, loff_t *);
|
2010-10-27 01:21:28 +04:00
|
|
|
#ifdef CONFIG_NUMA
|
2008-10-19 07:26:53 +04:00
|
|
|
extern int scan_unevictable_register_node(struct node *node);
|
|
|
|
extern void scan_unevictable_unregister_node(struct node *node);
|
2010-10-27 01:21:28 +04:00
|
|
|
#else
|
|
|
|
static inline int scan_unevictable_register_node(struct node *node)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
static inline void scan_unevictable_unregister_node(struct node *node)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
Unevictable LRU Infrastructure
When the system contains lots of mlocked or otherwise unevictable pages,
the pageout code (kswapd) can spend lots of time scanning over these
pages. Worse still, the presence of lots of unevictable pages can confuse
kswapd into thinking that more aggressive pageout modes are required,
resulting in all kinds of bad behaviour.
Infrastructure to manage pages excluded from reclaim--i.e., hidden from
vmscan. Based on a patch by Larry Woodman of Red Hat. Reworked to
maintain "unevictable" pages on a separate per-zone LRU list, to "hide"
them from vmscan.
Kosaki Motohiro added the support for the memory controller unevictable
lru list.
Pages on the unevictable list have both PG_unevictable and PG_lru set.
Thus, PG_unevictable is analogous to and mutually exclusive with
PG_active--it specifies which LRU list the page is on.
The unevictable infrastructure is enabled by a new mm Kconfig option
[CONFIG_]UNEVICTABLE_LRU.
A new function 'page_evictable(page, vma)' in vmscan.c tests whether or
not a page may be evictable. Subsequent patches will add the various
!evictable tests. We'll want to keep these tests light-weight for use in
shrink_active_list() and, possibly, the fault path.
To avoid races between tasks putting pages [back] onto an LRU list and
tasks that might be moving the page from non-evictable to evictable state,
the new function 'putback_lru_page()' -- inverse to 'isolate_lru_page()'
-- tests the "evictability" of a page after placing it on the LRU, before
dropping the reference. If the page has become unevictable,
putback_lru_page() will redo the 'putback', thus moving the page to the
unevictable list. This way, we avoid "stranding" evictable pages on the
unevictable list.
[akpm@linux-foundation.org: fix fallout from out-of-order merge]
[riel@redhat.com: fix UNEVICTABLE_LRU and !PROC_PAGE_MONITOR build]
[nishimura@mxp.nes.nec.co.jp: remove redundant mapping check]
[kosaki.motohiro@jp.fujitsu.com: unevictable-lru-infrastructure: putback_lru_page()/unevictable page handling rework]
[kosaki.motohiro@jp.fujitsu.com: kill unnecessary lock_page() in vmscan.c]
[kosaki.motohiro@jp.fujitsu.com: revert migration change of unevictable lru infrastructure]
[kosaki.motohiro@jp.fujitsu.com: revert to unevictable-lru-infrastructure-kconfig-fix.patch]
[kosaki.motohiro@jp.fujitsu.com: restore patch failure of vmstat-unevictable-and-mlocked-pages-vm-events.patch]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Debugged-by: Benjamin Kidwell <benjkidwell@yahoo.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 07:26:39 +04:00
|
|
|
|
2006-06-27 13:53:33 +04:00
|
|
|
extern int kswapd_run(int nid);
|
2009-12-15 04:58:33 +03:00
|
|
|
extern void kswapd_stop(int nid);
|
2011-07-27 03:08:21 +04:00
|
|
|
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
|
|
|
extern int mem_cgroup_swappiness(struct mem_cgroup *mem);
|
|
|
|
#else
|
|
|
|
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
|
|
|
|
{
|
|
|
|
return vm_swappiness;
|
|
|
|
}
|
|
|
|
#endif
|
2012-04-06 01:25:16 +04:00
|
|
|
#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
|
|
|
extern void mem_cgroup_uncharge_swap(swp_entry_t ent);
|
|
|
|
#else
|
|
|
|
static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
2005-04-17 02:20:36 +04:00
|
|
|
#ifdef CONFIG_SWAP
|
|
|
|
/* linux/mm/page_io.c */
|
2009-06-17 02:33:02 +04:00
|
|
|
extern int swap_readpage(struct page *);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern int swap_writepage(struct page *page, struct writeback_control *wbc);
|
2007-09-27 14:47:43 +04:00
|
|
|
extern void end_swap_bio_read(struct bio *bio, int err);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/* linux/mm/swap_state.c */
|
|
|
|
extern struct address_space swapper_space;
|
|
|
|
#define total_swapcache_pages swapper_space.nrpages
|
|
|
|
extern void show_swap_cache_info(void);
|
2009-01-07 01:39:39 +03:00
|
|
|
extern int add_to_swap(struct page *);
|
2008-02-05 09:28:50 +03:00
|
|
|
extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern void __delete_from_swap_cache(struct page *);
|
|
|
|
extern void delete_from_swap_cache(struct page *);
|
|
|
|
extern void free_page_and_swap_cache(struct page *);
|
|
|
|
extern void free_pages_and_swap_cache(struct page **, int);
|
2008-02-05 09:28:41 +03:00
|
|
|
extern struct page *lookup_swap_cache(swp_entry_t);
|
swapin needs gfp_mask for loop on tmpfs
Building in a filesystem on a loop device on a tmpfs file can hang when
swapping, the loop thread caught in that infamous throttle_vm_writeout.
In theory this is a long standing problem, which I've either never seen in
practice, or long ago suppressed the recollection, after discounting my load
and my tmpfs size as unrealistically high. But now, with the new aops, it has
become easy to hang on one machine.
Loop used to grab_cache_page before the old prepare_write to tmpfs, which
seems to have been enough to free up some memory for any swapin needed; but
the new write_begin lets tmpfs find or allocate the page (much nicer, since
grab_cache_page missed tmpfs pages in swapcache).
When allocating a fresh page, tmpfs respects loop's mapping_gfp_mask, which
has __GFP_IO|__GFP_FS stripped off, and throttle_vm_writeout is designed to
break out when __GFP_IO or GFP_FS is unset; but when tmfps swaps in,
read_swap_cache_async allocates with GFP_HIGHUSER_MOVABLE regardless of the
mapping_gfp_mask - hence the hang.
So, pass gfp_mask down the line from shmem_getpage to shmem_swapin to
swapin_readahead to read_swap_cache_async to add_to_swap_cache.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 09:28:42 +03:00
|
|
|
extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
|
2008-02-05 09:28:41 +03:00
|
|
|
struct vm_area_struct *vma, unsigned long addr);
|
swapin needs gfp_mask for loop on tmpfs
Building in a filesystem on a loop device on a tmpfs file can hang when
swapping, the loop thread caught in that infamous throttle_vm_writeout.
In theory this is a long standing problem, which I've either never seen in
practice, or long ago suppressed the recollection, after discounting my load
and my tmpfs size as unrealistically high. But now, with the new aops, it has
become easy to hang on one machine.
Loop used to grab_cache_page before the old prepare_write to tmpfs, which
seems to have been enough to free up some memory for any swapin needed; but
the new write_begin lets tmpfs find or allocate the page (much nicer, since
grab_cache_page missed tmpfs pages in swapcache).
When allocating a fresh page, tmpfs respects loop's mapping_gfp_mask, which
has __GFP_IO|__GFP_FS stripped off, and throttle_vm_writeout is designed to
break out when __GFP_IO or GFP_FS is unset; but when tmfps swaps in,
read_swap_cache_async allocates with GFP_HIGHUSER_MOVABLE regardless of the
mapping_gfp_mask - hence the hang.
So, pass gfp_mask down the line from shmem_getpage to shmem_swapin to
swapin_readahead to read_swap_cache_async to add_to_swap_cache.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 09:28:42 +03:00
|
|
|
extern struct page *swapin_readahead(swp_entry_t, gfp_t,
|
2008-02-05 09:28:41 +03:00
|
|
|
struct vm_area_struct *vma, unsigned long addr);
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/* linux/mm/swapfile.c */
|
2009-01-07 01:39:41 +03:00
|
|
|
extern long nr_swap_pages;
|
2005-04-17 02:20:36 +04:00
|
|
|
extern long total_swap_pages;
|
|
|
|
extern void si_swapinfo(struct sysinfo *);
|
|
|
|
extern swp_entry_t get_swap_page(void);
|
2010-09-10 03:38:07 +04:00
|
|
|
extern swp_entry_t get_swap_page_of_type(int);
|
swap_info: swap count continuations
Swap is duplicated (reference count incremented by one) whenever the same
swap page is inserted into another mm (when forking finds a swap entry in
place of a pte, or when reclaim unmaps a pte to insert the swap entry).
swap_info_struct's vmalloc'ed swap_map is the array of these reference
counts: but what happens when the unsigned short (or unsigned char since
the preceding patch) is full? (and its high bit is kept for a cache flag)
We then lose track of it, never freeing, leaving it in use until swapoff:
at which point we _hope_ that a single pass will have found all instances,
assume there are no more, and will lose user data if we're wrong.
Swapping of KSM pages has not yet been enabled; but it is implemented,
and makes it very easy for a user to overflow the maximum swap count:
possible with ordinary process pages, but unlikely, even when pid_max
has been raised from PID_MAX_DEFAULT.
This patch implements swap count continuations: when the count overflows,
a continuation page is allocated and linked to the original vmalloc'ed
map page, and this used to hold the continuation counts for that entry
and its neighbours. These continuation pages are seldom referenced:
the common paths all work on the original swap_map, only referring to
a continuation page when the low "digit" of a count is incremented or
decremented through SWAP_MAP_MAX.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 04:58:46 +03:00
|
|
|
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
|
2009-12-15 04:58:47 +03:00
|
|
|
extern void swap_shmem_alloc(swp_entry_t);
|
swap_info: swap count continuations
Swap is duplicated (reference count incremented by one) whenever the same
swap page is inserted into another mm (when forking finds a swap entry in
place of a pte, or when reclaim unmaps a pte to insert the swap entry).
swap_info_struct's vmalloc'ed swap_map is the array of these reference
counts: but what happens when the unsigned short (or unsigned char since
the preceding patch) is full? (and its high bit is kept for a cache flag)
We then lose track of it, never freeing, leaving it in use until swapoff:
at which point we _hope_ that a single pass will have found all instances,
assume there are no more, and will lose user data if we're wrong.
Swapping of KSM pages has not yet been enabled; but it is implemented,
and makes it very easy for a user to overflow the maximum swap count:
possible with ordinary process pages, but unlikely, even when pid_max
has been raised from PID_MAX_DEFAULT.
This patch implements swap count continuations: when the count overflows,
a continuation page is allocated and linked to the original vmalloc'ed
map page, and this used to hold the continuation counts for that entry
and its neighbours. These continuation pages are seldom referenced:
the common paths all work on the original swap_map, only referring to
a continuation page when the low "digit" of a count is incremented or
decremented through SWAP_MAP_MAX.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 04:58:46 +03:00
|
|
|
extern int swap_duplicate(swp_entry_t);
|
|
|
|
extern int swapcache_prepare(swp_entry_t);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern void swap_free(swp_entry_t);
|
2009-06-17 02:32:52 +04:00
|
|
|
extern void swapcache_free(swp_entry_t, struct page *page);
|
2009-01-07 01:40:10 +03:00
|
|
|
extern int free_swap_and_cache(swp_entry_t);
|
2007-01-06 03:36:28 +03:00
|
|
|
extern int swap_type_of(dev_t, sector_t, struct block_device **);
|
2006-03-23 13:59:59 +03:00
|
|
|
extern unsigned int count_swap_pages(int, int);
|
2009-12-15 04:58:49 +03:00
|
|
|
extern sector_t map_swap_page(struct page *, struct block_device **);
|
2006-12-07 07:34:10 +03:00
|
|
|
extern sector_t swapdev_block(int, pgoff_t);
|
2009-01-07 01:39:34 +03:00
|
|
|
extern int reuse_swap_page(struct page *);
|
2009-01-07 01:39:36 +03:00
|
|
|
extern int try_to_free_swap(struct page *);
|
2005-04-17 02:20:36 +04:00
|
|
|
struct backing_dev_info;
|
|
|
|
|
|
|
|
/* linux/mm/thrash.c */
|
2009-06-23 23:36:58 +04:00
|
|
|
extern struct mm_struct *swap_token_mm;
|
|
|
|
extern void grab_swap_token(struct mm_struct *);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern void __put_swap_token(struct mm_struct *);
|
2011-06-16 02:08:13 +04:00
|
|
|
extern void disable_swap_token(struct mem_cgroup *memcg);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
static inline int has_swap_token(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return (mm == swap_token_mm);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void put_swap_token(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
if (has_swap_token(mm))
|
|
|
|
__put_swap_token(mm);
|
|
|
|
}
|
|
|
|
|
2009-01-08 05:07:56 +03:00
|
|
|
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
2009-06-18 03:27:17 +04:00
|
|
|
extern void
|
|
|
|
mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
|
2010-03-11 02:22:17 +03:00
|
|
|
extern int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep);
|
2009-01-08 05:07:56 +03:00
|
|
|
#else
|
2009-01-08 05:08:00 +03:00
|
|
|
static inline void
|
2009-06-18 03:27:17 +04:00
|
|
|
mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
|
2009-01-08 05:08:00 +03:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
2009-01-08 05:07:56 +03:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#else /* CONFIG_SWAP */
|
|
|
|
|
2009-01-07 01:39:41 +03:00
|
|
|
#define nr_swap_pages 0L
|
|
|
|
#define total_swap_pages 0L
|
2005-04-17 02:20:36 +04:00
|
|
|
#define total_swapcache_pages 0UL
|
|
|
|
|
|
|
|
#define si_swapinfo(val) \
|
|
|
|
do { (val)->freeswap = (val)->totalswap = 0; } while (0)
|
2005-08-07 20:42:24 +04:00
|
|
|
/* only sparc can not include linux/pagemap.h in this file
|
|
|
|
* so leave page_cache_release and release_pages undeclared... */
|
2005-04-17 02:20:36 +04:00
|
|
|
#define free_page_and_swap_cache(page) \
|
|
|
|
page_cache_release(page)
|
|
|
|
#define free_pages_and_swap_cache(pages, nr) \
|
|
|
|
release_pages((pages), (nr), 0);
|
|
|
|
|
2006-06-23 13:03:42 +04:00
|
|
|
static inline void show_swap_cache_info(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-01-07 01:40:10 +03:00
|
|
|
#define free_swap_and_cache(swp) is_migration_entry(swp)
|
2009-06-17 02:32:52 +04:00
|
|
|
#define swapcache_prepare(swp) is_migration_entry(swp)
|
2006-06-23 13:03:42 +04:00
|
|
|
|
swap_info: swap count continuations
Swap is duplicated (reference count incremented by one) whenever the same
swap page is inserted into another mm (when forking finds a swap entry in
place of a pte, or when reclaim unmaps a pte to insert the swap entry).
swap_info_struct's vmalloc'ed swap_map is the array of these reference
counts: but what happens when the unsigned short (or unsigned char since
the preceding patch) is full? (and its high bit is kept for a cache flag)
We then lose track of it, never freeing, leaving it in use until swapoff:
at which point we _hope_ that a single pass will have found all instances,
assume there are no more, and will lose user data if we're wrong.
Swapping of KSM pages has not yet been enabled; but it is implemented,
and makes it very easy for a user to overflow the maximum swap count:
possible with ordinary process pages, but unlikely, even when pid_max
has been raised from PID_MAX_DEFAULT.
This patch implements swap count continuations: when the count overflows,
a continuation page is allocated and linked to the original vmalloc'ed
map page, and this used to hold the continuation counts for that entry
and its neighbours. These continuation pages are seldom referenced:
the common paths all work on the original swap_map, only referring to
a continuation page when the low "digit" of a count is incremented or
decremented through SWAP_MAP_MAX.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 04:58:46 +03:00
|
|
|
static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
|
2009-06-17 02:32:53 +04:00
|
|
|
{
|
swap_info: swap count continuations
Swap is duplicated (reference count incremented by one) whenever the same
swap page is inserted into another mm (when forking finds a swap entry in
place of a pte, or when reclaim unmaps a pte to insert the swap entry).
swap_info_struct's vmalloc'ed swap_map is the array of these reference
counts: but what happens when the unsigned short (or unsigned char since
the preceding patch) is full? (and its high bit is kept for a cache flag)
We then lose track of it, never freeing, leaving it in use until swapoff:
at which point we _hope_ that a single pass will have found all instances,
assume there are no more, and will lose user data if we're wrong.
Swapping of KSM pages has not yet been enabled; but it is implemented,
and makes it very easy for a user to overflow the maximum swap count:
possible with ordinary process pages, but unlikely, even when pid_max
has been raised from PID_MAX_DEFAULT.
This patch implements swap count continuations: when the count overflows,
a continuation page is allocated and linked to the original vmalloc'ed
map page, and this used to hold the continuation counts for that entry
and its neighbours. These continuation pages are seldom referenced:
the common paths all work on the original swap_map, only referring to
a continuation page when the low "digit" of a count is incremented or
decremented through SWAP_MAP_MAX.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 04:58:46 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-12-15 04:58:47 +03:00
|
|
|
static inline void swap_shmem_alloc(swp_entry_t swp)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
swap_info: swap count continuations
Swap is duplicated (reference count incremented by one) whenever the same
swap page is inserted into another mm (when forking finds a swap entry in
place of a pte, or when reclaim unmaps a pte to insert the swap entry).
swap_info_struct's vmalloc'ed swap_map is the array of these reference
counts: but what happens when the unsigned short (or unsigned char since
the preceding patch) is full? (and its high bit is kept for a cache flag)
We then lose track of it, never freeing, leaving it in use until swapoff:
at which point we _hope_ that a single pass will have found all instances,
assume there are no more, and will lose user data if we're wrong.
Swapping of KSM pages has not yet been enabled; but it is implemented,
and makes it very easy for a user to overflow the maximum swap count:
possible with ordinary process pages, but unlikely, even when pid_max
has been raised from PID_MAX_DEFAULT.
This patch implements swap count continuations: when the count overflows,
a continuation page is allocated and linked to the original vmalloc'ed
map page, and this used to hold the continuation counts for that entry
and its neighbours. These continuation pages are seldom referenced:
the common paths all work on the original swap_map, only referring to
a continuation page when the low "digit" of a count is incremented or
decremented through SWAP_MAP_MAX.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-15 04:58:46 +03:00
|
|
|
static inline int swap_duplicate(swp_entry_t swp)
|
|
|
|
{
|
|
|
|
return 0;
|
2009-06-17 02:32:53 +04:00
|
|
|
}
|
|
|
|
|
2006-06-23 13:03:42 +04:00
|
|
|
static inline void swap_free(swp_entry_t swp)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-06-17 02:32:52 +04:00
|
|
|
static inline void swapcache_free(swp_entry_t swp, struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
swapin needs gfp_mask for loop on tmpfs
Building in a filesystem on a loop device on a tmpfs file can hang when
swapping, the loop thread caught in that infamous throttle_vm_writeout.
In theory this is a long standing problem, which I've either never seen in
practice, or long ago suppressed the recollection, after discounting my load
and my tmpfs size as unrealistically high. But now, with the new aops, it has
become easy to hang on one machine.
Loop used to grab_cache_page before the old prepare_write to tmpfs, which
seems to have been enough to free up some memory for any swapin needed; but
the new write_begin lets tmpfs find or allocate the page (much nicer, since
grab_cache_page missed tmpfs pages in swapcache).
When allocating a fresh page, tmpfs respects loop's mapping_gfp_mask, which
has __GFP_IO|__GFP_FS stripped off, and throttle_vm_writeout is designed to
break out when __GFP_IO or GFP_FS is unset; but when tmfps swaps in,
read_swap_cache_async allocates with GFP_HIGHUSER_MOVABLE regardless of the
mapping_gfp_mask - hence the hang.
So, pass gfp_mask down the line from shmem_getpage to shmem_swapin to
swapin_readahead to read_swap_cache_async to add_to_swap_cache.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 09:28:42 +03:00
|
|
|
static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
|
2006-06-23 13:03:42 +04:00
|
|
|
struct vm_area_struct *vma, unsigned long addr)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
shmem: writepage directly to swap
Synopsis: if shmem_writepage calls swap_writepage directly, most shmem
swap loads benefit, and a catastrophic interaction between SLUB and some
flash storage is avoided.
shmem_writepage() has always been peculiar in making no attempt to write:
it has just transferred a shmem page from file cache to swap cache, then
let that page make its way around the LRU again before being written and
freed.
The idea was that people use tmpfs because they want those pages to stay
in RAM; so although we give it an overflow to swap, we should resist
writing too soon, giving those pages a second chance before they can be
reclaimed.
That was always questionable, and I've toyed with this patch for years;
but never had a clear justification to depart from the original design.
It became more questionable in 2.6.28, when the split LRU patches classed
shmem and tmpfs pages as SwapBacked rather than as file_cache: that in
itself gives them more resistance to reclaim than normal file pages. I
prepared this patch for 2.6.29, but the merge window arrived before I'd
completed gathering statistics to justify sending it in.
Then while comparing SLQB against SLUB, running SLUB on a laptop I'd
habitually used with SLAB, I found SLUB to run my tmpfs kbuild swapping
tests five times slower than SLAB or SLQB - other machines slower too, but
nowhere near so bad. Simpler "cp -a" swapping tests showed the same.
slub_max_order=0 brings sanity to all, but heavy swapping is too far from
normal to justify such a tuning. The crucial factor on that laptop turns
out to be that I'm using an SD card for swap. What happens is this:
By default, SLUB uses order-2 pages for shmem_inode_cache (and many other
fs inodes), so creating tmpfs files under memory pressure brings lumpy
reclaim into play. One subpage of the order is chosen from the bottom of
the LRU as usual, then the other three picked out from their random
positions on the LRUs.
In a tmpfs load, many of these pages will be ones which already passed
through shmem_writepage, so already have swap allocated. And though their
offsets on swap were probably allocated sequentially, now that the pages
are picked off at random, their swap offsets are scattered.
But the flash storage on the SD card is very sensitive to having its
writes merged: once swap is written at scattered offsets, performance
falls apart. Rotating disk seeks increase too, but less disastrously.
So: stop giving shmem/tmpfs pages a second pass around the LRU, write them
out to swap as soon as their swap has been allocated.
It's surely possible to devise an artificial load which runs faster the
old way, one whose sizing is such that the tmpfs pages on their second
pass are the ones that are wanted again, and other pages not.
But I've not yet found such a load: on all machines, under the loads I've
tried, immediate swap_writepage speeds up shmem swapping: especially when
using the SLUB allocator (and more effectively than slub_max_order=0), but
also with the others; and it also reduces the variance between runs. How
much faster varies widely: a factor of five is rare, 5% is common.
One load which might have suffered: imagine a swapping shmem load in a
limited mem_cgroup on a machine with plenty of memory. Before 2.6.29 the
swapcache was not charged, and such a load would have run quickest with
the shmem swapcache never written to swap. But now swapcache is charged,
so even this load benefits from shmem_writepage directly to swap.
Apologies for the #ifndef CONFIG_SWAP swap_writepage() stub in swap.h:
it's silly because that will never get called; but refactoring shmem.c
sensibly according to CONFIG_SWAP will be a separate task.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-01 02:23:33 +04:00
|
|
|
static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-06-23 13:03:42 +04:00
|
|
|
static inline struct page *lookup_swap_cache(swp_entry_t swp)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2009-01-07 01:39:40 +03:00
|
|
|
static inline int add_to_swap(struct page *page)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-02-05 09:28:50 +03:00
|
|
|
static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
|
|
|
|
gfp_t gfp_mask)
|
2006-06-23 13:03:42 +04:00
|
|
|
{
|
2008-02-05 09:28:50 +03:00
|
|
|
return -1;
|
2006-06-23 13:03:42 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void __delete_from_swap_cache(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void delete_from_swap_cache(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2009-01-07 01:39:34 +03:00
|
|
|
#define reuse_swap_page(page) (page_mapcount(page) == 1)
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2009-01-07 01:39:36 +03:00
|
|
|
static inline int try_to_free_swap(struct page *page)
|
2008-10-19 07:26:23 +04:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
static inline swp_entry_t get_swap_page(void)
|
|
|
|
{
|
|
|
|
swp_entry_t entry;
|
|
|
|
entry.val = 0;
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* linux/mm/thrash.c */
|
2009-09-22 04:01:13 +04:00
|
|
|
static inline void put_swap_token(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void grab_swap_token(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int has_swap_token(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-06-16 02:08:13 +04:00
|
|
|
static inline void disable_swap_token(struct mem_cgroup *memcg)
|
2009-09-22 04:01:13 +04:00
|
|
|
{
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2009-05-29 01:34:28 +04:00
|
|
|
static inline void
|
|
|
|
mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2010-03-11 02:22:17 +03:00
|
|
|
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
|
|
|
static inline int
|
|
|
|
mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#endif /* CONFIG_SWAP */
|
|
|
|
#endif /* __KERNEL__*/
|
|
|
|
#endif /* _LINUX_SWAP_H */
|