2005-04-17 02:20:36 +04:00
|
|
|
#ifndef _LINUX_SWAP_H
|
|
|
|
#define _LINUX_SWAP_H
|
|
|
|
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
#include <linux/mmzone.h>
|
|
|
|
#include <linux/list.h>
|
2008-02-07 11:13:56 +03:00
|
|
|
#include <linux/memcontrol.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <linux/sched.h>
|
2005-07-13 00:58:31 +04:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <asm/atomic.h>
|
|
|
|
#include <asm/page.h>
|
|
|
|
|
2006-09-26 10:31:20 +04:00
|
|
|
struct notifier_block;
|
|
|
|
|
2006-09-26 10:32:42 +04:00
|
|
|
struct bio;
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */
|
|
|
|
#define SWAP_FLAG_PRIO_MASK 0x7fff
|
|
|
|
#define SWAP_FLAG_PRIO_SHIFT 0
|
|
|
|
|
|
|
|
static inline int current_is_kswapd(void)
|
|
|
|
{
|
|
|
|
return current->flags & PF_KSWAPD;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* MAX_SWAPFILES defines the maximum number of swaptypes: things which can
|
|
|
|
* be swapped to. The swap type and the offset into that swap type are
|
|
|
|
* encoded into pte's and into pgoff_t's in the swapcache. Using five bits
|
|
|
|
* for the type means that the maximum number of swapcache pages is 27 bits
|
|
|
|
* on 32-bit-pgoff_t architectures. And that assumes that the architecture packs
|
|
|
|
* the type/offset into the pte as 5/27 as well.
|
|
|
|
*/
|
|
|
|
#define MAX_SWAPFILES_SHIFT 5
|
[PATCH] Swapless page migration: add R/W migration entries
Implement read/write migration ptes
We take the upper two swapfiles for the two types of migration ptes and define
a series of macros in swapops.h.
The VM is modified to handle the migration entries. migration entries can
only be encountered when the page they are pointing to is locked. This limits
the number of places one has to fix. We also check in copy_pte_range and in
mprotect_pte_range() for migration ptes.
We check for migration ptes in do_swap_cache and call a function that will
then wait on the page lock. This allows us to effectively stop all accesses
to apge.
Migration entries are created by try_to_unmap if called for migration and
removed by local functions in migrate.c
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration (I've no NUMA, just
hacking it up to migrate recklessly while running load), I've hit the
BUG_ON(!PageLocked(p)) in migration_entry_to_page.
This comes from an orphaned migration entry, unrelated to the current
correctly locked migration, but hit by remove_anon_migration_ptes as it
checks an address in each vma of the anon_vma list.
Such an orphan may be left behind if an earlier migration raced with fork:
copy_one_pte can duplicate a migration entry from parent to child, after
remove_anon_migration_ptes has checked the child vma, but before it has
removed it from the parent vma. (If the process were later to fault on this
orphaned entry, it would hit the same BUG from migration_entry_wait.)
This could be fixed by locking anon_vma in copy_one_pte, but we'd rather
not. There's no such problem with file pages, because vma_prio_tree_add
adds child vma after parent vma, and the page table locking at each end is
enough to serialize. Follow that example with anon_vma: add new vmas to the
tail instead of the head.
(There's no corresponding problem when inserting migration entries,
because a missed pte will leave the page count and mapcount high, which is
allowed for. And there's no corresponding problem when migrating via swap,
because a leftover swap entry will be correctly faulted. But the swapless
method has no refcounting of its entries.)
From: Ingo Molnar <mingo@elte.hu>
pte_unmap_unlock() takes the pte pointer as an argument.
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration, gcc has tried to exec
a pointer instead of a string: smells like COW mappings are not being
properly write-protected on fork.
The protection in copy_one_pte looks very convincing, until at last you
realize that the second arg to make_migration_entry is a boolean "write",
and SWP_MIGRATION_READ is 30.
Anyway, it's better done like in change_pte_range, using
is_write_migration_entry and make_migration_entry_read.
From: Hugh Dickins <hugh@veritas.com>
Remove unnecessary obfuscation from sys_swapon's range check on swap type,
which blew up causing memory corruption once swapless migration made
MAX_SWAPFILES no longer 2 ^ MAX_SWAPFILES_SHIFT.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
From: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:03:35 +04:00
|
|
|
#ifndef CONFIG_MIGRATION
|
2005-04-17 02:20:36 +04:00
|
|
|
#define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT)
|
[PATCH] Swapless page migration: add R/W migration entries
Implement read/write migration ptes
We take the upper two swapfiles for the two types of migration ptes and define
a series of macros in swapops.h.
The VM is modified to handle the migration entries. migration entries can
only be encountered when the page they are pointing to is locked. This limits
the number of places one has to fix. We also check in copy_pte_range and in
mprotect_pte_range() for migration ptes.
We check for migration ptes in do_swap_cache and call a function that will
then wait on the page lock. This allows us to effectively stop all accesses
to apge.
Migration entries are created by try_to_unmap if called for migration and
removed by local functions in migrate.c
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration (I've no NUMA, just
hacking it up to migrate recklessly while running load), I've hit the
BUG_ON(!PageLocked(p)) in migration_entry_to_page.
This comes from an orphaned migration entry, unrelated to the current
correctly locked migration, but hit by remove_anon_migration_ptes as it
checks an address in each vma of the anon_vma list.
Such an orphan may be left behind if an earlier migration raced with fork:
copy_one_pte can duplicate a migration entry from parent to child, after
remove_anon_migration_ptes has checked the child vma, but before it has
removed it from the parent vma. (If the process were later to fault on this
orphaned entry, it would hit the same BUG from migration_entry_wait.)
This could be fixed by locking anon_vma in copy_one_pte, but we'd rather
not. There's no such problem with file pages, because vma_prio_tree_add
adds child vma after parent vma, and the page table locking at each end is
enough to serialize. Follow that example with anon_vma: add new vmas to the
tail instead of the head.
(There's no corresponding problem when inserting migration entries,
because a missed pte will leave the page count and mapcount high, which is
allowed for. And there's no corresponding problem when migrating via swap,
because a leftover swap entry will be correctly faulted. But the swapless
method has no refcounting of its entries.)
From: Ingo Molnar <mingo@elte.hu>
pte_unmap_unlock() takes the pte pointer as an argument.
From: Hugh Dickins <hugh@veritas.com>
Several times while testing swapless page migration, gcc has tried to exec
a pointer instead of a string: smells like COW mappings are not being
properly write-protected on fork.
The protection in copy_one_pte looks very convincing, until at last you
realize that the second arg to make_migration_entry is a boolean "write",
and SWP_MIGRATION_READ is 30.
Anyway, it's better done like in change_pte_range, using
is_write_migration_entry and make_migration_entry_read.
From: Hugh Dickins <hugh@veritas.com>
Remove unnecessary obfuscation from sys_swapon's range check on swap type,
which blew up causing memory corruption once swapless migration made
MAX_SWAPFILES no longer 2 ^ MAX_SWAPFILES_SHIFT.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
From: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 13:03:35 +04:00
|
|
|
#else
|
|
|
|
/* Use last two entries for page migration swap entries */
|
|
|
|
#define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2)
|
|
|
|
#define SWP_MIGRATION_READ MAX_SWAPFILES
|
|
|
|
#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1)
|
|
|
|
#endif
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Magic header for a swap area. The first part of the union is
|
|
|
|
* what the swap magic looks like for the old (limited to 128MB)
|
|
|
|
* swap area format, the second part of the union adds - in the
|
|
|
|
* old reserved area - some extra information. Note that the first
|
|
|
|
* kilobyte is reserved for boot loader or disk label stuff...
|
|
|
|
*
|
|
|
|
* Having the magic at the end of the PAGE_SIZE makes detecting swap
|
|
|
|
* areas somewhat tricky on machines that support multiple page sizes.
|
|
|
|
* For 2.5 we'll probably want to move the magic to just beyond the
|
|
|
|
* bootbits...
|
|
|
|
*/
|
|
|
|
union swap_header {
|
|
|
|
struct {
|
|
|
|
char reserved[PAGE_SIZE - 10];
|
|
|
|
char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */
|
|
|
|
} magic;
|
|
|
|
struct {
|
2006-06-23 13:03:14 +04:00
|
|
|
char bootbits[1024]; /* Space for disklabel etc. */
|
|
|
|
__u32 version;
|
|
|
|
__u32 last_page;
|
|
|
|
__u32 nr_badpages;
|
|
|
|
unsigned char sws_uuid[16];
|
|
|
|
unsigned char sws_volume[16];
|
|
|
|
__u32 padding[117];
|
|
|
|
__u32 badpages[1];
|
2005-04-17 02:20:36 +04:00
|
|
|
} info;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* A swap entry has to fit into a "unsigned long", as
|
|
|
|
* the entry is hidden in the "index" field of the
|
|
|
|
* swapper address space.
|
|
|
|
*/
|
|
|
|
typedef struct {
|
|
|
|
unsigned long val;
|
|
|
|
} swp_entry_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* current->reclaim_state points to one of these when a task is running
|
|
|
|
* memory reclaim
|
|
|
|
*/
|
|
|
|
struct reclaim_state {
|
|
|
|
unsigned long reclaimed_slab;
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
|
|
|
|
struct address_space;
|
|
|
|
struct sysinfo;
|
|
|
|
struct writeback_control;
|
|
|
|
struct zone;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
|
|
|
|
* disk blocks. A list of swap extents maps the entire swapfile. (Where the
|
|
|
|
* term `swapfile' refers to either a blockdevice or an IS_REG file. Apart
|
|
|
|
* from setup, they're handled identically.
|
|
|
|
*
|
|
|
|
* We always assume that blocks are of size PAGE_SIZE.
|
|
|
|
*/
|
|
|
|
struct swap_extent {
|
|
|
|
struct list_head list;
|
|
|
|
pgoff_t start_page;
|
|
|
|
pgoff_t nr_pages;
|
|
|
|
sector_t start_block;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Max bad pages in the new format..
|
|
|
|
*/
|
|
|
|
#define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x)
|
|
|
|
#define MAX_SWAP_BADPAGES \
|
|
|
|
((__swapoffset(magic.magic) - __swapoffset(info.badpages)) / sizeof(int))
|
|
|
|
|
|
|
|
enum {
|
|
|
|
SWP_USED = (1 << 0), /* is slot in swap_info[] used? */
|
|
|
|
SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */
|
|
|
|
SWP_ACTIVE = (SWP_USED | SWP_WRITEOK),
|
[PATCH] swap: scan_swap_map drop swap_device_lock
get_swap_page has often shown up on latency traces, doing lengthy scans while
holding two spinlocks. swap_list_lock is already dropped, now scan_swap_map
drop swap_device_lock before scanning the swap_map.
While scanning for an empty cluster, don't worry that racing tasks may
allocate what was free and free what was allocated; but when allocating an
entry, check it's still free after retaking the lock. Avoid dropping the lock
in the expected common path. No barriers beyond the locks, just let the
cookie crumble; highest_bit limit is volatile, but benign.
Guard against swapoff: must check SWP_WRITEOK before allocating, must raise
SWP_SCANNING reference count while in scan_swap_map, swapoff wait for that to
fall - just use schedule_timeout, we don't want to burden scan_swap_map
itself, and it's very unlikely that anyone can really still be in
scan_swap_map once swapoff gets this far.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-04 02:54:39 +04:00
|
|
|
/* add others here before... */
|
|
|
|
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
|
2005-04-17 02:20:36 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
#define SWAP_CLUSTER_MAX 32
|
|
|
|
|
|
|
|
#define SWAP_MAP_MAX 0x7fff
|
|
|
|
#define SWAP_MAP_BAD 0x8000
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The in-memory structure used to track swap areas.
|
|
|
|
*/
|
|
|
|
struct swap_info_struct {
|
|
|
|
unsigned int flags;
|
[PATCH] swap: swap_lock replace list+device
The idea of a swap_device_lock per device, and a swap_list_lock over them all,
is appealing; but in practice almost every holder of swap_device_lock must
already hold swap_list_lock, which defeats the purpose of the split.
The only exceptions have been swap_duplicate, valid_swaphandles and an
untrodden path in try_to_unuse (plus a few places added in this series).
valid_swaphandles doesn't show up high in profiles, but swap_duplicate does
demand attention. However, with the hold time in get_swap_pages so much
reduced, I've not yet found a load and set of swap device priorities to show
even swap_duplicate benefitting from the split. Certainly the split is mere
overhead in the common case of a single swap device.
So, replace swap_list_lock and swap_device_lock by spinlock_t swap_lock
(generally we seem to prefer an _ in the name, and not hide in a macro).
If someone can show a regression in swap_duplicate, then probably we should
add a hashlock for the swap_map entries alone (shorts being anatomic), so as
to help the case of the single swap device too.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-04 02:54:41 +04:00
|
|
|
int prio; /* swap priority */
|
2005-04-17 02:20:36 +04:00
|
|
|
struct file *swap_file;
|
|
|
|
struct block_device *bdev;
|
|
|
|
struct list_head extent_list;
|
|
|
|
struct swap_extent *curr_swap_extent;
|
|
|
|
unsigned old_block_size;
|
|
|
|
unsigned short * swap_map;
|
|
|
|
unsigned int lowest_bit;
|
|
|
|
unsigned int highest_bit;
|
|
|
|
unsigned int cluster_next;
|
|
|
|
unsigned int cluster_nr;
|
2005-09-04 02:54:35 +04:00
|
|
|
unsigned int pages;
|
|
|
|
unsigned int max;
|
|
|
|
unsigned int inuse_pages;
|
2005-04-17 02:20:36 +04:00
|
|
|
int next; /* next entry on swap list */
|
|
|
|
};
|
|
|
|
|
|
|
|
struct swap_list_t {
|
|
|
|
int head; /* head of priority-ordered swapfile list */
|
|
|
|
int next; /* swapfile to be used next */
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Swap 50% full? Release swapcache more aggressively.. */
|
|
|
|
#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
|
|
|
|
|
|
|
|
/* linux/mm/page_alloc.c */
|
|
|
|
extern unsigned long totalram_pages;
|
2006-04-11 09:52:59 +04:00
|
|
|
extern unsigned long totalreserve_pages;
|
2005-04-17 02:20:36 +04:00
|
|
|
extern long nr_swap_pages;
|
|
|
|
extern unsigned int nr_free_buffer_pages(void);
|
|
|
|
extern unsigned int nr_free_pagecache_pages(void);
|
|
|
|
|
2007-02-10 12:43:03 +03:00
|
|
|
/* Definition of global_page_state not available yet */
|
|
|
|
#define nr_free_pages() global_page_state(NR_FREE_PAGES)
|
|
|
|
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/* linux/mm/swap.c */
|
2008-02-14 02:03:15 +03:00
|
|
|
extern void lru_cache_add(struct page *);
|
|
|
|
extern void lru_cache_add_active(struct page *);
|
|
|
|
extern void activate_page(struct page *);
|
|
|
|
extern void mark_page_accessed(struct page *);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern void lru_add_drain(void);
|
2006-01-19 04:42:27 +03:00
|
|
|
extern int lru_add_drain_all(void);
|
2008-04-28 13:12:38 +04:00
|
|
|
extern void rotate_reclaimable_page(struct page *page);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern void swap_setup(void);
|
|
|
|
|
|
|
|
/* linux/mm/vmscan.c */
|
2008-04-28 13:12:12 +04:00
|
|
|
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
|
2007-07-17 15:03:16 +04:00
|
|
|
gfp_t gfp_mask);
|
2008-02-07 11:14:02 +03:00
|
|
|
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
|
|
|
|
gfp_t gfp_mask);
|
2008-02-07 11:13:56 +03:00
|
|
|
extern int __isolate_lru_page(struct page *page, int mode);
|
2006-03-22 11:08:19 +03:00
|
|
|
extern unsigned long shrink_all_memory(unsigned long nr_pages);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern int vm_swappiness;
|
2006-03-22 11:09:12 +03:00
|
|
|
extern int remove_mapping(struct address_space *mapping, struct page *page);
|
2006-06-23 13:03:47 +04:00
|
|
|
extern long vm_total_pages;
|
2006-03-22 11:09:12 +03:00
|
|
|
|
2006-01-19 04:42:31 +03:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
extern int zone_reclaim_mode;
|
2006-07-03 11:24:13 +04:00
|
|
|
extern int sysctl_min_unmapped_ratio;
|
2006-09-26 10:31:52 +04:00
|
|
|
extern int sysctl_min_slab_ratio;
|
2006-01-19 04:42:31 +03:00
|
|
|
extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
|
|
|
|
#else
|
|
|
|
#define zone_reclaim_mode 0
|
|
|
|
static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2006-06-27 13:53:33 +04:00
|
|
|
extern int kswapd_run(int nid);
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#ifdef CONFIG_MMU
|
|
|
|
/* linux/mm/shmem.c */
|
|
|
|
extern int shmem_unuse(swp_entry_t entry, struct page *page);
|
|
|
|
#endif /* CONFIG_MMU */
|
|
|
|
|
|
|
|
extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
|
|
|
|
|
|
|
|
#ifdef CONFIG_SWAP
|
|
|
|
/* linux/mm/page_io.c */
|
|
|
|
extern int swap_readpage(struct file *, struct page *);
|
|
|
|
extern int swap_writepage(struct page *page, struct writeback_control *wbc);
|
2007-09-27 14:47:43 +04:00
|
|
|
extern void end_swap_bio_read(struct bio *bio, int err);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/* linux/mm/swap_state.c */
|
|
|
|
extern struct address_space swapper_space;
|
|
|
|
#define total_swapcache_pages swapper_space.nrpages
|
|
|
|
extern void show_swap_cache_info(void);
|
2006-01-08 12:00:53 +03:00
|
|
|
extern int add_to_swap(struct page *, gfp_t);
|
2008-02-05 09:28:50 +03:00
|
|
|
extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern void __delete_from_swap_cache(struct page *);
|
|
|
|
extern void delete_from_swap_cache(struct page *);
|
|
|
|
extern void free_page_and_swap_cache(struct page *);
|
|
|
|
extern void free_pages_and_swap_cache(struct page **, int);
|
2008-02-05 09:28:41 +03:00
|
|
|
extern struct page *lookup_swap_cache(swp_entry_t);
|
swapin needs gfp_mask for loop on tmpfs
Building in a filesystem on a loop device on a tmpfs file can hang when
swapping, the loop thread caught in that infamous throttle_vm_writeout.
In theory this is a long standing problem, which I've either never seen in
practice, or long ago suppressed the recollection, after discounting my load
and my tmpfs size as unrealistically high. But now, with the new aops, it has
become easy to hang on one machine.
Loop used to grab_cache_page before the old prepare_write to tmpfs, which
seems to have been enough to free up some memory for any swapin needed; but
the new write_begin lets tmpfs find or allocate the page (much nicer, since
grab_cache_page missed tmpfs pages in swapcache).
When allocating a fresh page, tmpfs respects loop's mapping_gfp_mask, which
has __GFP_IO|__GFP_FS stripped off, and throttle_vm_writeout is designed to
break out when __GFP_IO or GFP_FS is unset; but when tmfps swaps in,
read_swap_cache_async allocates with GFP_HIGHUSER_MOVABLE regardless of the
mapping_gfp_mask - hence the hang.
So, pass gfp_mask down the line from shmem_getpage to shmem_swapin to
swapin_readahead to read_swap_cache_async to add_to_swap_cache.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 09:28:42 +03:00
|
|
|
extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
|
2008-02-05 09:28:41 +03:00
|
|
|
struct vm_area_struct *vma, unsigned long addr);
|
swapin needs gfp_mask for loop on tmpfs
Building in a filesystem on a loop device on a tmpfs file can hang when
swapping, the loop thread caught in that infamous throttle_vm_writeout.
In theory this is a long standing problem, which I've either never seen in
practice, or long ago suppressed the recollection, after discounting my load
and my tmpfs size as unrealistically high. But now, with the new aops, it has
become easy to hang on one machine.
Loop used to grab_cache_page before the old prepare_write to tmpfs, which
seems to have been enough to free up some memory for any swapin needed; but
the new write_begin lets tmpfs find or allocate the page (much nicer, since
grab_cache_page missed tmpfs pages in swapcache).
When allocating a fresh page, tmpfs respects loop's mapping_gfp_mask, which
has __GFP_IO|__GFP_FS stripped off, and throttle_vm_writeout is designed to
break out when __GFP_IO or GFP_FS is unset; but when tmfps swaps in,
read_swap_cache_async allocates with GFP_HIGHUSER_MOVABLE regardless of the
mapping_gfp_mask - hence the hang.
So, pass gfp_mask down the line from shmem_getpage to shmem_swapin to
swapin_readahead to read_swap_cache_async to add_to_swap_cache.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 09:28:42 +03:00
|
|
|
extern struct page *swapin_readahead(swp_entry_t, gfp_t,
|
2008-02-05 09:28:41 +03:00
|
|
|
struct vm_area_struct *vma, unsigned long addr);
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/* linux/mm/swapfile.c */
|
|
|
|
extern long total_swap_pages;
|
|
|
|
extern unsigned int nr_swapfiles;
|
|
|
|
extern void si_swapinfo(struct sysinfo *);
|
|
|
|
extern swp_entry_t get_swap_page(void);
|
2006-03-23 13:59:59 +03:00
|
|
|
extern swp_entry_t get_swap_page_of_type(int);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern int swap_duplicate(swp_entry_t);
|
|
|
|
extern int valid_swaphandles(swp_entry_t, unsigned long *);
|
|
|
|
extern void swap_free(swp_entry_t);
|
|
|
|
extern void free_swap_and_cache(swp_entry_t);
|
2007-01-06 03:36:28 +03:00
|
|
|
extern int swap_type_of(dev_t, sector_t, struct block_device **);
|
2006-03-23 13:59:59 +03:00
|
|
|
extern unsigned int count_swap_pages(int, int);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
|
2006-12-07 07:34:10 +03:00
|
|
|
extern sector_t swapdev_block(int, pgoff_t);
|
2005-04-17 02:20:36 +04:00
|
|
|
extern struct swap_info_struct *get_swap_info_struct(unsigned);
|
|
|
|
extern int can_share_swap_page(struct page *);
|
|
|
|
extern int remove_exclusive_swap_page(struct page *);
|
|
|
|
struct backing_dev_info;
|
|
|
|
|
[PATCH] swap: swap_lock replace list+device
The idea of a swap_device_lock per device, and a swap_list_lock over them all,
is appealing; but in practice almost every holder of swap_device_lock must
already hold swap_list_lock, which defeats the purpose of the split.
The only exceptions have been swap_duplicate, valid_swaphandles and an
untrodden path in try_to_unuse (plus a few places added in this series).
valid_swaphandles doesn't show up high in profiles, but swap_duplicate does
demand attention. However, with the hold time in get_swap_pages so much
reduced, I've not yet found a load and set of swap device priorities to show
even swap_duplicate benefitting from the split. Certainly the split is mere
overhead in the common case of a single swap device.
So, replace swap_list_lock and swap_device_lock by spinlock_t swap_lock
(generally we seem to prefer an _ in the name, and not hide in a macro).
If someone can show a regression in swap_duplicate, then probably we should
add a hashlock for the swap_map entries alone (shorts being anatomic), so as
to help the case of the single swap device too.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-04 02:54:41 +04:00
|
|
|
extern spinlock_t swap_lock;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/* linux/mm/thrash.c */
|
|
|
|
extern struct mm_struct * swap_token_mm;
|
|
|
|
extern void grab_swap_token(void);
|
|
|
|
extern void __put_swap_token(struct mm_struct *);
|
|
|
|
|
|
|
|
static inline int has_swap_token(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return (mm == swap_token_mm);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void put_swap_token(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
if (has_swap_token(mm))
|
|
|
|
__put_swap_token(mm);
|
|
|
|
}
|
|
|
|
|
2005-11-29 00:44:07 +03:00
|
|
|
static inline void disable_swap_token(void)
|
|
|
|
{
|
|
|
|
put_swap_token(swap_token_mm);
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#else /* CONFIG_SWAP */
|
|
|
|
|
|
|
|
#define total_swap_pages 0
|
|
|
|
#define total_swapcache_pages 0UL
|
|
|
|
|
|
|
|
#define si_swapinfo(val) \
|
|
|
|
do { (val)->freeswap = (val)->totalswap = 0; } while (0)
|
2005-08-07 20:42:24 +04:00
|
|
|
/* only sparc can not include linux/pagemap.h in this file
|
|
|
|
* so leave page_cache_release and release_pages undeclared... */
|
2005-04-17 02:20:36 +04:00
|
|
|
#define free_page_and_swap_cache(page) \
|
|
|
|
page_cache_release(page)
|
|
|
|
#define free_pages_and_swap_cache(pages, nr) \
|
|
|
|
release_pages((pages), (nr), 0);
|
|
|
|
|
2006-06-23 13:03:42 +04:00
|
|
|
static inline void show_swap_cache_info(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void free_swap_and_cache(swp_entry_t swp)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int swap_duplicate(swp_entry_t swp)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void swap_free(swp_entry_t swp)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
swapin needs gfp_mask for loop on tmpfs
Building in a filesystem on a loop device on a tmpfs file can hang when
swapping, the loop thread caught in that infamous throttle_vm_writeout.
In theory this is a long standing problem, which I've either never seen in
practice, or long ago suppressed the recollection, after discounting my load
and my tmpfs size as unrealistically high. But now, with the new aops, it has
become easy to hang on one machine.
Loop used to grab_cache_page before the old prepare_write to tmpfs, which
seems to have been enough to free up some memory for any swapin needed; but
the new write_begin lets tmpfs find or allocate the page (much nicer, since
grab_cache_page missed tmpfs pages in swapcache).
When allocating a fresh page, tmpfs respects loop's mapping_gfp_mask, which
has __GFP_IO|__GFP_FS stripped off, and throttle_vm_writeout is designed to
break out when __GFP_IO or GFP_FS is unset; but when tmfps swaps in,
read_swap_cache_async allocates with GFP_HIGHUSER_MOVABLE regardless of the
mapping_gfp_mask - hence the hang.
So, pass gfp_mask down the line from shmem_getpage to shmem_swapin to
swapin_readahead to read_swap_cache_async to add_to_swap_cache.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 09:28:42 +03:00
|
|
|
static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
|
2006-06-23 13:03:42 +04:00
|
|
|
struct vm_area_struct *vma, unsigned long addr)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct page *lookup_swap_cache(swp_entry_t swp)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2006-05-15 20:44:22 +04:00
|
|
|
#define can_share_swap_page(p) (page_mapcount(p) == 1)
|
2006-06-23 13:03:42 +04:00
|
|
|
|
2008-02-05 09:28:50 +03:00
|
|
|
static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
|
|
|
|
gfp_t gfp_mask)
|
2006-06-23 13:03:42 +04:00
|
|
|
{
|
2008-02-05 09:28:50 +03:00
|
|
|
return -1;
|
2006-06-23 13:03:42 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void __delete_from_swap_cache(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void delete_from_swap_cache(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#define swap_token_default_timeout 0
|
|
|
|
|
|
|
|
static inline int remove_exclusive_swap_page(struct page *p)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline swp_entry_t get_swap_page(void)
|
|
|
|
{
|
|
|
|
swp_entry_t entry;
|
|
|
|
entry.val = 0;
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* linux/mm/thrash.c */
|
|
|
|
#define put_swap_token(x) do { } while(0)
|
|
|
|
#define grab_swap_token() do { } while(0)
|
|
|
|
#define has_swap_token(x) 0
|
2005-11-29 00:44:07 +03:00
|
|
|
#define disable_swap_token() do { } while(0)
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
#endif /* CONFIG_SWAP */
|
|
|
|
#endif /* __KERNEL__*/
|
|
|
|
#endif /* _LINUX_SWAP_H */
|