mm: make madvise(MADV_WILLNEED) support swap file prefetch

Make madvise(MADV_WILLNEED) support swap file prefetch.  If memory is
swapout, this syscall can do swapin prefetch.  It has no impact if the
memory isn't swapout.

[akpm@linux-foundation.org: fix CONFIG_SWAP=n build]
[sasha.levin@oracle.com: fix BUG on madvise early failure]
Signed-off-by: Shaohua Li <shli@fusionio.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Shaohua Li 2013-02-22 16:32:31 -08:00 коммит произвёл Linus Torvalds
Родитель a394cb8ee6
Коммит 1998cc0489
1 изменённых файлов: 101 добавлений и 4 удалений

Просмотреть файл

@ -16,6 +16,9 @@
#include <linux/ksm.h> #include <linux/ksm.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
/* /*
* Any behaviour which results in changes to the vma->vm_flags needs to * Any behaviour which results in changes to the vma->vm_flags needs to
@ -131,6 +134,84 @@ out:
return error; return error;
} }
#ifdef CONFIG_SWAP
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
{
pte_t *orig_pte;
struct vm_area_struct *vma = walk->private;
unsigned long index;
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
return 0;
for (index = start; index != end; index += PAGE_SIZE) {
pte_t pte;
swp_entry_t entry;
struct page *page;
spinlock_t *ptl;
orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
pte = *(orig_pte + ((index - start) / PAGE_SIZE));
pte_unmap_unlock(orig_pte, ptl);
if (pte_present(pte) || pte_none(pte) || pte_file(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
continue;
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
vma, index);
if (page)
page_cache_release(page);
}
return 0;
}
static void force_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
struct mm_walk walk = {
.mm = vma->vm_mm,
.pmd_entry = swapin_walk_pmd_entry,
.private = vma,
};
walk_page_range(start, end, &walk);
lru_add_drain(); /* Push any new pages onto the LRU now */
}
static void force_shm_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct address_space *mapping)
{
pgoff_t index;
struct page *page;
swp_entry_t swap;
for (; start < end; start += PAGE_SIZE) {
index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
page = find_get_page(mapping, index);
if (!radix_tree_exceptional_entry(page)) {
if (page)
page_cache_release(page);
continue;
}
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
NULL, 0);
if (page)
page_cache_release(page);
}
lru_add_drain(); /* Push any new pages onto the LRU now */
}
#endif /* CONFIG_SWAP */
/* /*
* Schedule all required I/O operations. Do not wait for completion. * Schedule all required I/O operations. Do not wait for completion.
*/ */
@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma,
{ {
struct file *file = vma->vm_file; struct file *file = vma->vm_file;
#ifdef CONFIG_SWAP
if (!file || mapping_cap_swap_backed(file->f_mapping)) {
*prev = vma;
if (!file)
force_swapin_readahead(vma, start, end);
else
force_shm_swapin_readahead(vma, start, end,
file->f_mapping);
return 0;
}
#endif
if (!file) if (!file)
return -EBADF; return -EBADF;
@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
int error = -EINVAL; int error = -EINVAL;
int write; int write;
size_t len; size_t len;
struct blk_plug plug;
#ifdef CONFIG_MEMORY_FAILURE #ifdef CONFIG_MEMORY_FAILURE
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@ -410,18 +504,19 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
if (vma && start > vma->vm_start) if (vma && start > vma->vm_start)
prev = vma; prev = vma;
blk_start_plug(&plug);
for (;;) { for (;;) {
/* Still start < end. */ /* Still start < end. */
error = -ENOMEM; error = -ENOMEM;
if (!vma) if (!vma)
goto out; goto out_plug;
/* Here start < (end|vma->vm_end). */ /* Here start < (end|vma->vm_end). */
if (start < vma->vm_start) { if (start < vma->vm_start) {
unmapped_error = -ENOMEM; unmapped_error = -ENOMEM;
start = vma->vm_start; start = vma->vm_start;
if (start >= end) if (start >= end)
goto out; goto out_plug;
} }
/* Here vma->vm_start <= start < (end|vma->vm_end) */ /* Here vma->vm_start <= start < (end|vma->vm_end) */
@ -432,18 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
error = madvise_vma(vma, &prev, start, tmp, behavior); error = madvise_vma(vma, &prev, start, tmp, behavior);
if (error) if (error)
goto out; goto out_plug;
start = tmp; start = tmp;
if (prev && start < prev->vm_end) if (prev && start < prev->vm_end)
start = prev->vm_end; start = prev->vm_end;
error = unmapped_error; error = unmapped_error;
if (start >= end) if (start >= end)
goto out; goto out_plug;
if (prev) if (prev)
vma = prev->vm_next; vma = prev->vm_next;
else /* madvise_remove dropped mmap_sem */ else /* madvise_remove dropped mmap_sem */
vma = find_vma(current->mm, start); vma = find_vma(current->mm, start);
} }
out_plug:
blk_finish_plug(&plug);
out: out:
if (write) if (write)
up_write(&current->mm->mmap_sem); up_write(&current->mm->mmap_sem);