From b20a35035f983f4ac7e29c4a68f30e43510007e0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 22 Mar 2006 00:09:12 -0800 Subject: [PATCH] [PATCH] page migration reorg Centralize the page migration functions in anticipation of additional tinkering. Creates a new file mm/migrate.c 1. Extract buffer_migrate_page() from fs/buffer.c 2. Extract central migration code from vmscan.c 3. Extract some components from mempolicy.c 4. Export pageout() and remove_from_swap() from vmscan.c 5. Make it possible to configure NUMA systems without page migration and non-NUMA systems with page migration. I had to so some #ifdeffing in mempolicy.c that may need a cleanup. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 62 ---- fs/xfs/linux-2.6/xfs_buf.c | 1 + include/linux/migrate.h | 36 ++ include/linux/swap.h | 34 +- mm/Kconfig | 6 + mm/Makefile | 2 + mm/mempolicy.c | 113 ++----- mm/migrate.c | 655 +++++++++++++++++++++++++++++++++++++ mm/swap_state.c | 1 + mm/vmscan.c | 491 +-------------------------- 10 files changed, 741 insertions(+), 660 deletions(-) create mode 100644 include/linux/migrate.h create mode 100644 mm/migrate.c diff --git a/fs/buffer.c b/fs/buffer.c index a9b399402007..1d3683d496f8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3050,68 +3050,6 @@ asmlinkage long sys_bdflush(int func, long data) return 0; } -/* - * Migration function for pages with buffers. This function can only be used - * if the underlying filesystem guarantees that no other references to "page" - * exist. - */ -#ifdef CONFIG_MIGRATION -int buffer_migrate_page(struct page *newpage, struct page *page) -{ - struct address_space *mapping = page->mapping; - struct buffer_head *bh, *head; - int rc; - - if (!mapping) - return -EAGAIN; - - if (!page_has_buffers(page)) - return migrate_page(newpage, page); - - head = page_buffers(page); - - rc = migrate_page_remove_references(newpage, page, 3); - if (rc) - return rc; - - bh = head; - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; - - } while (bh != head); - - ClearPagePrivate(page); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - get_page(newpage); - - bh = head; - do { - set_bh_page(bh, newpage, bh_offset(bh)); - bh = bh->b_this_page; - - } while (bh != head); - - SetPagePrivate(newpage); - - migrate_page_copy(newpage, page); - - bh = head; - do { - unlock_buffer(bh); - put_bh(bh); - bh = bh->b_this_page; - - } while (bh != head); - - return 0; -} -EXPORT_SYMBOL(buffer_migrate_page); -#endif - /* * Buffer-head allocation */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index bfb4f2917bb6..8cdfa4151659 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "xfs_linux.h" STATIC kmem_zone_t *xfs_buf_zone; diff --git a/include/linux/migrate.h b/include/linux/migrate.h new file mode 100644 index 000000000000..7d09962c3c0b --- /dev/null +++ b/include/linux/migrate.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_MIGRATE_H +#define _LINUX_MIGRATE_H + +#include +#include + +#ifdef CONFIG_MIGRATION +extern int isolate_lru_page(struct page *p, struct list_head *pagelist); +extern int putback_lru_pages(struct list_head *l); +extern int migrate_page(struct page *, struct page *); +extern void migrate_page_copy(struct page *, struct page *); +extern int migrate_page_remove_references(struct page *, struct page *, int); +extern int migrate_pages(struct list_head *l, struct list_head *t, + struct list_head *moved, struct list_head *failed); +int migrate_pages_to(struct list_head *pagelist, + struct vm_area_struct *vma, int dest); +extern int fail_migrate_page(struct page *, struct page *); + +extern int migrate_prep(void); + +#else + +static inline int isolate_lru_page(struct page *p, struct list_head *list) + { return -ENOSYS; } +static inline int putback_lru_pages(struct list_head *l) { return 0; } +static inline int migrate_pages(struct list_head *l, struct list_head *t, + struct list_head *moved, struct list_head *failed) { return -ENOSYS; } + +static inline int migrate_prep(void) { return -ENOSYS; } + +/* Possible settings for the migrate_page() method in address_operations */ +#define migrate_page NULL +#define fail_migrate_page NULL + +#endif /* CONFIG_MIGRATION */ +#endif /* _LINUX_MIGRATE_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 3dc6c89c49b8..12415dd94451 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -175,6 +175,21 @@ extern void swap_setup(void); extern unsigned long try_to_free_pages(struct zone **, gfp_t); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; +extern int remove_mapping(struct address_space *mapping, struct page *page); + +/* possible outcome of pageout() */ +typedef enum { + /* failed to write page out, page is locked */ + PAGE_KEEP, + /* move page to the active list, page is locked */ + PAGE_ACTIVATE, + /* page has been sent to the disk successfully, page is unlocked */ + PAGE_SUCCESS, + /* page is clean and locked */ + PAGE_CLEAN, +} pageout_t; + +extern pageout_t pageout(struct page *page, struct address_space *mapping); #ifdef CONFIG_NUMA extern int zone_reclaim_mode; @@ -188,25 +203,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) } #endif -#ifdef CONFIG_MIGRATION -extern int isolate_lru_page(struct page *p); -extern unsigned long putback_lru_pages(struct list_head *l); -extern int migrate_page(struct page *, struct page *); -extern void migrate_page_copy(struct page *, struct page *); -extern int migrate_page_remove_references(struct page *, struct page *, int); -extern unsigned long migrate_pages(struct list_head *l, struct list_head *t, - struct list_head *moved, struct list_head *failed); -extern int fail_migrate_page(struct page *, struct page *); -#else -static inline int isolate_lru_page(struct page *p) { return -ENOSYS; } -static inline int putback_lru_pages(struct list_head *l) { return 0; } -static inline int migrate_pages(struct list_head *l, struct list_head *t, - struct list_head *moved, struct list_head *failed) { return -ENOSYS; } -/* Possible settings for the migrate_page() method in address_operations */ -#define migrate_page NULL -#define fail_migrate_page NULL -#endif - #ifdef CONFIG_MMU /* linux/mm/shmem.c */ extern int shmem_unuse(swp_entry_t entry, struct page *page); diff --git a/mm/Kconfig b/mm/Kconfig index a9cb80ae6409..bd80460360db 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS # support for page migration # config MIGRATION + bool "Page migration" def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM depends on SWAP + help + Allows the migration of the physical location of pages of processes + while the virtual addresses are not changed. This is useful for + example on NUMA systems to put pages nearer to the processors accessing + the page. diff --git a/mm/Makefile b/mm/Makefile index 9aa03fa1dcc3..f10c753dce6d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o +obj-$(CONFIG_MIGRATION) += migrate.o + diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 96195dcb62e1..e93cc740c22b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include @@ -95,9 +96,6 @@ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ -/* The number of pages to migrate per call to migrate_pages() */ -#define MIGRATE_CHUNK_SIZE 256 - static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; @@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct vm_area_struct *first, *vma, *prev; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - /* Must have swap device for migration */ - if (nr_swap_pages <= 0) - return ERR_PTR(-ENODEV); - /* - * Clear the LRU lists so pages can be isolated. - * Note that pages may be moved off the LRU after we have - * drained them. Those pages will fail to migrate like other - * pages that may be busy. - */ - lru_add_drain_all(); + err = migrate_prep(); + if (err) + return ERR_PTR(err); } first = find_vma(mm, start); @@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } +#ifdef CONFIG_MIGRATION /* * page migration */ - static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags) { /* * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { - if (isolate_lru_page(page)) - list_add_tail(&page->lru, pagelist); - } -} - -/* - * Migrate the list 'pagelist' of pages to a certain destination. - * - * Specify destination with either non-NULL vma or dest_node >= 0 - * Return the number of pages not migrated or error code - */ -static int migrate_pages_to(struct list_head *pagelist, - struct vm_area_struct *vma, int dest) -{ - LIST_HEAD(newlist); - LIST_HEAD(moved); - LIST_HEAD(failed); - int err = 0; - unsigned long offset = 0; - int nr_pages; - struct page *page; - struct list_head *p; - -redo: - nr_pages = 0; - list_for_each(p, pagelist) { - if (vma) { - /* - * The address passed to alloc_page_vma is used to - * generate the proper interleave behavior. We fake - * the address here by an increasing offset in order - * to get the proper distribution of pages. - * - * No decision has been made as to which page - * a certain old page is moved to so we cannot - * specify the correct address. - */ - page = alloc_page_vma(GFP_HIGHUSER, vma, - offset + vma->vm_start); - offset += PAGE_SIZE; - } - else - page = alloc_pages_node(dest, GFP_HIGHUSER, 0); - - if (!page) { - err = -ENOMEM; - goto out; - } - list_add_tail(&page->lru, &newlist); - nr_pages++; - if (nr_pages > MIGRATE_CHUNK_SIZE) - break; - } - err = migrate_pages(pagelist, &newlist, &moved, &failed); - - putback_lru_pages(&moved); /* Call release pages instead ?? */ - - if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) - goto redo; -out: - /* Return leftover allocated pages */ - while (!list_empty(&newlist)) { - page = list_entry(newlist.next, struct page, lru); - list_del(&page->lru); - __free_page(page); - } - list_splice(&failed, pagelist); - if (err < 0) - return err; - - /* Calculate number of leftover pages */ - nr_pages = 0; - list_for_each(p, pagelist) - nr_pages++; - return nr_pages; + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) + isolate_lru_page(page, pagelist); } /* @@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm, if (err < 0) return err; return busy; + } +#else + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ +} + +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + return -ENOSYS; +} +#endif + long do_mbind(unsigned long start, unsigned long len, unsigned long mode, nodemask_t *nmask, unsigned long flags) { @@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len, if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } + if (!list_empty(&pagelist)) putback_lru_pages(&pagelist); diff --git a/mm/migrate.c b/mm/migrate.c new file mode 100644 index 000000000000..09f6e4aa87fc --- /dev/null +++ b/mm/migrate.c @@ -0,0 +1,655 @@ +/* + * Memory Migration functionality - linux/mm/migration.c + * + * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter + * + * Page migration was first developed in the context of the memory hotplug + * project. The main authors of the migration code are: + * + * IWAMOTO Toshihiro + * Hirokazu Takahashi + * Dave Hansen + * Christoph Lameter + */ + +#include +#include +#include +#include +#include /* for try_to_release_page(), + buffer_heads_over_limit */ +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#include "internal.h" + +/* The maximum number of pages to take off the LRU for migration */ +#define MIGRATE_CHUNK_SIZE 256 + +#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) + +/* + * Isolate one page from the LRU lists. If successful put it onto + * the indicated list with elevated page count. + * + * Result: + * -EBUSY: page not on LRU list + * 0: page removed from LRU list and added to the specified list. + */ +int isolate_lru_page(struct page *page, struct list_head *pagelist) +{ + int ret = -EBUSY; + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page)) { + ret = 0; + get_page(page); + ClearPageLRU(page); + if (PageActive(page)) + del_page_from_active_list(zone, page); + else + del_page_from_inactive_list(zone, page); + list_add_tail(&page->lru, pagelist); + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; +} + +/* + * migrate_prep() needs to be called after we have compiled the list of pages + * to be migrated using isolate_lru_page() but before we begin a series of calls + * to migrate_pages(). + */ +int migrate_prep(void) +{ + /* Must have swap device for migration */ + if (nr_swap_pages <= 0) + return -ENODEV; + + /* + * Clear the LRU lists so pages can be isolated. + * Note that pages may be moved off the LRU after we have + * drained them. Those pages will fail to migrate like other + * pages that may be busy. + */ + lru_add_drain_all(); + + return 0; +} + +static inline void move_to_lru(struct page *page) +{ + list_del(&page->lru); + if (PageActive(page)) { + /* + * lru_cache_add_active checks that + * the PG_active bit is off. + */ + ClearPageActive(page); + lru_cache_add_active(page); + } else { + lru_cache_add(page); + } + put_page(page); +} + +/* + * Add isolated pages on the list back to the LRU. + * + * returns the number of pages put back. + */ +int putback_lru_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + int count = 0; + + list_for_each_entry_safe(page, page2, l, lru) { + move_to_lru(page); + count++; + } + return count; +} + +/* + * Non migratable page + */ +int fail_migrate_page(struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); + +/* + * swapout a single page + * page is locked upon entry, unlocked on exit + */ +static int swap_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (page_mapped(page) && mapping) + if (try_to_unmap(page, 1) != SWAP_SUCCESS) + goto unlock_retry; + + if (PageDirty(page)) { + /* Page is dirty, try to write it out here */ + switch(pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_retry; + + case PAGE_SUCCESS: + goto retry; + + case PAGE_CLEAN: + ; /* try to free the page below */ + } + } + + if (PagePrivate(page)) { + if (!try_to_release_page(page, GFP_KERNEL) || + (!mapping && page_count(page) == 1)) + goto unlock_retry; + } + + if (remove_mapping(mapping, page)) { + /* Success */ + unlock_page(page); + return 0; + } + +unlock_retry: + unlock_page(page); + +retry: + return -EAGAIN; +} +EXPORT_SYMBOL(swap_page); + +/* + * Remove references for a page and establish the new page with the correct + * basic settings to be able to stop accesses to the page. + */ +int migrate_page_remove_references(struct page *newpage, + struct page *page, int nr_refs) +{ + struct address_space *mapping = page_mapping(page); + struct page **radix_pointer; + + /* + * Avoid doing any of the following work if the page count + * indicates that the page is in use or truncate has removed + * the page. + */ + if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) + return -EAGAIN; + + /* + * Establish swap ptes for anonymous pages or destroy pte + * maps for files. + * + * In order to reestablish file backed mappings the fault handlers + * will take the radix tree_lock which may then be used to stop + * processses from accessing this page until the new page is ready. + * + * A process accessing via a swap pte (an anonymous page) will take a + * page_lock on the old page which will block the process until the + * migration attempt is complete. At that time the PageSwapCache bit + * will be examined. If the page was migrated then the PageSwapCache + * bit will be clear and the operation to retrieve the page will be + * retried which will find the new page in the radix tree. Then a new + * direct mapping may be generated based on the radix tree contents. + * + * If the page was not migrated then the PageSwapCache bit + * is still set and the operation may continue. + */ + if (try_to_unmap(page, 1) == SWAP_FAIL) + /* A vma has VM_LOCKED set -> permanent failure */ + return -EPERM; + + /* + * Give up if we were unable to remove all mappings. + */ + if (page_mapcount(page)) + return -EAGAIN; + + write_lock_irq(&mapping->tree_lock); + + radix_pointer = (struct page **)radix_tree_lookup_slot( + &mapping->page_tree, + page_index(page)); + + if (!page_mapping(page) || page_count(page) != nr_refs || + *radix_pointer != page) { + write_unlock_irq(&mapping->tree_lock); + return 1; + } + + /* + * Now we know that no one else is looking at the page. + * + * Certain minimal information about a page must be available + * in order for other subsystems to properly handle the page if they + * find it through the radix tree update before we are finished + * copying the page. + */ + get_page(newpage); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + set_page_private(newpage, page_private(page)); + } + + *radix_pointer = newpage; + __put_page(page); + write_unlock_irq(&mapping->tree_lock); + + return 0; +} +EXPORT_SYMBOL(migrate_page_remove_references); + +/* + * Copy the page to its new location + */ +void migrate_page_copy(struct page *newpage, struct page *page) +{ + copy_highpage(newpage, page); + + if (PageError(page)) + SetPageError(newpage); + if (PageReferenced(page)) + SetPageReferenced(newpage); + if (PageUptodate(page)) + SetPageUptodate(newpage); + if (PageActive(page)) + SetPageActive(newpage); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageMappedToDisk(page)) + SetPageMappedToDisk(newpage); + + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + set_page_dirty(newpage); + } + + ClearPageSwapCache(page); + ClearPageActive(page); + ClearPagePrivate(page); + set_page_private(page, 0); + page->mapping = NULL; + + /* + * If any waiters have accumulated on the new page then + * wake them up. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); +} +EXPORT_SYMBOL(migrate_page_copy); + +/* + * Common logic to directly migrate a single page suitable for + * pages that do not use PagePrivate. + * + * Pages are locked upon entry and exit. + */ +int migrate_page(struct page *newpage, struct page *page) +{ + int rc; + + BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + + rc = migrate_page_remove_references(newpage, page, 2); + + if (rc) + return rc; + + migrate_page_copy(newpage, page); + + /* + * Remove auxiliary swap entries and replace + * them with real ptes. + * + * Note that a real pte entry will allow processes that are not + * waiting on the page lock to use the new page via the page tables + * before the new page is unlocked. + */ + remove_from_swap(newpage); + return 0; +} +EXPORT_SYMBOL(migrate_page); + +/* + * migrate_pages + * + * Two lists are passed to this function. The first list + * contains the pages isolated from the LRU to be migrated. + * The second list contains new pages that the pages isolated + * can be moved to. If the second list is NULL then all + * pages are swapped out. + * + * The function returns after 10 attempts or if no pages + * are movable anymore because to has become empty + * or no retryable pages exist anymore. + * + * Return: Number of pages not migrated when "to" ran empty. + */ +int migrate_pages(struct list_head *from, struct list_head *to, + struct list_head *moved, struct list_head *failed) +{ + int retry; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int swapwrite = current->flags & PF_SWAPWRITE; + int rc; + + if (!swapwrite) + current->flags |= PF_SWAPWRITE; + +redo: + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + struct page *newpage = NULL; + struct address_space *mapping; + + cond_resched(); + + rc = 0; + if (page_count(page) == 1) + /* page was freed from under us. So we are done. */ + goto next; + + if (to && list_empty(to)) + break; + + /* + * Skip locked pages during the first two passes to give the + * functions holding the lock time to release the page. Later we + * use lock_page() to have a higher chance of acquiring the + * lock. + */ + rc = -EAGAIN; + if (pass > 2) + lock_page(page); + else + if (TestSetPageLocked(page)) + goto next; + + /* + * Only wait on writeback if we have already done a pass where + * we we may have triggered writeouts for lots of pages. + */ + if (pass > 0) { + wait_on_page_writeback(page); + } else { + if (PageWriteback(page)) + goto unlock_page; + } + + /* + * Anonymous pages must have swap cache references otherwise + * the information contained in the page maps cannot be + * preserved. + */ + if (PageAnon(page) && !PageSwapCache(page)) { + if (!add_to_swap(page, GFP_KERNEL)) { + rc = -ENOMEM; + goto unlock_page; + } + } + + if (!to) { + rc = swap_page(page); + goto next; + } + + newpage = lru_to_page(to); + lock_page(newpage); + + /* + * Pages are properly locked and writeback is complete. + * Try to migrate the page. + */ + mapping = page_mapping(page); + if (!mapping) + goto unlock_both; + + if (mapping->a_ops->migratepage) { + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(newpage, page); + goto unlock_both; + } + + /* + * Default handling if a filesystem does not provide + * a migration function. We can only migrate clean + * pages so try to write out any dirty pages first. + */ + if (PageDirty(page)) { + switch (pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_both; + + case PAGE_SUCCESS: + unlock_page(newpage); + goto next; + + case PAGE_CLEAN: + ; /* try to migrate the page below */ + } + } + + /* + * Buffers are managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (!page_has_buffers(page) || + try_to_release_page(page, GFP_KERNEL)) { + rc = migrate_page(newpage, page); + goto unlock_both; + } + + /* + * On early passes with mapped pages simply + * retry. There may be a lock held for some + * buffers that may go away. Later + * swap them out. + */ + if (pass > 4) { + /* + * Persistently unable to drop buffers..... As a + * measure of last resort we fall back to + * swap_page(). + */ + unlock_page(newpage); + newpage = NULL; + rc = swap_page(page); + goto next; + } + +unlock_both: + unlock_page(newpage); + +unlock_page: + unlock_page(page); + +next: + if (rc == -EAGAIN) { + retry++; + } else if (rc) { + /* Permanent failure */ + list_move(&page->lru, failed); + nr_failed++; + } else { + if (newpage) { + /* Successful migration. Return page to LRU */ + move_to_lru(newpage); + } + list_move(&page->lru, moved); + } + } + if (retry && pass++ < 10) + goto redo; + + if (!swapwrite) + current->flags &= ~PF_SWAPWRITE; + + return nr_failed + retry; +} + +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +int buffer_migrate_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct buffer_head *bh, *head; + int rc; + + if (!mapping) + return -EAGAIN; + + if (!page_has_buffers(page)) + return migrate_page(newpage, page); + + head = page_buffers(page); + + rc = migrate_page_remove_references(newpage, page, 3); + + if (rc) + return rc; + + bh = head; + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return 0; +} +EXPORT_SYMBOL(buffer_migrate_page); + +/* + * Migrate the list 'pagelist' of pages to a certain destination. + * + * Specify destination with either non-NULL vma or dest_node >= 0 + * Return the number of pages not migrated or error code + */ +int migrate_pages_to(struct list_head *pagelist, + struct vm_area_struct *vma, int dest) +{ + LIST_HEAD(newlist); + LIST_HEAD(moved); + LIST_HEAD(failed); + int err = 0; + unsigned long offset = 0; + int nr_pages; + struct page *page; + struct list_head *p; + +redo: + nr_pages = 0; + list_for_each(p, pagelist) { + if (vma) { + /* + * The address passed to alloc_page_vma is used to + * generate the proper interleave behavior. We fake + * the address here by an increasing offset in order + * to get the proper distribution of pages. + * + * No decision has been made as to which page + * a certain old page is moved to so we cannot + * specify the correct address. + */ + page = alloc_page_vma(GFP_HIGHUSER, vma, + offset + vma->vm_start); + offset += PAGE_SIZE; + } + else + page = alloc_pages_node(dest, GFP_HIGHUSER, 0); + + if (!page) { + err = -ENOMEM; + goto out; + } + list_add_tail(&page->lru, &newlist); + nr_pages++; + if (nr_pages > MIGRATE_CHUNK_SIZE) + break; + } + err = migrate_pages(pagelist, &newlist, &moved, &failed); + + putback_lru_pages(&moved); /* Call release pages instead ?? */ + + if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) + goto redo; +out: + /* Return leftover allocated pages */ + while (!list_empty(&newlist)) { + page = list_entry(newlist.next, struct page, lru); + list_del(&page->lru); + __free_page(page); + } + list_splice(&failed, pagelist); + if (err < 0) + return err; + + /* Calculate number of leftover pages */ + nr_pages = 0; + list_for_each(p, pagelist) + nr_pages++; + return nr_pages; +} diff --git a/mm/swap_state.c b/mm/swap_state.c index db8a3d3e1636..d7af296833fc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -15,6 +15,7 @@ #include #include #include +#include #include diff --git a/mm/vmscan.c b/mm/vmscan.c index 548e023c193b..fd572bbdc9f5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -42,18 +42,6 @@ #include "internal.h" -/* possible outcome of pageout() */ -typedef enum { - /* failed to write page out, page is locked */ - PAGE_KEEP, - /* move page to the active list, page is locked */ - PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ - PAGE_SUCCESS, - /* page is clean and locked */ - PAGE_CLEAN, -} pageout_t; - struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -304,7 +292,7 @@ static void handle_write_error(struct address_space *mapping, * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write @@ -372,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) return PAGE_CLEAN; } -static int remove_mapping(struct address_space *mapping, struct page *page) +int remove_mapping(struct address_space *mapping, struct page *page) { if (!mapping) return 0; /* truncate got there first */ @@ -570,481 +558,6 @@ keep: return nr_reclaimed; } -#ifdef CONFIG_MIGRATION -static inline void move_to_lru(struct page *page) -{ - list_del(&page->lru); - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } - put_page(page); -} - -/* - * Add isolated pages on the list back to the LRU. - * - * returns the number of pages put back. - */ -unsigned long putback_lru_pages(struct list_head *l) -{ - struct page *page; - struct page *page2; - unsigned long count = 0; - - list_for_each_entry_safe(page, page2, l, lru) { - move_to_lru(page); - count++; - } - return count; -} - -/* - * Non migratable page - */ -int fail_migrate_page(struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - -/* - * swapout a single page - * page is locked upon entry, unlocked on exit - */ -static int swap_page(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - - if (page_mapped(page) && mapping) - if (try_to_unmap(page, 1) != SWAP_SUCCESS) - goto unlock_retry; - - if (PageDirty(page)) { - /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_retry; - - case PAGE_SUCCESS: - goto retry; - - case PAGE_CLEAN: - ; /* try to free the page below */ - } - } - - if (PagePrivate(page)) { - if (!try_to_release_page(page, GFP_KERNEL) || - (!mapping && page_count(page) == 1)) - goto unlock_retry; - } - - if (remove_mapping(mapping, page)) { - /* Success */ - unlock_page(page); - return 0; - } - -unlock_retry: - unlock_page(page); - -retry: - return -EAGAIN; -} -EXPORT_SYMBOL(swap_page); - -/* - * Page migration was first developed in the context of the memory hotplug - * project. The main authors of the migration code are: - * - * IWAMOTO Toshihiro - * Hirokazu Takahashi - * Dave Hansen - * Christoph Lameter - */ - -/* - * Remove references for a page and establish the new page with the correct - * basic settings to be able to stop accesses to the page. - */ -int migrate_page_remove_references(struct page *newpage, - struct page *page, int nr_refs) -{ - struct address_space *mapping = page_mapping(page); - struct page **radix_pointer; - - /* - * Avoid doing any of the following work if the page count - * indicates that the page is in use or truncate has removed - * the page. - */ - if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) - return -EAGAIN; - - /* - * Establish swap ptes for anonymous pages or destroy pte - * maps for files. - * - * In order to reestablish file backed mappings the fault handlers - * will take the radix tree_lock which may then be used to stop - * processses from accessing this page until the new page is ready. - * - * A process accessing via a swap pte (an anonymous page) will take a - * page_lock on the old page which will block the process until the - * migration attempt is complete. At that time the PageSwapCache bit - * will be examined. If the page was migrated then the PageSwapCache - * bit will be clear and the operation to retrieve the page will be - * retried which will find the new page in the radix tree. Then a new - * direct mapping may be generated based on the radix tree contents. - * - * If the page was not migrated then the PageSwapCache bit - * is still set and the operation may continue. - */ - if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> Permanent failure */ - return -EPERM; - - /* - * Give up if we were unable to remove all mappings. - */ - if (page_mapcount(page)) - return -EAGAIN; - - write_lock_irq(&mapping->tree_lock); - - radix_pointer = (struct page **)radix_tree_lookup_slot( - &mapping->page_tree, - page_index(page)); - - if (!page_mapping(page) || page_count(page) != nr_refs || - *radix_pointer != page) { - write_unlock_irq(&mapping->tree_lock); - return -EAGAIN; - } - - /* - * Now we know that no one else is looking at the page. - * - * Certain minimal information about a page must be available - * in order for other subsystems to properly handle the page if they - * find it through the radix tree update before we are finished - * copying the page. - */ - get_page(newpage); - newpage->index = page->index; - newpage->mapping = page->mapping; - if (PageSwapCache(page)) { - SetPageSwapCache(newpage); - set_page_private(newpage, page_private(page)); - } - - *radix_pointer = newpage; - __put_page(page); - write_unlock_irq(&mapping->tree_lock); - - return 0; -} -EXPORT_SYMBOL(migrate_page_remove_references); - -/* - * Copy the page to its new location - */ -void migrate_page_copy(struct page *newpage, struct page *page) -{ - copy_highpage(newpage, page); - - if (PageError(page)) - SetPageError(newpage); - if (PageReferenced(page)) - SetPageReferenced(newpage); - if (PageUptodate(page)) - SetPageUptodate(newpage); - if (PageActive(page)) - SetPageActive(newpage); - if (PageChecked(page)) - SetPageChecked(newpage); - if (PageMappedToDisk(page)) - SetPageMappedToDisk(newpage); - - if (PageDirty(page)) { - clear_page_dirty_for_io(page); - set_page_dirty(newpage); - } - - ClearPageSwapCache(page); - ClearPageActive(page); - ClearPagePrivate(page); - set_page_private(page, 0); - page->mapping = NULL; - - /* - * If any waiters have accumulated on the new page then - * wake them up. - */ - if (PageWriteback(newpage)) - end_page_writeback(newpage); -} -EXPORT_SYMBOL(migrate_page_copy); - -/* - * Common logic to directly migrate a single page suitable for - * pages that do not use PagePrivate. - * - * Pages are locked upon entry and exit. - */ -int migrate_page(struct page *newpage, struct page *page) -{ - int rc; - - BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - - rc = migrate_page_remove_references(newpage, page, 2); - - if (rc) - return rc; - - migrate_page_copy(newpage, page); - - /* - * Remove auxiliary swap entries and replace - * them with real ptes. - * - * Note that a real pte entry will allow processes that are not - * waiting on the page lock to use the new page via the page tables - * before the new page is unlocked. - */ - remove_from_swap(newpage); - return 0; -} -EXPORT_SYMBOL(migrate_page); - -/* - * migrate_pages - * - * Two lists are passed to this function. The first list - * contains the pages isolated from the LRU to be migrated. - * The second list contains new pages that the pages isolated - * can be moved to. If the second list is NULL then all - * pages are swapped out. - * - * The function returns after 10 attempts or if no pages - * are movable anymore because to has become empty - * or no retryable pages exist anymore. - * - * Return: Number of pages not migrated when "to" ran empty. - */ -unsigned long migrate_pages(struct list_head *from, struct list_head *to, - struct list_head *moved, struct list_head *failed) -{ - unsigned long retry; - unsigned long nr_failed = 0; - int pass = 0; - struct page *page; - struct page *page2; - int swapwrite = current->flags & PF_SWAPWRITE; - int rc; - - if (!swapwrite) - current->flags |= PF_SWAPWRITE; - -redo: - retry = 0; - - list_for_each_entry_safe(page, page2, from, lru) { - struct page *newpage = NULL; - struct address_space *mapping; - - cond_resched(); - - rc = 0; - if (page_count(page) == 1) - /* page was freed from under us. So we are done. */ - goto next; - - if (to && list_empty(to)) - break; - - /* - * Skip locked pages during the first two passes to give the - * functions holding the lock time to release the page. Later we - * use lock_page() to have a higher chance of acquiring the - * lock. - */ - rc = -EAGAIN; - if (pass > 2) - lock_page(page); - else - if (TestSetPageLocked(page)) - goto next; - - /* - * Only wait on writeback if we have already done a pass where - * we we may have triggered writeouts for lots of pages. - */ - if (pass > 0) { - wait_on_page_writeback(page); - } else { - if (PageWriteback(page)) - goto unlock_page; - } - - /* - * Anonymous pages must have swap cache references otherwise - * the information contained in the page maps cannot be - * preserved. - */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page, GFP_KERNEL)) { - rc = -ENOMEM; - goto unlock_page; - } - } - - if (!to) { - rc = swap_page(page); - goto next; - } - - newpage = lru_to_page(to); - lock_page(newpage); - - /* - * Pages are properly locked and writeback is complete. - * Try to migrate the page. - */ - mapping = page_mapping(page); - if (!mapping) - goto unlock_both; - - if (mapping->a_ops->migratepage) { - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(newpage, page); - goto unlock_both; - } - - /* - * Default handling if a filesystem does not provide - * a migration function. We can only migrate clean - * pages so try to write out any dirty pages first. - */ - if (PageDirty(page)) { - switch (pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_both; - - case PAGE_SUCCESS: - unlock_page(newpage); - goto next; - - case PAGE_CLEAN: - ; /* try to migrate the page below */ - } - } - - /* - * Buffers are managed in a filesystem specific way. - * We must have no buffers or drop them. - */ - if (!page_has_buffers(page) || - try_to_release_page(page, GFP_KERNEL)) { - rc = migrate_page(newpage, page); - goto unlock_both; - } - - /* - * On early passes with mapped pages simply - * retry. There may be a lock held for some - * buffers that may go away. Later - * swap them out. - */ - if (pass > 4) { - /* - * Persistently unable to drop buffers..... As a - * measure of last resort we fall back to - * swap_page(). - */ - unlock_page(newpage); - newpage = NULL; - rc = swap_page(page); - goto next; - } - -unlock_both: - unlock_page(newpage); - -unlock_page: - unlock_page(page); - -next: - if (rc == -EAGAIN) { - retry++; - } else if (rc) { - /* Permanent failure */ - list_move(&page->lru, failed); - nr_failed++; - } else { - if (newpage) { - /* Successful migration. Return page to LRU */ - move_to_lru(newpage); - } - list_move(&page->lru, moved); - } - } - if (retry && pass++ < 10) - goto redo; - - if (!swapwrite) - current->flags &= ~PF_SWAPWRITE; - - return nr_failed + retry; -} - -/* - * Isolate one page from the LRU lists and put it on the - * indicated list with elevated refcount. - * - * Result: - * 0 = page not on LRU list - * 1 = page removed from LRU list and added to the specified list. - */ -int isolate_lru_page(struct page *page) -{ - int ret = 0; - - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page)) { - ret = 1; - get_page(page); - ClearPageLRU(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - } - spin_unlock_irq(&zone->lru_lock); - } - - return ret; -} -#endif - /* * zone->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages