mm, THP, swap: support to reclaim swap space for THP swapped out

The normal swap slot reclaiming can be done when the swap count reaches
SWAP_HAS_CACHE.  But for the swap slot which is backing a THP, all swap
slots backing one THP must be reclaimed together, because the swap slot
may be used again when the THP is swapped out again later.  So the swap
slots backing one THP can be reclaimed together when the swap count for
all swap slots for the THP reached SWAP_HAS_CACHE.  In the patch, the
functions to check whether the swap count for all swap slots backing one
THP reached SWAP_HAS_CACHE are implemented and used when checking
whether a swap slot can be reclaimed.

To make it easier to determine whether a swap slot is backing a THP, a
new swap cluster flag named CLUSTER_FLAG_HUGE is added to mark a swap
cluster which is backing a THP (Transparent Huge Page).  Because THP
swap in as a whole isn't supported now.  After deleting the THP from the
swap cache (for example, swapping out finished), the CLUSTER_FLAG_HUGE
flag will be cleared.  So that, the normal pages inside THP can be
swapped in individually.

[ying.huang@intel.com: fix swap_page_trans_huge_swapped on HDD]
  Link: http://lkml.kernel.org/r/874ltsm0bi.fsf@yhuang-dev.intel.com
Link: http://lkml.kernel.org/r/20170724051840.2309-3-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Ross Zwisler <ross.zwisler@intel.com> [for brd.c, zram_drv.c, pmem.c]
Cc: Vishal L Verma <vishal.l.verma@intel.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Huang Ying 2017-09-06 16:22:16 -07:00 коммит произвёл Linus Torvalds
Родитель a3aea839e4
Коммит e07098294a
2 изменённых файлов: 72 добавлений и 7 удалений

Просмотреть файл

@ -188,6 +188,7 @@ struct swap_cluster_info {
}; };
#define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */
/* /*
* We assign a cluster to each CPU, so each CPU can allocate swap entry from * We assign a cluster to each CPU, so each CPU can allocate swap entry from

Просмотреть файл

@ -265,6 +265,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
info->data = 0; info->data = 0;
} }
static inline bool cluster_is_huge(struct swap_cluster_info *info)
{
return info->flags & CLUSTER_FLAG_HUGE;
}
static inline void cluster_clear_huge(struct swap_cluster_info *info)
{
info->flags &= ~CLUSTER_FLAG_HUGE;
}
static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
unsigned long offset) unsigned long offset)
{ {
@ -846,7 +856,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
offset = idx * SWAPFILE_CLUSTER; offset = idx * SWAPFILE_CLUSTER;
ci = lock_cluster(si, offset); ci = lock_cluster(si, offset);
alloc_cluster(si, idx); alloc_cluster(si, idx);
cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0); cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
map = si->swap_map + offset; map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++) for (i = 0; i < SWAPFILE_CLUSTER; i++)
@ -1176,6 +1186,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
return; return;
ci = lock_cluster(si, offset); ci = lock_cluster(si, offset);
VM_BUG_ON(!cluster_is_huge(ci));
map = si->swap_map + offset; map = si->swap_map + offset;
for (i = 0; i < SWAPFILE_CLUSTER; i++) { for (i = 0; i < SWAPFILE_CLUSTER; i++) {
val = map[i]; val = map[i];
@ -1187,6 +1198,7 @@ static void swapcache_free_cluster(swp_entry_t entry)
for (i = 0; i < SWAPFILE_CLUSTER; i++) for (i = 0; i < SWAPFILE_CLUSTER; i++)
map[i] &= ~SWAP_HAS_CACHE; map[i] &= ~SWAP_HAS_CACHE;
} }
cluster_clear_huge(ci);
unlock_cluster(ci); unlock_cluster(ci);
if (free_entries == SWAPFILE_CLUSTER) { if (free_entries == SWAPFILE_CLUSTER) {
spin_lock(&si->lock); spin_lock(&si->lock);
@ -1350,6 +1362,54 @@ out:
return count; return count;
} }
#ifdef CONFIG_THP_SWAP
static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
swp_entry_t entry)
{
struct swap_cluster_info *ci;
unsigned char *map = si->swap_map;
unsigned long roffset = swp_offset(entry);
unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
int i;
bool ret = false;
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
if (map[roffset] != SWAP_HAS_CACHE)
ret = true;
goto unlock_out;
}
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
if (map[offset + i] != SWAP_HAS_CACHE) {
ret = true;
break;
}
}
unlock_out:
unlock_cluster_or_swap_info(si, ci);
return ret;
}
static bool page_swapped(struct page *page)
{
swp_entry_t entry;
struct swap_info_struct *si;
if (likely(!PageTransCompound(page)))
return page_swapcount(page) != 0;
page = compound_head(page);
entry.val = page_private(page);
si = _swap_info_get(entry);
if (si)
return swap_page_trans_huge_swapped(si, entry);
return false;
}
#else
#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry)
#define page_swapped(page) (page_swapcount(page) != 0)
#endif
/* /*
* We can write to an anon page without COW if there are no other references * We can write to an anon page without COW if there are no other references
* to it. And as a side-effect, free up its swap: because the old content * to it. And as a side-effect, free up its swap: because the old content
@ -1404,7 +1464,7 @@ int try_to_free_swap(struct page *page)
return 0; return 0;
if (PageWriteback(page)) if (PageWriteback(page))
return 0; return 0;
if (page_swapcount(page)) if (page_swapped(page))
return 0; return 0;
/* /*
@ -1425,6 +1485,7 @@ int try_to_free_swap(struct page *page)
if (pm_suspended_storage()) if (pm_suspended_storage())
return 0; return 0;
page = compound_head(page);
delete_from_swap_cache(page); delete_from_swap_cache(page);
SetPageDirty(page); SetPageDirty(page);
return 1; return 1;
@ -1446,7 +1507,8 @@ int free_swap_and_cache(swp_entry_t entry)
p = _swap_info_get(entry); p = _swap_info_get(entry);
if (p) { if (p) {
count = __swap_entry_free(p, entry, 1); count = __swap_entry_free(p, entry, 1);
if (count == SWAP_HAS_CACHE) { if (count == SWAP_HAS_CACHE &&
!swap_page_trans_huge_swapped(p, entry)) {
page = find_get_page(swap_address_space(entry), page = find_get_page(swap_address_space(entry),
swp_offset(entry)); swp_offset(entry));
if (page && !trylock_page(page)) { if (page && !trylock_page(page)) {
@ -1463,7 +1525,8 @@ int free_swap_and_cache(swp_entry_t entry)
*/ */
if (PageSwapCache(page) && !PageWriteback(page) && if (PageSwapCache(page) && !PageWriteback(page) &&
(!page_mapped(page) || mem_cgroup_swap_full(page)) && (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
!swap_swapcount(p, entry)) { !swap_page_trans_huge_swapped(p, entry)) {
page = compound_head(page);
delete_from_swap_cache(page); delete_from_swap_cache(page);
SetPageDirty(page); SetPageDirty(page);
} }
@ -2017,7 +2080,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
.sync_mode = WB_SYNC_NONE, .sync_mode = WB_SYNC_NONE,
}; };
swap_writepage(page, &wbc); swap_writepage(compound_head(page), &wbc);
lock_page(page); lock_page(page);
wait_on_page_writeback(page); wait_on_page_writeback(page);
} }
@ -2030,8 +2093,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
* delete, since it may not have been written out to swap yet. * delete, since it may not have been written out to swap yet.
*/ */
if (PageSwapCache(page) && if (PageSwapCache(page) &&
likely(page_private(page) == entry.val)) likely(page_private(page) == entry.val) &&
delete_from_swap_cache(page); !page_swapped(page))
delete_from_swap_cache(compound_head(page));
/* /*
* So we could skip searching mms once swap count went * So we could skip searching mms once swap count went