mm,thp: add read-only THP support for (non-shmem) FS

This patch is (hopefully) the first step to enable THP for non-shmem
filesystems.

This patch enables an application to put part of its text sections to THP
via madvise, for example:

    madvise((void *)0x600000, 0x200000, MADV_HUGEPAGE);

We tried to reuse the logic for THP on tmpfs.

Currently, write is not supported for non-shmem THP.  khugepaged will only
process vma with VM_DENYWRITE.  sys_mmap() ignores VM_DENYWRITE requests
(see ksys_mmap_pgoff).  The only way to create vma with VM_DENYWRITE is
execve().  This requirement limits non-shmem THP to text sections.

The next patch will handle writes, which would only happen when the all
the vmas with VM_DENYWRITE are unmapped.

An EXPERIMENTAL config, READ_ONLY_THP_FOR_FS, is added to gate this
feature.

[songliubraving@fb.com: fix build without CONFIG_SHMEM]
  Link: http://lkml.kernel.org/r/F53407FB-96CC-42E8-9862-105C92CC2B98@fb.com
[songliubraving@fb.com: fix double unlock in collapse_file()]
  Link: http://lkml.kernel.org/r/B960CBFA-8EFC-4DA4-ABC5-1977FFF2CA57@fb.com
Link: http://lkml.kernel.org/r/20190801184244.3169074-7-songliubraving@fb.com
Signed-off-by: Song Liu <songliubraving@fb.com>
Acked-by: Rik van Riel <riel@surriel.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Song Liu 2019-09-23 15:38:00 -07:00 коммит произвёл Linus Torvalds
Родитель 579c571e2e
Коммит 99cb0dbd47
4 изменённых файлов: 128 добавлений и 48 удалений

Просмотреть файл

@ -712,6 +712,17 @@ config GUP_BENCHMARK
config GUP_GET_PTE_LOW_HIGH
bool
config READ_ONLY_THP_FOR_FS
bool "Read-only THP for filesystems (EXPERIMENTAL)"
depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
help
Allow khugepaged to put read-only file-backed pages in THP.
This is marked experimental because it is a new feature. Write
support of file THPs will be developed in the next few release
cycles.
config ARCH_HAS_PTE_SPECIAL
bool

Просмотреть файл

@ -203,8 +203,8 @@ static void unaccount_page_cache_page(struct address_space *mapping,
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
if (PageTransHuge(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
} else {
VM_BUG_ON_PAGE(PageTransHuge(page), page);
} else if (PageTransHuge(page)) {
__dec_node_page_state(page, NR_FILE_THPS);
}
/*

Просмотреть файл

@ -48,6 +48,7 @@ enum scan_result {
SCAN_CGROUP_CHARGE_FAIL,
SCAN_EXCEED_SWAP_PTE,
SCAN_TRUNCATED,
SCAN_PAGE_HAS_PRIVATE,
};
#define CREATE_TRACE_POINTS
@ -404,7 +405,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
(vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
return false;
if (shmem_file(vma->vm_file)) {
if (shmem_file(vma->vm_file) ||
(IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
vma->vm_file &&
(vm_flags & VM_DENYWRITE))) {
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
return false;
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
@ -456,8 +461,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
unsigned long hstart, hend;
/*
* khugepaged does not yet work on non-shmem files or special
* mappings. And file-private shmem THP is not supported.
* khugepaged only supports read-only files for non-shmem files.
* khugepaged does not yet work on special mappings. And
* file-private shmem THP is not supported.
*/
if (!hugepage_vma_check(vma, vm_flags))
return 0;
@ -1287,12 +1293,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
}
/**
* collapse_file - collapse small tmpfs/shmem pages into huge one.
* collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
*
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
* - scan page cache replacing old pages with the new one
* + swap in pages if necessary;
* + swap/gup in pages if necessary;
* + fill in gaps;
* + keep old pages around in case rollback is required;
* - if replacing succeeds:
@ -1316,7 +1322,9 @@ static void collapse_file(struct mm_struct *mm,
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
bool is_shmem = shmem_file(file);
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
/* Only allocate from the target node */
@ -1348,7 +1356,8 @@ static void collapse_file(struct mm_struct *mm,
} while (1);
__SetPageLocked(new_page);
__SetPageSwapBacked(new_page);
if (is_shmem)
__SetPageSwapBacked(new_page);
new_page->index = start;
new_page->mapping = mapping;
@ -1363,41 +1372,75 @@ static void collapse_file(struct mm_struct *mm,
struct page *page = xas_next(&xas);
VM_BUG_ON(index != xas.xa_index);
if (!page) {
/*
* Stop if extent has been truncated or hole-punched,
* and is now completely empty.
*/
if (index == start) {
if (!xas_next_entry(&xas, end - 1)) {
result = SCAN_TRUNCATED;
if (is_shmem) {
if (!page) {
/*
* Stop if extent has been truncated or
* hole-punched, and is now completely
* empty.
*/
if (index == start) {
if (!xas_next_entry(&xas, end - 1)) {
result = SCAN_TRUNCATED;
goto xa_locked;
}
xas_set(&xas, index);
}
if (!shmem_charge(mapping->host, 1)) {
result = SCAN_FAIL;
goto xa_locked;
}
xas_set(&xas, index);
xas_store(&xas, new_page);
nr_none++;
continue;
}
if (!shmem_charge(mapping->host, 1)) {
result = SCAN_FAIL;
if (xa_is_value(page) || !PageUptodate(page)) {
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
if (shmem_getpage(mapping->host, index, &page,
SGP_NOHUGE)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK;
goto xa_locked;
}
xas_store(&xas, new_page);
nr_none++;
continue;
}
if (xa_is_value(page) || !PageUptodate(page)) {
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
if (shmem_getpage(mapping->host, index, &page,
SGP_NOHUGE)) {
} else { /* !is_shmem */
if (!page || xa_is_value(page)) {
xas_unlock_irq(&xas);
page_cache_sync_readahead(mapping, &file->f_ra,
file, index,
PAGE_SIZE);
/* drain pagevecs to help isolate_lru_page() */
lru_add_drain();
page = find_lock_page(mapping, index);
if (unlikely(page == NULL)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
} else if (!PageUptodate(page)) {
xas_unlock_irq(&xas);
wait_on_page_locked(page);
if (!trylock_page(page)) {
result = SCAN_PAGE_LOCK;
goto xa_unlocked;
}
get_page(page);
} else if (PageDirty(page)) {
result = SCAN_FAIL;
goto xa_unlocked;
goto xa_locked;
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK;
goto xa_locked;
}
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
} else {
result = SCAN_PAGE_LOCK;
goto xa_locked;
}
/*
@ -1426,6 +1469,12 @@ static void collapse_file(struct mm_struct *mm,
goto out_unlock;
}
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL)) {
result = SCAN_PAGE_HAS_PRIVATE;
goto out_unlock;
}
if (page_mapped(page))
unmap_mapping_pages(mapping, index, 1, false);
@ -1463,12 +1512,18 @@ out_unlock:
goto xa_unlocked;
}
__inc_node_page_state(new_page, NR_SHMEM_THPS);
if (is_shmem)
__inc_node_page_state(new_page, NR_SHMEM_THPS);
else
__inc_node_page_state(new_page, NR_FILE_THPS);
if (nr_none) {
struct zone *zone = page_zone(new_page);
__mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
__mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
if (is_shmem)
__mod_node_page_state(zone->zone_pgdat,
NR_SHMEM, nr_none);
}
xa_locked:
@ -1506,10 +1561,15 @@ xa_unlocked:
SetPageUptodate(new_page);
page_ref_add(new_page, HPAGE_PMD_NR - 1);
set_page_dirty(new_page);
mem_cgroup_commit_charge(new_page, memcg, false, true);
if (is_shmem) {
set_page_dirty(new_page);
lru_cache_add_anon(new_page);
} else {
lru_cache_add_file(new_page);
}
count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
lru_cache_add_anon(new_page);
/*
* Remove pte page tables, so we can re-fault the page as huge.
@ -1524,7 +1584,9 @@ xa_unlocked:
/* Something went wrong: roll back page cache changes */
xas_lock_irq(&xas);
mapping->nrpages -= nr_none;
shmem_uncharge(mapping->host, nr_none);
if (is_shmem)
shmem_uncharge(mapping->host, nr_none);
xas_set(&xas, start);
xas_for_each(&xas, page, end - 1) {
@ -1607,7 +1669,8 @@ static void khugepaged_scan_file(struct mm_struct *mm,
break;
}
if (page_count(page) != 1 + page_mapcount(page)) {
if (page_count(page) !=
1 + page_mapcount(page) + page_has_private(page)) {
result = SCAN_PAGE_COUNT;
break;
}
@ -1713,11 +1776,13 @@ skip:
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
if (shmem_file(vma->vm_file)) {
if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
struct file *file;
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
if (!shmem_huge_enabled(vma))
if (shmem_file(vma->vm_file)
&& !shmem_huge_enabled(vma))
goto skip;
file = get_file(vma->vm_file);
up_read(&mm->mmap_sem);

Просмотреть файл

@ -1189,8 +1189,10 @@ void page_add_file_rmap(struct page *page, bool compound)
}
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
if (PageSwapBacked(page))
__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
else
__inc_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (PageTransCompound(page) && page_mapping(page)) {
VM_WARN_ON_ONCE(!PageLocked(page));
@ -1229,8 +1231,10 @@ static void page_remove_file_rmap(struct page *page, bool compound)
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
goto out;
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
if (PageSwapBacked(page))
__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
else
__dec_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
goto out;