Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "13 patches.

  Subsystems affected by this patch series: mm (memory-failure, memcg,
  userfaultfd, hugetlbfs, mremap, oom-kill, kasan, hmm), and kcov"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm/mmu_notifier.c: fix race in mmu_interval_notifier_remove()
  kcov: don't generate a warning on vm_insert_page()'s failure
  MAINTAINERS: add Vincenzo Frascino to KASAN reviewers
  oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup
  selftest/vm: add skip support to mremap_test
  selftest/vm: support xfail in mremap_test
  selftest/vm: verify remap destination address in mremap_test
  selftest/vm: verify mmap addr in mremap_test
  mm, hugetlb: allow for "high" userspace addresses
  userfaultfd: mark uffd_wp regardless of VM_WRITE flag
  memcg: sync flush only if periodic flush is delayed
  mm/memory-failure.c: skip huge_zero_page in memory_failure()
  mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()
This commit is contained in:
Linus Torvalds 2022-04-22 10:10:43 -07:00
Родитель 3b8000ae18 319561669a
Коммит 281b9d9a4b
18 изменённых файлов: 329 добавлений и 89 удалений

Просмотреть файл

@ -10547,6 +10547,7 @@ M: Andrey Ryabinin <ryabinin.a.a@gmail.com>
R: Alexander Potapenko <glider@google.com> R: Alexander Potapenko <glider@google.com>
R: Andrey Konovalov <andreyknvl@gmail.com> R: Andrey Konovalov <andreyknvl@gmail.com>
R: Dmitry Vyukov <dvyukov@google.com> R: Dmitry Vyukov <dvyukov@google.com>
R: Vincenzo Frascino <vincenzo.frascino@arm.com>
L: kasan-dev@googlegroups.com L: kasan-dev@googlegroups.com
S: Maintained S: Maintained
F: Documentation/dev-tools/kasan.rst F: Documentation/dev-tools/kasan.rst

Просмотреть файл

@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
info.flags = 0; info.flags = 0;
info.length = len; info.length = len;
info.low_limit = current->mm->mmap_base; info.low_limit = current->mm->mmap_base;
info.high_limit = TASK_SIZE; info.high_limit = arch_get_mmap_end(addr);
info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0; info.align_offset = 0;
return vm_unmapped_area(&info); return vm_unmapped_area(&info);
@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len; info.length = len;
info.low_limit = max(PAGE_SIZE, mmap_min_addr); info.low_limit = max(PAGE_SIZE, mmap_min_addr);
info.high_limit = current->mm->mmap_base; info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0; info.align_offset = 0;
addr = vm_unmapped_area(&info); addr = vm_unmapped_area(&info);
@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
VM_BUG_ON(addr != -ENOMEM); VM_BUG_ON(addr != -ENOMEM);
info.flags = 0; info.flags = 0;
info.low_limit = current->mm->mmap_base; info.low_limit = current->mm->mmap_base;
info.high_limit = TASK_SIZE; info.high_limit = arch_get_mmap_end(addr);
addr = vm_unmapped_area(&info); addr = vm_unmapped_area(&info);
} }
@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct vm_area_struct *vma; struct vm_area_struct *vma;
struct hstate *h = hstate_file(file); struct hstate *h = hstate_file(file);
const unsigned long mmap_end = arch_get_mmap_end(addr);
if (len & ~huge_page_mask(h)) if (len & ~huge_page_mask(h))
return -EINVAL; return -EINVAL;
@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (addr) { if (addr) {
addr = ALIGN(addr, huge_page_size(h)); addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr); vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr && if (mmap_end - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma))) (!vma || addr + len <= vm_start_gap(vma)))
return addr; return addr;
} }

Просмотреть файл

@ -169,6 +169,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
long freed); long freed);
bool isolate_huge_page(struct page *page, struct list_head *list); bool isolate_huge_page(struct page *page, struct list_head *list);
int get_hwpoison_huge_page(struct page *page, bool *hugetlb); int get_hwpoison_huge_page(struct page *page, bool *hugetlb);
int get_huge_page_for_hwpoison(unsigned long pfn, int flags);
void putback_active_hugepage(struct page *page); void putback_active_hugepage(struct page *page);
void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page); void free_huge_page(struct page *page);
@ -378,6 +379,11 @@ static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
return 0; return 0;
} }
static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
{
return 0;
}
static inline void putback_active_hugepage(struct page *page) static inline void putback_active_hugepage(struct page *page)
{ {
} }

Просмотреть файл

@ -1012,6 +1012,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
} }
void mem_cgroup_flush_stats(void); void mem_cgroup_flush_stats(void);
void mem_cgroup_flush_stats_delayed(void);
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val); int val);
@ -1455,6 +1456,10 @@ static inline void mem_cgroup_flush_stats(void)
{ {
} }
static inline void mem_cgroup_flush_stats_delayed(void)
{
}
static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx, int val) enum node_stat_item idx, int val)
{ {

Просмотреть файл

@ -3197,6 +3197,14 @@ extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p); extern void shake_page(struct page *p);
extern atomic_long_t num_poisoned_pages __read_mostly; extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags); extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags);
#else
static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
{
return 0;
}
#endif
#ifndef arch_memory_failure #ifndef arch_memory_failure
static inline int arch_memory_failure(unsigned long pfn, int flags) static inline int arch_memory_failure(unsigned long pfn, int flags)

Просмотреть файл

@ -1443,6 +1443,7 @@ struct task_struct {
int pagefault_disabled; int pagefault_disabled;
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
struct task_struct *oom_reaper_list; struct task_struct *oom_reaper_list;
struct timer_list oom_reaper_timer;
#endif #endif
#ifdef CONFIG_VMAP_STACK #ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area; struct vm_struct *stack_vm_area;

Просмотреть файл

@ -136,6 +136,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm)
#endif /* CONFIG_MEMCG */ #endif /* CONFIG_MEMCG */
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
#ifndef arch_get_mmap_end
#define arch_get_mmap_end(addr) (TASK_SIZE)
#endif
#ifndef arch_get_mmap_base
#define arch_get_mmap_base(addr, base) (base)
#endif
extern void arch_pick_mmap_layout(struct mm_struct *mm, extern void arch_pick_mmap_layout(struct mm_struct *mm,
struct rlimit *rlim_stack); struct rlimit *rlim_stack);
extern unsigned long extern unsigned long

Просмотреть файл

@ -475,8 +475,11 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
vma->vm_flags |= VM_DONTEXPAND; vma->vm_flags |= VM_DONTEXPAND;
for (off = 0; off < size; off += PAGE_SIZE) { for (off = 0; off < size; off += PAGE_SIZE) {
page = vmalloc_to_page(kcov->area + off); page = vmalloc_to_page(kcov->area + off);
if (vm_insert_page(vma, vma->vm_start + off, page)) res = vm_insert_page(vma, vma->vm_start + off, page);
WARN_ONCE(1, "vm_insert_page() failed"); if (res) {
pr_warn_once("kcov: vm_insert_page() failed\n");
return res;
}
} }
return 0; return 0;
exit: exit:

Просмотреть файл

@ -6785,6 +6785,16 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
return ret; return ret;
} }
int get_huge_page_for_hwpoison(unsigned long pfn, int flags)
{
int ret;
spin_lock_irq(&hugetlb_lock);
ret = __get_huge_page_for_hwpoison(pfn, flags);
spin_unlock_irq(&hugetlb_lock);
return ret;
}
void putback_active_hugepage(struct page *page) void putback_active_hugepage(struct page *page)
{ {
spin_lock_irq(&hugetlb_lock); spin_lock_irq(&hugetlb_lock);

Просмотреть файл

@ -587,6 +587,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
static DEFINE_SPINLOCK(stats_flush_lock); static DEFINE_SPINLOCK(stats_flush_lock);
static DEFINE_PER_CPU(unsigned int, stats_updates); static DEFINE_PER_CPU(unsigned int, stats_updates);
static atomic_t stats_flush_threshold = ATOMIC_INIT(0); static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
static u64 flush_next_time;
#define FLUSH_TIME (2UL*HZ)
/* /*
* Accessors to ensure that preemption is disabled on PREEMPT_RT because it can * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
@ -637,6 +640,7 @@ static void __mem_cgroup_flush_stats(void)
if (!spin_trylock_irqsave(&stats_flush_lock, flag)) if (!spin_trylock_irqsave(&stats_flush_lock, flag))
return; return;
flush_next_time = jiffies_64 + 2*FLUSH_TIME;
cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
atomic_set(&stats_flush_threshold, 0); atomic_set(&stats_flush_threshold, 0);
spin_unlock_irqrestore(&stats_flush_lock, flag); spin_unlock_irqrestore(&stats_flush_lock, flag);
@ -648,10 +652,16 @@ void mem_cgroup_flush_stats(void)
__mem_cgroup_flush_stats(); __mem_cgroup_flush_stats();
} }
void mem_cgroup_flush_stats_delayed(void)
{
if (time_after64(jiffies_64, flush_next_time))
mem_cgroup_flush_stats();
}
static void flush_memcg_stats_dwork(struct work_struct *w) static void flush_memcg_stats_dwork(struct work_struct *w)
{ {
__mem_cgroup_flush_stats(); __mem_cgroup_flush_stats();
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
} }
/** /**

Просмотреть файл

@ -1498,50 +1498,113 @@ static int try_to_split_thp_page(struct page *page, const char *msg)
return 0; return 0;
} }
static int memory_failure_hugetlb(unsigned long pfn, int flags) /*
* Called from hugetlb code with hugetlb_lock held.
*
* Return values:
* 0 - free hugepage
* 1 - in-use hugepage
* 2 - not a hugepage
* -EBUSY - the hugepage is busy (try to retry)
* -EHWPOISON - the hugepage is already hwpoisoned
*/
int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
{ {
struct page *p = pfn_to_page(pfn); struct page *page = pfn_to_page(pfn);
struct page *head = compound_head(p); struct page *head = compound_head(page);
int res; int ret = 2; /* fallback to normal page handling */
unsigned long page_flags; bool count_increased = false;
if (!PageHeadHuge(head))
goto out;
if (flags & MF_COUNT_INCREASED) {
ret = 1;
count_increased = true;
} else if (HPageFreed(head) || HPageMigratable(head)) {
ret = get_page_unless_zero(head);
if (ret)
count_increased = true;
} else {
ret = -EBUSY;
goto out;
}
if (TestSetPageHWPoison(head)) { if (TestSetPageHWPoison(head)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n", ret = -EHWPOISON;
pfn); goto out;
res = -EHWPOISON; }
if (flags & MF_ACTION_REQUIRED)
return ret;
out:
if (count_increased)
put_page(head);
return ret;
}
#ifdef CONFIG_HUGETLB_PAGE
/*
* Taking refcount of hugetlb pages needs extra care about race conditions
* with basic operations like hugepage allocation/free/demotion.
* So some of prechecks for hwpoison (pinning, and testing/setting
* PageHWPoison) should be done in single hugetlb_lock range.
*/
static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
{
int res;
struct page *p = pfn_to_page(pfn);
struct page *head;
unsigned long page_flags;
bool retry = true;
*hugetlb = 1;
retry:
res = get_huge_page_for_hwpoison(pfn, flags);
if (res == 2) { /* fallback to normal page handling */
*hugetlb = 0;
return 0;
} else if (res == -EHWPOISON) {
pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn);
if (flags & MF_ACTION_REQUIRED) {
head = compound_head(p);
res = kill_accessing_process(current, page_to_pfn(head), flags); res = kill_accessing_process(current, page_to_pfn(head), flags);
}
return res; return res;
} else if (res == -EBUSY) {
if (retry) {
retry = false;
goto retry;
}
action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
return res;
}
head = compound_head(p);
lock_page(head);
if (hwpoison_filter(p)) {
ClearPageHWPoison(head);
res = -EOPNOTSUPP;
goto out;
} }
num_poisoned_pages_inc(); num_poisoned_pages_inc();
if (!(flags & MF_COUNT_INCREASED)) { /*
res = get_hwpoison_page(p, flags); * Handling free hugepage. The possible race with hugepage allocation
if (!res) { * or demotion can be prevented by PageHWPoison flag.
lock_page(head); */
if (hwpoison_filter(p)) { if (res == 0) {
if (TestClearPageHWPoison(head)) unlock_page(head);
num_poisoned_pages_dec(); res = MF_FAILED;
unlock_page(head); if (__page_handle_poison(p)) {
return -EOPNOTSUPP; page_ref_inc(p);
} res = MF_RECOVERED;
unlock_page(head);
res = MF_FAILED;
if (__page_handle_poison(p)) {
page_ref_inc(p);
res = MF_RECOVERED;
}
action_result(pfn, MF_MSG_FREE_HUGE, res);
return res == MF_RECOVERED ? 0 : -EBUSY;
} else if (res < 0) {
action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
return -EBUSY;
} }
action_result(pfn, MF_MSG_FREE_HUGE, res);
return res == MF_RECOVERED ? 0 : -EBUSY;
} }
lock_page(head);
/* /*
* The page could have changed compound pages due to race window. * The page could have changed compound pages due to race window.
* If this happens just bail out. * If this happens just bail out.
@ -1554,14 +1617,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
page_flags = head->flags; page_flags = head->flags;
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(head))
num_poisoned_pages_dec();
put_page(p);
res = -EOPNOTSUPP;
goto out;
}
/* /*
* TODO: hwpoison for pud-sized hugetlb doesn't work right now, so * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
* simply disable it. In order to make it work properly, we need * simply disable it. In order to make it work properly, we need
@ -1588,6 +1643,12 @@ out:
unlock_page(head); unlock_page(head);
return res; return res;
} }
#else
static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
{
return 0;
}
#endif
static int memory_failure_dev_pagemap(unsigned long pfn, int flags, static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
struct dev_pagemap *pgmap) struct dev_pagemap *pgmap)
@ -1712,6 +1773,7 @@ int memory_failure(unsigned long pfn, int flags)
int res = 0; int res = 0;
unsigned long page_flags; unsigned long page_flags;
bool retry = true; bool retry = true;
int hugetlb = 0;
if (!sysctl_memory_failure_recovery) if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn); panic("Memory failure on page %lx", pfn);
@ -1739,10 +1801,9 @@ int memory_failure(unsigned long pfn, int flags)
} }
try_again: try_again:
if (PageHuge(p)) { res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
res = memory_failure_hugetlb(pfn, flags); if (hugetlb)
goto unlock_mutex; goto unlock_mutex;
}
if (TestSetPageHWPoison(p)) { if (TestSetPageHWPoison(p)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n", pr_err("Memory failure: %#lx: already hardware poisoned\n",
@ -1799,6 +1860,19 @@ try_again:
} }
if (PageTransHuge(hpage)) { if (PageTransHuge(hpage)) {
/*
* Bail out before SetPageHasHWPoisoned() if hpage is
* huge_zero_page, although PG_has_hwpoisoned is not
* checked in set_huge_zero_page().
*
* TODO: Handle memory failure of huge_zero_page thoroughly.
*/
if (is_huge_zero_page(hpage)) {
action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
res = -EBUSY;
goto unlock_mutex;
}
/* /*
* The flag must be set after the refcount is bumped * The flag must be set after the refcount is bumped
* otherwise it may race with THP split. * otherwise it may race with THP split.

Просмотреть файл

@ -2117,14 +2117,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
return addr; return addr;
} }
#ifndef arch_get_mmap_end
#define arch_get_mmap_end(addr) (TASK_SIZE)
#endif
#ifndef arch_get_mmap_base
#define arch_get_mmap_base(addr, base) (base)
#endif
/* Get an address range which is currently unmapped. /* Get an address range which is currently unmapped.
* For shmat() with addr=0. * For shmat() with addr=0.
* *

Просмотреть файл

@ -1036,6 +1036,18 @@ int mmu_interval_notifier_insert_locked(
} }
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
static bool
mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
unsigned long seq)
{
bool ret;
spin_lock(&subscriptions->lock);
ret = subscriptions->invalidate_seq != seq;
spin_unlock(&subscriptions->lock);
return ret;
}
/** /**
* mmu_interval_notifier_remove - Remove a interval notifier * mmu_interval_notifier_remove - Remove a interval notifier
* @interval_sub: Interval subscription to unregister * @interval_sub: Interval subscription to unregister
@ -1083,7 +1095,7 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
lock_map_release(&__mmu_notifier_invalidate_range_start_map); lock_map_release(&__mmu_notifier_invalidate_range_start_map);
if (seq) if (seq)
wait_event(subscriptions->wq, wait_event(subscriptions->wq,
READ_ONCE(subscriptions->invalidate_seq) != seq); mmu_interval_seq_released(subscriptions, seq));
/* pairs with mmgrab in mmu_interval_notifier_insert() */ /* pairs with mmgrab in mmu_interval_notifier_insert() */
mmdrop(mm); mmdrop(mm);

Просмотреть файл

@ -632,7 +632,7 @@ done:
*/ */
set_bit(MMF_OOM_SKIP, &mm->flags); set_bit(MMF_OOM_SKIP, &mm->flags);
/* Drop a reference taken by wake_oom_reaper */ /* Drop a reference taken by queue_oom_reaper */
put_task_struct(tsk); put_task_struct(tsk);
} }
@ -644,12 +644,12 @@ static int oom_reaper(void *unused)
struct task_struct *tsk = NULL; struct task_struct *tsk = NULL;
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
spin_lock(&oom_reaper_lock); spin_lock_irq(&oom_reaper_lock);
if (oom_reaper_list != NULL) { if (oom_reaper_list != NULL) {
tsk = oom_reaper_list; tsk = oom_reaper_list;
oom_reaper_list = tsk->oom_reaper_list; oom_reaper_list = tsk->oom_reaper_list;
} }
spin_unlock(&oom_reaper_lock); spin_unlock_irq(&oom_reaper_lock);
if (tsk) if (tsk)
oom_reap_task(tsk); oom_reap_task(tsk);
@ -658,20 +658,46 @@ static int oom_reaper(void *unused)
return 0; return 0;
} }
static void wake_oom_reaper(struct task_struct *tsk) static void wake_oom_reaper(struct timer_list *timer)
{
struct task_struct *tsk = container_of(timer, struct task_struct,
oom_reaper_timer);
struct mm_struct *mm = tsk->signal->oom_mm;
unsigned long flags;
/* The victim managed to terminate on its own - see exit_mmap */
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
put_task_struct(tsk);
return;
}
spin_lock_irqsave(&oom_reaper_lock, flags);
tsk->oom_reaper_list = oom_reaper_list;
oom_reaper_list = tsk;
spin_unlock_irqrestore(&oom_reaper_lock, flags);
trace_wake_reaper(tsk->pid);
wake_up(&oom_reaper_wait);
}
/*
* Give the OOM victim time to exit naturally before invoking the oom_reaping.
* The timers timeout is arbitrary... the longer it is, the longer the worst
* case scenario for the OOM can take. If it is too small, the oom_reaper can
* get in the way and release resources needed by the process exit path.
* e.g. The futex robust list can sit in Anon|Private memory that gets reaped
* before the exit path is able to wake the futex waiters.
*/
#define OOM_REAPER_DELAY (2*HZ)
static void queue_oom_reaper(struct task_struct *tsk)
{ {
/* mm is already queued? */ /* mm is already queued? */
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return; return;
get_task_struct(tsk); get_task_struct(tsk);
timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
spin_lock(&oom_reaper_lock); tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
tsk->oom_reaper_list = oom_reaper_list; add_timer(&tsk->oom_reaper_timer);
oom_reaper_list = tsk;
spin_unlock(&oom_reaper_lock);
trace_wake_reaper(tsk->pid);
wake_up(&oom_reaper_wait);
} }
static int __init oom_init(void) static int __init oom_init(void)
@ -681,7 +707,7 @@ static int __init oom_init(void)
} }
subsys_initcall(oom_init) subsys_initcall(oom_init)
#else #else
static inline void wake_oom_reaper(struct task_struct *tsk) static inline void queue_oom_reaper(struct task_struct *tsk)
{ {
} }
#endif /* CONFIG_MMU */ #endif /* CONFIG_MMU */
@ -932,7 +958,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
rcu_read_unlock(); rcu_read_unlock();
if (can_oom_reap) if (can_oom_reap)
wake_oom_reaper(victim); queue_oom_reaper(victim);
mmdrop(mm); mmdrop(mm);
put_task_struct(victim); put_task_struct(victim);
@ -968,7 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
task_lock(victim); task_lock(victim);
if (task_will_free_mem(victim)) { if (task_will_free_mem(victim)) {
mark_oom_victim(victim); mark_oom_victim(victim);
wake_oom_reaper(victim); queue_oom_reaper(victim);
task_unlock(victim); task_unlock(victim);
put_task_struct(victim); put_task_struct(victim);
return; return;
@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *oc)
*/ */
if (task_will_free_mem(current)) { if (task_will_free_mem(current)) {
mark_oom_victim(current); mark_oom_victim(current);
wake_oom_reaper(current); queue_oom_reaper(current);
return true; return true;
} }

Просмотреть файл

@ -72,12 +72,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
_dst_pte = pte_mkdirty(_dst_pte); _dst_pte = pte_mkdirty(_dst_pte);
if (page_in_cache && !vm_shared) if (page_in_cache && !vm_shared)
writable = false; writable = false;
if (writable) {
if (wp_copy) /*
_dst_pte = pte_mkuffd_wp(_dst_pte); * Always mark a PTE as write-protected when needed, regardless of
else * VM_WRITE, which the user might change.
_dst_pte = pte_mkwrite(_dst_pte); */
} if (wp_copy)
_dst_pte = pte_mkuffd_wp(_dst_pte);
else if (writable)
_dst_pte = pte_mkwrite(_dst_pte);
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);

Просмотреть файл

@ -355,7 +355,7 @@ void workingset_refault(struct folio *folio, void *shadow)
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
mem_cgroup_flush_stats(); mem_cgroup_flush_stats_delayed();
/* /*
* Compare the distance to the existing workingset size. We * Compare the distance to the existing workingset size. We
* don't activate pages that couldn't stay resident even if * don't activate pages that couldn't stay resident even if

Просмотреть файл

@ -6,9 +6,11 @@
#include <errno.h> #include <errno.h>
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h>
#include <string.h> #include <string.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <time.h> #include <time.h>
#include <stdbool.h>
#include "../kselftest.h" #include "../kselftest.h"
@ -63,6 +65,59 @@ enum {
.expect_failure = should_fail \ .expect_failure = should_fail \
} }
/*
* Returns false if the requested remap region overlaps with an
* existing mapping (e.g text, stack) else returns true.
*/
static bool is_remap_region_valid(void *addr, unsigned long long size)
{
void *remap_addr = NULL;
bool ret = true;
/* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */
remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE,
MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
-1, 0);
if (remap_addr == MAP_FAILED) {
if (errno == EEXIST)
ret = false;
} else {
munmap(remap_addr, size);
}
return ret;
}
/* Returns mmap_min_addr sysctl tunable from procfs */
static unsigned long long get_mmap_min_addr(void)
{
FILE *fp;
int n_matched;
static unsigned long long addr;
if (addr)
return addr;
fp = fopen("/proc/sys/vm/mmap_min_addr", "r");
if (fp == NULL) {
ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n",
strerror(errno));
exit(KSFT_SKIP);
}
n_matched = fscanf(fp, "%llu", &addr);
if (n_matched != 1) {
ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n",
strerror(errno));
fclose(fp);
exit(KSFT_SKIP);
}
fclose(fp);
return addr;
}
/* /*
* Returns the start address of the mapping on success, else returns * Returns the start address of the mapping on success, else returns
* NULL on failure. * NULL on failure.
@ -71,11 +126,18 @@ static void *get_source_mapping(struct config c)
{ {
unsigned long long addr = 0ULL; unsigned long long addr = 0ULL;
void *src_addr = NULL; void *src_addr = NULL;
unsigned long long mmap_min_addr;
mmap_min_addr = get_mmap_min_addr();
retry: retry:
addr += c.src_alignment; addr += c.src_alignment;
if (addr < mmap_min_addr)
goto retry;
src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE,
MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
-1, 0); -1, 0);
if (src_addr == MAP_FAILED) { if (src_addr == MAP_FAILED) {
if (errno == EPERM || errno == EEXIST) if (errno == EPERM || errno == EEXIST)
goto retry; goto retry;
@ -90,8 +152,10 @@ retry:
* alignment in the tests. * alignment in the tests.
*/ */
if (((unsigned long long) src_addr & (c.src_alignment - 1)) || if (((unsigned long long) src_addr & (c.src_alignment - 1)) ||
!((unsigned long long) src_addr & c.src_alignment)) !((unsigned long long) src_addr & c.src_alignment)) {
munmap(src_addr, c.region_size);
goto retry; goto retry;
}
if (!src_addr) if (!src_addr)
goto error; goto error;
@ -140,9 +204,20 @@ static long long remap_region(struct config c, unsigned int threshold_mb,
if (!((unsigned long long) addr & c.dest_alignment)) if (!((unsigned long long) addr & c.dest_alignment))
addr = (void *) ((unsigned long long) addr | c.dest_alignment); addr = (void *) ((unsigned long long) addr | c.dest_alignment);
/* Don't destroy existing mappings unless expected to overlap */
while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) {
/* Check for unsigned overflow */
if (addr + c.dest_alignment < addr) {
ksft_print_msg("Couldn't find a valid region to remap to\n");
ret = -1;
goto out;
}
addr += c.dest_alignment;
}
clock_gettime(CLOCK_MONOTONIC, &t_start); clock_gettime(CLOCK_MONOTONIC, &t_start);
dest_addr = mremap(src_addr, c.region_size, c.region_size, dest_addr = mremap(src_addr, c.region_size, c.region_size,
MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr); MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr);
clock_gettime(CLOCK_MONOTONIC, &t_end); clock_gettime(CLOCK_MONOTONIC, &t_end);
if (dest_addr == MAP_FAILED) { if (dest_addr == MAP_FAILED) {
@ -193,7 +268,7 @@ static void run_mremap_test_case(struct test test_case, int *failures,
if (remap_time < 0) { if (remap_time < 0) {
if (test_case.expect_failure) if (test_case.expect_failure)
ksft_test_result_pass("%s\n\tExpected mremap failure\n", ksft_test_result_xfail("%s\n\tExpected mremap failure\n",
test_case.name); test_case.name);
else { else {
ksft_test_result_fail("%s\n", test_case.name); ksft_test_result_fail("%s\n", test_case.name);

Просмотреть файл

@ -291,11 +291,16 @@ echo "-------------------"
echo "running mremap_test" echo "running mremap_test"
echo "-------------------" echo "-------------------"
./mremap_test ./mremap_test
if [ $? -ne 0 ]; then ret_val=$?
if [ $ret_val -eq 0 ]; then
echo "[PASS]"
elif [ $ret_val -eq $ksft_skip ]; then
echo "[SKIP]"
exitcode=$ksft_skip
else
echo "[FAIL]" echo "[FAIL]"
exitcode=1 exitcode=1
else
echo "[PASS]"
fi fi
echo "-----------------" echo "-----------------"