memcg: oom kill disable and oom status
This adds a feature to disable oom-killer for memcg, if disabled, of course, tasks under memcg will stop. But now, we have oom-notifier for memcg. And the world around memcg is not under out-of-memory. memcg's out-of-memory just shows memcg hits limit. Then, administrator or management daemon can recover the situation by - kill some process - enlarge limit, add more swap. - migrate some tasks - remove file cache on tmps (difficult ?) Unlike oom-killer, you can take enough information before killing tasks. (by gcore, or, ps etc.) [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Родитель
9490ff2756
Коммит
3c11ecf448
|
@ -493,6 +493,8 @@ It's applicable for root and non-root cgroup.
|
|||
|
||||
10. OOM Control
|
||||
|
||||
memory.oom_control file is for OOM notification and other controls.
|
||||
|
||||
Memory controler implements oom notifier using cgroup notification
|
||||
API (See cgroups.txt). It allows to register multiple oom notification
|
||||
delivery and gets notification when oom happens.
|
||||
|
@ -505,6 +507,27 @@ To register a notifier, application need:
|
|||
Application will be notifier through eventfd when oom happens.
|
||||
OOM notification doesn't work for root cgroup.
|
||||
|
||||
You can disable oom-killer by writing "1" to memory.oom_control file.
|
||||
As.
|
||||
#echo 1 > memory.oom_control
|
||||
|
||||
This operation is only allowed to the top cgroup of subhierarchy.
|
||||
If oom-killer is disabled, tasks under cgroup will hang/sleep
|
||||
in memcg's oom-waitq when they request accountable memory.
|
||||
|
||||
For running them, you have to relax the memcg's oom sitaution by
|
||||
* enlarge limit or reduce usage.
|
||||
To reduce usage,
|
||||
* kill some tasks.
|
||||
* move some tasks to other group with account migration.
|
||||
* remove some files (on tmpfs?)
|
||||
|
||||
Then, stopped tasks will work again.
|
||||
|
||||
At reading, current status of OOM is shown.
|
||||
oom_kill_disable 0 or 1 (if 1, oom-killer is disabled)
|
||||
under_oom 0 or 1 (if 1, the memcg is under OOM,tasks may
|
||||
be stopped.)
|
||||
|
||||
11. TODO
|
||||
|
||||
|
|
111
mm/memcontrol.c
111
mm/memcontrol.c
|
@ -214,6 +214,8 @@ struct mem_cgroup {
|
|||
atomic_t refcnt;
|
||||
|
||||
unsigned int swappiness;
|
||||
/* OOM-Killer disable */
|
||||
int oom_kill_disable;
|
||||
|
||||
/* set when res.limit == memsw.limit */
|
||||
bool memsw_is_minimum;
|
||||
|
@ -235,7 +237,6 @@ struct mem_cgroup {
|
|||
* mem_cgroup ? And what type of charges should we move ?
|
||||
*/
|
||||
unsigned long move_charge_at_immigrate;
|
||||
|
||||
/*
|
||||
* percpu counter.
|
||||
*/
|
||||
|
@ -1342,20 +1343,26 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
|
|||
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
|
||||
}
|
||||
|
||||
static void memcg_oom_recover(struct mem_cgroup *mem)
|
||||
{
|
||||
if (mem->oom_kill_disable && atomic_read(&mem->oom_lock))
|
||||
memcg_wakeup_oom(mem);
|
||||
}
|
||||
|
||||
/*
|
||||
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
|
||||
*/
|
||||
bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
|
||||
{
|
||||
struct oom_wait_info owait;
|
||||
bool locked;
|
||||
bool locked, need_to_kill;
|
||||
|
||||
owait.mem = mem;
|
||||
owait.wait.flags = 0;
|
||||
owait.wait.func = memcg_oom_wake_function;
|
||||
owait.wait.private = current;
|
||||
INIT_LIST_HEAD(&owait.wait.task_list);
|
||||
|
||||
need_to_kill = true;
|
||||
/* At first, try to OOM lock hierarchy under mem.*/
|
||||
mutex_lock(&memcg_oom_mutex);
|
||||
locked = mem_cgroup_oom_lock(mem);
|
||||
|
@ -1364,15 +1371,17 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
|
|||
* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
|
||||
* under OOM is always welcomed, use TASK_KILLABLE here.
|
||||
*/
|
||||
if (!locked)
|
||||
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
|
||||
else
|
||||
if (!locked || mem->oom_kill_disable)
|
||||
need_to_kill = false;
|
||||
if (locked)
|
||||
mem_cgroup_oom_notify(mem);
|
||||
mutex_unlock(&memcg_oom_mutex);
|
||||
|
||||
if (locked)
|
||||
if (need_to_kill) {
|
||||
finish_wait(&memcg_oom_waitq, &owait.wait);
|
||||
mem_cgroup_out_of_memory(mem, mask);
|
||||
else {
|
||||
} else {
|
||||
schedule();
|
||||
finish_wait(&memcg_oom_waitq, &owait.wait);
|
||||
}
|
||||
|
@ -2162,15 +2171,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
|
|||
/* If swapout, usage of swap doesn't decrease */
|
||||
if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
|
||||
uncharge_memsw = false;
|
||||
/*
|
||||
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
|
||||
* In those cases, all pages freed continously can be expected to be in
|
||||
* the same cgroup and we have chance to coalesce uncharges.
|
||||
* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
|
||||
* because we want to do uncharge as soon as possible.
|
||||
*/
|
||||
if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
|
||||
goto direct_uncharge;
|
||||
|
||||
batch = ¤t->memcg_batch;
|
||||
/*
|
||||
|
@ -2180,6 +2180,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
|
|||
*/
|
||||
if (!batch->memcg)
|
||||
batch->memcg = mem;
|
||||
/*
|
||||
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
|
||||
* In those cases, all pages freed continously can be expected to be in
|
||||
* the same cgroup and we have chance to coalesce uncharges.
|
||||
* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
|
||||
* because we want to do uncharge as soon as possible.
|
||||
*/
|
||||
|
||||
if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
|
||||
goto direct_uncharge;
|
||||
|
||||
/*
|
||||
* In typical case, batch->memcg == mem. This means we can
|
||||
* merge a series of uncharges to an uncharge of res_counter.
|
||||
|
@ -2196,6 +2207,8 @@ direct_uncharge:
|
|||
res_counter_uncharge(&mem->res, PAGE_SIZE);
|
||||
if (uncharge_memsw)
|
||||
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
|
||||
if (unlikely(batch->memcg != mem))
|
||||
memcg_oom_recover(mem);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -2332,6 +2345,7 @@ void mem_cgroup_uncharge_end(void)
|
|||
res_counter_uncharge(&batch->memcg->res, batch->bytes);
|
||||
if (batch->memsw_bytes)
|
||||
res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
|
||||
memcg_oom_recover(batch->memcg);
|
||||
/* forget this pointer (for sanity check) */
|
||||
batch->memcg = NULL;
|
||||
}
|
||||
|
@ -2568,10 +2582,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
|
|||
unsigned long long val)
|
||||
{
|
||||
int retry_count;
|
||||
u64 memswlimit;
|
||||
u64 memswlimit, memlimit;
|
||||
int ret = 0;
|
||||
int children = mem_cgroup_count_children(memcg);
|
||||
u64 curusage, oldusage;
|
||||
int enlarge;
|
||||
|
||||
/*
|
||||
* For keeping hierarchical_reclaim simple, how long we should retry
|
||||
|
@ -2582,6 +2597,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
|
|||
|
||||
oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
|
||||
|
||||
enlarge = 0;
|
||||
while (retry_count) {
|
||||
if (signal_pending(current)) {
|
||||
ret = -EINTR;
|
||||
|
@ -2599,6 +2615,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
|
|||
mutex_unlock(&set_limit_mutex);
|
||||
break;
|
||||
}
|
||||
|
||||
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
|
||||
if (memlimit < val)
|
||||
enlarge = 1;
|
||||
|
||||
ret = res_counter_set_limit(&memcg->res, val);
|
||||
if (!ret) {
|
||||
if (memswlimit == val)
|
||||
|
@ -2620,6 +2641,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
|
|||
else
|
||||
oldusage = curusage;
|
||||
}
|
||||
if (!ret && enlarge)
|
||||
memcg_oom_recover(memcg);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -2628,9 +2651,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
|
|||
unsigned long long val)
|
||||
{
|
||||
int retry_count;
|
||||
u64 memlimit, oldusage, curusage;
|
||||
u64 memlimit, memswlimit, oldusage, curusage;
|
||||
int children = mem_cgroup_count_children(memcg);
|
||||
int ret = -EBUSY;
|
||||
int enlarge = 0;
|
||||
|
||||
/* see mem_cgroup_resize_res_limit */
|
||||
retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
|
||||
|
@ -2652,6 +2676,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
|
|||
mutex_unlock(&set_limit_mutex);
|
||||
break;
|
||||
}
|
||||
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
|
||||
if (memswlimit < val)
|
||||
enlarge = 1;
|
||||
ret = res_counter_set_limit(&memcg->memsw, val);
|
||||
if (!ret) {
|
||||
if (memlimit == val)
|
||||
|
@ -2674,6 +2701,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
|
|||
else
|
||||
oldusage = curusage;
|
||||
}
|
||||
if (!ret && enlarge)
|
||||
memcg_oom_recover(memcg);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2865,6 +2894,7 @@ move_account:
|
|||
if (ret)
|
||||
break;
|
||||
}
|
||||
memcg_oom_recover(mem);
|
||||
/* it seems parent cgroup doesn't have enough mem */
|
||||
if (ret == -ENOMEM)
|
||||
goto try_to_free;
|
||||
|
@ -3645,6 +3675,46 @@ static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
|
||||
struct cftype *cft, struct cgroup_map_cb *cb)
|
||||
{
|
||||
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
|
||||
|
||||
cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
|
||||
|
||||
if (atomic_read(&mem->oom_lock))
|
||||
cb->fill(cb, "under_oom", 1);
|
||||
else
|
||||
cb->fill(cb, "under_oom", 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
*/
|
||||
static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
|
||||
struct cftype *cft, u64 val)
|
||||
{
|
||||
struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
|
||||
struct mem_cgroup *parent;
|
||||
|
||||
/* cannot set to root cgroup and only 0 and 1 are allowed */
|
||||
if (!cgrp->parent || !((val == 0) || (val == 1)))
|
||||
return -EINVAL;
|
||||
|
||||
parent = mem_cgroup_from_cont(cgrp->parent);
|
||||
|
||||
cgroup_lock();
|
||||
/* oom-kill-disable is a flag for subhierarchy. */
|
||||
if ((parent->use_hierarchy) ||
|
||||
(mem->use_hierarchy && !list_empty(&cgrp->children))) {
|
||||
cgroup_unlock();
|
||||
return -EINVAL;
|
||||
}
|
||||
mem->oom_kill_disable = val;
|
||||
cgroup_unlock();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct cftype mem_cgroup_files[] = {
|
||||
{
|
||||
.name = "usage_in_bytes",
|
||||
|
@ -3702,6 +3772,8 @@ static struct cftype mem_cgroup_files[] = {
|
|||
},
|
||||
{
|
||||
.name = "oom_control",
|
||||
.read_map = mem_cgroup_oom_control_read,
|
||||
.write_u64 = mem_cgroup_oom_control_write,
|
||||
.register_event = mem_cgroup_oom_register_event,
|
||||
.unregister_event = mem_cgroup_oom_unregister_event,
|
||||
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
|
||||
|
@ -3943,6 +4015,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
|
|||
} else {
|
||||
parent = mem_cgroup_from_cont(cont->parent);
|
||||
mem->use_hierarchy = parent->use_hierarchy;
|
||||
mem->oom_kill_disable = parent->oom_kill_disable;
|
||||
}
|
||||
|
||||
if (parent && parent->use_hierarchy) {
|
||||
|
@ -4215,6 +4288,7 @@ static void mem_cgroup_clear_mc(void)
|
|||
if (mc.precharge) {
|
||||
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
|
||||
mc.precharge = 0;
|
||||
memcg_oom_recover(mc.to);
|
||||
}
|
||||
/*
|
||||
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
|
||||
|
@ -4223,6 +4297,7 @@ static void mem_cgroup_clear_mc(void)
|
|||
if (mc.moved_charge) {
|
||||
__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
|
||||
mc.moved_charge = 0;
|
||||
memcg_oom_recover(mc.from);
|
||||
}
|
||||
/* we must fixup refcnts and charges */
|
||||
if (mc.moved_swap) {
|
||||
|
|
Загрузка…
Ссылка в новой задаче