diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e3fdbd809c7..dd845d25827a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -90,6 +90,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ + MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ MEM_CGROUP_STAT_NSTATS, }; @@ -1051,7 +1052,46 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) return swappiness; } -/* A routine for testing mem is not under move_account */ +static void mem_cgroup_start_move(struct mem_cgroup *mem) +{ + int cpu; + /* Because this is for moving account, reuse mc.lock */ + spin_lock(&mc.lock); + for_each_possible_cpu(cpu) + per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; + spin_unlock(&mc.lock); + + synchronize_rcu(); +} + +static void mem_cgroup_end_move(struct mem_cgroup *mem) +{ + int cpu; + + if (!mem) + return; + spin_lock(&mc.lock); + for_each_possible_cpu(cpu) + per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; + spin_unlock(&mc.lock); +} +/* + * 2 routines for checking "mem" is under move_account() or not. + * + * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used + * for avoiding race in accounting. If true, + * pc->mem_cgroup may be overwritten. + * + * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or + * under hierarchy of moving cgroups. This is for + * waiting at hith-memory prressure caused by "move". + */ + +static bool mem_cgroup_stealed(struct mem_cgroup *mem) +{ + VM_BUG_ON(!rcu_read_lock_held()); + return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; +} static bool mem_cgroup_under_move(struct mem_cgroup *mem) { @@ -1462,35 +1502,62 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) /* * Currently used to update mapped file statistics, but the routine can be * generalized to update other statistics as well. + * + * Notes: Race condition + * + * We usually use page_cgroup_lock() for accessing page_cgroup member but + * it tends to be costly. But considering some conditions, we doesn't need + * to do so _always_. + * + * Considering "charge", lock_page_cgroup() is not required because all + * file-stat operations happen after a page is attached to radix-tree. There + * are no race with "charge". + * + * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup + * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even + * if there are race with "uncharge". Statistics itself is properly handled + * by flags. + * + * Considering "move", this is an only case we see a race. To make the race + * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are + * possibility of race condition. If there is, we take a lock. */ void mem_cgroup_update_file_mapped(struct page *page, int val) { struct mem_cgroup *mem; - struct page_cgroup *pc; + struct page_cgroup *pc = lookup_page_cgroup(page); + bool need_unlock = false; - pc = lookup_page_cgroup(page); if (unlikely(!pc)) return; - lock_page_cgroup(pc); + rcu_read_lock(); mem = pc->mem_cgroup; - if (!mem || !PageCgroupUsed(pc)) - goto done; - - /* - * Preemption is already disabled. We can use __this_cpu_xxx - */ + if (unlikely(!mem || !PageCgroupUsed(pc))) + goto out; + /* pc->mem_cgroup is unstable ? */ + if (unlikely(mem_cgroup_stealed(mem))) { + /* take a lock against to access pc->mem_cgroup */ + lock_page_cgroup(pc); + need_unlock = true; + mem = pc->mem_cgroup; + if (!mem || !PageCgroupUsed(pc)) + goto out; + } if (val > 0) { - __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); SetPageCgroupFileMapped(pc); } else { - __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); if (!page_mapped(page)) /* for race between dec->inc counter */ ClearPageCgroupFileMapped(pc); } -done: - unlock_page_cgroup(pc); +out: + if (unlikely(need_unlock)) + unlock_page_cgroup(pc); + rcu_read_unlock(); + return; } /* @@ -3039,6 +3106,7 @@ move_account: lru_add_drain_all(); drain_all_stock_sync(); ret = 0; + mem_cgroup_start_move(mem); for_each_node_state(node, N_HIGH_MEMORY) { for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { enum lru_list l; @@ -3052,6 +3120,7 @@ move_account: if (ret) break; } + mem_cgroup_end_move(mem); memcg_oom_recover(mem); /* it seems parent cgroup doesn't have enough mem */ if (ret == -ENOMEM) @@ -4514,6 +4583,7 @@ static void mem_cgroup_clear_mc(void) mc.to = NULL; mc.moving_task = NULL; spin_unlock(&mc.lock); + mem_cgroup_end_move(from); memcg_oom_recover(from); memcg_oom_recover(to); wake_up_all(&mc.waitq); @@ -4544,6 +4614,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); VM_BUG_ON(mc.moving_task); + mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; mc.to = mem;