diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 44e7ded33448..5e028870ee8a 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -454,21 +454,27 @@ And if you want disable it again: 8.2 Type of charges which can be move Each bits of move_charge_at_immigrate has its own meaning about what type of -charges should be moved. +charges should be moved. But in any cases, it must be noted that an account of +a page or a swap can be moved only when it is charged to the task's current(old) +memory cgroup. bit | what type of charges would be moved ? -----+------------------------------------------------------------------------ 0 | A charge of an anonymous page(or swap of it) used by the target task. | Those pages and swaps must be used only by the target task. You must | enable Swap Extension(see 2.4) to enable move of swap charges. - -Note: Those pages and swaps must be charged to the old cgroup. -Note: More type of pages(e.g. file cache, shmem,) will be supported by other - bits in future. + -----+------------------------------------------------------------------------ + 1 | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory) + | and swaps of tmpfs file) mmaped by the target task. Unlike the case of + | anonymous pages, file pages(and swaps) in the range mmapped by the task + | will be moved even if the task hasn't done page fault, i.e. they might + | not be the task's "RSS", but other task's "RSS" that maps the same file. + | And mapcount of the page is ignored(the page can be moved even if + | page_mapcount(page) > 1). You must enable Swap Extension(see 2.4) to + | enable move of swap charges. 8.3 TODO -- Add support for other types of pages(e.g. file cache, shmem, etc.). - Implement madvise(2) to let users decide the vma to be moved or not to be moved. - All of moving charge operations are done under cgroup_mutex. It's not good diff --git a/include/linux/swap.h b/include/linux/swap.h index b6b614364dd8..ff4acea9bbdb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -282,6 +282,11 @@ extern void kswapd_stop(int nid); extern int shmem_unuse(swp_entry_t entry, struct page *page); #endif /* CONFIG_MMU */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, + struct page **pagep, swp_entry_t *ent); +#endif + extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *); #ifdef CONFIG_SWAP diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e5277e8a42a8..be5f478351bd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -250,6 +250,7 @@ struct mem_cgroup { */ enum move_type { MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ + MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ NR_MOVE_TYPE, }; @@ -272,6 +273,12 @@ static bool move_anon(void) &mc.to->move_charge_at_immigrate); } +static bool move_file(void) +{ + return test_bit(MOVE_CHARGE_TYPE_FILE, + &mc.to->move_charge_at_immigrate); +} + /* * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft * limit reclaim to prevent infinite loops, if they ever occur. @@ -4179,11 +4186,8 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, /* we don't move shared anon */ if (!move_anon() || page_mapcount(page) > 2) return NULL; - } else - /* - * TODO: We don't move charges of file(including shmem/tmpfs) - * pages for now. - */ + } else if (!move_file()) + /* we ignore mapcount for file pages */ return NULL; if (!get_page_unless_zero(page)) return NULL; @@ -4212,6 +4216,39 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return page; } +static struct page *mc_handle_file_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + struct page *page = NULL; + struct inode *inode; + struct address_space *mapping; + pgoff_t pgoff; + + if (!vma->vm_file) /* anonymous vma */ + return NULL; + if (!move_file()) + return NULL; + + inode = vma->vm_file->f_path.dentry->d_inode; + mapping = vma->vm_file->f_mapping; + if (pte_none(ptent)) + pgoff = linear_page_index(vma, addr); + else /* pte_file(ptent) is true */ + pgoff = pte_to_pgoff(ptent); + + /* page is moved even if it's not RSS of this task(page-faulted). */ + if (!mapping_cap_swap_backed(mapping)) { /* normal file */ + page = find_get_page(mapping, pgoff); + } else { /* shmem/tmpfs file. we should take account of swap too. */ + swp_entry_t ent; + mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); + if (do_swap_account) + entry->val = ent.val; + } + + return page; +} + static int is_target_pte_for_mc(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { @@ -4224,7 +4261,8 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, page = mc_handle_present_pte(vma, addr, ptent); else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, addr, ptent, &ent); - /* TODO: handle swap of shmes/tmpfs */ + else if (pte_none(ptent) || pte_file(ptent)) + page = mc_handle_file_pte(vma, addr, ptent, &ent); if (!page && !ent.val) return 0; @@ -4285,9 +4323,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) }; if (is_vm_hugetlb_page(vma)) continue; - /* TODO: We don't move charges of shmem/tmpfs pages for now. */ - if (vma->vm_flags & VM_SHARED) - continue; walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } @@ -4484,9 +4519,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) }; if (is_vm_hugetlb_page(vma)) continue; - /* TODO: We don't move charges of shmem/tmpfs pages for now. */ - if (vma->vm_flags & VM_SHARED) - continue; ret = walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_move_charge_walk); if (ret) diff --git a/mm/shmem.c b/mm/shmem.c index 4ef9797bd430..855eaf5b8d5b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2559,6 +2559,45 @@ out4: return error; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file + * @inode: the inode to be searched + * @pgoff: the offset to be searched + * @pagep: the pointer for the found page to be stored + * @ent: the pointer for the found swap entry to be stored + * + * If a page is found, refcount of it is incremented. Callers should handle + * these refcount. + */ +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, + struct page **pagep, swp_entry_t *ent) +{ + swp_entry_t entry = { .val = 0 }, *ptr; + struct page *page = NULL; + struct shmem_inode_info *info = SHMEM_I(inode); + + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + goto out; + + spin_lock(&info->lock); + ptr = shmem_swp_entry(info, pgoff, NULL); +#ifdef CONFIG_SWAP + if (ptr && ptr->val) { + entry.val = ptr->val; + page = find_get_page(&swapper_space, entry.val); + } else +#endif + page = find_get_page(inode->i_mapping, pgoff); + if (ptr) + shmem_swp_unmap(ptr); + spin_unlock(&info->lock); +out: + *pagep = page; + *ent = entry; +} +#endif + #else /* !CONFIG_SHMEM */ /* @@ -2598,6 +2637,31 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) return 0; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file + * @inode: the inode to be searched + * @pgoff: the offset to be searched + * @pagep: the pointer for the found page to be stored + * @ent: the pointer for the found swap entry to be stored + * + * If a page is found, refcount of it is incremented. Callers should handle + * these refcount. + */ +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, + struct page **pagep, swp_entry_t *ent) +{ + struct page *page = NULL; + + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + goto out; + page = find_get_page(inode->i_mapping, pgoff); +out: + *pagep = page; + *ent = (swp_entry_t){ .val = 0 }; +} +#endif + #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)