From d57456753787ab158f906f1f8eb58d54a2ccd9f4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2015 11:43:05 +0900 Subject: [PATCH] cgroup: fix race condition around termination check in css_task_iter_next() css_task_iter_next() checked @it->cur_task before grabbing css_set_lock and assumed that the result won't change afterwards; however, tasks could leave the cgroup being iterated terminating the iterator before css_task_lock is acquired. If this happens, css_task_iter_next() tries to calculate the current task from NULL cg_list pointer leading to the following oops. BUG: unable to handle kernel paging request at fffffffffffff7d0 IP: [] css_task_iter_next+0x42/0x80 ... CPU: 4 PID: 6391 Comm: JobQDisp2 Not tainted 4.0.9-22_fbk4_rc3_81616_ge8d9cb6 #1 Hardware name: Quanta Freedom/Winterfell, BIOS F03_3B08 03/04/2014 task: ffff880868e46400 ti: ffff88083404c000 task.ti: ffff88083404c000 RIP: 0010:[] [] css_task_iter_next+0x42/0x80 RSP: 0018:ffff88083404fd28 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff88083404fd68 RCX: ffff8804697fb8b0 RDX: fffffffffffff7c0 RSI: ffff8803b7dff800 RDI: ffffffff822c0278 RBP: ffff88083404fd38 R08: 0000000000017160 R09: ffff88046f4070c0 R10: ffffffff810d61f7 R11: 0000000000000293 R12: ffff880863bf8400 R13: ffff88046b87fd80 R14: 0000000000000000 R15: ffff88083404fe58 FS: 00007fa0567e2700(0000) GS:ffff88046f900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffffffffff7d0 CR3: 0000000469568000 CR4: 00000000001406e0 Stack: 0000000000000246 0000000000000000 ffff88083404fde8 ffffffff810d6248 ffff88083404fd68 0000000000000000 ffff8803b7dff800 000001ef000001ee 0000000000000000 0000000000000000 ffff880863bf8568 0000000000000000 Call Trace: [] cgroup_pidlist_start+0x258/0x550 [] cgroup_seqfile_start+0x1d/0x20 [] kernfs_seq_start+0x5f/0xa0 [] seq_read+0x166/0x380 [] kernfs_fop_read+0x11d/0x180 [] __vfs_read+0x18/0x50 [] vfs_read+0x8d/0x150 [] SyS_read+0x4f/0xb0 [] system_call_fastpath+0x12/0x17 Fix it by moving the termination condition check inside css_set_lock. @it->cur_task is now cleared after being put and @it->task_pos is tested for termination instead of @it->cset_pos as they indicate the same condition and @it->task_pos is what's being dereferenced. Signed-off-by: Tejun Heo Reported-by: Calvin Owens Fixes: ed27b9f7a17d ("cgroup: don't hold css_set_rwsem across css task iteration") Acked-by: Zefan Li --- kernel/cgroup.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4f4fc538cb1b..b9d0cce3f9ce 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3930,18 +3930,19 @@ void css_task_iter_start(struct cgroup_subsys_state *css, */ struct task_struct *css_task_iter_next(struct css_task_iter *it) { - if (!it->cset_pos) - return NULL; - - if (it->cur_task) + if (it->cur_task) { put_task_struct(it->cur_task); + it->cur_task = NULL; + } spin_lock_bh(&css_set_lock); - it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); - get_task_struct(it->cur_task); - - css_task_iter_advance(it); + if (it->task_pos) { + it->cur_task = list_entry(it->task_pos, struct task_struct, + cg_list); + get_task_struct(it->cur_task); + css_task_iter_advance(it); + } spin_unlock_bh(&css_set_lock);