pidns: Make the pidns proc mount/umount logic obvious.
Track the number of pids in the proc hash table. When the number of pids goes to 0 schedule work to unmount the kernel mount of proc. Move the mount of proc into alloc_pid when we allocate the pid for init. Remove the surprising calls of pid_ns_release proc in fork and proc_flush_task. Those code paths really shouldn't know about proc namespace implementation details and people have demonstrated several times that finding and understanding those code paths is difficult and non-obvious. Because of the call path detach pid is alwasy called with the rtnl_lock held free_pid is not allowed to sleep, so the work to unmounting proc is moved to a work queue. This has the side benefit of not blocking the entire world waiting for the unnecessary rcu_barrier in deactivate_locked_super. In the process of making the code clear and obvious this fixes a bug reported by Gao feng <gaofeng@cn.fujitsu.com> where we would leak a mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns succeeded and copy_net_ns failed. Acked-by: "Serge E. Hallyn" <serge@hallyn.com> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
This commit is contained in:
Родитель
17cf22c33e
Коммит
0a01f2cc39
|
@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task)
|
||||||
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
|
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
|
||||||
tgid->numbers[i].nr);
|
tgid->numbers[i].nr);
|
||||||
}
|
}
|
||||||
|
|
||||||
upid = &pid->numbers[pid->level];
|
|
||||||
if (upid->nr == 1)
|
|
||||||
pid_ns_release_proc(upid->ns);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct dentry *proc_pid_instantiate(struct inode *dir,
|
static struct dentry *proc_pid_instantiate(struct inode *dir,
|
||||||
|
|
|
@ -155,11 +155,6 @@ void __init proc_root_init(void)
|
||||||
err = register_filesystem(&proc_fs_type);
|
err = register_filesystem(&proc_fs_type);
|
||||||
if (err)
|
if (err)
|
||||||
return;
|
return;
|
||||||
err = pid_ns_prepare_proc(&init_pid_ns);
|
|
||||||
if (err) {
|
|
||||||
unregister_filesystem(&proc_fs_type);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
proc_self_init();
|
proc_self_init();
|
||||||
proc_symlink("mounts", NULL, "self/mounts");
|
proc_symlink("mounts", NULL, "self/mounts");
|
||||||
|
|
|
@ -21,6 +21,7 @@ struct pid_namespace {
|
||||||
struct kref kref;
|
struct kref kref;
|
||||||
struct pidmap pidmap[PIDMAP_ENTRIES];
|
struct pidmap pidmap[PIDMAP_ENTRIES];
|
||||||
int last_pid;
|
int last_pid;
|
||||||
|
int nr_hashed;
|
||||||
struct task_struct *child_reaper;
|
struct task_struct *child_reaper;
|
||||||
struct kmem_cache *pid_cachep;
|
struct kmem_cache *pid_cachep;
|
||||||
unsigned int level;
|
unsigned int level;
|
||||||
|
@ -32,6 +33,7 @@ struct pid_namespace {
|
||||||
struct bsd_acct_struct *bacct;
|
struct bsd_acct_struct *bacct;
|
||||||
#endif
|
#endif
|
||||||
struct user_namespace *user_ns;
|
struct user_namespace *user_ns;
|
||||||
|
struct work_struct proc_work;
|
||||||
kgid_t pid_gid;
|
kgid_t pid_gid;
|
||||||
int hide_pid;
|
int hide_pid;
|
||||||
int reboot; /* group exit code if this pidns was rebooted */
|
int reboot; /* group exit code if this pidns was rebooted */
|
||||||
|
|
|
@ -1476,8 +1476,6 @@ bad_fork_cleanup_io:
|
||||||
if (p->io_context)
|
if (p->io_context)
|
||||||
exit_io_context(p);
|
exit_io_context(p);
|
||||||
bad_fork_cleanup_namespaces:
|
bad_fork_cleanup_namespaces:
|
||||||
if (unlikely(clone_flags & CLONE_NEWPID))
|
|
||||||
pid_ns_release_proc(p->nsproxy->pid_ns);
|
|
||||||
exit_task_namespaces(p);
|
exit_task_namespaces(p);
|
||||||
bad_fork_cleanup_mm:
|
bad_fork_cleanup_mm:
|
||||||
if (p->mm)
|
if (p->mm)
|
||||||
|
|
21
kernel/pid.c
21
kernel/pid.c
|
@ -36,6 +36,7 @@
|
||||||
#include <linux/pid_namespace.h>
|
#include <linux/pid_namespace.h>
|
||||||
#include <linux/init_task.h>
|
#include <linux/init_task.h>
|
||||||
#include <linux/syscalls.h>
|
#include <linux/syscalls.h>
|
||||||
|
#include <linux/proc_fs.h>
|
||||||
|
|
||||||
#define pid_hashfn(nr, ns) \
|
#define pid_hashfn(nr, ns) \
|
||||||
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
|
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
|
||||||
|
@ -270,8 +271,12 @@ void free_pid(struct pid *pid)
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
spin_lock_irqsave(&pidmap_lock, flags);
|
spin_lock_irqsave(&pidmap_lock, flags);
|
||||||
for (i = 0; i <= pid->level; i++)
|
for (i = 0; i <= pid->level; i++) {
|
||||||
hlist_del_rcu(&pid->numbers[i].pid_chain);
|
struct upid *upid = pid->numbers + i;
|
||||||
|
hlist_del_rcu(&upid->pid_chain);
|
||||||
|
if (--upid->ns->nr_hashed == 0)
|
||||||
|
schedule_work(&upid->ns->proc_work);
|
||||||
|
}
|
||||||
spin_unlock_irqrestore(&pidmap_lock, flags);
|
spin_unlock_irqrestore(&pidmap_lock, flags);
|
||||||
|
|
||||||
for (i = 0; i <= pid->level; i++)
|
for (i = 0; i <= pid->level; i++)
|
||||||
|
@ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
tmp = ns;
|
tmp = ns;
|
||||||
|
pid->level = ns->level;
|
||||||
for (i = ns->level; i >= 0; i--) {
|
for (i = ns->level; i >= 0; i--) {
|
||||||
nr = alloc_pidmap(tmp);
|
nr = alloc_pidmap(tmp);
|
||||||
if (nr < 0)
|
if (nr < 0)
|
||||||
|
@ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns)
|
||||||
tmp = tmp->parent;
|
tmp = tmp->parent;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (unlikely(is_child_reaper(pid))) {
|
||||||
|
if (pid_ns_prepare_proc(ns))
|
||||||
|
goto out_free;
|
||||||
|
}
|
||||||
|
|
||||||
get_pid_ns(ns);
|
get_pid_ns(ns);
|
||||||
pid->level = ns->level;
|
|
||||||
atomic_set(&pid->count, 1);
|
atomic_set(&pid->count, 1);
|
||||||
for (type = 0; type < PIDTYPE_MAX; ++type)
|
for (type = 0; type < PIDTYPE_MAX; ++type)
|
||||||
INIT_HLIST_HEAD(&pid->tasks[type]);
|
INIT_HLIST_HEAD(&pid->tasks[type]);
|
||||||
|
|
||||||
upid = pid->numbers + ns->level;
|
upid = pid->numbers + ns->level;
|
||||||
spin_lock_irq(&pidmap_lock);
|
spin_lock_irq(&pidmap_lock);
|
||||||
for ( ; upid >= pid->numbers; --upid)
|
for ( ; upid >= pid->numbers; --upid) {
|
||||||
hlist_add_head_rcu(&upid->pid_chain,
|
hlist_add_head_rcu(&upid->pid_chain,
|
||||||
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
|
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
|
||||||
|
upid->ns->nr_hashed++;
|
||||||
|
}
|
||||||
spin_unlock_irq(&pidmap_lock);
|
spin_unlock_irq(&pidmap_lock);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
|
@ -570,6 +582,7 @@ void __init pidmap_init(void)
|
||||||
/* Reserve PID 0. We never call free_pidmap(0) */
|
/* Reserve PID 0. We never call free_pidmap(0) */
|
||||||
set_bit(0, init_pid_ns.pidmap[0].page);
|
set_bit(0, init_pid_ns.pidmap[0].page);
|
||||||
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
|
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
|
||||||
|
init_pid_ns.nr_hashed = 1;
|
||||||
|
|
||||||
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
|
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
|
||||||
SLAB_HWCACHE_ALIGN | SLAB_PANIC);
|
SLAB_HWCACHE_ALIGN | SLAB_PANIC);
|
||||||
|
|
|
@ -72,6 +72,12 @@ err_alloc:
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void proc_cleanup_work(struct work_struct *work)
|
||||||
|
{
|
||||||
|
struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
|
||||||
|
pid_ns_release_proc(ns);
|
||||||
|
}
|
||||||
|
|
||||||
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
|
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
|
||||||
#define MAX_PID_NS_LEVEL 32
|
#define MAX_PID_NS_LEVEL 32
|
||||||
|
|
||||||
|
@ -105,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
|
||||||
ns->level = level;
|
ns->level = level;
|
||||||
ns->parent = get_pid_ns(parent_pid_ns);
|
ns->parent = get_pid_ns(parent_pid_ns);
|
||||||
ns->user_ns = get_user_ns(user_ns);
|
ns->user_ns = get_user_ns(user_ns);
|
||||||
|
INIT_WORK(&ns->proc_work, proc_cleanup_work);
|
||||||
|
|
||||||
set_bit(0, ns->pidmap[0].page);
|
set_bit(0, ns->pidmap[0].page);
|
||||||
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
|
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
|
||||||
|
@ -112,15 +119,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
|
||||||
for (i = 1; i < PIDMAP_ENTRIES; i++)
|
for (i = 1; i < PIDMAP_ENTRIES; i++)
|
||||||
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
|
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
|
||||||
|
|
||||||
err = pid_ns_prepare_proc(ns);
|
|
||||||
if (err)
|
|
||||||
goto out_put_parent_pid_ns;
|
|
||||||
|
|
||||||
return ns;
|
return ns;
|
||||||
|
|
||||||
out_put_parent_pid_ns:
|
|
||||||
put_pid_ns(parent_pid_ns);
|
|
||||||
put_user_ns(user_ns);
|
|
||||||
out_free_map:
|
out_free_map:
|
||||||
kfree(ns->pidmap[0].page);
|
kfree(ns->pidmap[0].page);
|
||||||
out_free:
|
out_free:
|
||||||
|
|
Загрузка…
Ссылка в новой задаче