2015-10-29 16:58:09 +03:00
|
|
|
/*
|
|
|
|
* Minimal file system backend for holding eBPF maps and programs,
|
|
|
|
* used by bpf(2) object pinning.
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
*
|
|
|
|
* Daniel Borkmann <daniel@iogearbox.net>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* version 2 as published by the Free Software Foundation.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/magic.h>
|
|
|
|
#include <linux/major.h>
|
|
|
|
#include <linux/mount.h>
|
|
|
|
#include <linux/namei.h>
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/kdev_t.h>
|
|
|
|
#include <linux/filter.h>
|
|
|
|
#include <linux/bpf.h>
|
|
|
|
|
|
|
|
enum bpf_type {
|
|
|
|
BPF_TYPE_UNSPEC = 0,
|
|
|
|
BPF_TYPE_PROG,
|
|
|
|
BPF_TYPE_MAP,
|
|
|
|
};
|
|
|
|
|
|
|
|
static void *bpf_any_get(void *raw, enum bpf_type type)
|
|
|
|
{
|
|
|
|
switch (type) {
|
|
|
|
case BPF_TYPE_PROG:
|
|
|
|
atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
|
|
|
|
break;
|
|
|
|
case BPF_TYPE_MAP:
|
bpf: fix clearing on persistent program array maps
Currently, when having map file descriptors pointing to program arrays,
there's still the issue that we unconditionally flush program array
contents via bpf_fd_array_map_clear() in bpf_map_release(). This happens
when such a file descriptor is released and is independent of the map's
refcount.
Having this flush independent of the refcount is for a reason: there
can be arbitrary complex dependency chains among tail calls, also circular
ones (direct or indirect, nesting limit determined during runtime), and
we need to make sure that the map drops all references to eBPF programs
it holds, so that the map's refcount can eventually drop to zero and
initiate its freeing. Btw, a walk of the whole dependency graph would
not be possible for various reasons, one being complexity and another
one inconsistency, i.e. new programs can be added to parts of the graph
at any time, so there's no guaranteed consistent state for the time of
such a walk.
Now, the program array pinning itself works, but the issue is that each
derived file descriptor on close would nevertheless call unconditionally
into bpf_fd_array_map_clear(). Instead, keep track of users and postpone
this flush until the last reference to a user is dropped. As this only
concerns a subset of references (f.e. a prog array could hold a program
that itself has reference on the prog array holding it, etc), we need to
track them separately.
Short analysis on the refcounting: on map creation time usercnt will be
one, so there's no change in behaviour for bpf_map_release(), if unpinned.
If we already fail in map_create(), we are immediately freed, and no
file descriptor has been made public yet. In bpf_obj_pin_user(), we need
to probe for a possible map in bpf_fd_probe_obj() already with a usercnt
reference, so before we drop the reference on the fd with fdput().
Therefore, if actual pinning fails, we need to drop that reference again
in bpf_any_put(), otherwise we keep holding it. When last reference
drops on the inode, the bpf_any_put() in bpf_evict_inode() will take
care of dropping the usercnt again. In the bpf_obj_get_user() case, the
bpf_any_get() will grab a reference on the usercnt, still at a time when
we have the reference on the path. Should we later on fail to grab a new
file descriptor, bpf_any_put() will drop it, otherwise we hold it until
bpf_map_release() time.
Joint work with Alexei.
Fixes: b2197755b263 ("bpf: add support for persistent maps/progs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-24 23:28:15 +03:00
|
|
|
bpf_map_inc(raw, true);
|
2015-10-29 16:58:09 +03:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return raw;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void bpf_any_put(void *raw, enum bpf_type type)
|
|
|
|
{
|
|
|
|
switch (type) {
|
|
|
|
case BPF_TYPE_PROG:
|
|
|
|
bpf_prog_put(raw);
|
|
|
|
break;
|
|
|
|
case BPF_TYPE_MAP:
|
bpf: fix clearing on persistent program array maps
Currently, when having map file descriptors pointing to program arrays,
there's still the issue that we unconditionally flush program array
contents via bpf_fd_array_map_clear() in bpf_map_release(). This happens
when such a file descriptor is released and is independent of the map's
refcount.
Having this flush independent of the refcount is for a reason: there
can be arbitrary complex dependency chains among tail calls, also circular
ones (direct or indirect, nesting limit determined during runtime), and
we need to make sure that the map drops all references to eBPF programs
it holds, so that the map's refcount can eventually drop to zero and
initiate its freeing. Btw, a walk of the whole dependency graph would
not be possible for various reasons, one being complexity and another
one inconsistency, i.e. new programs can be added to parts of the graph
at any time, so there's no guaranteed consistent state for the time of
such a walk.
Now, the program array pinning itself works, but the issue is that each
derived file descriptor on close would nevertheless call unconditionally
into bpf_fd_array_map_clear(). Instead, keep track of users and postpone
this flush until the last reference to a user is dropped. As this only
concerns a subset of references (f.e. a prog array could hold a program
that itself has reference on the prog array holding it, etc), we need to
track them separately.
Short analysis on the refcounting: on map creation time usercnt will be
one, so there's no change in behaviour for bpf_map_release(), if unpinned.
If we already fail in map_create(), we are immediately freed, and no
file descriptor has been made public yet. In bpf_obj_pin_user(), we need
to probe for a possible map in bpf_fd_probe_obj() already with a usercnt
reference, so before we drop the reference on the fd with fdput().
Therefore, if actual pinning fails, we need to drop that reference again
in bpf_any_put(), otherwise we keep holding it. When last reference
drops on the inode, the bpf_any_put() in bpf_evict_inode() will take
care of dropping the usercnt again. In the bpf_obj_get_user() case, the
bpf_any_get() will grab a reference on the usercnt, still at a time when
we have the reference on the path. Should we later on fail to grab a new
file descriptor, bpf_any_put() will drop it, otherwise we hold it until
bpf_map_release() time.
Joint work with Alexei.
Fixes: b2197755b263 ("bpf: add support for persistent maps/progs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-24 23:28:15 +03:00
|
|
|
bpf_map_put_with_uref(raw);
|
2015-10-29 16:58:09 +03:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
|
|
|
|
{
|
|
|
|
void *raw;
|
|
|
|
|
|
|
|
*type = BPF_TYPE_MAP;
|
bpf: fix clearing on persistent program array maps
Currently, when having map file descriptors pointing to program arrays,
there's still the issue that we unconditionally flush program array
contents via bpf_fd_array_map_clear() in bpf_map_release(). This happens
when such a file descriptor is released and is independent of the map's
refcount.
Having this flush independent of the refcount is for a reason: there
can be arbitrary complex dependency chains among tail calls, also circular
ones (direct or indirect, nesting limit determined during runtime), and
we need to make sure that the map drops all references to eBPF programs
it holds, so that the map's refcount can eventually drop to zero and
initiate its freeing. Btw, a walk of the whole dependency graph would
not be possible for various reasons, one being complexity and another
one inconsistency, i.e. new programs can be added to parts of the graph
at any time, so there's no guaranteed consistent state for the time of
such a walk.
Now, the program array pinning itself works, but the issue is that each
derived file descriptor on close would nevertheless call unconditionally
into bpf_fd_array_map_clear(). Instead, keep track of users and postpone
this flush until the last reference to a user is dropped. As this only
concerns a subset of references (f.e. a prog array could hold a program
that itself has reference on the prog array holding it, etc), we need to
track them separately.
Short analysis on the refcounting: on map creation time usercnt will be
one, so there's no change in behaviour for bpf_map_release(), if unpinned.
If we already fail in map_create(), we are immediately freed, and no
file descriptor has been made public yet. In bpf_obj_pin_user(), we need
to probe for a possible map in bpf_fd_probe_obj() already with a usercnt
reference, so before we drop the reference on the fd with fdput().
Therefore, if actual pinning fails, we need to drop that reference again
in bpf_any_put(), otherwise we keep holding it. When last reference
drops on the inode, the bpf_any_put() in bpf_evict_inode() will take
care of dropping the usercnt again. In the bpf_obj_get_user() case, the
bpf_any_get() will grab a reference on the usercnt, still at a time when
we have the reference on the path. Should we later on fail to grab a new
file descriptor, bpf_any_put() will drop it, otherwise we hold it until
bpf_map_release() time.
Joint work with Alexei.
Fixes: b2197755b263 ("bpf: add support for persistent maps/progs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-24 23:28:15 +03:00
|
|
|
raw = bpf_map_get_with_uref(ufd);
|
2015-10-29 16:58:09 +03:00
|
|
|
if (IS_ERR(raw)) {
|
|
|
|
*type = BPF_TYPE_PROG;
|
|
|
|
raw = bpf_prog_get(ufd);
|
|
|
|
}
|
|
|
|
|
|
|
|
return raw;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct inode_operations bpf_dir_iops;
|
|
|
|
|
|
|
|
static const struct inode_operations bpf_prog_iops = { };
|
|
|
|
static const struct inode_operations bpf_map_iops = { };
|
|
|
|
|
|
|
|
static struct inode *bpf_get_inode(struct super_block *sb,
|
|
|
|
const struct inode *dir,
|
|
|
|
umode_t mode)
|
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
switch (mode & S_IFMT) {
|
|
|
|
case S_IFDIR:
|
|
|
|
case S_IFREG:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
}
|
|
|
|
|
|
|
|
inode = new_inode(sb);
|
|
|
|
if (!inode)
|
|
|
|
return ERR_PTR(-ENOSPC);
|
|
|
|
|
|
|
|
inode->i_ino = get_next_ino();
|
|
|
|
inode->i_atime = CURRENT_TIME;
|
|
|
|
inode->i_mtime = inode->i_atime;
|
|
|
|
inode->i_ctime = inode->i_atime;
|
|
|
|
|
|
|
|
inode_init_owner(inode, dir, mode);
|
|
|
|
|
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
|
|
|
|
{
|
|
|
|
*type = BPF_TYPE_UNSPEC;
|
|
|
|
if (inode->i_op == &bpf_prog_iops)
|
|
|
|
*type = BPF_TYPE_PROG;
|
|
|
|
else if (inode->i_op == &bpf_map_iops)
|
|
|
|
*type = BPF_TYPE_MAP;
|
|
|
|
else
|
|
|
|
return -EACCES;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool bpf_dname_reserved(const struct dentry *dentry)
|
|
|
|
{
|
|
|
|
return strchr(dentry->d_name.name, '.');
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
if (bpf_dname_reserved(dentry))
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
|
|
|
|
if (IS_ERR(inode))
|
|
|
|
return PTR_ERR(inode);
|
|
|
|
|
|
|
|
inode->i_op = &bpf_dir_iops;
|
|
|
|
inode->i_fop = &simple_dir_operations;
|
|
|
|
|
|
|
|
inc_nlink(inode);
|
|
|
|
inc_nlink(dir);
|
|
|
|
|
|
|
|
d_instantiate(dentry, inode);
|
|
|
|
dget(dentry);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
|
|
|
|
umode_t mode, const struct inode_operations *iops)
|
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
if (bpf_dname_reserved(dentry))
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
|
|
|
|
if (IS_ERR(inode))
|
|
|
|
return PTR_ERR(inode);
|
|
|
|
|
|
|
|
inode->i_op = iops;
|
|
|
|
inode->i_private = dentry->d_fsdata;
|
|
|
|
|
|
|
|
d_instantiate(dentry, inode);
|
|
|
|
dget(dentry);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
|
|
|
|
dev_t devt)
|
|
|
|
{
|
|
|
|
enum bpf_type type = MINOR(devt);
|
|
|
|
|
|
|
|
if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) ||
|
|
|
|
dentry->d_fsdata == NULL)
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case BPF_TYPE_PROG:
|
|
|
|
return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops);
|
|
|
|
case BPF_TYPE_MAP:
|
|
|
|
return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops);
|
|
|
|
default:
|
|
|
|
return -EPERM;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct inode_operations bpf_dir_iops = {
|
|
|
|
.lookup = simple_lookup,
|
|
|
|
.mknod = bpf_mkobj,
|
|
|
|
.mkdir = bpf_mkdir,
|
|
|
|
.rmdir = simple_rmdir,
|
|
|
|
.unlink = simple_unlink,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
|
|
|
|
enum bpf_type type)
|
|
|
|
{
|
|
|
|
struct dentry *dentry;
|
|
|
|
struct inode *dir;
|
|
|
|
struct path path;
|
|
|
|
umode_t mode;
|
|
|
|
dev_t devt;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
|
|
|
|
if (IS_ERR(dentry))
|
|
|
|
return PTR_ERR(dentry);
|
|
|
|
|
|
|
|
mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
|
|
|
|
devt = MKDEV(UNNAMED_MAJOR, type);
|
|
|
|
|
|
|
|
ret = security_path_mknod(&path, dentry, mode, devt);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
dir = d_inode(path.dentry);
|
|
|
|
if (dir->i_op != &bpf_dir_iops) {
|
|
|
|
ret = -EPERM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
dentry->d_fsdata = raw;
|
|
|
|
ret = vfs_mknod(dir, dentry, mode, devt);
|
|
|
|
dentry->d_fsdata = NULL;
|
|
|
|
out:
|
|
|
|
done_path_create(&path, dentry);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
|
|
|
|
{
|
|
|
|
struct filename *pname;
|
|
|
|
enum bpf_type type;
|
|
|
|
void *raw;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
pname = getname(pathname);
|
|
|
|
if (IS_ERR(pname))
|
|
|
|
return PTR_ERR(pname);
|
|
|
|
|
|
|
|
raw = bpf_fd_probe_obj(ufd, &type);
|
|
|
|
if (IS_ERR(raw)) {
|
|
|
|
ret = PTR_ERR(raw);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = bpf_obj_do_pin(pname, raw, type);
|
|
|
|
if (ret != 0)
|
|
|
|
bpf_any_put(raw, type);
|
|
|
|
out:
|
|
|
|
putname(pname);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *bpf_obj_do_get(const struct filename *pathname,
|
|
|
|
enum bpf_type *type)
|
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
struct path path;
|
|
|
|
void *raw;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
|
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
|
|
|
inode = d_backing_inode(path.dentry);
|
|
|
|
ret = inode_permission(inode, MAY_WRITE);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = bpf_inode_type(inode, type);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
raw = bpf_any_get(inode->i_private, *type);
|
|
|
|
touch_atime(&path);
|
|
|
|
|
|
|
|
path_put(&path);
|
|
|
|
return raw;
|
|
|
|
out:
|
|
|
|
path_put(&path);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
int bpf_obj_get_user(const char __user *pathname)
|
|
|
|
{
|
|
|
|
enum bpf_type type = BPF_TYPE_UNSPEC;
|
|
|
|
struct filename *pname;
|
|
|
|
int ret = -ENOENT;
|
|
|
|
void *raw;
|
|
|
|
|
|
|
|
pname = getname(pathname);
|
|
|
|
if (IS_ERR(pname))
|
|
|
|
return PTR_ERR(pname);
|
|
|
|
|
|
|
|
raw = bpf_obj_do_get(pname, &type);
|
|
|
|
if (IS_ERR(raw)) {
|
|
|
|
ret = PTR_ERR(raw);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (type == BPF_TYPE_PROG)
|
|
|
|
ret = bpf_prog_new_fd(raw);
|
|
|
|
else if (type == BPF_TYPE_MAP)
|
|
|
|
ret = bpf_map_new_fd(raw);
|
|
|
|
else
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (ret < 0)
|
|
|
|
bpf_any_put(raw, type);
|
|
|
|
out:
|
|
|
|
putname(pname);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void bpf_evict_inode(struct inode *inode)
|
|
|
|
{
|
|
|
|
enum bpf_type type;
|
|
|
|
|
|
|
|
truncate_inode_pages_final(&inode->i_data);
|
|
|
|
clear_inode(inode);
|
|
|
|
|
|
|
|
if (!bpf_inode_type(inode, &type))
|
|
|
|
bpf_any_put(inode->i_private, type);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct super_operations bpf_super_ops = {
|
|
|
|
.statfs = simple_statfs,
|
|
|
|
.drop_inode = generic_delete_inode,
|
|
|
|
.evict_inode = bpf_evict_inode,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int bpf_fill_super(struct super_block *sb, void *data, int silent)
|
|
|
|
{
|
|
|
|
static struct tree_descr bpf_rfiles[] = { { "" } };
|
|
|
|
struct inode *inode;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
sb->s_op = &bpf_super_ops;
|
|
|
|
|
|
|
|
inode = sb->s_root->d_inode;
|
|
|
|
inode->i_op = &bpf_dir_iops;
|
|
|
|
inode->i_mode &= ~S_IALLUGO;
|
|
|
|
inode->i_mode |= S_ISVTX | S_IRWXUGO;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct dentry *bpf_mount(struct file_system_type *type, int flags,
|
|
|
|
const char *dev_name, void *data)
|
|
|
|
{
|
|
|
|
return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct file_system_type bpf_fs_type = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.name = "bpf",
|
|
|
|
.mount = bpf_mount,
|
|
|
|
.kill_sb = kill_litter_super,
|
|
|
|
.fs_flags = FS_USERNS_MOUNT,
|
|
|
|
};
|
|
|
|
|
|
|
|
MODULE_ALIAS_FS("bpf");
|
|
|
|
|
|
|
|
static int __init bpf_init(void)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = sysfs_create_mount_point(fs_kobj, "bpf");
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = register_filesystem(&bpf_fs_type);
|
|
|
|
if (ret)
|
|
|
|
sysfs_remove_mount_point(fs_kobj, "bpf");
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
fs_initcall(bpf_init);
|