2020-05-09 20:58:59 +03:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
/* Copyright (c) 2020 Facebook */
|
|
|
|
|
|
|
|
#include <linux/fs.h>
|
2020-05-09 20:59:05 +03:00
|
|
|
#include <linux/anon_inodes.h>
|
2020-05-09 20:58:59 +03:00
|
|
|
#include <linux/filter.h>
|
|
|
|
#include <linux/bpf.h>
|
2022-01-24 21:54:00 +03:00
|
|
|
#include <linux/rcupdate_trace.h>
|
2020-05-09 20:58:59 +03:00
|
|
|
|
|
|
|
struct bpf_iter_target_info {
|
|
|
|
struct list_head list;
|
2020-05-13 21:02:19 +03:00
|
|
|
const struct bpf_iter_reg *reg_info;
|
2020-05-09 20:59:00 +03:00
|
|
|
u32 btf_id; /* cached value */
|
2020-05-09 20:58:59 +03:00
|
|
|
};
|
|
|
|
|
2020-05-09 20:59:01 +03:00
|
|
|
struct bpf_iter_link {
|
|
|
|
struct bpf_link link;
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
struct bpf_iter_aux_info aux;
|
2020-05-09 20:59:01 +03:00
|
|
|
struct bpf_iter_target_info *tinfo;
|
|
|
|
};
|
|
|
|
|
2020-05-09 20:59:05 +03:00
|
|
|
struct bpf_iter_priv_data {
|
|
|
|
struct bpf_iter_target_info *tinfo;
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
const struct bpf_iter_seq_info *seq_info;
|
2020-05-09 20:59:05 +03:00
|
|
|
struct bpf_prog *prog;
|
|
|
|
u64 session_id;
|
|
|
|
u64 seq_num;
|
|
|
|
bool done_stop;
|
|
|
|
u8 target_private[] __aligned(8);
|
|
|
|
};
|
|
|
|
|
2020-05-09 20:58:59 +03:00
|
|
|
static struct list_head targets = LIST_HEAD_INIT(targets);
|
|
|
|
static DEFINE_MUTEX(targets_mutex);
|
|
|
|
|
2020-05-09 20:59:02 +03:00
|
|
|
/* protect bpf_iter_link changes */
|
|
|
|
static DEFINE_MUTEX(link_mutex);
|
|
|
|
|
2020-05-09 20:59:05 +03:00
|
|
|
/* incremented on every opened seq_file */
|
|
|
|
static atomic64_t session_id;
|
|
|
|
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
|
|
|
|
const struct bpf_iter_seq_info *seq_info);
|
2020-05-09 20:59:06 +03:00
|
|
|
|
2020-05-09 20:59:07 +03:00
|
|
|
static void bpf_iter_inc_seq_num(struct seq_file *seq)
|
|
|
|
{
|
|
|
|
struct bpf_iter_priv_data *iter_priv;
|
|
|
|
|
|
|
|
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
|
|
|
|
target_private);
|
|
|
|
iter_priv->seq_num++;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void bpf_iter_dec_seq_num(struct seq_file *seq)
|
|
|
|
{
|
|
|
|
struct bpf_iter_priv_data *iter_priv;
|
|
|
|
|
|
|
|
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
|
|
|
|
target_private);
|
|
|
|
iter_priv->seq_num--;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void bpf_iter_done_stop(struct seq_file *seq)
|
|
|
|
{
|
|
|
|
struct bpf_iter_priv_data *iter_priv;
|
|
|
|
|
|
|
|
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
|
|
|
|
target_private);
|
|
|
|
iter_priv->done_stop = true;
|
|
|
|
}
|
|
|
|
|
bpf: Permit cond_resched for some iterators
Commit e679654a704e ("bpf: Fix a rcu_sched stall issue with
bpf task/task_file iterator") tries to fix rcu stalls warning
which is caused by bpf task_file iterator when running
"bpftool prog".
rcu: INFO: rcu_sched self-detected stall on CPU
rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913
\x09(t=21031 jiffies g=2534773 q=179750)
NMI backtrace for cpu 7
CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6
Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
Call Trace:
<IRQ>
dump_stack+0x57/0x70
nmi_cpu_backtrace.cold+0x14/0x53
? lapic_can_unplug_cpu.cold+0x39/0x39
nmi_trigger_cpumask_backtrace+0xb7/0xc7
rcu_dump_cpu_stacks+0xa2/0xd0
rcu_sched_clock_irq.cold+0x1ff/0x3d9
? tick_nohz_handler+0x100/0x100
update_process_times+0x5b/0x90
tick_sched_timer+0x5e/0xf0
__hrtimer_run_queues+0x12a/0x2a0
hrtimer_interrupt+0x10e/0x280
__sysvec_apic_timer_interrupt+0x51/0xe0
asm_call_on_stack+0xf/0x20
</IRQ>
sysvec_apic_timer_interrupt+0x6f/0x80
...
task_file_seq_next+0x52/0xa0
bpf_seq_read+0xb9/0x320
vfs_read+0x9d/0x180
ksys_read+0x5f/0xe0
do_syscall_64+0x38/0x60
entry_SYSCALL_64_after_hwframe+0x44/0xa9
The fix is to limit the number of bpf program runs to be
one million. This fixed the program in most cases. But
we also found under heavy load, which can increase the wallclock
time for bpf_seq_read(), the warning may still be possible.
For example, calling bpf_delay() in the "while" loop of
bpf_seq_read(), which will introduce artificial delay,
the warning will show up in my qemu run.
static unsigned q;
volatile unsigned *p = &q;
volatile unsigned long long ll;
static void bpf_delay(void)
{
int i, j;
for (i = 0; i < 10000; i++)
for (j = 0; j < 10000; j++)
ll += *p;
}
There are two ways to fix this issue. One is to reduce the above
one million threshold to say 100,000 and hopefully rcu warning will
not show up any more. Another is to introduce a target feature
which enables bpf_seq_read() calling cond_resched().
This patch took second approach as the first approach may cause
more -EAGAIN failures for read() syscalls. Note that not all bpf_iter
targets can permit cond_resched() in bpf_seq_read() as some, e.g.,
netlink seq iterator, rcu read lock critical section spans through
seq_ops->next() -> seq_ops->show() -> seq_ops->next().
For the kernel code with the above hack, "bpftool p" roughly takes
38 seconds to finish on my VM with 184 bpf program runs.
Using the following command, I am able to collect the number of
context switches:
perf stat -e context-switches -- ./bpftool p >& log
Without this patch,
69 context-switches
With this patch,
75 context-switches
This patch added additional 6 context switches, roughly every 6 seconds
to reschedule, to avoid lengthy no-rescheduling which may cause the
above RCU warnings.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201028061054.1411116-1-yhs@fb.com
2020-10-28 09:10:54 +03:00
|
|
|
static bool bpf_iter_support_resched(struct seq_file *seq)
|
|
|
|
{
|
|
|
|
struct bpf_iter_priv_data *iter_priv;
|
|
|
|
|
|
|
|
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
|
|
|
|
target_private);
|
|
|
|
return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED;
|
|
|
|
}
|
|
|
|
|
bpf: Fix a rcu_sched stall issue with bpf task/task_file iterator
In our production system, we observed rcu stalls when
'bpftool prog` is running.
rcu: INFO: rcu_sched self-detected stall on CPU
rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913
\x09(t=21031 jiffies g=2534773 q=179750)
NMI backtrace for cpu 7
CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6
Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
Call Trace:
<IRQ>
dump_stack+0x57/0x70
nmi_cpu_backtrace.cold+0x14/0x53
? lapic_can_unplug_cpu.cold+0x39/0x39
nmi_trigger_cpumask_backtrace+0xb7/0xc7
rcu_dump_cpu_stacks+0xa2/0xd0
rcu_sched_clock_irq.cold+0x1ff/0x3d9
? tick_nohz_handler+0x100/0x100
update_process_times+0x5b/0x90
tick_sched_timer+0x5e/0xf0
__hrtimer_run_queues+0x12a/0x2a0
hrtimer_interrupt+0x10e/0x280
__sysvec_apic_timer_interrupt+0x51/0xe0
asm_call_on_stack+0xf/0x20
</IRQ>
sysvec_apic_timer_interrupt+0x6f/0x80
asm_sysvec_apic_timer_interrupt+0x12/0x20
RIP: 0010:task_file_seq_get_next+0x71/0x220
Code: 00 00 8b 53 1c 49 8b 7d 00 89 d6 48 8b 47 20 44 8b 18 41 39 d3 76 75 48 8b 4f 20 8b 01 39 d0 76 61 41 89 d1 49 39 c1 48 19 c0 <48> 8b 49 08 21 d0 48 8d 04 c1 4c 8b 08 4d 85 c9 74 46 49 8b 41 38
RSP: 0018:ffffc90006223e10 EFLAGS: 00000297
RAX: ffffffffffffffff RBX: ffff888f0d172388 RCX: ffff888c8c07c1c0
RDX: 00000000000f017b RSI: 00000000000f017b RDI: ffff888c254702c0
RBP: ffffc90006223e68 R08: ffff888be2a1c140 R09: 00000000000f017b
R10: 0000000000000002 R11: 0000000000100000 R12: ffff888f23c24118
R13: ffffc90006223e60 R14: ffffffff828509a0 R15: 00000000ffffffff
task_file_seq_next+0x52/0xa0
bpf_seq_read+0xb9/0x320
vfs_read+0x9d/0x180
ksys_read+0x5f/0xe0
do_syscall_64+0x38/0x60
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f8815f4f76e
Code: c0 e9 f6 fe ff ff 55 48 8d 3d 76 70 0a 00 48 89 e5 e8 36 06 02 00 66 0f 1f 44 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 52 c3 66 0f 1f 84 00 00 00 00 00 55 48 89 e5
RSP: 002b:00007fff8f9df578 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 000000000170b9c0 RCX: 00007f8815f4f76e
RDX: 0000000000001000 RSI: 00007fff8f9df5b0 RDI: 0000000000000007
RBP: 00007fff8f9e05f0 R08: 0000000000000049 R09: 0000000000000010
R10: 00007f881601fa40 R11: 0000000000000246 R12: 00007fff8f9e05a8
R13: 00007fff8f9e05a8 R14: 0000000001917f90 R15: 000000000000e22e
Note that `bpftool prog` actually calls a task_file bpf iterator
program to establish an association between prog/map/link/btf anon
files and processes.
In the case where the above rcu stall occured, we had a process
having 1587 tasks and each task having roughly 81305 files.
This implied 129 million bpf prog invocations. Unfortunwtely none of
these files are prog/map/link/btf files so bpf iterator/prog needs
to traverse all these files and not able to return to user space
since there are no seq_file buffer overflow.
This patch fixed the issue in bpf_seq_read() to limit the number
of visited objects. If the maximum number of visited objects is
reached, no more objects will be visited in the current syscall.
If there is nothing written in the seq_file buffer, -EAGAIN will
return to the user so user can try again.
The maximum number of visited objects is set at 1 million.
In our Intel Xeon D-2191 2.3GHZ 18-core server, bpf_seq_read()
visiting 1 million files takes around 0.18 seconds.
We did not use cond_resched() since for some iterators, e.g.,
netlink iterator, where rcu read_lock critical section spans between
consecutive seq_ops->next(), which makes impossible to do cond_resched()
in the key while loop of function bpf_seq_read().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/bpf/20200818222309.2181348-1-yhs@fb.com
2020-08-19 01:23:09 +03:00
|
|
|
/* maximum visited objects before bailing out */
|
|
|
|
#define MAX_ITER_OBJECTS 1000000
|
|
|
|
|
2020-05-09 20:59:04 +03:00
|
|
|
/* bpf_seq_read, a customized and simpler version for bpf iterator.
|
|
|
|
* no_llseek is assumed for this file.
|
|
|
|
* The following are differences from seq_read():
|
|
|
|
* . fixed buffer size (PAGE_SIZE)
|
|
|
|
* . assuming no_llseek
|
|
|
|
* . stop() may call bpf program, handling potential overflow there
|
|
|
|
*/
|
|
|
|
static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
|
|
|
struct seq_file *seq = file->private_data;
|
|
|
|
size_t n, offs, copied = 0;
|
bpf: Fix a rcu_sched stall issue with bpf task/task_file iterator
In our production system, we observed rcu stalls when
'bpftool prog` is running.
rcu: INFO: rcu_sched self-detected stall on CPU
rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913
\x09(t=21031 jiffies g=2534773 q=179750)
NMI backtrace for cpu 7
CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6
Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
Call Trace:
<IRQ>
dump_stack+0x57/0x70
nmi_cpu_backtrace.cold+0x14/0x53
? lapic_can_unplug_cpu.cold+0x39/0x39
nmi_trigger_cpumask_backtrace+0xb7/0xc7
rcu_dump_cpu_stacks+0xa2/0xd0
rcu_sched_clock_irq.cold+0x1ff/0x3d9
? tick_nohz_handler+0x100/0x100
update_process_times+0x5b/0x90
tick_sched_timer+0x5e/0xf0
__hrtimer_run_queues+0x12a/0x2a0
hrtimer_interrupt+0x10e/0x280
__sysvec_apic_timer_interrupt+0x51/0xe0
asm_call_on_stack+0xf/0x20
</IRQ>
sysvec_apic_timer_interrupt+0x6f/0x80
asm_sysvec_apic_timer_interrupt+0x12/0x20
RIP: 0010:task_file_seq_get_next+0x71/0x220
Code: 00 00 8b 53 1c 49 8b 7d 00 89 d6 48 8b 47 20 44 8b 18 41 39 d3 76 75 48 8b 4f 20 8b 01 39 d0 76 61 41 89 d1 49 39 c1 48 19 c0 <48> 8b 49 08 21 d0 48 8d 04 c1 4c 8b 08 4d 85 c9 74 46 49 8b 41 38
RSP: 0018:ffffc90006223e10 EFLAGS: 00000297
RAX: ffffffffffffffff RBX: ffff888f0d172388 RCX: ffff888c8c07c1c0
RDX: 00000000000f017b RSI: 00000000000f017b RDI: ffff888c254702c0
RBP: ffffc90006223e68 R08: ffff888be2a1c140 R09: 00000000000f017b
R10: 0000000000000002 R11: 0000000000100000 R12: ffff888f23c24118
R13: ffffc90006223e60 R14: ffffffff828509a0 R15: 00000000ffffffff
task_file_seq_next+0x52/0xa0
bpf_seq_read+0xb9/0x320
vfs_read+0x9d/0x180
ksys_read+0x5f/0xe0
do_syscall_64+0x38/0x60
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f8815f4f76e
Code: c0 e9 f6 fe ff ff 55 48 8d 3d 76 70 0a 00 48 89 e5 e8 36 06 02 00 66 0f 1f 44 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 52 c3 66 0f 1f 84 00 00 00 00 00 55 48 89 e5
RSP: 002b:00007fff8f9df578 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 000000000170b9c0 RCX: 00007f8815f4f76e
RDX: 0000000000001000 RSI: 00007fff8f9df5b0 RDI: 0000000000000007
RBP: 00007fff8f9e05f0 R08: 0000000000000049 R09: 0000000000000010
R10: 00007f881601fa40 R11: 0000000000000246 R12: 00007fff8f9e05a8
R13: 00007fff8f9e05a8 R14: 0000000001917f90 R15: 000000000000e22e
Note that `bpftool prog` actually calls a task_file bpf iterator
program to establish an association between prog/map/link/btf anon
files and processes.
In the case where the above rcu stall occured, we had a process
having 1587 tasks and each task having roughly 81305 files.
This implied 129 million bpf prog invocations. Unfortunwtely none of
these files are prog/map/link/btf files so bpf iterator/prog needs
to traverse all these files and not able to return to user space
since there are no seq_file buffer overflow.
This patch fixed the issue in bpf_seq_read() to limit the number
of visited objects. If the maximum number of visited objects is
reached, no more objects will be visited in the current syscall.
If there is nothing written in the seq_file buffer, -EAGAIN will
return to the user so user can try again.
The maximum number of visited objects is set at 1 million.
In our Intel Xeon D-2191 2.3GHZ 18-core server, bpf_seq_read()
visiting 1 million files takes around 0.18 seconds.
We did not use cond_resched() since for some iterators, e.g.,
netlink iterator, where rcu read_lock critical section spans between
consecutive seq_ops->next(), which makes impossible to do cond_resched()
in the key while loop of function bpf_seq_read().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/bpf/20200818222309.2181348-1-yhs@fb.com
2020-08-19 01:23:09 +03:00
|
|
|
int err = 0, num_objs = 0;
|
bpf: Permit cond_resched for some iterators
Commit e679654a704e ("bpf: Fix a rcu_sched stall issue with
bpf task/task_file iterator") tries to fix rcu stalls warning
which is caused by bpf task_file iterator when running
"bpftool prog".
rcu: INFO: rcu_sched self-detected stall on CPU
rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913
\x09(t=21031 jiffies g=2534773 q=179750)
NMI backtrace for cpu 7
CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6
Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
Call Trace:
<IRQ>
dump_stack+0x57/0x70
nmi_cpu_backtrace.cold+0x14/0x53
? lapic_can_unplug_cpu.cold+0x39/0x39
nmi_trigger_cpumask_backtrace+0xb7/0xc7
rcu_dump_cpu_stacks+0xa2/0xd0
rcu_sched_clock_irq.cold+0x1ff/0x3d9
? tick_nohz_handler+0x100/0x100
update_process_times+0x5b/0x90
tick_sched_timer+0x5e/0xf0
__hrtimer_run_queues+0x12a/0x2a0
hrtimer_interrupt+0x10e/0x280
__sysvec_apic_timer_interrupt+0x51/0xe0
asm_call_on_stack+0xf/0x20
</IRQ>
sysvec_apic_timer_interrupt+0x6f/0x80
...
task_file_seq_next+0x52/0xa0
bpf_seq_read+0xb9/0x320
vfs_read+0x9d/0x180
ksys_read+0x5f/0xe0
do_syscall_64+0x38/0x60
entry_SYSCALL_64_after_hwframe+0x44/0xa9
The fix is to limit the number of bpf program runs to be
one million. This fixed the program in most cases. But
we also found under heavy load, which can increase the wallclock
time for bpf_seq_read(), the warning may still be possible.
For example, calling bpf_delay() in the "while" loop of
bpf_seq_read(), which will introduce artificial delay,
the warning will show up in my qemu run.
static unsigned q;
volatile unsigned *p = &q;
volatile unsigned long long ll;
static void bpf_delay(void)
{
int i, j;
for (i = 0; i < 10000; i++)
for (j = 0; j < 10000; j++)
ll += *p;
}
There are two ways to fix this issue. One is to reduce the above
one million threshold to say 100,000 and hopefully rcu warning will
not show up any more. Another is to introduce a target feature
which enables bpf_seq_read() calling cond_resched().
This patch took second approach as the first approach may cause
more -EAGAIN failures for read() syscalls. Note that not all bpf_iter
targets can permit cond_resched() in bpf_seq_read() as some, e.g.,
netlink seq iterator, rcu read lock critical section spans through
seq_ops->next() -> seq_ops->show() -> seq_ops->next().
For the kernel code with the above hack, "bpftool p" roughly takes
38 seconds to finish on my VM with 184 bpf program runs.
Using the following command, I am able to collect the number of
context switches:
perf stat -e context-switches -- ./bpftool p >& log
Without this patch,
69 context-switches
With this patch,
75 context-switches
This patch added additional 6 context switches, roughly every 6 seconds
to reschedule, to avoid lengthy no-rescheduling which may cause the
above RCU warnings.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201028061054.1411116-1-yhs@fb.com
2020-10-28 09:10:54 +03:00
|
|
|
bool can_resched;
|
2020-05-09 20:59:04 +03:00
|
|
|
void *p;
|
|
|
|
|
|
|
|
mutex_lock(&seq->lock);
|
|
|
|
|
|
|
|
if (!seq->buf) {
|
2020-09-28 14:31:07 +03:00
|
|
|
seq->size = PAGE_SIZE << 3;
|
|
|
|
seq->buf = kvmalloc(seq->size, GFP_KERNEL);
|
2020-05-09 20:59:04 +03:00
|
|
|
if (!seq->buf) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (seq->count) {
|
|
|
|
n = min(seq->count, size);
|
|
|
|
err = copy_to_user(buf, seq->buf + seq->from, n);
|
|
|
|
if (err) {
|
|
|
|
err = -EFAULT;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
seq->count -= n;
|
|
|
|
seq->from += n;
|
|
|
|
copied = n;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
seq->from = 0;
|
|
|
|
p = seq->op->start(seq, &seq->index);
|
|
|
|
if (!p)
|
|
|
|
goto stop;
|
|
|
|
if (IS_ERR(p)) {
|
|
|
|
err = PTR_ERR(p);
|
|
|
|
seq->op->stop(seq, p);
|
|
|
|
seq->count = 0;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = seq->op->show(seq, p);
|
|
|
|
if (err > 0) {
|
2020-05-09 20:59:07 +03:00
|
|
|
/* object is skipped, decrease seq_num, so next
|
|
|
|
* valid object can reuse the same seq_num.
|
|
|
|
*/
|
|
|
|
bpf_iter_dec_seq_num(seq);
|
2020-05-09 20:59:04 +03:00
|
|
|
seq->count = 0;
|
|
|
|
} else if (err < 0 || seq_has_overflowed(seq)) {
|
|
|
|
if (!err)
|
|
|
|
err = -E2BIG;
|
|
|
|
seq->op->stop(seq, p);
|
|
|
|
seq->count = 0;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
bpf: Permit cond_resched for some iterators
Commit e679654a704e ("bpf: Fix a rcu_sched stall issue with
bpf task/task_file iterator") tries to fix rcu stalls warning
which is caused by bpf task_file iterator when running
"bpftool prog".
rcu: INFO: rcu_sched self-detected stall on CPU
rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913
\x09(t=21031 jiffies g=2534773 q=179750)
NMI backtrace for cpu 7
CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6
Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
Call Trace:
<IRQ>
dump_stack+0x57/0x70
nmi_cpu_backtrace.cold+0x14/0x53
? lapic_can_unplug_cpu.cold+0x39/0x39
nmi_trigger_cpumask_backtrace+0xb7/0xc7
rcu_dump_cpu_stacks+0xa2/0xd0
rcu_sched_clock_irq.cold+0x1ff/0x3d9
? tick_nohz_handler+0x100/0x100
update_process_times+0x5b/0x90
tick_sched_timer+0x5e/0xf0
__hrtimer_run_queues+0x12a/0x2a0
hrtimer_interrupt+0x10e/0x280
__sysvec_apic_timer_interrupt+0x51/0xe0
asm_call_on_stack+0xf/0x20
</IRQ>
sysvec_apic_timer_interrupt+0x6f/0x80
...
task_file_seq_next+0x52/0xa0
bpf_seq_read+0xb9/0x320
vfs_read+0x9d/0x180
ksys_read+0x5f/0xe0
do_syscall_64+0x38/0x60
entry_SYSCALL_64_after_hwframe+0x44/0xa9
The fix is to limit the number of bpf program runs to be
one million. This fixed the program in most cases. But
we also found under heavy load, which can increase the wallclock
time for bpf_seq_read(), the warning may still be possible.
For example, calling bpf_delay() in the "while" loop of
bpf_seq_read(), which will introduce artificial delay,
the warning will show up in my qemu run.
static unsigned q;
volatile unsigned *p = &q;
volatile unsigned long long ll;
static void bpf_delay(void)
{
int i, j;
for (i = 0; i < 10000; i++)
for (j = 0; j < 10000; j++)
ll += *p;
}
There are two ways to fix this issue. One is to reduce the above
one million threshold to say 100,000 and hopefully rcu warning will
not show up any more. Another is to introduce a target feature
which enables bpf_seq_read() calling cond_resched().
This patch took second approach as the first approach may cause
more -EAGAIN failures for read() syscalls. Note that not all bpf_iter
targets can permit cond_resched() in bpf_seq_read() as some, e.g.,
netlink seq iterator, rcu read lock critical section spans through
seq_ops->next() -> seq_ops->show() -> seq_ops->next().
For the kernel code with the above hack, "bpftool p" roughly takes
38 seconds to finish on my VM with 184 bpf program runs.
Using the following command, I am able to collect the number of
context switches:
perf stat -e context-switches -- ./bpftool p >& log
Without this patch,
69 context-switches
With this patch,
75 context-switches
This patch added additional 6 context switches, roughly every 6 seconds
to reschedule, to avoid lengthy no-rescheduling which may cause the
above RCU warnings.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201028061054.1411116-1-yhs@fb.com
2020-10-28 09:10:54 +03:00
|
|
|
can_resched = bpf_iter_support_resched(seq);
|
2020-05-09 20:59:04 +03:00
|
|
|
while (1) {
|
|
|
|
loff_t pos = seq->index;
|
|
|
|
|
bpf: Fix a rcu_sched stall issue with bpf task/task_file iterator
In our production system, we observed rcu stalls when
'bpftool prog` is running.
rcu: INFO: rcu_sched self-detected stall on CPU
rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913
\x09(t=21031 jiffies g=2534773 q=179750)
NMI backtrace for cpu 7
CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6
Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
Call Trace:
<IRQ>
dump_stack+0x57/0x70
nmi_cpu_backtrace.cold+0x14/0x53
? lapic_can_unplug_cpu.cold+0x39/0x39
nmi_trigger_cpumask_backtrace+0xb7/0xc7
rcu_dump_cpu_stacks+0xa2/0xd0
rcu_sched_clock_irq.cold+0x1ff/0x3d9
? tick_nohz_handler+0x100/0x100
update_process_times+0x5b/0x90
tick_sched_timer+0x5e/0xf0
__hrtimer_run_queues+0x12a/0x2a0
hrtimer_interrupt+0x10e/0x280
__sysvec_apic_timer_interrupt+0x51/0xe0
asm_call_on_stack+0xf/0x20
</IRQ>
sysvec_apic_timer_interrupt+0x6f/0x80
asm_sysvec_apic_timer_interrupt+0x12/0x20
RIP: 0010:task_file_seq_get_next+0x71/0x220
Code: 00 00 8b 53 1c 49 8b 7d 00 89 d6 48 8b 47 20 44 8b 18 41 39 d3 76 75 48 8b 4f 20 8b 01 39 d0 76 61 41 89 d1 49 39 c1 48 19 c0 <48> 8b 49 08 21 d0 48 8d 04 c1 4c 8b 08 4d 85 c9 74 46 49 8b 41 38
RSP: 0018:ffffc90006223e10 EFLAGS: 00000297
RAX: ffffffffffffffff RBX: ffff888f0d172388 RCX: ffff888c8c07c1c0
RDX: 00000000000f017b RSI: 00000000000f017b RDI: ffff888c254702c0
RBP: ffffc90006223e68 R08: ffff888be2a1c140 R09: 00000000000f017b
R10: 0000000000000002 R11: 0000000000100000 R12: ffff888f23c24118
R13: ffffc90006223e60 R14: ffffffff828509a0 R15: 00000000ffffffff
task_file_seq_next+0x52/0xa0
bpf_seq_read+0xb9/0x320
vfs_read+0x9d/0x180
ksys_read+0x5f/0xe0
do_syscall_64+0x38/0x60
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f8815f4f76e
Code: c0 e9 f6 fe ff ff 55 48 8d 3d 76 70 0a 00 48 89 e5 e8 36 06 02 00 66 0f 1f 44 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 52 c3 66 0f 1f 84 00 00 00 00 00 55 48 89 e5
RSP: 002b:00007fff8f9df578 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 000000000170b9c0 RCX: 00007f8815f4f76e
RDX: 0000000000001000 RSI: 00007fff8f9df5b0 RDI: 0000000000000007
RBP: 00007fff8f9e05f0 R08: 0000000000000049 R09: 0000000000000010
R10: 00007f881601fa40 R11: 0000000000000246 R12: 00007fff8f9e05a8
R13: 00007fff8f9e05a8 R14: 0000000001917f90 R15: 000000000000e22e
Note that `bpftool prog` actually calls a task_file bpf iterator
program to establish an association between prog/map/link/btf anon
files and processes.
In the case where the above rcu stall occured, we had a process
having 1587 tasks and each task having roughly 81305 files.
This implied 129 million bpf prog invocations. Unfortunwtely none of
these files are prog/map/link/btf files so bpf iterator/prog needs
to traverse all these files and not able to return to user space
since there are no seq_file buffer overflow.
This patch fixed the issue in bpf_seq_read() to limit the number
of visited objects. If the maximum number of visited objects is
reached, no more objects will be visited in the current syscall.
If there is nothing written in the seq_file buffer, -EAGAIN will
return to the user so user can try again.
The maximum number of visited objects is set at 1 million.
In our Intel Xeon D-2191 2.3GHZ 18-core server, bpf_seq_read()
visiting 1 million files takes around 0.18 seconds.
We did not use cond_resched() since for some iterators, e.g.,
netlink iterator, where rcu read_lock critical section spans between
consecutive seq_ops->next(), which makes impossible to do cond_resched()
in the key while loop of function bpf_seq_read().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/bpf/20200818222309.2181348-1-yhs@fb.com
2020-08-19 01:23:09 +03:00
|
|
|
num_objs++;
|
2020-05-09 20:59:04 +03:00
|
|
|
offs = seq->count;
|
|
|
|
p = seq->op->next(seq, p, &seq->index);
|
|
|
|
if (pos == seq->index) {
|
|
|
|
pr_info_ratelimited("buggy seq_file .next function %ps "
|
|
|
|
"did not updated position index\n",
|
|
|
|
seq->op->next);
|
|
|
|
seq->index++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (IS_ERR_OR_NULL(p))
|
|
|
|
break;
|
|
|
|
|
2020-05-09 20:59:07 +03:00
|
|
|
/* got a valid next object, increase seq_num */
|
|
|
|
bpf_iter_inc_seq_num(seq);
|
|
|
|
|
2020-05-09 20:59:04 +03:00
|
|
|
if (seq->count >= size)
|
|
|
|
break;
|
|
|
|
|
bpf: Fix a rcu_sched stall issue with bpf task/task_file iterator
In our production system, we observed rcu stalls when
'bpftool prog` is running.
rcu: INFO: rcu_sched self-detected stall on CPU
rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913
\x09(t=21031 jiffies g=2534773 q=179750)
NMI backtrace for cpu 7
CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6
Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
Call Trace:
<IRQ>
dump_stack+0x57/0x70
nmi_cpu_backtrace.cold+0x14/0x53
? lapic_can_unplug_cpu.cold+0x39/0x39
nmi_trigger_cpumask_backtrace+0xb7/0xc7
rcu_dump_cpu_stacks+0xa2/0xd0
rcu_sched_clock_irq.cold+0x1ff/0x3d9
? tick_nohz_handler+0x100/0x100
update_process_times+0x5b/0x90
tick_sched_timer+0x5e/0xf0
__hrtimer_run_queues+0x12a/0x2a0
hrtimer_interrupt+0x10e/0x280
__sysvec_apic_timer_interrupt+0x51/0xe0
asm_call_on_stack+0xf/0x20
</IRQ>
sysvec_apic_timer_interrupt+0x6f/0x80
asm_sysvec_apic_timer_interrupt+0x12/0x20
RIP: 0010:task_file_seq_get_next+0x71/0x220
Code: 00 00 8b 53 1c 49 8b 7d 00 89 d6 48 8b 47 20 44 8b 18 41 39 d3 76 75 48 8b 4f 20 8b 01 39 d0 76 61 41 89 d1 49 39 c1 48 19 c0 <48> 8b 49 08 21 d0 48 8d 04 c1 4c 8b 08 4d 85 c9 74 46 49 8b 41 38
RSP: 0018:ffffc90006223e10 EFLAGS: 00000297
RAX: ffffffffffffffff RBX: ffff888f0d172388 RCX: ffff888c8c07c1c0
RDX: 00000000000f017b RSI: 00000000000f017b RDI: ffff888c254702c0
RBP: ffffc90006223e68 R08: ffff888be2a1c140 R09: 00000000000f017b
R10: 0000000000000002 R11: 0000000000100000 R12: ffff888f23c24118
R13: ffffc90006223e60 R14: ffffffff828509a0 R15: 00000000ffffffff
task_file_seq_next+0x52/0xa0
bpf_seq_read+0xb9/0x320
vfs_read+0x9d/0x180
ksys_read+0x5f/0xe0
do_syscall_64+0x38/0x60
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x7f8815f4f76e
Code: c0 e9 f6 fe ff ff 55 48 8d 3d 76 70 0a 00 48 89 e5 e8 36 06 02 00 66 0f 1f 44 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 52 c3 66 0f 1f 84 00 00 00 00 00 55 48 89 e5
RSP: 002b:00007fff8f9df578 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 000000000170b9c0 RCX: 00007f8815f4f76e
RDX: 0000000000001000 RSI: 00007fff8f9df5b0 RDI: 0000000000000007
RBP: 00007fff8f9e05f0 R08: 0000000000000049 R09: 0000000000000010
R10: 00007f881601fa40 R11: 0000000000000246 R12: 00007fff8f9e05a8
R13: 00007fff8f9e05a8 R14: 0000000001917f90 R15: 000000000000e22e
Note that `bpftool prog` actually calls a task_file bpf iterator
program to establish an association between prog/map/link/btf anon
files and processes.
In the case where the above rcu stall occured, we had a process
having 1587 tasks and each task having roughly 81305 files.
This implied 129 million bpf prog invocations. Unfortunwtely none of
these files are prog/map/link/btf files so bpf iterator/prog needs
to traverse all these files and not able to return to user space
since there are no seq_file buffer overflow.
This patch fixed the issue in bpf_seq_read() to limit the number
of visited objects. If the maximum number of visited objects is
reached, no more objects will be visited in the current syscall.
If there is nothing written in the seq_file buffer, -EAGAIN will
return to the user so user can try again.
The maximum number of visited objects is set at 1 million.
In our Intel Xeon D-2191 2.3GHZ 18-core server, bpf_seq_read()
visiting 1 million files takes around 0.18 seconds.
We did not use cond_resched() since for some iterators, e.g.,
netlink iterator, where rcu read_lock critical section spans between
consecutive seq_ops->next(), which makes impossible to do cond_resched()
in the key while loop of function bpf_seq_read().
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/bpf/20200818222309.2181348-1-yhs@fb.com
2020-08-19 01:23:09 +03:00
|
|
|
if (num_objs >= MAX_ITER_OBJECTS) {
|
|
|
|
if (offs == 0) {
|
|
|
|
err = -EAGAIN;
|
|
|
|
seq->op->stop(seq, p);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2020-05-09 20:59:04 +03:00
|
|
|
err = seq->op->show(seq, p);
|
|
|
|
if (err > 0) {
|
2020-05-09 20:59:07 +03:00
|
|
|
bpf_iter_dec_seq_num(seq);
|
2020-05-09 20:59:04 +03:00
|
|
|
seq->count = offs;
|
|
|
|
} else if (err < 0 || seq_has_overflowed(seq)) {
|
|
|
|
seq->count = offs;
|
|
|
|
if (offs == 0) {
|
|
|
|
if (!err)
|
|
|
|
err = -E2BIG;
|
|
|
|
seq->op->stop(seq, p);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
bpf: Permit cond_resched for some iterators
Commit e679654a704e ("bpf: Fix a rcu_sched stall issue with
bpf task/task_file iterator") tries to fix rcu stalls warning
which is caused by bpf task_file iterator when running
"bpftool prog".
rcu: INFO: rcu_sched self-detected stall on CPU
rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913
\x09(t=21031 jiffies g=2534773 q=179750)
NMI backtrace for cpu 7
CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6
Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019
Call Trace:
<IRQ>
dump_stack+0x57/0x70
nmi_cpu_backtrace.cold+0x14/0x53
? lapic_can_unplug_cpu.cold+0x39/0x39
nmi_trigger_cpumask_backtrace+0xb7/0xc7
rcu_dump_cpu_stacks+0xa2/0xd0
rcu_sched_clock_irq.cold+0x1ff/0x3d9
? tick_nohz_handler+0x100/0x100
update_process_times+0x5b/0x90
tick_sched_timer+0x5e/0xf0
__hrtimer_run_queues+0x12a/0x2a0
hrtimer_interrupt+0x10e/0x280
__sysvec_apic_timer_interrupt+0x51/0xe0
asm_call_on_stack+0xf/0x20
</IRQ>
sysvec_apic_timer_interrupt+0x6f/0x80
...
task_file_seq_next+0x52/0xa0
bpf_seq_read+0xb9/0x320
vfs_read+0x9d/0x180
ksys_read+0x5f/0xe0
do_syscall_64+0x38/0x60
entry_SYSCALL_64_after_hwframe+0x44/0xa9
The fix is to limit the number of bpf program runs to be
one million. This fixed the program in most cases. But
we also found under heavy load, which can increase the wallclock
time for bpf_seq_read(), the warning may still be possible.
For example, calling bpf_delay() in the "while" loop of
bpf_seq_read(), which will introduce artificial delay,
the warning will show up in my qemu run.
static unsigned q;
volatile unsigned *p = &q;
volatile unsigned long long ll;
static void bpf_delay(void)
{
int i, j;
for (i = 0; i < 10000; i++)
for (j = 0; j < 10000; j++)
ll += *p;
}
There are two ways to fix this issue. One is to reduce the above
one million threshold to say 100,000 and hopefully rcu warning will
not show up any more. Another is to introduce a target feature
which enables bpf_seq_read() calling cond_resched().
This patch took second approach as the first approach may cause
more -EAGAIN failures for read() syscalls. Note that not all bpf_iter
targets can permit cond_resched() in bpf_seq_read() as some, e.g.,
netlink seq iterator, rcu read lock critical section spans through
seq_ops->next() -> seq_ops->show() -> seq_ops->next().
For the kernel code with the above hack, "bpftool p" roughly takes
38 seconds to finish on my VM with 184 bpf program runs.
Using the following command, I am able to collect the number of
context switches:
perf stat -e context-switches -- ./bpftool p >& log
Without this patch,
69 context-switches
With this patch,
75 context-switches
This patch added additional 6 context switches, roughly every 6 seconds
to reschedule, to avoid lengthy no-rescheduling which may cause the
above RCU warnings.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20201028061054.1411116-1-yhs@fb.com
2020-10-28 09:10:54 +03:00
|
|
|
|
|
|
|
if (can_resched)
|
|
|
|
cond_resched();
|
2020-05-09 20:59:04 +03:00
|
|
|
}
|
|
|
|
stop:
|
|
|
|
offs = seq->count;
|
|
|
|
/* bpf program called if !p */
|
|
|
|
seq->op->stop(seq, p);
|
2020-05-09 20:59:07 +03:00
|
|
|
if (!p) {
|
|
|
|
if (!seq_has_overflowed(seq)) {
|
|
|
|
bpf_iter_done_stop(seq);
|
|
|
|
} else {
|
|
|
|
seq->count = offs;
|
|
|
|
if (offs == 0) {
|
|
|
|
err = -E2BIG;
|
|
|
|
goto done;
|
|
|
|
}
|
2020-05-09 20:59:04 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
n = min(seq->count, size);
|
|
|
|
err = copy_to_user(buf, seq->buf, n);
|
|
|
|
if (err) {
|
|
|
|
err = -EFAULT;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
copied = n;
|
|
|
|
seq->count -= n;
|
|
|
|
seq->from = n;
|
|
|
|
done:
|
|
|
|
if (!copied)
|
|
|
|
copied = err;
|
|
|
|
else
|
|
|
|
*ppos += copied;
|
|
|
|
mutex_unlock(&seq->lock);
|
|
|
|
return copied;
|
|
|
|
}
|
|
|
|
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
static const struct bpf_iter_seq_info *
|
|
|
|
__get_seq_info(struct bpf_iter_link *link)
|
|
|
|
{
|
|
|
|
const struct bpf_iter_seq_info *seq_info;
|
|
|
|
|
|
|
|
if (link->aux.map) {
|
|
|
|
seq_info = link->aux.map->ops->iter_seq_info;
|
|
|
|
if (seq_info)
|
|
|
|
return seq_info;
|
|
|
|
}
|
|
|
|
|
|
|
|
return link->tinfo->reg_info->seq_info;
|
|
|
|
}
|
|
|
|
|
2020-05-09 20:59:06 +03:00
|
|
|
static int iter_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
struct bpf_iter_link *link = inode->i_private;
|
|
|
|
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
return prepare_seq_file(file, link, __get_seq_info(link));
|
2020-05-09 20:59:06 +03:00
|
|
|
}
|
|
|
|
|
2020-05-09 20:59:05 +03:00
|
|
|
static int iter_release(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
struct bpf_iter_priv_data *iter_priv;
|
|
|
|
struct seq_file *seq;
|
|
|
|
|
|
|
|
seq = file->private_data;
|
|
|
|
if (!seq)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
|
|
|
|
target_private);
|
|
|
|
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
if (iter_priv->seq_info->fini_seq_private)
|
|
|
|
iter_priv->seq_info->fini_seq_private(seq->private);
|
2020-05-09 20:59:05 +03:00
|
|
|
|
|
|
|
bpf_prog_put(iter_priv->prog);
|
|
|
|
seq->private = iter_priv;
|
|
|
|
|
|
|
|
return seq_release_private(inode, file);
|
|
|
|
}
|
|
|
|
|
2020-05-09 20:59:06 +03:00
|
|
|
const struct file_operations bpf_iter_fops = {
|
|
|
|
.open = iter_open,
|
2020-05-09 20:59:05 +03:00
|
|
|
.llseek = no_llseek,
|
|
|
|
.read = bpf_seq_read,
|
|
|
|
.release = iter_release,
|
|
|
|
};
|
|
|
|
|
2020-05-13 21:02:19 +03:00
|
|
|
/* The argument reg_info will be cached in bpf_iter_target_info.
|
|
|
|
* The common practice is to declare target reg_info as
|
|
|
|
* a const static variable and passed as an argument to
|
|
|
|
* bpf_iter_reg_target().
|
|
|
|
*/
|
|
|
|
int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
|
2020-05-09 20:58:59 +03:00
|
|
|
{
|
|
|
|
struct bpf_iter_target_info *tinfo;
|
|
|
|
|
2021-02-12 03:59:26 +03:00
|
|
|
tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL);
|
2020-05-09 20:58:59 +03:00
|
|
|
if (!tinfo)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2020-05-13 21:02:19 +03:00
|
|
|
tinfo->reg_info = reg_info;
|
2020-05-09 20:58:59 +03:00
|
|
|
INIT_LIST_HEAD(&tinfo->list);
|
|
|
|
|
|
|
|
mutex_lock(&targets_mutex);
|
|
|
|
list_add(&tinfo->list, &targets);
|
|
|
|
mutex_unlock(&targets_mutex);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-05-13 21:02:20 +03:00
|
|
|
void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info)
|
2020-05-09 20:58:59 +03:00
|
|
|
{
|
|
|
|
struct bpf_iter_target_info *tinfo;
|
|
|
|
bool found = false;
|
|
|
|
|
|
|
|
mutex_lock(&targets_mutex);
|
|
|
|
list_for_each_entry(tinfo, &targets, list) {
|
2020-05-13 21:02:20 +03:00
|
|
|
if (reg_info == tinfo->reg_info) {
|
2020-05-09 20:58:59 +03:00
|
|
|
list_del(&tinfo->list);
|
|
|
|
kfree(tinfo);
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_unlock(&targets_mutex);
|
|
|
|
|
|
|
|
WARN_ON(found == false);
|
|
|
|
}
|
2020-05-09 20:59:00 +03:00
|
|
|
|
|
|
|
static void cache_btf_id(struct bpf_iter_target_info *tinfo,
|
|
|
|
struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
tinfo->btf_id = prog->aux->attach_btf_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool bpf_iter_prog_supported(struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
const char *attach_fname = prog->aux->attach_func_name;
|
2022-03-31 12:19:29 +03:00
|
|
|
struct bpf_iter_target_info *tinfo = NULL, *iter;
|
2020-05-09 20:59:00 +03:00
|
|
|
u32 prog_btf_id = prog->aux->attach_btf_id;
|
|
|
|
const char *prefix = BPF_ITER_FUNC_PREFIX;
|
|
|
|
int prefix_len = strlen(prefix);
|
|
|
|
|
|
|
|
if (strncmp(attach_fname, prefix, prefix_len))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
mutex_lock(&targets_mutex);
|
2022-03-31 12:19:29 +03:00
|
|
|
list_for_each_entry(iter, &targets, list) {
|
|
|
|
if (iter->btf_id && iter->btf_id == prog_btf_id) {
|
|
|
|
tinfo = iter;
|
2020-05-09 20:59:00 +03:00
|
|
|
break;
|
|
|
|
}
|
2022-03-31 12:19:29 +03:00
|
|
|
if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) {
|
|
|
|
cache_btf_id(iter, prog);
|
|
|
|
tinfo = iter;
|
2020-05-09 20:59:00 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_unlock(&targets_mutex);
|
|
|
|
|
2022-03-31 12:19:29 +03:00
|
|
|
if (tinfo) {
|
2020-05-13 21:02:21 +03:00
|
|
|
prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
|
|
|
|
prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
|
|
|
|
}
|
|
|
|
|
2022-03-31 12:19:29 +03:00
|
|
|
return tinfo != NULL;
|
2020-05-09 20:59:00 +03:00
|
|
|
}
|
2020-05-09 20:59:01 +03:00
|
|
|
|
2021-07-01 23:06:19 +03:00
|
|
|
const struct bpf_func_proto *
|
|
|
|
bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
const struct bpf_iter_target_info *tinfo;
|
|
|
|
const struct bpf_func_proto *fn = NULL;
|
|
|
|
|
|
|
|
mutex_lock(&targets_mutex);
|
|
|
|
list_for_each_entry(tinfo, &targets, list) {
|
|
|
|
if (tinfo->btf_id == prog->aux->attach_btf_id) {
|
|
|
|
const struct bpf_iter_reg *reg_info;
|
|
|
|
|
|
|
|
reg_info = tinfo->reg_info;
|
|
|
|
if (reg_info->get_func_proto)
|
|
|
|
fn = reg_info->get_func_proto(func_id, prog);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_unlock(&targets_mutex);
|
|
|
|
|
|
|
|
return fn;
|
|
|
|
}
|
|
|
|
|
2020-05-09 20:59:01 +03:00
|
|
|
static void bpf_iter_link_release(struct bpf_link *link)
|
|
|
|
{
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
struct bpf_iter_link *iter_link =
|
|
|
|
container_of(link, struct bpf_iter_link, link);
|
|
|
|
|
bpf: Change uapi for bpf iterator map elements
Commit a5cbe05a6673 ("bpf: Implement bpf iterator for
map elements") added bpf iterator support for
map elements. The map element bpf iterator requires
info to identify a particular map. In the above
commit, the attr->link_create.target_fd is used
to carry map_fd and an enum bpf_iter_link_info
is added to uapi to specify the target_fd actually
representing a map_fd:
enum bpf_iter_link_info {
BPF_ITER_LINK_UNSPEC = 0,
BPF_ITER_LINK_MAP_FD = 1,
MAX_BPF_ITER_LINK_INFO,
};
This is an extensible approach as we can grow
enumerator for pid, cgroup_id, etc. and we can
unionize target_fd for pid, cgroup_id, etc.
But in the future, there are chances that
more complex customization may happen, e.g.,
for tasks, it could be filtered based on
both cgroup_id and user_id.
This patch changed the uapi to have fields
__aligned_u64 iter_info;
__u32 iter_info_len;
for additional iter_info for link_create.
The iter_info is defined as
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
};
So future extension for additional customization
will be easier. The bpf_iter_link_info will be
passed to target callback to validate and generic
bpf_iter framework does not need to deal it any
more.
Note that map_fd = 0 will be considered invalid
and -EBADF will be returned to user space.
Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com
2020-08-05 08:50:56 +03:00
|
|
|
if (iter_link->tinfo->reg_info->detach_target)
|
|
|
|
iter_link->tinfo->reg_info->detach_target(&iter_link->aux);
|
2020-05-09 20:59:01 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void bpf_iter_link_dealloc(struct bpf_link *link)
|
|
|
|
{
|
|
|
|
struct bpf_iter_link *iter_link =
|
|
|
|
container_of(link, struct bpf_iter_link, link);
|
|
|
|
|
|
|
|
kfree(iter_link);
|
|
|
|
}
|
|
|
|
|
2020-05-09 20:59:02 +03:00
|
|
|
static int bpf_iter_link_replace(struct bpf_link *link,
|
|
|
|
struct bpf_prog *new_prog,
|
|
|
|
struct bpf_prog *old_prog)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
mutex_lock(&link_mutex);
|
|
|
|
if (old_prog && link->prog != old_prog) {
|
|
|
|
ret = -EPERM;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (link->prog->type != new_prog->type ||
|
|
|
|
link->prog->expected_attach_type != new_prog->expected_attach_type ||
|
|
|
|
link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
old_prog = xchg(&link->prog, new_prog);
|
|
|
|
bpf_prog_put(old_prog);
|
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&link_mutex);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-08-21 21:44:18 +03:00
|
|
|
static void bpf_iter_link_show_fdinfo(const struct bpf_link *link,
|
|
|
|
struct seq_file *seq)
|
|
|
|
{
|
|
|
|
struct bpf_iter_link *iter_link =
|
|
|
|
container_of(link, struct bpf_iter_link, link);
|
|
|
|
bpf_iter_show_fdinfo_t show_fdinfo;
|
|
|
|
|
|
|
|
seq_printf(seq,
|
|
|
|
"target_name:\t%s\n",
|
|
|
|
iter_link->tinfo->reg_info->target);
|
|
|
|
|
|
|
|
show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo;
|
|
|
|
if (show_fdinfo)
|
|
|
|
show_fdinfo(&iter_link->aux, seq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int bpf_iter_link_fill_link_info(const struct bpf_link *link,
|
|
|
|
struct bpf_link_info *info)
|
|
|
|
{
|
|
|
|
struct bpf_iter_link *iter_link =
|
|
|
|
container_of(link, struct bpf_iter_link, link);
|
|
|
|
char __user *ubuf = u64_to_user_ptr(info->iter.target_name);
|
|
|
|
bpf_iter_fill_link_info_t fill_link_info;
|
|
|
|
u32 ulen = info->iter.target_name_len;
|
|
|
|
const char *target_name;
|
|
|
|
u32 target_len;
|
|
|
|
|
|
|
|
if (!ulen ^ !ubuf)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
target_name = iter_link->tinfo->reg_info->target;
|
|
|
|
target_len = strlen(target_name);
|
|
|
|
info->iter.target_name_len = target_len + 1;
|
|
|
|
|
|
|
|
if (ubuf) {
|
|
|
|
if (ulen >= target_len + 1) {
|
|
|
|
if (copy_to_user(ubuf, target_name, target_len + 1))
|
|
|
|
return -EFAULT;
|
|
|
|
} else {
|
|
|
|
char zero = '\0';
|
|
|
|
|
|
|
|
if (copy_to_user(ubuf, target_name, ulen - 1))
|
|
|
|
return -EFAULT;
|
|
|
|
if (put_user(zero, ubuf + ulen - 1))
|
|
|
|
return -EFAULT;
|
|
|
|
return -ENOSPC;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fill_link_info = iter_link->tinfo->reg_info->fill_link_info;
|
|
|
|
if (fill_link_info)
|
|
|
|
return fill_link_info(&iter_link->aux, info);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-05-09 20:59:01 +03:00
|
|
|
static const struct bpf_link_ops bpf_iter_link_lops = {
|
|
|
|
.release = bpf_iter_link_release,
|
|
|
|
.dealloc = bpf_iter_link_dealloc,
|
2020-05-09 20:59:02 +03:00
|
|
|
.update_prog = bpf_iter_link_replace,
|
2020-08-21 21:44:18 +03:00
|
|
|
.show_fdinfo = bpf_iter_link_show_fdinfo,
|
|
|
|
.fill_link_info = bpf_iter_link_fill_link_info,
|
2020-05-09 20:59:01 +03:00
|
|
|
};
|
|
|
|
|
2020-05-09 20:59:06 +03:00
|
|
|
bool bpf_link_is_iter(struct bpf_link *link)
|
|
|
|
{
|
|
|
|
return link->ops == &bpf_iter_link_lops;
|
|
|
|
}
|
|
|
|
|
2021-05-14 03:36:05 +03:00
|
|
|
int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
|
|
|
|
struct bpf_prog *prog)
|
2020-05-09 20:59:01 +03:00
|
|
|
{
|
2022-03-31 12:19:29 +03:00
|
|
|
struct bpf_iter_target_info *tinfo = NULL, *iter;
|
2020-05-09 20:59:01 +03:00
|
|
|
struct bpf_link_primer link_primer;
|
bpf: Change uapi for bpf iterator map elements
Commit a5cbe05a6673 ("bpf: Implement bpf iterator for
map elements") added bpf iterator support for
map elements. The map element bpf iterator requires
info to identify a particular map. In the above
commit, the attr->link_create.target_fd is used
to carry map_fd and an enum bpf_iter_link_info
is added to uapi to specify the target_fd actually
representing a map_fd:
enum bpf_iter_link_info {
BPF_ITER_LINK_UNSPEC = 0,
BPF_ITER_LINK_MAP_FD = 1,
MAX_BPF_ITER_LINK_INFO,
};
This is an extensible approach as we can grow
enumerator for pid, cgroup_id, etc. and we can
unionize target_fd for pid, cgroup_id, etc.
But in the future, there are chances that
more complex customization may happen, e.g.,
for tasks, it could be filtered based on
both cgroup_id and user_id.
This patch changed the uapi to have fields
__aligned_u64 iter_info;
__u32 iter_info_len;
for additional iter_info for link_create.
The iter_info is defined as
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
};
So future extension for additional customization
will be easier. The bpf_iter_link_info will be
passed to target callback to validate and generic
bpf_iter framework does not need to deal it any
more.
Note that map_fd = 0 will be considered invalid
and -EBADF will be returned to user space.
Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com
2020-08-05 08:50:56 +03:00
|
|
|
union bpf_iter_link_info linfo;
|
2020-05-09 20:59:01 +03:00
|
|
|
struct bpf_iter_link *link;
|
bpf: Change uapi for bpf iterator map elements
Commit a5cbe05a6673 ("bpf: Implement bpf iterator for
map elements") added bpf iterator support for
map elements. The map element bpf iterator requires
info to identify a particular map. In the above
commit, the attr->link_create.target_fd is used
to carry map_fd and an enum bpf_iter_link_info
is added to uapi to specify the target_fd actually
representing a map_fd:
enum bpf_iter_link_info {
BPF_ITER_LINK_UNSPEC = 0,
BPF_ITER_LINK_MAP_FD = 1,
MAX_BPF_ITER_LINK_INFO,
};
This is an extensible approach as we can grow
enumerator for pid, cgroup_id, etc. and we can
unionize target_fd for pid, cgroup_id, etc.
But in the future, there are chances that
more complex customization may happen, e.g.,
for tasks, it could be filtered based on
both cgroup_id and user_id.
This patch changed the uapi to have fields
__aligned_u64 iter_info;
__u32 iter_info_len;
for additional iter_info for link_create.
The iter_info is defined as
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
};
So future extension for additional customization
will be easier. The bpf_iter_link_info will be
passed to target callback to validate and generic
bpf_iter framework does not need to deal it any
more.
Note that map_fd = 0 will be considered invalid
and -EBADF will be returned to user space.
Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com
2020-08-05 08:50:56 +03:00
|
|
|
u32 prog_btf_id, linfo_len;
|
2021-05-14 03:36:05 +03:00
|
|
|
bpfptr_t ulinfo;
|
2020-05-09 20:59:01 +03:00
|
|
|
int err;
|
|
|
|
|
bpf: Change uapi for bpf iterator map elements
Commit a5cbe05a6673 ("bpf: Implement bpf iterator for
map elements") added bpf iterator support for
map elements. The map element bpf iterator requires
info to identify a particular map. In the above
commit, the attr->link_create.target_fd is used
to carry map_fd and an enum bpf_iter_link_info
is added to uapi to specify the target_fd actually
representing a map_fd:
enum bpf_iter_link_info {
BPF_ITER_LINK_UNSPEC = 0,
BPF_ITER_LINK_MAP_FD = 1,
MAX_BPF_ITER_LINK_INFO,
};
This is an extensible approach as we can grow
enumerator for pid, cgroup_id, etc. and we can
unionize target_fd for pid, cgroup_id, etc.
But in the future, there are chances that
more complex customization may happen, e.g.,
for tasks, it could be filtered based on
both cgroup_id and user_id.
This patch changed the uapi to have fields
__aligned_u64 iter_info;
__u32 iter_info_len;
for additional iter_info for link_create.
The iter_info is defined as
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
};
So future extension for additional customization
will be easier. The bpf_iter_link_info will be
passed to target callback to validate and generic
bpf_iter framework does not need to deal it any
more.
Note that map_fd = 0 will be considered invalid
and -EBADF will be returned to user space.
Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com
2020-08-05 08:50:56 +03:00
|
|
|
if (attr->link_create.target_fd || attr->link_create.flags)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
memset(&linfo, 0, sizeof(union bpf_iter_link_info));
|
|
|
|
|
2021-05-14 03:36:05 +03:00
|
|
|
ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel);
|
bpf: Change uapi for bpf iterator map elements
Commit a5cbe05a6673 ("bpf: Implement bpf iterator for
map elements") added bpf iterator support for
map elements. The map element bpf iterator requires
info to identify a particular map. In the above
commit, the attr->link_create.target_fd is used
to carry map_fd and an enum bpf_iter_link_info
is added to uapi to specify the target_fd actually
representing a map_fd:
enum bpf_iter_link_info {
BPF_ITER_LINK_UNSPEC = 0,
BPF_ITER_LINK_MAP_FD = 1,
MAX_BPF_ITER_LINK_INFO,
};
This is an extensible approach as we can grow
enumerator for pid, cgroup_id, etc. and we can
unionize target_fd for pid, cgroup_id, etc.
But in the future, there are chances that
more complex customization may happen, e.g.,
for tasks, it could be filtered based on
both cgroup_id and user_id.
This patch changed the uapi to have fields
__aligned_u64 iter_info;
__u32 iter_info_len;
for additional iter_info for link_create.
The iter_info is defined as
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
};
So future extension for additional customization
will be easier. The bpf_iter_link_info will be
passed to target callback to validate and generic
bpf_iter framework does not need to deal it any
more.
Note that map_fd = 0 will be considered invalid
and -EBADF will be returned to user space.
Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com
2020-08-05 08:50:56 +03:00
|
|
|
linfo_len = attr->link_create.iter_info_len;
|
2021-05-14 03:36:05 +03:00
|
|
|
if (bpfptr_is_null(ulinfo) ^ !linfo_len)
|
bpf: Change uapi for bpf iterator map elements
Commit a5cbe05a6673 ("bpf: Implement bpf iterator for
map elements") added bpf iterator support for
map elements. The map element bpf iterator requires
info to identify a particular map. In the above
commit, the attr->link_create.target_fd is used
to carry map_fd and an enum bpf_iter_link_info
is added to uapi to specify the target_fd actually
representing a map_fd:
enum bpf_iter_link_info {
BPF_ITER_LINK_UNSPEC = 0,
BPF_ITER_LINK_MAP_FD = 1,
MAX_BPF_ITER_LINK_INFO,
};
This is an extensible approach as we can grow
enumerator for pid, cgroup_id, etc. and we can
unionize target_fd for pid, cgroup_id, etc.
But in the future, there are chances that
more complex customization may happen, e.g.,
for tasks, it could be filtered based on
both cgroup_id and user_id.
This patch changed the uapi to have fields
__aligned_u64 iter_info;
__u32 iter_info_len;
for additional iter_info for link_create.
The iter_info is defined as
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
};
So future extension for additional customization
will be easier. The bpf_iter_link_info will be
passed to target callback to validate and generic
bpf_iter framework does not need to deal it any
more.
Note that map_fd = 0 will be considered invalid
and -EBADF will be returned to user space.
Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com
2020-08-05 08:50:56 +03:00
|
|
|
return -EINVAL;
|
|
|
|
|
2021-05-14 03:36:05 +03:00
|
|
|
if (!bpfptr_is_null(ulinfo)) {
|
bpf: Change uapi for bpf iterator map elements
Commit a5cbe05a6673 ("bpf: Implement bpf iterator for
map elements") added bpf iterator support for
map elements. The map element bpf iterator requires
info to identify a particular map. In the above
commit, the attr->link_create.target_fd is used
to carry map_fd and an enum bpf_iter_link_info
is added to uapi to specify the target_fd actually
representing a map_fd:
enum bpf_iter_link_info {
BPF_ITER_LINK_UNSPEC = 0,
BPF_ITER_LINK_MAP_FD = 1,
MAX_BPF_ITER_LINK_INFO,
};
This is an extensible approach as we can grow
enumerator for pid, cgroup_id, etc. and we can
unionize target_fd for pid, cgroup_id, etc.
But in the future, there are chances that
more complex customization may happen, e.g.,
for tasks, it could be filtered based on
both cgroup_id and user_id.
This patch changed the uapi to have fields
__aligned_u64 iter_info;
__u32 iter_info_len;
for additional iter_info for link_create.
The iter_info is defined as
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
};
So future extension for additional customization
will be easier. The bpf_iter_link_info will be
passed to target callback to validate and generic
bpf_iter framework does not need to deal it any
more.
Note that map_fd = 0 will be considered invalid
and -EBADF will be returned to user space.
Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com
2020-08-05 08:50:56 +03:00
|
|
|
err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
|
|
|
|
linfo_len);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
linfo_len = min_t(u32, linfo_len, sizeof(linfo));
|
2021-05-14 03:36:05 +03:00
|
|
|
if (copy_from_bpfptr(&linfo, ulinfo, linfo_len))
|
bpf: Change uapi for bpf iterator map elements
Commit a5cbe05a6673 ("bpf: Implement bpf iterator for
map elements") added bpf iterator support for
map elements. The map element bpf iterator requires
info to identify a particular map. In the above
commit, the attr->link_create.target_fd is used
to carry map_fd and an enum bpf_iter_link_info
is added to uapi to specify the target_fd actually
representing a map_fd:
enum bpf_iter_link_info {
BPF_ITER_LINK_UNSPEC = 0,
BPF_ITER_LINK_MAP_FD = 1,
MAX_BPF_ITER_LINK_INFO,
};
This is an extensible approach as we can grow
enumerator for pid, cgroup_id, etc. and we can
unionize target_fd for pid, cgroup_id, etc.
But in the future, there are chances that
more complex customization may happen, e.g.,
for tasks, it could be filtered based on
both cgroup_id and user_id.
This patch changed the uapi to have fields
__aligned_u64 iter_info;
__u32 iter_info_len;
for additional iter_info for link_create.
The iter_info is defined as
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
};
So future extension for additional customization
will be easier. The bpf_iter_link_info will be
passed to target callback to validate and generic
bpf_iter framework does not need to deal it any
more.
Note that map_fd = 0 will be considered invalid
and -EBADF will be returned to user space.
Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com
2020-08-05 08:50:56 +03:00
|
|
|
return -EFAULT;
|
|
|
|
}
|
|
|
|
|
2020-05-09 20:59:01 +03:00
|
|
|
prog_btf_id = prog->aux->attach_btf_id;
|
|
|
|
mutex_lock(&targets_mutex);
|
2022-03-31 12:19:29 +03:00
|
|
|
list_for_each_entry(iter, &targets, list) {
|
|
|
|
if (iter->btf_id == prog_btf_id) {
|
|
|
|
tinfo = iter;
|
2020-05-09 20:59:01 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_unlock(&targets_mutex);
|
2022-03-31 12:19:29 +03:00
|
|
|
if (!tinfo)
|
2020-05-09 20:59:01 +03:00
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
|
|
|
|
if (!link)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
|
|
|
|
link->tinfo = tinfo;
|
|
|
|
|
2022-04-10 09:00:19 +03:00
|
|
|
err = bpf_link_prime(&link->link, &link_primer);
|
2020-05-09 20:59:01 +03:00
|
|
|
if (err) {
|
|
|
|
kfree(link);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
bpf: Change uapi for bpf iterator map elements
Commit a5cbe05a6673 ("bpf: Implement bpf iterator for
map elements") added bpf iterator support for
map elements. The map element bpf iterator requires
info to identify a particular map. In the above
commit, the attr->link_create.target_fd is used
to carry map_fd and an enum bpf_iter_link_info
is added to uapi to specify the target_fd actually
representing a map_fd:
enum bpf_iter_link_info {
BPF_ITER_LINK_UNSPEC = 0,
BPF_ITER_LINK_MAP_FD = 1,
MAX_BPF_ITER_LINK_INFO,
};
This is an extensible approach as we can grow
enumerator for pid, cgroup_id, etc. and we can
unionize target_fd for pid, cgroup_id, etc.
But in the future, there are chances that
more complex customization may happen, e.g.,
for tasks, it could be filtered based on
both cgroup_id and user_id.
This patch changed the uapi to have fields
__aligned_u64 iter_info;
__u32 iter_info_len;
for additional iter_info for link_create.
The iter_info is defined as
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
};
So future extension for additional customization
will be easier. The bpf_iter_link_info will be
passed to target callback to validate and generic
bpf_iter framework does not need to deal it any
more.
Note that map_fd = 0 will be considered invalid
and -EBADF will be returned to user space.
Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com
2020-08-05 08:50:56 +03:00
|
|
|
if (tinfo->reg_info->attach_target) {
|
|
|
|
err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux);
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
if (err) {
|
bpf: Change uapi for bpf iterator map elements
Commit a5cbe05a6673 ("bpf: Implement bpf iterator for
map elements") added bpf iterator support for
map elements. The map element bpf iterator requires
info to identify a particular map. In the above
commit, the attr->link_create.target_fd is used
to carry map_fd and an enum bpf_iter_link_info
is added to uapi to specify the target_fd actually
representing a map_fd:
enum bpf_iter_link_info {
BPF_ITER_LINK_UNSPEC = 0,
BPF_ITER_LINK_MAP_FD = 1,
MAX_BPF_ITER_LINK_INFO,
};
This is an extensible approach as we can grow
enumerator for pid, cgroup_id, etc. and we can
unionize target_fd for pid, cgroup_id, etc.
But in the future, there are chances that
more complex customization may happen, e.g.,
for tasks, it could be filtered based on
both cgroup_id and user_id.
This patch changed the uapi to have fields
__aligned_u64 iter_info;
__u32 iter_info_len;
for additional iter_info for link_create.
The iter_info is defined as
union bpf_iter_link_info {
struct {
__u32 map_fd;
} map;
};
So future extension for additional customization
will be easier. The bpf_iter_link_info will be
passed to target callback to validate and generic
bpf_iter framework does not need to deal it any
more.
Note that map_fd = 0 will be considered invalid
and -EBADF will be returned to user space.
Fixes: a5cbe05a6673 ("bpf: Implement bpf iterator for map elements")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200805055056.1457463-1-yhs@fb.com
2020-08-05 08:50:56 +03:00
|
|
|
bpf_link_cleanup(&link_primer);
|
|
|
|
return err;
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-09 20:59:01 +03:00
|
|
|
return bpf_link_settle(&link_primer);
|
|
|
|
}
|
2020-05-09 20:59:05 +03:00
|
|
|
|
|
|
|
static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
|
|
|
|
struct bpf_iter_target_info *tinfo,
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
const struct bpf_iter_seq_info *seq_info,
|
2020-05-09 20:59:05 +03:00
|
|
|
struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
priv_data->tinfo = tinfo;
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
priv_data->seq_info = seq_info;
|
2020-05-09 20:59:05 +03:00
|
|
|
priv_data->prog = prog;
|
|
|
|
priv_data->session_id = atomic64_inc_return(&session_id);
|
|
|
|
priv_data->seq_num = 0;
|
|
|
|
priv_data->done_stop = false;
|
|
|
|
}
|
|
|
|
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
|
|
|
|
const struct bpf_iter_seq_info *seq_info)
|
2020-05-09 20:59:05 +03:00
|
|
|
{
|
|
|
|
struct bpf_iter_priv_data *priv_data;
|
|
|
|
struct bpf_iter_target_info *tinfo;
|
|
|
|
struct bpf_prog *prog;
|
|
|
|
u32 total_priv_dsize;
|
|
|
|
struct seq_file *seq;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
mutex_lock(&link_mutex);
|
|
|
|
prog = link->link.prog;
|
|
|
|
bpf_prog_inc(prog);
|
|
|
|
mutex_unlock(&link_mutex);
|
|
|
|
|
|
|
|
tinfo = link->tinfo;
|
|
|
|
total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
seq_info->seq_priv_size;
|
|
|
|
priv_data = __seq_open_private(file, seq_info->seq_ops,
|
2020-05-13 21:02:19 +03:00
|
|
|
total_priv_dsize);
|
2020-05-09 20:59:05 +03:00
|
|
|
if (!priv_data) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto release_prog;
|
|
|
|
}
|
|
|
|
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
if (seq_info->init_seq_private) {
|
|
|
|
err = seq_info->init_seq_private(priv_data->target_private, &link->aux);
|
2020-05-09 20:59:05 +03:00
|
|
|
if (err)
|
|
|
|
goto release_seq_file;
|
|
|
|
}
|
|
|
|
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
init_seq_meta(priv_data, tinfo, seq_info, prog);
|
2020-05-09 20:59:05 +03:00
|
|
|
seq = file->private_data;
|
|
|
|
seq->private = priv_data->target_private;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
release_seq_file:
|
|
|
|
seq_release_private(file->f_inode, file);
|
|
|
|
file->private_data = NULL;
|
|
|
|
release_prog:
|
|
|
|
bpf_prog_put(prog);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
int bpf_iter_new_fd(struct bpf_link *link)
|
|
|
|
{
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
struct bpf_iter_link *iter_link;
|
2020-05-09 20:59:05 +03:00
|
|
|
struct file *file;
|
|
|
|
unsigned int flags;
|
|
|
|
int err, fd;
|
|
|
|
|
|
|
|
if (link->ops != &bpf_iter_link_lops)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
flags = O_RDONLY | O_CLOEXEC;
|
|
|
|
fd = get_unused_fd_flags(flags);
|
|
|
|
if (fd < 0)
|
|
|
|
return fd;
|
|
|
|
|
|
|
|
file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
|
|
|
|
if (IS_ERR(file)) {
|
|
|
|
err = PTR_ERR(file);
|
|
|
|
goto free_fd;
|
|
|
|
}
|
|
|
|
|
bpf: Implement bpf iterator for map elements
The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
bpf_iter_meta *meta: the meta data
bpf_map *map: the bpf_map whose elements are traversed
void *key: the key of one element
void *value: the value of the same element
Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.
A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.
In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.
Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
2020-07-23 21:41:12 +03:00
|
|
|
iter_link = container_of(link, struct bpf_iter_link, link);
|
|
|
|
err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link));
|
2020-05-09 20:59:05 +03:00
|
|
|
if (err)
|
|
|
|
goto free_file;
|
|
|
|
|
|
|
|
fd_install(fd, file);
|
|
|
|
return fd;
|
|
|
|
|
|
|
|
free_file:
|
|
|
|
fput(file);
|
|
|
|
free_fd:
|
|
|
|
put_unused_fd(fd);
|
|
|
|
return err;
|
|
|
|
}
|
2020-05-09 20:59:07 +03:00
|
|
|
|
|
|
|
struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
|
|
|
|
{
|
|
|
|
struct bpf_iter_priv_data *iter_priv;
|
|
|
|
struct seq_file *seq;
|
|
|
|
void *seq_priv;
|
|
|
|
|
|
|
|
seq = meta->seq;
|
|
|
|
if (seq->file->f_op != &bpf_iter_fops)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
seq_priv = seq->private;
|
|
|
|
iter_priv = container_of(seq_priv, struct bpf_iter_priv_data,
|
|
|
|
target_private);
|
|
|
|
|
|
|
|
if (in_stop && iter_priv->done_stop)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
meta->session_id = iter_priv->session_id;
|
|
|
|
meta->seq_num = iter_priv->seq_num;
|
|
|
|
|
|
|
|
return iter_priv->prog;
|
|
|
|
}
|
|
|
|
|
|
|
|
int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2022-01-24 21:54:00 +03:00
|
|
|
if (prog->aux->sleepable) {
|
|
|
|
rcu_read_lock_trace();
|
|
|
|
migrate_disable();
|
|
|
|
might_fault();
|
|
|
|
ret = bpf_prog_run(prog, ctx);
|
|
|
|
migrate_enable();
|
|
|
|
rcu_read_unlock_trace();
|
|
|
|
} else {
|
|
|
|
rcu_read_lock();
|
|
|
|
migrate_disable();
|
|
|
|
ret = bpf_prog_run(prog, ctx);
|
|
|
|
migrate_enable();
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
2020-05-09 20:59:07 +03:00
|
|
|
|
2020-05-13 21:02:18 +03:00
|
|
|
/* bpf program can only return 0 or 1:
|
|
|
|
* 0 : okay
|
|
|
|
* 1 : retry the same object
|
|
|
|
* The bpf_iter_run_prog() return value
|
|
|
|
* will be seq_ops->show() return value.
|
|
|
|
*/
|
2020-05-09 20:59:07 +03:00
|
|
|
return ret == 0 ? 0 : -EAGAIN;
|
|
|
|
}
|
bpf: Add bpf_for_each_map_elem() helper
The bpf_for_each_map_elem() helper is introduced which
iterates all map elements with a callback function. The
helper signature looks like
long bpf_for_each_map_elem(map, callback_fn, callback_ctx, flags)
and for each map element, the callback_fn will be called. For example,
like hashmap, the callback signature may look like
long callback_fn(map, key, val, callback_ctx)
There are two known use cases for this. One is from upstream ([1]) where
a for_each_map_elem helper may help implement a timeout mechanism
in a more generic way. Another is from our internal discussion
for a firewall use case where a map contains all the rules. The packet
data can be compared to all these rules to decide allow or deny
the packet.
For array maps, users can already use a bounded loop to traverse
elements. Using this helper can avoid using bounded loop. For other
type of maps (e.g., hash maps) where bounded loop is hard or
impossible to use, this helper provides a convenient way to
operate on all elements.
For callback_fn, besides map and map element, a callback_ctx,
allocated on caller stack, is also passed to the callback
function. This callback_ctx argument can provide additional
input and allow to write to caller stack for output.
If the callback_fn returns 0, the helper will iterate through next
element if available. If the callback_fn returns 1, the helper
will stop iterating and returns to the bpf program. Other return
values are not used for now.
Currently, this helper is only available with jit. It is possible
to make it work with interpreter with so effort but I leave it
as the future work.
[1]: https://lore.kernel.org/bpf/20210122205415.113822-1-xiyou.wangcong@gmail.com/
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210226204925.3884923-1-yhs@fb.com
2021-02-26 23:49:25 +03:00
|
|
|
|
|
|
|
BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn,
|
|
|
|
void *, callback_ctx, u64, flags)
|
|
|
|
{
|
|
|
|
return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
const struct bpf_func_proto bpf_for_each_map_elem_proto = {
|
|
|
|
.func = bpf_for_each_map_elem,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_CONST_MAP_PTR,
|
|
|
|
.arg2_type = ARG_PTR_TO_FUNC,
|
|
|
|
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
|
|
|
|
.arg4_type = ARG_ANYTHING,
|
|
|
|
};
|
2021-11-30 06:06:19 +03:00
|
|
|
|
|
|
|
BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
|
|
|
|
u64, flags)
|
|
|
|
{
|
|
|
|
bpf_callback_t callback = (bpf_callback_t)callback_fn;
|
|
|
|
u64 ret;
|
|
|
|
u32 i;
|
|
|
|
|
2022-06-21 02:53:42 +03:00
|
|
|
/* Note: these safety checks are also verified when bpf_loop
|
|
|
|
* is inlined, be careful to modify this code in sync. See
|
|
|
|
* function verifier.c:inline_bpf_loop.
|
|
|
|
*/
|
2021-11-30 06:06:19 +03:00
|
|
|
if (flags)
|
|
|
|
return -EINVAL;
|
2022-06-21 02:53:42 +03:00
|
|
|
if (nr_loops > BPF_MAX_LOOPS)
|
2021-11-30 06:06:19 +03:00
|
|
|
return -E2BIG;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_loops; i++) {
|
|
|
|
ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
|
|
|
|
/* return value: 0 - continue, 1 - stop and return */
|
|
|
|
if (ret)
|
|
|
|
return i + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
const struct bpf_func_proto bpf_loop_proto = {
|
|
|
|
.func = bpf_loop,
|
|
|
|
.gpl_only = false,
|
|
|
|
.ret_type = RET_INTEGER,
|
|
|
|
.arg1_type = ARG_ANYTHING,
|
|
|
|
.arg2_type = ARG_PTR_TO_FUNC,
|
|
|
|
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
|
|
|
|
.arg4_type = ARG_ANYTHING,
|
|
|
|
};
|