selftests/bpf: Add benchmark for local_storage RCU Tasks Trace usage
This benchmark measures grace period latency and kthread cpu usage of RCU Tasks Trace when many processes are creating/deleting BPF local_storage. Intent here is to quantify improvement on these metrics after Paul's recent RCU Tasks patches [0]. Specifically, fork 15k tasks which call a bpf prog that creates/destroys task local_storage and sleep in a loop, resulting in many call_rcu_tasks_trace calls. To determine grace period latency, trace time elapsed between rcu_tasks_trace_pregp_step and rcu_tasks_trace_postgp; for cpu usage look at rcu_task_trace_kthread's stime in /proc/PID/stat. On my virtualized test environment (Skylake, 8 cpus) benchmark results demonstrate significant improvement: BEFORE Paul's patches: SUMMARY tasks_trace grace period latency avg 22298.551 us stddev 1302.165 us SUMMARY ticks per tasks_trace grace period avg 2.291 stddev 0.324 AFTER Paul's patches: SUMMARY tasks_trace grace period latency avg 16969.197 us stddev 2525.053 us SUMMARY ticks per tasks_trace grace period avg 1.146 stddev 0.178 Note that since these patches are not in bpf-next benchmarking was done by cherry-picking this patch onto rcu tree. [0] https://lore.kernel.org/rcu/20220620225402.GA3842369@paulmck-ThinkPad-P17-Gen-1/ Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Paul E. McKenney <paulmck@kernel.org> Acked-by: Martin KaFai Lau <kafai@fb.com> Link: https://lore.kernel.org/bpf/20220705190018.3239050-1-davemarchevsky@fb.com
This commit is contained in:
Родитель
935dc35c75
Коммит
2b4b2621fd
|
@ -574,6 +574,7 @@ $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h
|
|||
$(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h
|
||||
$(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h
|
||||
$(OUTPUT)/bench_local_storage.o: $(OUTPUT)/local_storage_bench.skel.h
|
||||
$(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tasks_trace_bench.skel.h
|
||||
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(BPFOBJ)
|
||||
$(OUTPUT)/bench: LDLIBS += -lm
|
||||
$(OUTPUT)/bench: $(OUTPUT)/bench.o \
|
||||
|
@ -587,7 +588,8 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
|
|||
$(OUTPUT)/bench_bpf_loop.o \
|
||||
$(OUTPUT)/bench_strncmp.o \
|
||||
$(OUTPUT)/bench_bpf_hashmap_full_update.o \
|
||||
$(OUTPUT)/bench_local_storage.o
|
||||
$(OUTPUT)/bench_local_storage.o \
|
||||
$(OUTPUT)/bench_local_storage_rcu_tasks_trace.o
|
||||
$(call msg,BINARY,,$@)
|
||||
$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
|
||||
|
||||
|
|
|
@ -79,6 +79,43 @@ void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns)
|
|||
hits_per_sec, hits_per_prod, drops_per_sec, hits_per_sec + drops_per_sec);
|
||||
}
|
||||
|
||||
void
|
||||
grace_period_latency_basic_stats(struct bench_res res[], int res_cnt, struct basic_stats *gp_stat)
|
||||
{
|
||||
int i;
|
||||
|
||||
memset(gp_stat, 0, sizeof(struct basic_stats));
|
||||
|
||||
for (i = 0; i < res_cnt; i++)
|
||||
gp_stat->mean += res[i].gp_ns / 1000.0 / (double)res[i].gp_ct / (0.0 + res_cnt);
|
||||
|
||||
#define IT_MEAN_DIFF (res[i].gp_ns / 1000.0 / (double)res[i].gp_ct - gp_stat->mean)
|
||||
if (res_cnt > 1) {
|
||||
for (i = 0; i < res_cnt; i++)
|
||||
gp_stat->stddev += (IT_MEAN_DIFF * IT_MEAN_DIFF) / (res_cnt - 1.0);
|
||||
}
|
||||
gp_stat->stddev = sqrt(gp_stat->stddev);
|
||||
#undef IT_MEAN_DIFF
|
||||
}
|
||||
|
||||
void
|
||||
grace_period_ticks_basic_stats(struct bench_res res[], int res_cnt, struct basic_stats *gp_stat)
|
||||
{
|
||||
int i;
|
||||
|
||||
memset(gp_stat, 0, sizeof(struct basic_stats));
|
||||
for (i = 0; i < res_cnt; i++)
|
||||
gp_stat->mean += res[i].stime / (double)res[i].gp_ct / (0.0 + res_cnt);
|
||||
|
||||
#define IT_MEAN_DIFF (res[i].stime / (double)res[i].gp_ct - gp_stat->mean)
|
||||
if (res_cnt > 1) {
|
||||
for (i = 0; i < res_cnt; i++)
|
||||
gp_stat->stddev += (IT_MEAN_DIFF * IT_MEAN_DIFF) / (res_cnt - 1.0);
|
||||
}
|
||||
gp_stat->stddev = sqrt(gp_stat->stddev);
|
||||
#undef IT_MEAN_DIFF
|
||||
}
|
||||
|
||||
void hits_drops_report_final(struct bench_res res[], int res_cnt)
|
||||
{
|
||||
int i;
|
||||
|
@ -236,6 +273,7 @@ extern struct argp bench_ringbufs_argp;
|
|||
extern struct argp bench_bloom_map_argp;
|
||||
extern struct argp bench_bpf_loop_argp;
|
||||
extern struct argp bench_local_storage_argp;
|
||||
extern struct argp bench_local_storage_rcu_tasks_trace_argp;
|
||||
extern struct argp bench_strncmp_argp;
|
||||
|
||||
static const struct argp_child bench_parsers[] = {
|
||||
|
@ -244,6 +282,8 @@ static const struct argp_child bench_parsers[] = {
|
|||
{ &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 },
|
||||
{ &bench_local_storage_argp, 0, "local_storage benchmark", 0 },
|
||||
{ &bench_strncmp_argp, 0, "bpf_strncmp helper benchmark", 0 },
|
||||
{ &bench_local_storage_rcu_tasks_trace_argp, 0,
|
||||
"local_storage RCU Tasks Trace slowdown benchmark", 0 },
|
||||
{},
|
||||
};
|
||||
|
||||
|
@ -449,6 +489,7 @@ extern const struct bench bench_bpf_hashmap_full_update;
|
|||
extern const struct bench bench_local_storage_cache_seq_get;
|
||||
extern const struct bench bench_local_storage_cache_interleaved_get;
|
||||
extern const struct bench bench_local_storage_cache_hashmap_control;
|
||||
extern const struct bench bench_local_storage_tasks_trace;
|
||||
|
||||
static const struct bench *benchs[] = {
|
||||
&bench_count_global,
|
||||
|
@ -487,6 +528,7 @@ static const struct bench *benchs[] = {
|
|||
&bench_local_storage_cache_seq_get,
|
||||
&bench_local_storage_cache_interleaved_get,
|
||||
&bench_local_storage_cache_hashmap_control,
|
||||
&bench_local_storage_tasks_trace,
|
||||
};
|
||||
|
||||
static void setup_benchmark()
|
||||
|
|
|
@ -30,11 +30,19 @@ struct env {
|
|||
struct cpu_set cons_cpus;
|
||||
};
|
||||
|
||||
struct basic_stats {
|
||||
double mean;
|
||||
double stddev;
|
||||
};
|
||||
|
||||
struct bench_res {
|
||||
long hits;
|
||||
long drops;
|
||||
long false_hits;
|
||||
long important_hits;
|
||||
unsigned long gp_ns;
|
||||
unsigned long gp_ct;
|
||||
unsigned int stime;
|
||||
};
|
||||
|
||||
struct bench {
|
||||
|
@ -65,6 +73,10 @@ void ops_report_final(struct bench_res res[], int res_cnt);
|
|||
void local_storage_report_progress(int iter, struct bench_res *res,
|
||||
long delta_ns);
|
||||
void local_storage_report_final(struct bench_res res[], int res_cnt);
|
||||
void grace_period_latency_basic_stats(struct bench_res res[], int res_cnt,
|
||||
struct basic_stats *gp_stat);
|
||||
void grace_period_ticks_basic_stats(struct bench_res res[], int res_cnt,
|
||||
struct basic_stats *gp_stat);
|
||||
|
||||
static inline __u64 get_time_ns(void)
|
||||
{
|
||||
|
|
|
@ -0,0 +1,281 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
|
||||
|
||||
#include <argp.h>
|
||||
|
||||
#include <sys/prctl.h>
|
||||
#include "local_storage_rcu_tasks_trace_bench.skel.h"
|
||||
#include "bench.h"
|
||||
|
||||
#include <signal.h>
|
||||
|
||||
static struct {
|
||||
__u32 nr_procs;
|
||||
__u32 kthread_pid;
|
||||
bool quiet;
|
||||
} args = {
|
||||
.nr_procs = 1000,
|
||||
.kthread_pid = 0,
|
||||
.quiet = false,
|
||||
};
|
||||
|
||||
enum {
|
||||
ARG_NR_PROCS = 7000,
|
||||
ARG_KTHREAD_PID = 7001,
|
||||
ARG_QUIET = 7002,
|
||||
};
|
||||
|
||||
static const struct argp_option opts[] = {
|
||||
{ "nr_procs", ARG_NR_PROCS, "NR_PROCS", 0,
|
||||
"Set number of user processes to spin up"},
|
||||
{ "kthread_pid", ARG_KTHREAD_PID, "PID", 0,
|
||||
"Pid of rcu_tasks_trace kthread for ticks tracking"},
|
||||
{ "quiet", ARG_QUIET, "{0,1}", 0,
|
||||
"If true, don't report progress"},
|
||||
{},
|
||||
};
|
||||
|
||||
static error_t parse_arg(int key, char *arg, struct argp_state *state)
|
||||
{
|
||||
long ret;
|
||||
|
||||
switch (key) {
|
||||
case ARG_NR_PROCS:
|
||||
ret = strtol(arg, NULL, 10);
|
||||
if (ret < 1 || ret > UINT_MAX) {
|
||||
fprintf(stderr, "invalid nr_procs\n");
|
||||
argp_usage(state);
|
||||
}
|
||||
args.nr_procs = ret;
|
||||
break;
|
||||
case ARG_KTHREAD_PID:
|
||||
ret = strtol(arg, NULL, 10);
|
||||
if (ret < 1) {
|
||||
fprintf(stderr, "invalid kthread_pid\n");
|
||||
argp_usage(state);
|
||||
}
|
||||
args.kthread_pid = ret;
|
||||
break;
|
||||
case ARG_QUIET:
|
||||
ret = strtol(arg, NULL, 10);
|
||||
if (ret < 0 || ret > 1) {
|
||||
fprintf(stderr, "invalid quiet %ld\n", ret);
|
||||
argp_usage(state);
|
||||
}
|
||||
args.quiet = ret;
|
||||
break;
|
||||
break;
|
||||
default:
|
||||
return ARGP_ERR_UNKNOWN;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct argp bench_local_storage_rcu_tasks_trace_argp = {
|
||||
.options = opts,
|
||||
.parser = parse_arg,
|
||||
};
|
||||
|
||||
#define MAX_SLEEP_PROCS 150000
|
||||
|
||||
static void validate(void)
|
||||
{
|
||||
if (env.producer_cnt != 1) {
|
||||
fprintf(stderr, "benchmark doesn't support multi-producer!\n");
|
||||
exit(1);
|
||||
}
|
||||
if (env.consumer_cnt != 1) {
|
||||
fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (args.nr_procs > MAX_SLEEP_PROCS) {
|
||||
fprintf(stderr, "benchmark supports up to %u sleeper procs!\n",
|
||||
MAX_SLEEP_PROCS);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
static long kthread_pid_ticks(void)
|
||||
{
|
||||
char procfs_path[100];
|
||||
long stime;
|
||||
FILE *f;
|
||||
|
||||
if (!args.kthread_pid)
|
||||
return -1;
|
||||
|
||||
sprintf(procfs_path, "/proc/%u/stat", args.kthread_pid);
|
||||
f = fopen(procfs_path, "r");
|
||||
if (!f) {
|
||||
fprintf(stderr, "couldn't open %s, exiting\n", procfs_path);
|
||||
goto err_out;
|
||||
}
|
||||
if (fscanf(f, "%*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %ld", &stime) != 1) {
|
||||
fprintf(stderr, "fscanf of %s failed, exiting\n", procfs_path);
|
||||
goto err_out;
|
||||
}
|
||||
fclose(f);
|
||||
return stime;
|
||||
|
||||
err_out:
|
||||
if (f)
|
||||
fclose(f);
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct {
|
||||
struct local_storage_rcu_tasks_trace_bench *skel;
|
||||
long prev_kthread_stime;
|
||||
} ctx;
|
||||
|
||||
static void sleep_and_loop(void)
|
||||
{
|
||||
while (true) {
|
||||
sleep(rand() % 4);
|
||||
syscall(__NR_getpgid);
|
||||
}
|
||||
}
|
||||
|
||||
static void local_storage_tasks_trace_setup(void)
|
||||
{
|
||||
int i, err, forkret, runner_pid;
|
||||
|
||||
runner_pid = getpid();
|
||||
|
||||
for (i = 0; i < args.nr_procs; i++) {
|
||||
forkret = fork();
|
||||
if (forkret < 0) {
|
||||
fprintf(stderr, "Error forking sleeper proc %u of %u, exiting\n", i,
|
||||
args.nr_procs);
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
if (!forkret) {
|
||||
err = prctl(PR_SET_PDEATHSIG, SIGKILL);
|
||||
if (err < 0) {
|
||||
fprintf(stderr, "prctl failed with err %d, exiting\n", errno);
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
if (getppid() != runner_pid) {
|
||||
fprintf(stderr, "Runner died while spinning up procs, exiting\n");
|
||||
goto err_out;
|
||||
}
|
||||
sleep_and_loop();
|
||||
}
|
||||
}
|
||||
printf("Spun up %u procs (our pid %d)\n", args.nr_procs, runner_pid);
|
||||
|
||||
setup_libbpf();
|
||||
|
||||
ctx.skel = local_storage_rcu_tasks_trace_bench__open_and_load();
|
||||
if (!ctx.skel) {
|
||||
fprintf(stderr, "Error doing open_and_load, exiting\n");
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
ctx.prev_kthread_stime = kthread_pid_ticks();
|
||||
|
||||
if (!bpf_program__attach(ctx.skel->progs.get_local)) {
|
||||
fprintf(stderr, "Error attaching bpf program\n");
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
if (!bpf_program__attach(ctx.skel->progs.pregp_step)) {
|
||||
fprintf(stderr, "Error attaching bpf program\n");
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
if (!bpf_program__attach(ctx.skel->progs.postgp)) {
|
||||
fprintf(stderr, "Error attaching bpf program\n");
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
return;
|
||||
err_out:
|
||||
exit(1);
|
||||
}
|
||||
|
||||
static void measure(struct bench_res *res)
|
||||
{
|
||||
long ticks;
|
||||
|
||||
res->gp_ct = atomic_swap(&ctx.skel->bss->gp_hits, 0);
|
||||
res->gp_ns = atomic_swap(&ctx.skel->bss->gp_times, 0);
|
||||
ticks = kthread_pid_ticks();
|
||||
res->stime = ticks - ctx.prev_kthread_stime;
|
||||
ctx.prev_kthread_stime = ticks;
|
||||
}
|
||||
|
||||
static void *consumer(void *input)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *producer(void *input)
|
||||
{
|
||||
while (true)
|
||||
syscall(__NR_getpgid);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void report_progress(int iter, struct bench_res *res, long delta_ns)
|
||||
{
|
||||
if (ctx.skel->bss->unexpected) {
|
||||
fprintf(stderr, "Error: Unexpected order of bpf prog calls (postgp after pregp).");
|
||||
fprintf(stderr, "Data can't be trusted, exiting\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (args.quiet)
|
||||
return;
|
||||
|
||||
printf("Iter %d\t avg tasks_trace grace period latency\t%lf ns\n",
|
||||
iter, res->gp_ns / (double)res->gp_ct);
|
||||
printf("Iter %d\t avg ticks per tasks_trace grace period\t%lf\n",
|
||||
iter, res->stime / (double)res->gp_ct);
|
||||
}
|
||||
|
||||
static void report_final(struct bench_res res[], int res_cnt)
|
||||
{
|
||||
struct basic_stats gp_stat;
|
||||
|
||||
grace_period_latency_basic_stats(res, res_cnt, &gp_stat);
|
||||
printf("SUMMARY tasks_trace grace period latency");
|
||||
printf("\tavg %.3lf us\tstddev %.3lf us\n", gp_stat.mean, gp_stat.stddev);
|
||||
grace_period_ticks_basic_stats(res, res_cnt, &gp_stat);
|
||||
printf("SUMMARY ticks per tasks_trace grace period");
|
||||
printf("\tavg %.3lf\tstddev %.3lf\n", gp_stat.mean, gp_stat.stddev);
|
||||
}
|
||||
|
||||
/* local-storage-tasks-trace: Benchmark performance of BPF local_storage's use
|
||||
* of RCU Tasks-Trace.
|
||||
*
|
||||
* Stress RCU Tasks Trace by forking many tasks, all of which do no work aside
|
||||
* from sleep() loop, and creating/destroying BPF task-local storage on wakeup.
|
||||
* The number of forked tasks is configurable.
|
||||
*
|
||||
* exercising code paths which call call_rcu_tasks_trace while there are many
|
||||
* thousands of tasks on the system should result in RCU Tasks-Trace having to
|
||||
* do a noticeable amount of work.
|
||||
*
|
||||
* This should be observable by measuring rcu_tasks_trace_kthread CPU usage
|
||||
* after the grace period has ended, or by measuring grace period latency.
|
||||
*
|
||||
* This benchmark uses both approaches, attaching to rcu_tasks_trace_pregp_step
|
||||
* and rcu_tasks_trace_postgp functions to measure grace period latency and
|
||||
* using /proc/PID/stat to measure rcu_tasks_trace_kthread kernel ticks
|
||||
*/
|
||||
const struct bench bench_local_storage_tasks_trace = {
|
||||
.name = "local-storage-tasks-trace",
|
||||
.validate = validate,
|
||||
.setup = local_storage_tasks_trace_setup,
|
||||
.producer_thread = producer,
|
||||
.consumer_thread = consumer,
|
||||
.measure = measure,
|
||||
.report_progress = report_progress,
|
||||
.report_final = report_final,
|
||||
};
|
|
@ -0,0 +1,11 @@
|
|||
#!/bin/bash
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
kthread_pid=`pgrep rcu_tasks_trace_kthread`
|
||||
|
||||
if [ -z $kthread_pid ]; then
|
||||
echo "error: Couldn't find rcu_tasks_trace_kthread"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
./bench --nr_procs 15000 --kthread_pid $kthread_pid -d 600 --quiet 1 local-storage-tasks-trace
|
|
@ -0,0 +1,67 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
|
||||
|
||||
#include "vmlinux.h"
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include "bpf_misc.h"
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
__type(key, int);
|
||||
__type(value, int);
|
||||
} task_storage SEC(".maps");
|
||||
|
||||
long hits;
|
||||
long gp_hits;
|
||||
long gp_times;
|
||||
long current_gp_start;
|
||||
long unexpected;
|
||||
bool postgp_seen;
|
||||
|
||||
SEC("fentry/" SYS_PREFIX "sys_getpgid")
|
||||
int get_local(void *ctx)
|
||||
{
|
||||
struct task_struct *task;
|
||||
int idx;
|
||||
int *s;
|
||||
|
||||
idx = 0;
|
||||
task = bpf_get_current_task_btf();
|
||||
s = bpf_task_storage_get(&task_storage, task, &idx,
|
||||
BPF_LOCAL_STORAGE_GET_F_CREATE);
|
||||
if (!s)
|
||||
return 0;
|
||||
|
||||
*s = 3;
|
||||
bpf_task_storage_delete(&task_storage, task);
|
||||
__sync_add_and_fetch(&hits, 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("fentry/rcu_tasks_trace_pregp_step")
|
||||
int pregp_step(struct pt_regs *ctx)
|
||||
{
|
||||
current_gp_start = bpf_ktime_get_ns();
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("fentry/rcu_tasks_trace_postgp")
|
||||
int postgp(struct pt_regs *ctx)
|
||||
{
|
||||
if (!current_gp_start && postgp_seen) {
|
||||
/* Will only happen if prog tracing rcu_tasks_trace_pregp_step doesn't
|
||||
* execute before this prog
|
||||
*/
|
||||
__sync_add_and_fetch(&unexpected, 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
__sync_add_and_fetch(&gp_times, bpf_ktime_get_ns() - current_gp_start);
|
||||
__sync_add_and_fetch(&gp_hits, 1);
|
||||
current_gp_start = 0;
|
||||
postgp_seen = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
Загрузка…
Ссылка в новой задаче