Merge branch 'bpf: tcp: Allow bpf-tcp-cc to call bpf_(get|set)sockopt'
Martin KaFai says: ==================== This set allows the bpf-tcp-cc to call bpf_setsockopt. One use case is to allow a bpf-tcp-cc switching to another cc during init(). For example, when the tcp flow is not ecn ready, the bpf_dctcp can switch to another cc by calling setsockopt(TCP_CONGESTION). bpf_getsockopt() is also added to have a symmetrical API, so less usage surprise. v2: - Not allow switching to kernel's tcp_cdg because it is the only kernel tcp-cc that stores a pointer to icsk_ca_priv. Please see the commit log in patch 1 for details. Test is added in patch 4 to check switching to tcp_cdg. - Refactor the logic finding the offset of a func ptr in the "struct tcp_congestion_ops" to prog_ops_moff() in patch 1. - bpf_setsockopt() has been disabled in release() since v1 (please see commit log in patch 1 for reason). bpf_getsockopt() is also disabled together in release() in v2 to avoid usage surprise because both of them are usually expected to be available together. bpf-tcp-cc can already use PTR_TO_BTF_ID to read from tcp_sock. ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Коммит
0584e965fb
|
@ -28,6 +28,7 @@ struct bpf_struct_ops_value {
|
|||
|
||||
struct bpf_struct_ops_map {
|
||||
struct bpf_map map;
|
||||
struct rcu_head rcu;
|
||||
const struct bpf_struct_ops *st_ops;
|
||||
/* protect map_update */
|
||||
struct mutex lock;
|
||||
|
@ -622,6 +623,14 @@ bool bpf_struct_ops_get(const void *kdata)
|
|||
return refcount_inc_not_zero(&kvalue->refcnt);
|
||||
}
|
||||
|
||||
static void bpf_struct_ops_put_rcu(struct rcu_head *head)
|
||||
{
|
||||
struct bpf_struct_ops_map *st_map;
|
||||
|
||||
st_map = container_of(head, struct bpf_struct_ops_map, rcu);
|
||||
bpf_map_put(&st_map->map);
|
||||
}
|
||||
|
||||
void bpf_struct_ops_put(const void *kdata)
|
||||
{
|
||||
struct bpf_struct_ops_value *kvalue;
|
||||
|
@ -632,6 +641,17 @@ void bpf_struct_ops_put(const void *kdata)
|
|||
|
||||
st_map = container_of(kvalue, struct bpf_struct_ops_map,
|
||||
kvalue);
|
||||
bpf_map_put(&st_map->map);
|
||||
/* The struct_ops's function may switch to another struct_ops.
|
||||
*
|
||||
* For example, bpf_tcp_cc_x->init() may switch to
|
||||
* another tcp_cc_y by calling
|
||||
* setsockopt(TCP_CONGESTION, "tcp_cc_y").
|
||||
* During the switch, bpf_struct_ops_put(tcp_cc_x) is called
|
||||
* and its map->refcnt may reach 0 which then free its
|
||||
* trampoline image while tcp_cc_x is still running.
|
||||
*
|
||||
* Thus, a rcu grace period is needed here.
|
||||
*/
|
||||
call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5051,6 +5051,12 @@ err_clear:
|
|||
BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
|
||||
int, optname, char *, optval, int, optlen)
|
||||
{
|
||||
if (level == SOL_TCP && optname == TCP_CONGESTION) {
|
||||
if (optlen >= sizeof("cdg") - 1 &&
|
||||
!strncmp("cdg", optval, optlen))
|
||||
return -ENOTSUPP;
|
||||
}
|
||||
|
||||
return _bpf_setsockopt(sk, level, optname, optval, optlen);
|
||||
}
|
||||
|
||||
|
|
|
@ -10,6 +10,9 @@
|
|||
#include <net/tcp.h>
|
||||
#include <net/bpf_sk_storage.h>
|
||||
|
||||
/* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */
|
||||
extern struct bpf_struct_ops bpf_tcp_congestion_ops;
|
||||
|
||||
static u32 optional_ops[] = {
|
||||
offsetof(struct tcp_congestion_ops, init),
|
||||
offsetof(struct tcp_congestion_ops, release),
|
||||
|
@ -163,6 +166,19 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
|
|||
.arg2_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
static u32 prog_ops_moff(const struct bpf_prog *prog)
|
||||
{
|
||||
const struct btf_member *m;
|
||||
const struct btf_type *t;
|
||||
u32 midx;
|
||||
|
||||
midx = prog->expected_attach_type;
|
||||
t = bpf_tcp_congestion_ops.type;
|
||||
m = &btf_type_member(t)[midx];
|
||||
|
||||
return btf_member_bit_offset(t, m) / 8;
|
||||
}
|
||||
|
||||
static const struct bpf_func_proto *
|
||||
bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
|
||||
const struct bpf_prog *prog)
|
||||
|
@ -174,6 +190,28 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
|
|||
return &bpf_sk_storage_get_proto;
|
||||
case BPF_FUNC_sk_storage_delete:
|
||||
return &bpf_sk_storage_delete_proto;
|
||||
case BPF_FUNC_setsockopt:
|
||||
/* Does not allow release() to call setsockopt.
|
||||
* release() is called when the current bpf-tcp-cc
|
||||
* is retiring. It is not allowed to call
|
||||
* setsockopt() to make further changes which
|
||||
* may potentially allocate new resources.
|
||||
*/
|
||||
if (prog_ops_moff(prog) !=
|
||||
offsetof(struct tcp_congestion_ops, release))
|
||||
return &bpf_sk_setsockopt_proto;
|
||||
return NULL;
|
||||
case BPF_FUNC_getsockopt:
|
||||
/* Since get/setsockopt is usually expected to
|
||||
* be available together, disable getsockopt for
|
||||
* release also to avoid usage surprise.
|
||||
* The bpf-tcp-cc already has a more powerful way
|
||||
* to read tcp_sock from the PTR_TO_BTF_ID.
|
||||
*/
|
||||
if (prog_ops_moff(prog) !=
|
||||
offsetof(struct tcp_congestion_ops, release))
|
||||
return &bpf_sk_getsockopt_proto;
|
||||
return NULL;
|
||||
default:
|
||||
return bpf_base_func_proto(func_id);
|
||||
}
|
||||
|
@ -286,9 +324,6 @@ static void bpf_tcp_ca_unreg(void *kdata)
|
|||
tcp_unregister_congestion_control(kdata);
|
||||
}
|
||||
|
||||
/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */
|
||||
extern struct bpf_struct_ops bpf_tcp_congestion_ops;
|
||||
|
||||
struct bpf_struct_ops bpf_tcp_congestion_ops = {
|
||||
.verifier_ops = &bpf_tcp_ca_verifier_ops,
|
||||
.reg = bpf_tcp_ca_reg,
|
||||
|
|
|
@ -31,6 +31,7 @@ enum sk_pacing {
|
|||
|
||||
struct sock {
|
||||
struct sock_common __sk_common;
|
||||
#define sk_state __sk_common.skc_state
|
||||
unsigned long sk_pacing_rate;
|
||||
__u32 sk_pacing_status; /* see enum sk_pacing */
|
||||
} __attribute__((preserve_access_index));
|
||||
|
|
|
@ -218,13 +218,18 @@ static int connect_fd_to_addr(int fd,
|
|||
return 0;
|
||||
}
|
||||
|
||||
int connect_to_fd(int server_fd, int timeout_ms)
|
||||
static const struct network_helper_opts default_opts;
|
||||
|
||||
int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts)
|
||||
{
|
||||
struct sockaddr_storage addr;
|
||||
struct sockaddr_in *addr_in;
|
||||
socklen_t addrlen, optlen;
|
||||
int fd, type;
|
||||
|
||||
if (!opts)
|
||||
opts = &default_opts;
|
||||
|
||||
optlen = sizeof(type);
|
||||
if (getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &type, &optlen)) {
|
||||
log_err("getsockopt(SOL_TYPE)");
|
||||
|
@ -244,7 +249,12 @@ int connect_to_fd(int server_fd, int timeout_ms)
|
|||
return -1;
|
||||
}
|
||||
|
||||
if (settimeo(fd, timeout_ms))
|
||||
if (settimeo(fd, opts->timeout_ms))
|
||||
goto error_close;
|
||||
|
||||
if (opts->cc && opts->cc[0] &&
|
||||
setsockopt(fd, SOL_TCP, TCP_CONGESTION, opts->cc,
|
||||
strlen(opts->cc) + 1))
|
||||
goto error_close;
|
||||
|
||||
if (connect_fd_to_addr(fd, &addr, addrlen))
|
||||
|
@ -257,6 +267,15 @@ error_close:
|
|||
return -1;
|
||||
}
|
||||
|
||||
int connect_to_fd(int server_fd, int timeout_ms)
|
||||
{
|
||||
struct network_helper_opts opts = {
|
||||
.timeout_ms = timeout_ms,
|
||||
};
|
||||
|
||||
return connect_to_fd_opts(server_fd, &opts);
|
||||
}
|
||||
|
||||
int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms)
|
||||
{
|
||||
struct sockaddr_storage addr;
|
||||
|
|
|
@ -17,6 +17,11 @@ typedef __u16 __sum16;
|
|||
#define VIP_NUM 5
|
||||
#define MAGIC_BYTES 123
|
||||
|
||||
struct network_helper_opts {
|
||||
const char *cc;
|
||||
int timeout_ms;
|
||||
};
|
||||
|
||||
/* ipv4 test vector */
|
||||
struct ipv4_packet {
|
||||
struct ethhdr eth;
|
||||
|
@ -41,6 +46,7 @@ int *start_reuseport_server(int family, int type, const char *addr_str,
|
|||
unsigned int nr_listens);
|
||||
void free_fds(int *fds, unsigned int nr_close_fds);
|
||||
int connect_to_fd(int server_fd, int timeout_ms);
|
||||
int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts);
|
||||
int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms);
|
||||
int fastopen_connect(int server_fd, const char *data, unsigned int data_len,
|
||||
int timeout_ms);
|
||||
|
|
|
@ -4,37 +4,22 @@
|
|||
#include <linux/err.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <test_progs.h>
|
||||
#include "network_helpers.h"
|
||||
#include "bpf_dctcp.skel.h"
|
||||
#include "bpf_cubic.skel.h"
|
||||
#include "bpf_tcp_nogpl.skel.h"
|
||||
#include "bpf_dctcp_release.skel.h"
|
||||
|
||||
#define min(a, b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
#ifndef ENOTSUPP
|
||||
#define ENOTSUPP 524
|
||||
#endif
|
||||
|
||||
static const unsigned int total_bytes = 10 * 1024 * 1024;
|
||||
static const struct timeval timeo_sec = { .tv_sec = 10 };
|
||||
static const size_t timeo_optlen = sizeof(timeo_sec);
|
||||
static int expected_stg = 0xeB9F;
|
||||
static int stop, duration;
|
||||
|
||||
static int settimeo(int fd)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec,
|
||||
timeo_optlen);
|
||||
if (CHECK(err == -1, "setsockopt(fd, SO_RCVTIMEO)", "errno:%d\n",
|
||||
errno))
|
||||
return -1;
|
||||
|
||||
err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec,
|
||||
timeo_optlen);
|
||||
if (CHECK(err == -1, "setsockopt(fd, SO_SNDTIMEO)", "errno:%d\n",
|
||||
errno))
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int settcpca(int fd, const char *tcp_ca)
|
||||
{
|
||||
int err;
|
||||
|
@ -61,7 +46,7 @@ static void *server(void *arg)
|
|||
goto done;
|
||||
}
|
||||
|
||||
if (settimeo(fd)) {
|
||||
if (settimeo(fd, 0)) {
|
||||
err = -errno;
|
||||
goto done;
|
||||
}
|
||||
|
@ -114,7 +99,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map)
|
|||
}
|
||||
|
||||
if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) ||
|
||||
settimeo(lfd) || settimeo(fd))
|
||||
settimeo(lfd, 0) || settimeo(fd, 0))
|
||||
goto done;
|
||||
|
||||
/* bind, listen and start server thread to accept */
|
||||
|
@ -267,6 +252,77 @@ static void test_invalid_license(void)
|
|||
libbpf_set_print(old_print_fn);
|
||||
}
|
||||
|
||||
static void test_dctcp_fallback(void)
|
||||
{
|
||||
int err, lfd = -1, cli_fd = -1, srv_fd = -1;
|
||||
struct network_helper_opts opts = {
|
||||
.cc = "cubic",
|
||||
};
|
||||
struct bpf_dctcp *dctcp_skel;
|
||||
struct bpf_link *link = NULL;
|
||||
char srv_cc[16];
|
||||
socklen_t cc_len = sizeof(srv_cc);
|
||||
|
||||
dctcp_skel = bpf_dctcp__open();
|
||||
if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel"))
|
||||
return;
|
||||
strcpy(dctcp_skel->rodata->fallback, "cubic");
|
||||
if (!ASSERT_OK(bpf_dctcp__load(dctcp_skel), "bpf_dctcp__load"))
|
||||
goto done;
|
||||
|
||||
link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp);
|
||||
if (!ASSERT_OK_PTR(link, "dctcp link"))
|
||||
goto done;
|
||||
|
||||
lfd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
|
||||
if (!ASSERT_GE(lfd, 0, "lfd") ||
|
||||
!ASSERT_OK(settcpca(lfd, "bpf_dctcp"), "lfd=>bpf_dctcp"))
|
||||
goto done;
|
||||
|
||||
cli_fd = connect_to_fd_opts(lfd, &opts);
|
||||
if (!ASSERT_GE(cli_fd, 0, "cli_fd"))
|
||||
goto done;
|
||||
|
||||
srv_fd = accept(lfd, NULL, 0);
|
||||
if (!ASSERT_GE(srv_fd, 0, "srv_fd"))
|
||||
goto done;
|
||||
ASSERT_STREQ(dctcp_skel->bss->cc_res, "cubic", "cc_res");
|
||||
ASSERT_EQ(dctcp_skel->bss->tcp_cdg_res, -ENOTSUPP, "tcp_cdg_res");
|
||||
|
||||
err = getsockopt(srv_fd, SOL_TCP, TCP_CONGESTION, srv_cc, &cc_len);
|
||||
if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)"))
|
||||
goto done;
|
||||
ASSERT_STREQ(srv_cc, "cubic", "srv_fd cc");
|
||||
|
||||
done:
|
||||
bpf_link__destroy(link);
|
||||
bpf_dctcp__destroy(dctcp_skel);
|
||||
if (lfd != -1)
|
||||
close(lfd);
|
||||
if (srv_fd != -1)
|
||||
close(srv_fd);
|
||||
if (cli_fd != -1)
|
||||
close(cli_fd);
|
||||
}
|
||||
|
||||
static void test_rel_setsockopt(void)
|
||||
{
|
||||
struct bpf_dctcp_release *rel_skel;
|
||||
libbpf_print_fn_t old_print_fn;
|
||||
|
||||
err_str = "unknown func bpf_setsockopt";
|
||||
found = false;
|
||||
|
||||
old_print_fn = libbpf_set_print(libbpf_debug_print);
|
||||
rel_skel = bpf_dctcp_release__open_and_load();
|
||||
libbpf_set_print(old_print_fn);
|
||||
|
||||
ASSERT_ERR_PTR(rel_skel, "rel_skel");
|
||||
ASSERT_TRUE(found, "expected_err_msg");
|
||||
|
||||
bpf_dctcp_release__destroy(rel_skel);
|
||||
}
|
||||
|
||||
void test_bpf_tcp_ca(void)
|
||||
{
|
||||
if (test__start_subtest("dctcp"))
|
||||
|
@ -275,4 +331,8 @@ void test_bpf_tcp_ca(void)
|
|||
test_cubic();
|
||||
if (test__start_subtest("invalid_license"))
|
||||
test_invalid_license();
|
||||
if (test__start_subtest("dctcp_fallback"))
|
||||
test_dctcp_fallback();
|
||||
if (test__start_subtest("rel_setsockopt"))
|
||||
test_rel_setsockopt();
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@ static void test_subprog(void)
|
|||
ASSERT_OK(err, "bpf_prog_test_run(test1)");
|
||||
ASSERT_EQ(retval, 10, "test1-retval");
|
||||
ASSERT_NEQ(skel->data->active_res, -1, "active_res");
|
||||
ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state");
|
||||
ASSERT_EQ(skel->data->sk_state_res, BPF_TCP_CLOSE, "sk_state_res");
|
||||
|
||||
kfunc_call_test_subprog__destroy(skel);
|
||||
}
|
||||
|
|
|
@ -17,6 +17,11 @@
|
|||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
volatile const char fallback[TCP_CA_NAME_MAX];
|
||||
const char bpf_dctcp[] = "bpf_dctcp";
|
||||
const char tcp_cdg[] = "cdg";
|
||||
char cc_res[TCP_CA_NAME_MAX];
|
||||
int tcp_cdg_res = 0;
|
||||
int stg_result = 0;
|
||||
|
||||
struct {
|
||||
|
@ -57,6 +62,26 @@ void BPF_PROG(dctcp_init, struct sock *sk)
|
|||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
int *stg;
|
||||
|
||||
if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) {
|
||||
/* Switch to fallback */
|
||||
bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
|
||||
(void *)fallback, sizeof(fallback));
|
||||
/* Switch back to myself which the bpf trampoline
|
||||
* stopped calling dctcp_init recursively.
|
||||
*/
|
||||
bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
|
||||
(void *)bpf_dctcp, sizeof(bpf_dctcp));
|
||||
/* Switch back to fallback */
|
||||
bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
|
||||
(void *)fallback, sizeof(fallback));
|
||||
/* Expecting -ENOTSUPP for tcp_cdg_res */
|
||||
tcp_cdg_res = bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
|
||||
(void *)tcp_cdg, sizeof(tcp_cdg));
|
||||
bpf_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
|
||||
(void *)cc_res, sizeof(cc_res));
|
||||
return;
|
||||
}
|
||||
|
||||
ca->prior_rcv_nxt = tp->rcv_nxt;
|
||||
ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
|
||||
ca->loss_cwnd = 0;
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (c) 2021 Facebook */
|
||||
|
||||
#include <stddef.h>
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/stddef.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
#include "bpf_tcp_helpers.h"
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
const char cubic[] = "cubic";
|
||||
|
||||
void BPF_STRUCT_OPS(dctcp_nouse_release, struct sock *sk)
|
||||
{
|
||||
bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
|
||||
(void *)cubic, sizeof(cubic));
|
||||
}
|
||||
|
||||
SEC(".struct_ops")
|
||||
struct tcp_congestion_ops dctcp_rel = {
|
||||
.release = (void *)dctcp_nouse_release,
|
||||
.name = "bpf_dctcp_rel",
|
||||
};
|
|
@ -9,7 +9,7 @@ extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b,
|
|||
__u32 c, __u64 d) __ksym;
|
||||
extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym;
|
||||
int active_res = -1;
|
||||
int sk_state = -1;
|
||||
int sk_state_res = -1;
|
||||
|
||||
int __noinline f1(struct __sk_buff *skb)
|
||||
{
|
||||
|
@ -28,7 +28,7 @@ int __noinline f1(struct __sk_buff *skb)
|
|||
if (active)
|
||||
active_res = *active;
|
||||
|
||||
sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state;
|
||||
sk_state_res = bpf_kfunc_call_test3((struct sock *)sk)->sk_state;
|
||||
|
||||
return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4);
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче