bpf: Support socket migration by eBPF.
This patch introduces a new bpf_attach_type for BPF_PROG_TYPE_SK_REUSEPORT
to check if the attached eBPF program is capable of migrating sockets. When
the eBPF program is attached, we run it for socket migration if the
expected_attach_type is BPF_SK_REUSEPORT_SELECT_OR_MIGRATE or
net.ipv4.tcp_migrate_req is enabled.
Currently, the expected_attach_type is not enforced for the
BPF_PROG_TYPE_SK_REUSEPORT type of program. Thus, this commit follows the
earlier idea in the commit aac3fc320d
("bpf: Post-hooks for sys_bind") to
fix up the zero expected_attach_type in bpf_prog_load_fixup_attach_type().
Moreover, this patch adds a new field (migrating_sk) to sk_reuseport_md to
select a new listener based on the child socket. migrating_sk varies
depending on if it is migrating a request in the accept queue or during
3WHS.
- accept_queue : sock (ESTABLISHED/SYN_RECV)
- 3WHS : request_sock (NEW_SYN_RECV)
In the eBPF program, we can select a new listener by
BPF_FUNC_sk_select_reuseport(). Also, we can cancel migration by returning
SK_DROP. This feature is useful when listeners have different settings at
the socket API level or when we want to free resources as soon as possible.
- SK_PASS with selected_sk, select it as a new listener
- SK_PASS with selected_sk NULL, fallbacks to the random selection
- SK_DROP, cancel the migration.
There is a noteworthy point. We select a listening socket in three places,
but we do not have struct skb at closing a listener or retransmitting a
SYN+ACK. On the other hand, some helper functions do not expect skb is NULL
(e.g. skb_header_pointer() in BPF_FUNC_skb_load_bytes(), skb_tail_pointer()
in BPF_FUNC_skb_load_bytes_relative()). So we allocate an empty skb
temporarily before running the eBPF program.
Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/netdev/20201123003828.xjpjdtk4ygl6tg6h@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/netdev/20201203042402.6cskdlit5f3mw4ru@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/bpf/20210612123224.12525-10-kuniyu@amazon.co.jp
This commit is contained in:
Родитель
e061047684
Коммит
d5e4ddaeb6
|
@ -2048,6 +2048,7 @@ struct sk_reuseport_kern {
|
||||||
struct sk_buff *skb;
|
struct sk_buff *skb;
|
||||||
struct sock *sk;
|
struct sock *sk;
|
||||||
struct sock *selected_sk;
|
struct sock *selected_sk;
|
||||||
|
struct sock *migrating_sk;
|
||||||
void *data_end;
|
void *data_end;
|
||||||
u32 hash;
|
u32 hash;
|
||||||
u32 reuseport_id;
|
u32 reuseport_id;
|
||||||
|
|
|
@ -996,11 +996,13 @@ void bpf_warn_invalid_xdp_action(u32 act);
|
||||||
#ifdef CONFIG_INET
|
#ifdef CONFIG_INET
|
||||||
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
|
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
|
||||||
struct bpf_prog *prog, struct sk_buff *skb,
|
struct bpf_prog *prog, struct sk_buff *skb,
|
||||||
|
struct sock *migrating_sk,
|
||||||
u32 hash);
|
u32 hash);
|
||||||
#else
|
#else
|
||||||
static inline struct sock *
|
static inline struct sock *
|
||||||
bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
|
bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
|
||||||
struct bpf_prog *prog, struct sk_buff *skb,
|
struct bpf_prog *prog, struct sk_buff *skb,
|
||||||
|
struct sock *migrating_sk,
|
||||||
u32 hash)
|
u32 hash)
|
||||||
{
|
{
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
|
@ -994,6 +994,8 @@ enum bpf_attach_type {
|
||||||
BPF_SK_LOOKUP,
|
BPF_SK_LOOKUP,
|
||||||
BPF_XDP,
|
BPF_XDP,
|
||||||
BPF_SK_SKB_VERDICT,
|
BPF_SK_SKB_VERDICT,
|
||||||
|
BPF_SK_REUSEPORT_SELECT,
|
||||||
|
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
|
||||||
__MAX_BPF_ATTACH_TYPE
|
__MAX_BPF_ATTACH_TYPE
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -5416,7 +5418,20 @@ struct sk_reuseport_md {
|
||||||
__u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
|
__u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
|
||||||
__u32 bind_inany; /* Is sock bound to an INANY address? */
|
__u32 bind_inany; /* Is sock bound to an INANY address? */
|
||||||
__u32 hash; /* A hash of the packet 4 tuples */
|
__u32 hash; /* A hash of the packet 4 tuples */
|
||||||
|
/* When reuse->migrating_sk is NULL, it is selecting a sk for the
|
||||||
|
* new incoming connection request (e.g. selecting a listen sk for
|
||||||
|
* the received SYN in the TCP case). reuse->sk is one of the sk
|
||||||
|
* in the reuseport group. The bpf prog can use reuse->sk to learn
|
||||||
|
* the local listening ip/port without looking into the skb.
|
||||||
|
*
|
||||||
|
* When reuse->migrating_sk is not NULL, reuse->sk is closed and
|
||||||
|
* reuse->migrating_sk is the socket that needs to be migrated
|
||||||
|
* to another listening socket. migrating_sk could be a fullsock
|
||||||
|
* sk that is fully established or a reqsk that is in-the-middle
|
||||||
|
* of 3-way handshake.
|
||||||
|
*/
|
||||||
__bpf_md_ptr(struct bpf_sock *, sk);
|
__bpf_md_ptr(struct bpf_sock *, sk);
|
||||||
|
__bpf_md_ptr(struct bpf_sock *, migrating_sk);
|
||||||
};
|
};
|
||||||
|
|
||||||
#define BPF_TAG_SIZE 8
|
#define BPF_TAG_SIZE 8
|
||||||
|
|
|
@ -1972,6 +1972,11 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
|
||||||
attr->expected_attach_type =
|
attr->expected_attach_type =
|
||||||
BPF_CGROUP_INET_SOCK_CREATE;
|
BPF_CGROUP_INET_SOCK_CREATE;
|
||||||
break;
|
break;
|
||||||
|
case BPF_PROG_TYPE_SK_REUSEPORT:
|
||||||
|
if (!attr->expected_attach_type)
|
||||||
|
attr->expected_attach_type =
|
||||||
|
BPF_SK_REUSEPORT_SELECT;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2055,6 +2060,14 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
|
||||||
if (expected_attach_type == BPF_SK_LOOKUP)
|
if (expected_attach_type == BPF_SK_LOOKUP)
|
||||||
return 0;
|
return 0;
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
case BPF_PROG_TYPE_SK_REUSEPORT:
|
||||||
|
switch (expected_attach_type) {
|
||||||
|
case BPF_SK_REUSEPORT_SELECT:
|
||||||
|
case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
|
||||||
|
return 0;
|
||||||
|
default:
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
case BPF_PROG_TYPE_SYSCALL:
|
case BPF_PROG_TYPE_SYSCALL:
|
||||||
case BPF_PROG_TYPE_EXT:
|
case BPF_PROG_TYPE_EXT:
|
||||||
if (expected_attach_type)
|
if (expected_attach_type)
|
||||||
|
|
|
@ -10044,11 +10044,13 @@ out:
|
||||||
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
|
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
|
||||||
struct sock_reuseport *reuse,
|
struct sock_reuseport *reuse,
|
||||||
struct sock *sk, struct sk_buff *skb,
|
struct sock *sk, struct sk_buff *skb,
|
||||||
|
struct sock *migrating_sk,
|
||||||
u32 hash)
|
u32 hash)
|
||||||
{
|
{
|
||||||
reuse_kern->skb = skb;
|
reuse_kern->skb = skb;
|
||||||
reuse_kern->sk = sk;
|
reuse_kern->sk = sk;
|
||||||
reuse_kern->selected_sk = NULL;
|
reuse_kern->selected_sk = NULL;
|
||||||
|
reuse_kern->migrating_sk = migrating_sk;
|
||||||
reuse_kern->data_end = skb->data + skb_headlen(skb);
|
reuse_kern->data_end = skb->data + skb_headlen(skb);
|
||||||
reuse_kern->hash = hash;
|
reuse_kern->hash = hash;
|
||||||
reuse_kern->reuseport_id = reuse->reuseport_id;
|
reuse_kern->reuseport_id = reuse->reuseport_id;
|
||||||
|
@ -10057,12 +10059,13 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
|
||||||
|
|
||||||
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
|
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
|
||||||
struct bpf_prog *prog, struct sk_buff *skb,
|
struct bpf_prog *prog, struct sk_buff *skb,
|
||||||
|
struct sock *migrating_sk,
|
||||||
u32 hash)
|
u32 hash)
|
||||||
{
|
{
|
||||||
struct sk_reuseport_kern reuse_kern;
|
struct sk_reuseport_kern reuse_kern;
|
||||||
enum sk_action action;
|
enum sk_action action;
|
||||||
|
|
||||||
bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
|
bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
|
||||||
action = BPF_PROG_RUN(prog, &reuse_kern);
|
action = BPF_PROG_RUN(prog, &reuse_kern);
|
||||||
|
|
||||||
if (action == SK_PASS)
|
if (action == SK_PASS)
|
||||||
|
@ -10207,6 +10210,10 @@ sk_reuseport_is_valid_access(int off, int size,
|
||||||
info->reg_type = PTR_TO_SOCKET;
|
info->reg_type = PTR_TO_SOCKET;
|
||||||
return size == sizeof(__u64);
|
return size == sizeof(__u64);
|
||||||
|
|
||||||
|
case offsetof(struct sk_reuseport_md, migrating_sk):
|
||||||
|
info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
|
||||||
|
return size == sizeof(__u64);
|
||||||
|
|
||||||
/* Fields that allow narrowing */
|
/* Fields that allow narrowing */
|
||||||
case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
|
case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
|
||||||
if (size < sizeof_field(struct sk_buff, protocol))
|
if (size < sizeof_field(struct sk_buff, protocol))
|
||||||
|
@ -10283,6 +10290,10 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
|
||||||
case offsetof(struct sk_reuseport_md, sk):
|
case offsetof(struct sk_reuseport_md, sk):
|
||||||
SK_REUSEPORT_LOAD_FIELD(sk);
|
SK_REUSEPORT_LOAD_FIELD(sk);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case offsetof(struct sk_reuseport_md, migrating_sk):
|
||||||
|
SK_REUSEPORT_LOAD_FIELD(migrating_sk);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return insn - insn_buf;
|
return insn - insn_buf;
|
||||||
|
|
|
@ -377,13 +377,17 @@ void reuseport_stop_listen_sock(struct sock *sk)
|
||||||
{
|
{
|
||||||
if (sk->sk_protocol == IPPROTO_TCP) {
|
if (sk->sk_protocol == IPPROTO_TCP) {
|
||||||
struct sock_reuseport *reuse;
|
struct sock_reuseport *reuse;
|
||||||
|
struct bpf_prog *prog;
|
||||||
|
|
||||||
spin_lock_bh(&reuseport_lock);
|
spin_lock_bh(&reuseport_lock);
|
||||||
|
|
||||||
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
|
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
|
||||||
lockdep_is_held(&reuseport_lock));
|
lockdep_is_held(&reuseport_lock));
|
||||||
|
prog = rcu_dereference_protected(reuse->prog,
|
||||||
|
lockdep_is_held(&reuseport_lock));
|
||||||
|
|
||||||
if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req) {
|
if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req ||
|
||||||
|
(prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
|
||||||
/* Migration capable, move sk from the listening section
|
/* Migration capable, move sk from the listening section
|
||||||
* to the closed section.
|
* to the closed section.
|
||||||
*/
|
*/
|
||||||
|
@ -488,7 +492,7 @@ struct sock *reuseport_select_sock(struct sock *sk,
|
||||||
goto select_by_hash;
|
goto select_by_hash;
|
||||||
|
|
||||||
if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
|
if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
|
||||||
sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
|
sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
|
||||||
else
|
else
|
||||||
sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
|
sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
|
||||||
|
|
||||||
|
@ -519,6 +523,8 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
|
||||||
{
|
{
|
||||||
struct sock_reuseport *reuse;
|
struct sock_reuseport *reuse;
|
||||||
struct sock *nsk = NULL;
|
struct sock *nsk = NULL;
|
||||||
|
bool allocated = false;
|
||||||
|
struct bpf_prog *prog;
|
||||||
u16 socks;
|
u16 socks;
|
||||||
u32 hash;
|
u32 hash;
|
||||||
|
|
||||||
|
@ -536,10 +542,30 @@ struct sock *reuseport_migrate_sock(struct sock *sk,
|
||||||
smp_rmb();
|
smp_rmb();
|
||||||
|
|
||||||
hash = migrating_sk->sk_hash;
|
hash = migrating_sk->sk_hash;
|
||||||
if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
|
prog = rcu_dereference(reuse->prog);
|
||||||
|
if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
|
||||||
|
if (sock_net(sk)->ipv4.sysctl_tcp_migrate_req)
|
||||||
|
goto select_by_hash;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!skb) {
|
||||||
|
skb = alloc_skb(0, GFP_ATOMIC);
|
||||||
|
if (!skb)
|
||||||
|
goto out;
|
||||||
|
allocated = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
|
||||||
|
|
||||||
|
if (allocated)
|
||||||
|
kfree_skb(skb);
|
||||||
|
|
||||||
|
select_by_hash:
|
||||||
|
if (!nsk)
|
||||||
nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
|
nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
|
||||||
|
|
||||||
if (nsk && unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
|
if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt)))
|
||||||
nsk = NULL;
|
nsk = NULL;
|
||||||
|
|
||||||
out:
|
out:
|
||||||
|
|
|
@ -994,6 +994,8 @@ enum bpf_attach_type {
|
||||||
BPF_SK_LOOKUP,
|
BPF_SK_LOOKUP,
|
||||||
BPF_XDP,
|
BPF_XDP,
|
||||||
BPF_SK_SKB_VERDICT,
|
BPF_SK_SKB_VERDICT,
|
||||||
|
BPF_SK_REUSEPORT_SELECT,
|
||||||
|
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
|
||||||
__MAX_BPF_ATTACH_TYPE
|
__MAX_BPF_ATTACH_TYPE
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -5416,7 +5418,20 @@ struct sk_reuseport_md {
|
||||||
__u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
|
__u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
|
||||||
__u32 bind_inany; /* Is sock bound to an INANY address? */
|
__u32 bind_inany; /* Is sock bound to an INANY address? */
|
||||||
__u32 hash; /* A hash of the packet 4 tuples */
|
__u32 hash; /* A hash of the packet 4 tuples */
|
||||||
|
/* When reuse->migrating_sk is NULL, it is selecting a sk for the
|
||||||
|
* new incoming connection request (e.g. selecting a listen sk for
|
||||||
|
* the received SYN in the TCP case). reuse->sk is one of the sk
|
||||||
|
* in the reuseport group. The bpf prog can use reuse->sk to learn
|
||||||
|
* the local listening ip/port without looking into the skb.
|
||||||
|
*
|
||||||
|
* When reuse->migrating_sk is not NULL, reuse->sk is closed and
|
||||||
|
* reuse->migrating_sk is the socket that needs to be migrated
|
||||||
|
* to another listening socket. migrating_sk could be a fullsock
|
||||||
|
* sk that is fully established or a reqsk that is in-the-middle
|
||||||
|
* of 3-way handshake.
|
||||||
|
*/
|
||||||
__bpf_md_ptr(struct bpf_sock *, sk);
|
__bpf_md_ptr(struct bpf_sock *, sk);
|
||||||
|
__bpf_md_ptr(struct bpf_sock *, migrating_sk);
|
||||||
};
|
};
|
||||||
|
|
||||||
#define BPF_TAG_SIZE 8
|
#define BPF_TAG_SIZE 8
|
||||||
|
|
Загрузка…
Ссылка в новой задаче