2020-05-31 11:28:36 +03:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
#ifndef _BPF_NETNS_H
|
|
|
|
#define _BPF_NETNS_H
|
|
|
|
|
|
|
|
#include <linux/mutex.h>
|
|
|
|
#include <uapi/linux/bpf.h>
|
|
|
|
|
|
|
|
enum netns_bpf_attach_type {
|
|
|
|
NETNS_BPF_INVALID = -1,
|
|
|
|
NETNS_BPF_FLOW_DISSECTOR = 0,
|
bpf: Introduce SK_LOOKUP program type with a dedicated attach point
Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type
BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer
when looking up a listening socket for a new connection request for
connection oriented protocols, or when looking up an unconnected socket for
a packet for connection-less protocols.
When called, SK_LOOKUP BPF program can select a socket that will receive
the packet. This serves as a mechanism to overcome the limits of what
bind() API allows to express. Two use-cases driving this work are:
(1) steer packets destined to an IP range, on fixed port to a socket
192.0.2.0/24, port 80 -> NGINX socket
(2) steer packets destined to an IP address, on any port to a socket
198.51.100.1, any port -> L7 proxy socket
In its run-time context program receives information about the packet that
triggered the socket lookup. Namely IP version, L4 protocol identifier, and
address 4-tuple. Context can be further extended to include ingress
interface identifier.
To select a socket BPF program fetches it from a map holding socket
references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...)
helper to record the selection. Transport layer then uses the selected
socket as a result of socket lookup.
In its basic form, SK_LOOKUP acts as a filter and hence must return either
SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should
look for a socket to receive the packet, or use the one selected by the
program if available, while SK_DROP informs the transport layer that the
lookup should fail.
This patch only enables the user to attach an SK_LOOKUP program to a
network namespace. Subsequent patches hook it up to run on local delivery
path in ipv4 and ipv6 stacks.
Suggested-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com
2020-07-17 13:35:23 +03:00
|
|
|
NETNS_BPF_SK_LOOKUP,
|
2020-05-31 11:28:36 +03:00
|
|
|
MAX_NETNS_BPF_ATTACH_TYPE
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline enum netns_bpf_attach_type
|
|
|
|
to_netns_bpf_attach_type(enum bpf_attach_type attach_type)
|
|
|
|
{
|
|
|
|
switch (attach_type) {
|
|
|
|
case BPF_FLOW_DISSECTOR:
|
|
|
|
return NETNS_BPF_FLOW_DISSECTOR;
|
bpf: Introduce SK_LOOKUP program type with a dedicated attach point
Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type
BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer
when looking up a listening socket for a new connection request for
connection oriented protocols, or when looking up an unconnected socket for
a packet for connection-less protocols.
When called, SK_LOOKUP BPF program can select a socket that will receive
the packet. This serves as a mechanism to overcome the limits of what
bind() API allows to express. Two use-cases driving this work are:
(1) steer packets destined to an IP range, on fixed port to a socket
192.0.2.0/24, port 80 -> NGINX socket
(2) steer packets destined to an IP address, on any port to a socket
198.51.100.1, any port -> L7 proxy socket
In its run-time context program receives information about the packet that
triggered the socket lookup. Namely IP version, L4 protocol identifier, and
address 4-tuple. Context can be further extended to include ingress
interface identifier.
To select a socket BPF program fetches it from a map holding socket
references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...)
helper to record the selection. Transport layer then uses the selected
socket as a result of socket lookup.
In its basic form, SK_LOOKUP acts as a filter and hence must return either
SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should
look for a socket to receive the packet, or use the one selected by the
program if available, while SK_DROP informs the transport layer that the
lookup should fail.
This patch only enables the user to attach an SK_LOOKUP program to a
network namespace. Subsequent patches hook it up to run on local delivery
path in ipv4 and ipv6 stacks.
Suggested-by: Marek Majkowski <marek@cloudflare.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com
2020-07-17 13:35:23 +03:00
|
|
|
case BPF_SK_LOOKUP:
|
|
|
|
return NETNS_BPF_SK_LOOKUP;
|
2020-05-31 11:28:36 +03:00
|
|
|
default:
|
|
|
|
return NETNS_BPF_INVALID;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Protects updates to netns_bpf */
|
|
|
|
extern struct mutex netns_bpf_mutex;
|
|
|
|
|
|
|
|
union bpf_attr;
|
|
|
|
struct bpf_prog;
|
|
|
|
|
|
|
|
#ifdef CONFIG_NET
|
|
|
|
int netns_bpf_prog_query(const union bpf_attr *attr,
|
|
|
|
union bpf_attr __user *uattr);
|
|
|
|
int netns_bpf_prog_attach(const union bpf_attr *attr,
|
|
|
|
struct bpf_prog *prog);
|
2020-06-29 12:56:26 +03:00
|
|
|
int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
|
bpf: Add link-based BPF program attachment to network namespace
Extend bpf() syscall subcommands that operate on bpf_link, that is
LINK_CREATE, LINK_UPDATE, OBJ_GET_INFO, to accept attach types tied to
network namespaces (only flow dissector at the moment).
Link-based and prog-based attachment can be used interchangeably, but only
one can exist at a time. Attempts to attach a link when a prog is already
attached directly, and the other way around, will be met with -EEXIST.
Attempts to detach a program when link exists result in -EINVAL.
Attachment of multiple links of same attach type to one netns is not
supported with the intention to lift the restriction when a use-case
presents itself. Because of that link create returns -E2BIG when trying to
create another netns link, when one already exists.
Link-based attachments to netns don't keep a netns alive by holding a ref
to it. Instead links get auto-detached from netns when the latter is being
destroyed, using a pernet pre_exit callback.
When auto-detached, link lives in defunct state as long there are open FDs
for it. -ENOLINK is returned if a user tries to update a defunct link.
Because bpf_link to netns doesn't hold a ref to struct net, special care is
taken when releasing, updating, or filling link info. The netns might be
getting torn down when any of these link operations are in progress. That
is why auto-detach and update/release/fill_info are synchronized by the
same mutex. Also, link ops have to always check if auto-detach has not
happened yet and if netns is still alive (refcnt > 0).
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200531082846.2117903-5-jakub@cloudflare.com
2020-05-31 11:28:38 +03:00
|
|
|
int netns_bpf_link_create(const union bpf_attr *attr,
|
|
|
|
struct bpf_prog *prog);
|
2020-05-31 11:28:36 +03:00
|
|
|
#else
|
|
|
|
static inline int netns_bpf_prog_query(const union bpf_attr *attr,
|
|
|
|
union bpf_attr __user *uattr)
|
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int netns_bpf_prog_attach(const union bpf_attr *attr,
|
|
|
|
struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2020-06-29 12:56:26 +03:00
|
|
|
static inline int netns_bpf_prog_detach(const union bpf_attr *attr,
|
|
|
|
enum bpf_prog_type ptype)
|
2020-05-31 11:28:36 +03:00
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
bpf: Add link-based BPF program attachment to network namespace
Extend bpf() syscall subcommands that operate on bpf_link, that is
LINK_CREATE, LINK_UPDATE, OBJ_GET_INFO, to accept attach types tied to
network namespaces (only flow dissector at the moment).
Link-based and prog-based attachment can be used interchangeably, but only
one can exist at a time. Attempts to attach a link when a prog is already
attached directly, and the other way around, will be met with -EEXIST.
Attempts to detach a program when link exists result in -EINVAL.
Attachment of multiple links of same attach type to one netns is not
supported with the intention to lift the restriction when a use-case
presents itself. Because of that link create returns -E2BIG when trying to
create another netns link, when one already exists.
Link-based attachments to netns don't keep a netns alive by holding a ref
to it. Instead links get auto-detached from netns when the latter is being
destroyed, using a pernet pre_exit callback.
When auto-detached, link lives in defunct state as long there are open FDs
for it. -ENOLINK is returned if a user tries to update a defunct link.
Because bpf_link to netns doesn't hold a ref to struct net, special care is
taken when releasing, updating, or filling link info. The netns might be
getting torn down when any of these link operations are in progress. That
is why auto-detach and update/release/fill_info are synchronized by the
same mutex. Also, link ops have to always check if auto-detach has not
happened yet and if netns is still alive (refcnt > 0).
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200531082846.2117903-5-jakub@cloudflare.com
2020-05-31 11:28:38 +03:00
|
|
|
|
|
|
|
static inline int netns_bpf_link_create(const union bpf_attr *attr,
|
|
|
|
struct bpf_prog *prog)
|
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
2020-05-31 11:28:36 +03:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _BPF_NETNS_H */
|