From 69495d2a52957c415d11312fe37844062a48fd32 Mon Sep 17 00:00:00 2001 From: Taeung Song Date: Mon, 3 Sep 2018 08:30:07 +0900 Subject: [PATCH 01/29] libbpf: Remove the duplicate checking of function storage After the commit eac7d84519a3 ("tools: libbpf: don't return '.text' as a program for multi-function programs"), bpf_program__next() in bpf_object__for_each_program skips the function storage such as .text, so eliminate the duplicate checking. Cc: Jakub Kicinski Signed-off-by: Taeung Song Acked-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2abd0f112627..8476da7f2720 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2336,7 +2336,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, bpf_program__set_expected_attach_type(prog, expected_attach_type); - if (!bpf_program__is_function_storage(prog, obj) && !first_prog) + if (!first_prog) first_prog = prog; } From a9c676bc8fc58d00eea9836fb14ee43c0346416a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 4 Sep 2018 19:13:44 -0700 Subject: [PATCH 02/29] bpf/verifier: fix verifier instability Edward Cree says: In check_mem_access(), for the PTR_TO_CTX case, after check_ctx_access() has supplied a reg_type, the other members of the register state are set appropriately. Previously reg.range was set to 0, but as it is in a union with reg.map_ptr, which is larger, upper bytes of the latter were left in place. This then caused the memcmp() in regsafe() to fail, preventing some branches from being pruned (and occasionally causing the same program to take a varying number of processed insns on repeated verifier runs). Fix the instability by clearing bpf_reg_state in __mark_reg_[un]known() Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Debugged-by: Edward Cree Acked-by: Edward Cree Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f4ff0c569e54..6ff1bac1795d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -570,7 +570,9 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg); */ static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) { - reg->id = 0; + /* Clear id, off, and union(map_ptr, range) */ + memset(((u8 *)reg) + sizeof(reg->type), 0, + offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); reg->var_off = tnum_const(imm); reg->smin_value = (s64)imm; reg->smax_value = (s64)imm; @@ -589,7 +591,6 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) static void __mark_reg_const_zero(struct bpf_reg_state *reg) { __mark_reg_known(reg, 0); - reg->off = 0; reg->type = SCALAR_VALUE; } @@ -700,9 +701,12 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) /* Mark a register as having a completely unknown (scalar) value. */ static void __mark_reg_unknown(struct bpf_reg_state *reg) { + /* + * Clear type, id, off, and union(map_ptr, range) and + * padding between 'type' and union + */ + memset(reg, 0, offsetof(struct bpf_reg_state, var_off)); reg->type = SCALAR_VALUE; - reg->id = 0; - reg->off = 0; reg->var_off = tnum_unknown; reg->frameno = 0; __mark_reg_unbounded(reg); @@ -1640,9 +1644,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn else mark_reg_known_zero(env, regs, value_regno); - regs[value_regno].id = 0; - regs[value_regno].off = 0; - regs[value_regno].range = 0; regs[value_regno].type = reg_type; } @@ -2495,7 +2496,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].off = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() From e1302542e37ebbc1de70982e93bd777397f90712 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 3 Sep 2018 09:54:57 +0200 Subject: [PATCH 03/29] xdp: unlikely instrumentation for xdp map redirect Notice the compiler generated ASM code layout was suboptimal. It assumed map enqueue errors as the likely case, which is shouldn't. It assumed that xdp_do_flush_map() was a likely case, due to maps changing between packets, which should be very unlikely. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index d301134bca3a..80200f719028 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3187,7 +3187,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_dtab_netdev *dst = fwd; err = dev_map_enqueue(dst, xdp, dev_rx); - if (err) + if (unlikely(err)) return err; __dev_map_insert_ctx(map, index); break; @@ -3196,7 +3196,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_cpu_map_entry *rcpu = fwd; err = cpu_map_enqueue(rcpu, xdp, dev_rx); - if (err) + if (unlikely(err)) return err; __cpu_map_insert_ctx(map, index); break; @@ -3284,7 +3284,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, err = -EINVAL; goto err; } - if (ri->map_to_flush && ri->map_to_flush != map) + if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) xdp_do_flush_map(); err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); From 2a68d85fe1b7511731dedf73c4003fec2880f1ce Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 3 Sep 2018 09:55:02 +0200 Subject: [PATCH 04/29] xdp: explicit inline __xdp_map_lookup_elem The compiler chooses to not-inline the function __xdp_map_lookup_elem, because it can see that it is used by both Generic-XDP and native-XDP do redirect calls (xdp_do_generic_redirect_map and xdp_do_redirect_map). The compiler cannot know that this is a bad choice, as it cannot know that a net device cannot run both XDP modes (Generic or Native) at the same time. Thus, mark this function inline, even-though we normally leave this up-to the compiler. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 80200f719028..d7dbe412cb9e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3237,7 +3237,7 @@ void xdp_do_flush_map(void) } EXPORT_SYMBOL_GPL(xdp_do_flush_map); -static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) +static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) { switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: @@ -3280,7 +3280,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, WRITE_ONCE(ri->map, NULL); fwd = __xdp_map_lookup_elem(map, index); - if (!fwd) { + if (unlikely(!fwd)) { err = -EINVAL; goto err; } @@ -3308,7 +3308,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, u32 index = ri->ifindex; int err; - if (map) + if (likely(map)) return xdp_do_redirect_map(dev, xdp, xdp_prog, map); fwd = dev_get_by_index_rcu(dev_net(dev), index); From 47b123ed9e99b064dd2b250e1b62e1d91dd876ee Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 3 Sep 2018 09:55:07 +0200 Subject: [PATCH 05/29] xdp: split code for map vs non-map redirect The compiler does an efficient job of inlining static C functions. Perf top clearly shows that almost everything gets inlined into the function call xdp_do_redirect. The function xdp_do_redirect end-up containing and interleaving the map and non-map redirect code. This is sub-optimal, as it would be strange for an XDP program to use both types of redirect in the same program. The two use-cases are separate, and interleaving the code just cause more instruction-cache pressure. I would like to stress (again) that the non-map variant bpf_redirect is very slow compared to the bpf_redirect_map variant, approx half the speed. Measured with driver i40e the difference is: - map redirect: 13,250,350 pps - non-map redirect: 7,491,425 pps For this reason, the function name of the non-map variant of redirect have been called xdp_do_redirect_slow. This hopefully gives a hint when using perf, that this is not the optimal XDP redirect operating mode. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 52 +++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index d7dbe412cb9e..8cb242b4400f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3175,6 +3175,32 @@ static int __bpf_tx_xdp(struct net_device *dev, return 0; } +static noinline int +xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri) +{ + struct net_device *fwd; + u32 index = ri->ifindex; + int err; + + fwd = dev_get_by_index_rcu(dev_net(dev), index); + ri->ifindex = 0; + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = __bpf_tx_xdp(fwd, NULL, xdp, 0); + if (unlikely(err)) + goto err; + + _trace_xdp_redirect(dev, xdp_prog, index); + return 0; +err: + _trace_xdp_redirect_err(dev, xdp_prog, index, err); + return err; +} + static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_map *map, struct xdp_buff *xdp, @@ -3269,9 +3295,9 @@ void bpf_clear_redirect_map(struct bpf_map *map) } static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, struct bpf_map *map) + struct bpf_prog *xdp_prog, struct bpf_map *map, + struct bpf_redirect_info *ri) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->ifindex; void *fwd = NULL; int err; @@ -3304,29 +3330,11 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = READ_ONCE(ri->map); - struct net_device *fwd; - u32 index = ri->ifindex; - int err; if (likely(map)) - return xdp_do_redirect_map(dev, xdp, xdp_prog, map); + return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri); - fwd = dev_get_by_index_rcu(dev_net(dev), index); - ri->ifindex = 0; - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } - - err = __bpf_tx_xdp(fwd, NULL, xdp, 0); - if (unlikely(err)) - goto err; - - _trace_xdp_redirect(dev, xdp_prog, index); - return 0; -err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); - return err; + return xdp_do_redirect_slow(dev, xdp, xdp_prog, ri); } EXPORT_SYMBOL_GPL(xdp_do_redirect); From ad1242d8a063ceb8c6e1b9c1a63b73ec94fa0295 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Mon, 3 Sep 2018 18:05:27 +0200 Subject: [PATCH 06/29] selftests/bpf: add missing executables to .gitignore Signed-off-by: Mauricio Vasquez B Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 49938d72cf63..4d789c1e5167 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -19,3 +19,7 @@ test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user +test_skb_cgroup_id_user +test_socket_cookie +test_cgroup_storage +test_select_reuseport From f5bd3948eb07e76fcd73d0b8ab7b3265be226038 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Mon, 3 Sep 2018 19:01:59 +0200 Subject: [PATCH 07/29] selftests/bpf/test_progs: do not check errno == 0 The errno man page states: "The value in errno is significant only when the return value of the call indicated an error..." then it is not correct to check it, it could be different than zero even if the function succeeded. It causes some false positives if errno is set by a previous function. Signed-off-by: Mauricio Vasquez B Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_progs.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 0ef68204c84b..63a671803ed6 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -112,13 +112,13 @@ static void test_pkt_access(void) err = bpf_prog_test_run(prog_fd, 100000, &pkt_v4, sizeof(pkt_v4), NULL, NULL, &retval, &duration); - CHECK(err || errno || retval, "ipv4", + CHECK(err || retval, "ipv4", "err %d errno %d retval %d duration %d\n", err, errno, retval, duration); err = bpf_prog_test_run(prog_fd, 100000, &pkt_v6, sizeof(pkt_v6), NULL, NULL, &retval, &duration); - CHECK(err || errno || retval, "ipv6", + CHECK(err || retval, "ipv6", "err %d errno %d retval %d duration %d\n", err, errno, retval, duration); bpf_object__close(obj); @@ -153,14 +153,14 @@ static void test_xdp(void) err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); - CHECK(err || errno || retval != XDP_TX || size != 74 || + CHECK(err || retval != XDP_TX || size != 74 || iph->protocol != IPPROTO_IPIP, "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); - CHECK(err || errno || retval != XDP_TX || size != 114 || + CHECK(err || retval != XDP_TX || size != 114 || iph6->nexthdr != IPPROTO_IPV6, "ipv6", "err %d errno %d retval %d size %d\n", err, errno, retval, size); @@ -185,13 +185,13 @@ static void test_xdp_adjust_tail(void) err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); - CHECK(err || errno || retval != XDP_DROP, + CHECK(err || retval != XDP_DROP, "ipv4", "err %d errno %d retval %d size %d\n", err, errno, retval, size); err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); - CHECK(err || errno || retval != XDP_TX || size != 54, + CHECK(err || retval != XDP_TX || size != 54, "ipv6", "err %d errno %d retval %d size %d\n", err, errno, retval, size); bpf_object__close(obj); @@ -254,14 +254,14 @@ static void test_l4lb(const char *file) err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); - CHECK(err || errno || retval != 7/*TC_ACT_REDIRECT*/ || size != 54 || + CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 54 || *magic != MAGIC_VAL, "ipv4", "err %d errno %d retval %d size %d magic %x\n", err, errno, retval, size, *magic); err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); - CHECK(err || errno || retval != 7/*TC_ACT_REDIRECT*/ || size != 74 || + CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 74 || *magic != MAGIC_VAL, "ipv6", "err %d errno %d retval %d size %d magic %x\n", err, errno, retval, size, *magic); @@ -343,14 +343,14 @@ static void test_xdp_noinline(void) err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4), buf, &size, &retval, &duration); - CHECK(err || errno || retval != 1 || size != 54 || + CHECK(err || retval != 1 || size != 54 || *magic != MAGIC_VAL, "ipv4", "err %d errno %d retval %d size %d magic %x\n", err, errno, retval, size, *magic); err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6), buf, &size, &retval, &duration); - CHECK(err || errno || retval != 1 || size != 74 || + CHECK(err || retval != 1 || size != 74 || *magic != MAGIC_VAL, "ipv6", "err %d errno %d retval %d size %d magic %x\n", err, errno, retval, size, *magic); From 52b7b7843d9523ebc3c60c51c7afc4a45cc10aad Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 5 Sep 2018 16:58:03 -0700 Subject: [PATCH 08/29] tools/bpf: sync kernel uapi header if_link.h to tools Among others, this header will be used later for bpftool net support. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/if_link.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index cf01b6824244..43391e2d1153 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -164,6 +164,8 @@ enum { IFLA_CARRIER_UP_COUNT, IFLA_CARRIER_DOWN_COUNT, IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, __IFLA_MAX }; @@ -334,6 +336,7 @@ enum { IFLA_BRPORT_GROUP_FWD_MASK, IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, + IFLA_BRPORT_BACKUP_PORT, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) @@ -459,6 +462,16 @@ enum { #define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) +/* XFRM section */ +enum { + IFLA_XFRM_UNSPEC, + IFLA_XFRM_LINK, + IFLA_XFRM_IF_ID, + __IFLA_XFRM_MAX +}; + +#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) + enum macsec_validation_type { MACSEC_VALIDATE_DISABLED = 0, MACSEC_VALIDATE_CHECK = 1, @@ -920,6 +933,7 @@ enum { XDP_ATTACHED_DRV, XDP_ATTACHED_SKB, XDP_ATTACHED_HW, + XDP_ATTACHED_MULTI, }; enum { @@ -928,6 +942,9 @@ enum { IFLA_XDP_ATTACHED, IFLA_XDP_FLAGS, IFLA_XDP_PROG_ID, + IFLA_XDP_DRV_PROG_ID, + IFLA_XDP_SKB_PROG_ID, + IFLA_XDP_HW_PROG_ID, __IFLA_XDP_MAX, }; From f7010770fbac47b1fc9fb723b1d2019eb23c04f2 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 5 Sep 2018 16:58:04 -0700 Subject: [PATCH 09/29] tools/bpf: move bpf/lib netlink related functions into a new file There are no functionality change for this patch. In the subsequent patches, more netlink related library functions will be added and a separate file is better than cluttering bpf.c. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/Build | 2 +- tools/lib/bpf/bpf.c | 129 ------------------------------- tools/lib/bpf/netlink.c | 165 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 130 deletions(-) create mode 100644 tools/lib/bpf/netlink.c diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 13a861135127..512b2c0ba0d2 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1 +1 @@ -libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o +libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o netlink.o diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 60aa4ca8b2c5..3878a26a2071 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -28,16 +28,8 @@ #include #include "bpf.h" #include "libbpf.h" -#include "nlattr.h" -#include -#include -#include #include -#ifndef SOL_NETLINK -#define SOL_NETLINK 270 -#endif - /* * When building perf, unistd.h is overridden. __NR_bpf is * required to be defined explicitly. @@ -499,127 +491,6 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd) return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr)); } -int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) -{ - struct sockaddr_nl sa; - int sock, seq = 0, len, ret = -1; - char buf[4096]; - struct nlattr *nla, *nla_xdp; - struct { - struct nlmsghdr nh; - struct ifinfomsg ifinfo; - char attrbuf[64]; - } req; - struct nlmsghdr *nh; - struct nlmsgerr *err; - socklen_t addrlen; - int one = 1; - - memset(&sa, 0, sizeof(sa)); - sa.nl_family = AF_NETLINK; - - sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); - if (sock < 0) { - return -errno; - } - - if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, - &one, sizeof(one)) < 0) { - fprintf(stderr, "Netlink error reporting not supported\n"); - } - - if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - ret = -errno; - goto cleanup; - } - - addrlen = sizeof(sa); - if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { - ret = -errno; - goto cleanup; - } - - if (addrlen != sizeof(sa)) { - ret = -LIBBPF_ERRNO__INTERNAL; - goto cleanup; - } - - memset(&req, 0, sizeof(req)); - req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); - req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; - req.nh.nlmsg_type = RTM_SETLINK; - req.nh.nlmsg_pid = 0; - req.nh.nlmsg_seq = ++seq; - req.ifinfo.ifi_family = AF_UNSPEC; - req.ifinfo.ifi_index = ifindex; - - /* started nested attribute for XDP */ - nla = (struct nlattr *)(((char *)&req) - + NLMSG_ALIGN(req.nh.nlmsg_len)); - nla->nla_type = NLA_F_NESTED | IFLA_XDP; - nla->nla_len = NLA_HDRLEN; - - /* add XDP fd */ - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = IFLA_XDP_FD; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); - memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); - nla->nla_len += nla_xdp->nla_len; - - /* if user passed in any flags, add those too */ - if (flags) { - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = IFLA_XDP_FLAGS; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); - memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); - nla->nla_len += nla_xdp->nla_len; - } - - req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); - - if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { - ret = -errno; - goto cleanup; - } - - len = recv(sock, buf, sizeof(buf), 0); - if (len < 0) { - ret = -errno; - goto cleanup; - } - - for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); - nh = NLMSG_NEXT(nh, len)) { - if (nh->nlmsg_pid != sa.nl_pid) { - ret = -LIBBPF_ERRNO__WRNGPID; - goto cleanup; - } - if (nh->nlmsg_seq != seq) { - ret = -LIBBPF_ERRNO__INVSEQ; - goto cleanup; - } - switch (nh->nlmsg_type) { - case NLMSG_ERROR: - err = (struct nlmsgerr *)NLMSG_DATA(nh); - if (!err->error) - continue; - ret = err->error; - nla_dump_errormsg(nh); - goto cleanup; - case NLMSG_DONE: - break; - default: - break; - } - } - - ret = 0; - -cleanup: - close(sock); - return ret; -} - int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, bool do_log) { diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c new file mode 100644 index 000000000000..ccaa991fe9d8 --- /dev/null +++ b/tools/lib/bpf/netlink.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: LGPL-2.1 +/* Copyright (c) 2018 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bpf.h" +#include "libbpf.h" +#include "nlattr.h" + +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +static int bpf_netlink_open(__u32 *nl_pid) +{ + struct sockaddr_nl sa; + socklen_t addrlen; + int one = 1, ret; + int sock; + + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) + return -errno; + + if (setsockopt(sock, SOL_NETLINK, NETLINK_EXT_ACK, + &one, sizeof(one)) < 0) { + fprintf(stderr, "Netlink error reporting not supported\n"); + } + + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + ret = -errno; + goto cleanup; + } + + addrlen = sizeof(sa); + if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { + ret = -errno; + goto cleanup; + } + + if (addrlen != sizeof(sa)) { + ret = -LIBBPF_ERRNO__INTERNAL; + goto cleanup; + } + + *nl_pid = sa.nl_pid; + return sock; + +cleanup: + close(sock); + return ret; +} + +static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq) +{ + struct nlmsgerr *err; + struct nlmsghdr *nh; + char buf[4096]; + int len, ret; + + while (1) { + len = recv(sock, buf, sizeof(buf), 0); + if (len < 0) { + ret = -errno; + goto done; + } + + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); + nh = NLMSG_NEXT(nh, len)) { + if (nh->nlmsg_pid != nl_pid) { + ret = -LIBBPF_ERRNO__WRNGPID; + goto done; + } + if (nh->nlmsg_seq != seq) { + ret = -LIBBPF_ERRNO__INVSEQ; + goto done; + } + switch (nh->nlmsg_type) { + case NLMSG_ERROR: + err = (struct nlmsgerr *)NLMSG_DATA(nh); + if (!err->error) + continue; + ret = err->error; + nla_dump_errormsg(nh); + goto done; + case NLMSG_DONE: + return 0; + default: + break; + } + } + } + ret = 0; +done: + return ret; +} + +int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) +{ + int sock, seq = 0, ret; + struct nlattr *nla, *nla_xdp; + struct { + struct nlmsghdr nh; + struct ifinfomsg ifinfo; + char attrbuf[64]; + } req; + __u32 nl_pid; + + sock = bpf_netlink_open(&nl_pid); + if (sock < 0) + return sock; + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + req.nh.nlmsg_type = RTM_SETLINK; + req.nh.nlmsg_pid = 0; + req.nh.nlmsg_seq = ++seq; + req.ifinfo.ifi_family = AF_UNSPEC; + req.ifinfo.ifi_index = ifindex; + + /* started nested attribute for XDP */ + nla = (struct nlattr *)(((char *)&req) + + NLMSG_ALIGN(req.nh.nlmsg_len)); + nla->nla_type = NLA_F_NESTED | IFLA_XDP; + nla->nla_len = NLA_HDRLEN; + + /* add XDP fd */ + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_FD; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); + memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); + nla->nla_len += nla_xdp->nla_len; + + /* if user passed in any flags, add those too */ + if (flags) { + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = IFLA_XDP_FLAGS; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); + memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); + nla->nla_len += nla_xdp->nla_len; + } + + req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); + + if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { + ret = -errno; + goto cleanup; + } + ret = bpf_netlink_recv(sock, nl_pid, seq); + +cleanup: + close(sock); + return ret; +} From 36f1678d9e0b5d2e0236046d9659e0348b4719a8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 5 Sep 2018 16:58:05 -0700 Subject: [PATCH 10/29] tools/bpf: add more netlink functionalities in lib/bpf This patch added a few netlink attribute parsing functions and the netlink API functions to query networking links, tc classes, tc qdiscs and tc filters. For example, the following API is to get networking links: int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg, void *cookie); Note that when the API is called, the user also provided a callback function with the following signature: int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); The "cookie" is the parameter the user passed to the API and will be available for the callback function. The "msg" is the information about the result, e.g., ifinfomsg or tcmsg. The "tb" is the parsed netlink attributes. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/libbpf.h | 16 ++++ tools/lib/bpf/libbpf_errno.c | 1 + tools/lib/bpf/netlink.c | 165 ++++++++++++++++++++++++++++++++++- tools/lib/bpf/nlattr.c | 33 ++++--- tools/lib/bpf/nlattr.h | 38 ++++++++ 5 files changed, 238 insertions(+), 15 deletions(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 96c55fac54c3..e3b00e23e181 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -46,6 +46,7 @@ enum libbpf_errno { LIBBPF_ERRNO__PROGTYPE, /* Kernel doesn't support this program type */ LIBBPF_ERRNO__WRNGPID, /* Wrong pid in netlink message */ LIBBPF_ERRNO__INVSEQ, /* Invalid netlink sequence */ + LIBBPF_ERRNO__NLPARSE, /* netlink parsing error */ __LIBBPF_ERRNO__END, }; @@ -297,4 +298,19 @@ int bpf_perf_event_read_simple(void *mem, unsigned long size, unsigned long page_size, void **buf, size_t *buf_len, bpf_perf_event_print_t fn, void *priv); + +struct nlmsghdr; +struct nlattr; +typedef int (*dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); +typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, dump_nlmsg_t, + void *cookie); +int bpf_netlink_open(unsigned int *nl_pid); +int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg, + void *cookie); +int nl_get_class(int sock, unsigned int nl_pid, int ifindex, + dump_nlmsg_t dump_class_nlmsg, void *cookie); +int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex, + dump_nlmsg_t dump_qdisc_nlmsg, void *cookie); +int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle, + dump_nlmsg_t dump_filter_nlmsg, void *cookie); #endif diff --git a/tools/lib/bpf/libbpf_errno.c b/tools/lib/bpf/libbpf_errno.c index d9ba851bd7f9..2464ade3b326 100644 --- a/tools/lib/bpf/libbpf_errno.c +++ b/tools/lib/bpf/libbpf_errno.c @@ -42,6 +42,7 @@ static const char *libbpf_strerror_table[NR_ERRNO] = { [ERRCODE_OFFSET(PROGTYPE)] = "Kernel doesn't support this program type", [ERRCODE_OFFSET(WRNGPID)] = "Wrong pid in netlink message", [ERRCODE_OFFSET(INVSEQ)] = "Invalid netlink sequence", + [ERRCODE_OFFSET(NLPARSE)] = "Incorrect netlink message parsing", }; int libbpf_strerror(int err, char *buf, size_t size) diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index ccaa991fe9d8..469e068dd0c5 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -18,7 +18,7 @@ #define SOL_NETLINK 270 #endif -static int bpf_netlink_open(__u32 *nl_pid) +int bpf_netlink_open(__u32 *nl_pid) { struct sockaddr_nl sa; socklen_t addrlen; @@ -61,7 +61,9 @@ cleanup: return ret; } -static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq) +static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq, + __dump_nlmsg_t _fn, dump_nlmsg_t fn, + void *cookie) { struct nlmsgerr *err; struct nlmsghdr *nh; @@ -98,6 +100,11 @@ static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq) default: break; } + if (_fn) { + ret = _fn(nh, fn, cookie); + if (ret) + return ret; + } } } ret = 0; @@ -157,9 +164,161 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) ret = -errno; goto cleanup; } - ret = bpf_netlink_recv(sock, nl_pid, seq); + ret = bpf_netlink_recv(sock, nl_pid, seq, NULL, NULL, NULL); cleanup: close(sock); return ret; } + +static int __dump_link_nlmsg(struct nlmsghdr *nlh, dump_nlmsg_t dump_link_nlmsg, + void *cookie) +{ + struct nlattr *tb[IFLA_MAX + 1], *attr; + struct ifinfomsg *ifi = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi)); + attr = (struct nlattr *) ((void *) ifi + NLMSG_ALIGN(sizeof(*ifi))); + if (nla_parse(tb, IFLA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_link_nlmsg(cookie, ifi, tb); +} + +int nl_get_link(int sock, unsigned int nl_pid, dump_nlmsg_t dump_link_nlmsg, + void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct ifinfomsg ifm; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .nlh.nlmsg_type = RTM_GETLINK, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .ifm.ifi_family = AF_PACKET, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, __dump_link_nlmsg, + dump_link_nlmsg, cookie); +} + +static int __dump_class_nlmsg(struct nlmsghdr *nlh, + dump_nlmsg_t dump_class_nlmsg, void *cookie) +{ + struct nlattr *tb[TCA_MAX + 1], *attr; + struct tcmsg *t = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t)); + attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t))); + if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_class_nlmsg(cookie, t, tb); +} + +int nl_get_class(int sock, unsigned int nl_pid, int ifindex, + dump_nlmsg_t dump_class_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct tcmsg t; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nlh.nlmsg_type = RTM_GETTCLASS, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .t.tcm_family = AF_UNSPEC, + .t.tcm_ifindex = ifindex, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, __dump_class_nlmsg, + dump_class_nlmsg, cookie); +} + +static int __dump_qdisc_nlmsg(struct nlmsghdr *nlh, + dump_nlmsg_t dump_qdisc_nlmsg, void *cookie) +{ + struct nlattr *tb[TCA_MAX + 1], *attr; + struct tcmsg *t = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t)); + attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t))); + if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_qdisc_nlmsg(cookie, t, tb); +} + +int nl_get_qdisc(int sock, unsigned int nl_pid, int ifindex, + dump_nlmsg_t dump_qdisc_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct tcmsg t; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nlh.nlmsg_type = RTM_GETQDISC, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .t.tcm_family = AF_UNSPEC, + .t.tcm_ifindex = ifindex, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, __dump_qdisc_nlmsg, + dump_qdisc_nlmsg, cookie); +} + +static int __dump_filter_nlmsg(struct nlmsghdr *nlh, + dump_nlmsg_t dump_filter_nlmsg, void *cookie) +{ + struct nlattr *tb[TCA_MAX + 1], *attr; + struct tcmsg *t = NLMSG_DATA(nlh); + int len; + + len = nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*t)); + attr = (struct nlattr *) ((void *) t + NLMSG_ALIGN(sizeof(*t))); + if (nla_parse(tb, TCA_MAX, attr, len, NULL) != 0) + return -LIBBPF_ERRNO__NLPARSE; + + return dump_filter_nlmsg(cookie, t, tb); +} + +int nl_get_filter(int sock, unsigned int nl_pid, int ifindex, int handle, + dump_nlmsg_t dump_filter_nlmsg, void *cookie) +{ + struct { + struct nlmsghdr nlh; + struct tcmsg t; + } req = { + .nlh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)), + .nlh.nlmsg_type = RTM_GETTFILTER, + .nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST, + .t.tcm_family = AF_UNSPEC, + .t.tcm_ifindex = ifindex, + .t.tcm_parent = handle, + }; + int seq = time(NULL); + + req.nlh.nlmsg_seq = seq; + if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0) + return -errno; + + return bpf_netlink_recv(sock, nl_pid, seq, __dump_filter_nlmsg, + dump_filter_nlmsg, cookie); +} diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c index 4719434278b2..49f514119bdb 100644 --- a/tools/lib/bpf/nlattr.c +++ b/tools/lib/bpf/nlattr.c @@ -26,11 +26,6 @@ static uint16_t nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_FLAG] = 0, }; -static int nla_len(const struct nlattr *nla) -{ - return nla->nla_len - NLA_HDRLEN; -} - static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) { int totlen = NLA_ALIGN(nla->nla_len); @@ -46,11 +41,6 @@ static int nla_ok(const struct nlattr *nla, int remaining) nla->nla_len <= remaining; } -static void *nla_data(const struct nlattr *nla) -{ - return (char *) nla + NLA_HDRLEN; -} - static int nla_type(const struct nlattr *nla) { return nla->nla_type & NLA_TYPE_MASK; @@ -114,8 +104,8 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh) * @see nla_validate * @return 0 on success or a negative error code. */ -static int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, - struct nla_policy *policy) +int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, + struct nla_policy *policy) { struct nlattr *nla; int rem, err; @@ -146,6 +136,25 @@ errout: return err; } +/** + * Create attribute index based on nested attribute + * @arg tb Index array to be filled (maxtype+1 elements). + * @arg maxtype Maximum attribute type expected and accepted. + * @arg nla Nested Attribute. + * @arg policy Attribute validation policy. + * + * Feeds the stream of attributes nested into the specified attribute + * to nla_parse(). + * + * @see nla_parse + * @return 0 on success or a negative error code. + */ +int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, + struct nla_policy *policy) +{ + return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy); +} + /* dump netlink extended ack error message */ int nla_dump_errormsg(struct nlmsghdr *nlh) { diff --git a/tools/lib/bpf/nlattr.h b/tools/lib/bpf/nlattr.h index 931a71f68f93..a6e2396bce7c 100644 --- a/tools/lib/bpf/nlattr.h +++ b/tools/lib/bpf/nlattr.h @@ -67,6 +67,44 @@ struct nla_policy { nla_ok(pos, rem); \ pos = nla_next(pos, &(rem))) +/** + * nla_data - head of payload + * @nla: netlink attribute + */ +static inline void *nla_data(const struct nlattr *nla) +{ + return (char *) nla + NLA_HDRLEN; +} + +static inline uint8_t nla_getattr_u8(const struct nlattr *nla) +{ + return *(uint8_t *)nla_data(nla); +} + +static inline uint32_t nla_getattr_u32(const struct nlattr *nla) +{ + return *(uint32_t *)nla_data(nla); +} + +static inline const char *nla_getattr_str(const struct nlattr *nla) +{ + return (const char *)nla_data(nla); +} + +/** + * nla_len - length of payload + * @nla: netlink attribute + */ +static inline int nla_len(const struct nlattr *nla) +{ + return nla->nla_len - NLA_HDRLEN; +} + +int nla_parse(struct nlattr *tb[], int maxtype, struct nlattr *head, int len, + struct nla_policy *policy); +int nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, + struct nla_policy *policy); + int nla_dump_errormsg(struct nlmsghdr *nlh); #endif /* __NLATTR_H */ From f6f3bac08ff9855d803081a353a1fafaa8845739 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 5 Sep 2018 16:58:06 -0700 Subject: [PATCH 11/29] tools/bpf: bpftool: add net support Add "bpftool net" support. Networking devices are enumerated to dump device index/name associated with xdp progs. For each networking device, tc classes and qdiscs are enumerated in order to check their bpf filters. In addition, root handle and clsact ingress/egress are also checked for bpf filters. Not all filter information is printed out. Only ifindex, kind, filter name, prog_id and tag are printed out, which are good enough to show attachment information. If the filter action is a bpf action, its bpf program id, bpf name and tag will be printed out as well. For example, $ ./bpftool net xdp [ ifindex 2 devname eth0 prog_id 198 ] tc_filters [ ifindex 2 kind qdisc_htb name prefix_matcher.o:[cls_prefix_matcher_htb] prog_id 111727 tag d08fe3b4319bc2fd act [] ifindex 2 kind qdisc_clsact_ingress name fbflow_icmp prog_id 130246 tag 3f265c7f26db62c9 act [] ifindex 2 kind qdisc_clsact_egress name prefix_matcher.o:[cls_prefix_matcher_clsact] prog_id 111726 tag 99a197826974c876 ifindex 2 kind qdisc_clsact_egress name cls_fg_dscp prog_id 108619 tag dc4630674fd72dcc act [] ifindex 2 kind qdisc_clsact_egress name fbflow_egress prog_id 130245 tag 72d2d830d6888d2c ] $ ./bpftool -jp net [{ "xdp": [{ "ifindex": 2, "devname": "eth0", "prog_id": 198 } ], "tc_filters": [{ "ifindex": 2, "kind": "qdisc_htb", "name": "prefix_matcher.o:[cls_prefix_matcher_htb]", "prog_id": 111727, "tag": "d08fe3b4319bc2fd", "act": [] },{ "ifindex": 2, "kind": "qdisc_clsact_ingress", "name": "fbflow_icmp", "prog_id": 130246, "tag": "3f265c7f26db62c9", "act": [] },{ "ifindex": 2, "kind": "qdisc_clsact_egress", "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]", "prog_id": 111726, "tag": "99a197826974c876" },{ "ifindex": 2, "kind": "qdisc_clsact_egress", "name": "cls_fg_dscp", "prog_id": 108619, "tag": "dc4630674fd72dcc", "act": [] },{ "ifindex": 2, "kind": "qdisc_clsact_egress", "name": "fbflow_egress", "prog_id": 130245, "tag": "72d2d830d6888d2c" } ] } ] Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- .../bpf/bpftool/Documentation/bpftool-net.rst | 133 ++++++++++ tools/bpf/bpftool/Documentation/bpftool.rst | 6 +- tools/bpf/bpftool/bash-completion/bpftool | 17 +- tools/bpf/bpftool/main.c | 3 +- tools/bpf/bpftool/main.h | 7 + tools/bpf/bpftool/net.c | 233 ++++++++++++++++++ tools/bpf/bpftool/netlink_dumper.c | 181 ++++++++++++++ tools/bpf/bpftool/netlink_dumper.h | 103 ++++++++ 8 files changed, 676 insertions(+), 7 deletions(-) create mode 100644 tools/bpf/bpftool/Documentation/bpftool-net.rst create mode 100644 tools/bpf/bpftool/net.c create mode 100644 tools/bpf/bpftool/netlink_dumper.c create mode 100644 tools/bpf/bpftool/netlink_dumper.h diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst new file mode 100644 index 000000000000..48a61837a264 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -0,0 +1,133 @@ +================ +bpftool-net +================ +------------------------------------------------------------------------------- +tool for inspection of netdev/tc related bpf prog attachments +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **net** *COMMAND* + + *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] } + + *COMMANDS* := + { **show** | **list** } [ **dev** name ] | **help** + +NET COMMANDS +============ + +| **bpftool** **net { show | list } [ dev name ]** +| **bpftool** **net help** + +DESCRIPTION +=========== + **bpftool net { show | list } [ dev name ]** + List all networking device driver and tc attachment in the system. + + Output will start with all xdp program attachment, followed by + all tc class/qdisc bpf program attachments. Both xdp programs and + tc programs are ordered based on ifindex number. If multiple bpf + programs attached to the same networking device through **tc filter**, + the order will be first all bpf programs attached to tc classes, then + all bpf programs attached to non clsact qdiscs, and finally all + bpf programs attached to root and clsact qdisc. + + **bpftool net help** + Print short help message. + +OPTIONS +======= + -h, --help + Print short generic help message (similar to **bpftool help**). + + -v, --version + Print version number (similar to **bpftool version**). + + -j, --json + Generate JSON output. For commands that cannot produce JSON, this + option has no effect. + + -p, --pretty + Generate human-readable JSON output. Implies **-j**. + +EXAMPLES +======== + +| **# bpftool net** + +:: + + xdp [ + ifindex 2 devname eth0 prog_id 198 + ] + tc_filters [ + ifindex 2 kind qdisc_htb name prefix_matcher.o:[cls_prefix_matcher_htb] + prog_id 111727 tag d08fe3b4319bc2fd act [] + ifindex 2 kind qdisc_clsact_ingress name fbflow_icmp + prog_id 130246 tag 3f265c7f26db62c9 act [] + ifindex 2 kind qdisc_clsact_egress name prefix_matcher.o:[cls_prefix_matcher_clsact] + prog_id 111726 tag 99a197826974c876 + ifindex 2 kind qdisc_clsact_egress name cls_fg_dscp + prog_id 108619 tag dc4630674fd72dcc act [] + ifindex 2 kind qdisc_clsact_egress name fbflow_egress + prog_id 130245 tag 72d2d830d6888d2c + ] + +| +| **# bpftool -jp net** + +:: + + [{ + "xdp": [{ + "ifindex": 2, + "devname": "eth0", + "prog_id": 198 + } + ], + "tc_filters": [{ + "ifindex": 2, + "kind": "qdisc_htb", + "name": "prefix_matcher.o:[cls_prefix_matcher_htb]", + "prog_id": 111727, + "tag": "d08fe3b4319bc2fd", + "act": [] + },{ + "ifindex": 2, + "kind": "qdisc_clsact_ingress", + "name": "fbflow_icmp", + "prog_id": 130246, + "tag": "3f265c7f26db62c9", + "act": [] + },{ + "ifindex": 2, + "kind": "qdisc_clsact_egress", + "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]", + "prog_id": 111726, + "tag": "99a197826974c876" + },{ + "ifindex": 2, + "kind": "qdisc_clsact_egress", + "name": "cls_fg_dscp", + "prog_id": 108619, + "tag": "dc4630674fd72dcc", + "act": [] + },{ + "ifindex": 2, + "kind": "qdisc_clsact_egress", + "name": "fbflow_egress", + "prog_id": 130245, + "tag": "72d2d830d6888d2c" + } + ] + } + ] + + +SEE ALSO +======== + **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index b6f5d560460d..8dda77daeda9 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -16,7 +16,7 @@ SYNOPSIS **bpftool** **version** - *OBJECT* := { **map** | **program** | **cgroup** | **perf** } + *OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** } *OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** } | { **-j** | **--json** } [{ **-p** | **--pretty** }] } @@ -32,6 +32,8 @@ SYNOPSIS *PERF-COMMANDS* := { **show** | **list** | **help** } + *NET-COMMANDS* := { **show** | **list** | **help** } + DESCRIPTION =========== *bpftool* allows for inspection and simple modification of BPF objects @@ -58,4 +60,4 @@ OPTIONS SEE ALSO ======== **bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8) - **bpftool-perf**\ (8) + **bpftool-perf**\ (8), **bpftool-net**\ (8) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index 598066c40191..df1060b852c1 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -494,10 +494,10 @@ _bpftool() _filedir return 0 ;; - tree) - _filedir - return 0 - ;; + tree) + _filedir + return 0 + ;; attach|detach) local ATTACH_TYPES='ingress egress sock_create sock_ops \ device bind4 bind6 post_bind4 post_bind6 connect4 \ @@ -552,6 +552,15 @@ _bpftool() ;; esac ;; + net) + case $command in + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help \ + show list' -- "$cur" ) ) + ;; + esac + ;; esac } && complete -F _bpftool bpftool diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index d15a62be6cf0..79dc3f193547 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -85,7 +85,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | cgroup | perf }\n" + " OBJECT := { prog | map | cgroup | perf | net }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, bin_name, bin_name); @@ -215,6 +215,7 @@ static const struct cmd cmds[] = { { "map", do_map }, { "cgroup", do_cgroup }, { "perf", do_perf }, + { "net", do_net }, { "version", do_version }, { 0 } }; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 238e734d75b3..02dfbcb92a23 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -136,6 +136,7 @@ int do_map(int argc, char **arg); int do_event_pipe(int argc, char **argv); int do_cgroup(int argc, char **arg); int do_perf(int argc, char **arg); +int do_net(int argc, char **arg); int prog_parse_fd(int *argc, char ***argv); int map_parse_fd(int *argc, char ***argv); @@ -165,4 +166,10 @@ struct btf_dumper { */ int btf_dumper_type(const struct btf_dumper *d, __u32 type_id, const void *data); + +struct nlattr; +struct ifinfomsg; +struct tcmsg; +int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb); +int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind); #endif diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c new file mode 100644 index 000000000000..77dd73dd9ade --- /dev/null +++ b/tools/bpf/bpftool/net.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (C) 2018 Facebook + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "main.h" +#include "netlink_dumper.h" + +struct bpf_netdev_t { + int *ifindex_array; + int used_len; + int array_len; + int filter_idx; +}; + +struct tc_kind_handle { + char kind[64]; + int handle; +}; + +struct bpf_tcinfo_t { + struct tc_kind_handle *handle_array; + int used_len; + int array_len; + bool is_qdisc; +}; + +static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb) +{ + struct bpf_netdev_t *netinfo = cookie; + struct ifinfomsg *ifinfo = msg; + + if (netinfo->filter_idx > 0 && netinfo->filter_idx != ifinfo->ifi_index) + return 0; + + if (netinfo->used_len == netinfo->array_len) { + netinfo->ifindex_array = realloc(netinfo->ifindex_array, + (netinfo->array_len + 16) * sizeof(int)); + netinfo->array_len += 16; + } + netinfo->ifindex_array[netinfo->used_len++] = ifinfo->ifi_index; + + return do_xdp_dump(ifinfo, tb); +} + +static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb) +{ + struct bpf_tcinfo_t *tcinfo = cookie; + struct tcmsg *info = msg; + + if (tcinfo->is_qdisc) { + /* skip clsact qdisc */ + if (tb[TCA_KIND] && + strcmp(nla_data(tb[TCA_KIND]), "clsact") == 0) + return 0; + if (info->tcm_handle == 0) + return 0; + } + + if (tcinfo->used_len == tcinfo->array_len) { + tcinfo->handle_array = realloc(tcinfo->handle_array, + (tcinfo->array_len + 16) * sizeof(struct tc_kind_handle)); + tcinfo->array_len += 16; + } + tcinfo->handle_array[tcinfo->used_len].handle = info->tcm_handle; + snprintf(tcinfo->handle_array[tcinfo->used_len].kind, + sizeof(tcinfo->handle_array[tcinfo->used_len].kind), + "%s_%s", + tcinfo->is_qdisc ? "qdisc" : "class", + tb[TCA_KIND] ? nla_getattr_str(tb[TCA_KIND]) : "unknown"); + tcinfo->used_len++; + + return 0; +} + +static int dump_filter_nlmsg(void *cookie, void *msg, struct nlattr **tb) +{ + const char *kind = cookie; + + return do_filter_dump((struct tcmsg *)msg, tb, kind); +} + +static int show_dev_tc_bpf(int sock, unsigned int nl_pid, int ifindex) +{ + struct bpf_tcinfo_t tcinfo; + int i, handle, ret; + + tcinfo.handle_array = NULL; + tcinfo.used_len = 0; + tcinfo.array_len = 0; + + tcinfo.is_qdisc = false; + ret = nl_get_class(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg, + &tcinfo); + if (ret) + return ret; + + tcinfo.is_qdisc = true; + ret = nl_get_qdisc(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg, + &tcinfo); + if (ret) + return ret; + + for (i = 0; i < tcinfo.used_len; i++) { + ret = nl_get_filter(sock, nl_pid, ifindex, + tcinfo.handle_array[i].handle, + dump_filter_nlmsg, + tcinfo.handle_array[i].kind); + if (ret) + return ret; + } + + /* root, ingress and egress handle */ + handle = TC_H_ROOT; + ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg, + "root"); + if (ret) + return ret; + + handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS); + ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg, + "qdisc_clsact_ingress"); + if (ret) + return ret; + + handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS); + ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg, + "qdisc_clsact_egress"); + if (ret) + return ret; + + return 0; +} + +static int do_show(int argc, char **argv) +{ + int i, sock, ret, filter_idx = -1; + struct bpf_netdev_t dev_array; + unsigned int nl_pid; + char err_buf[256]; + + if (argc == 2) { + if (strcmp(argv[0], "dev") != 0) + usage(); + filter_idx = if_nametoindex(argv[1]); + if (filter_idx == 0) { + fprintf(stderr, "invalid dev name %s\n", argv[1]); + return -1; + } + } else if (argc != 0) { + usage(); + } + + sock = bpf_netlink_open(&nl_pid); + if (sock < 0) { + fprintf(stderr, "failed to open netlink sock\n"); + return -1; + } + + dev_array.ifindex_array = NULL; + dev_array.used_len = 0; + dev_array.array_len = 0; + dev_array.filter_idx = filter_idx; + + if (json_output) + jsonw_start_array(json_wtr); + NET_START_OBJECT; + NET_START_ARRAY("xdp", "\n"); + ret = nl_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array); + NET_END_ARRAY("\n"); + + if (!ret) { + NET_START_ARRAY("tc_filters", "\n"); + for (i = 0; i < dev_array.used_len; i++) { + ret = show_dev_tc_bpf(sock, nl_pid, + dev_array.ifindex_array[i]); + if (ret) + break; + } + NET_END_ARRAY("\n"); + } + NET_END_OBJECT; + if (json_output) + jsonw_end_array(json_wtr); + + if (ret) { + if (json_output) + jsonw_null(json_wtr); + libbpf_strerror(ret, err_buf, sizeof(err_buf)); + fprintf(stderr, "Error: %s\n", err_buf); + } + free(dev_array.ifindex_array); + close(sock); + return ret; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %s %s { show | list } [dev ]\n" + " %s %s help\n", + bin_name, argv[-2], bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "show", do_show }, + { "list", do_show }, + { "help", do_help }, + { 0 } +}; + +int do_net(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/netlink_dumper.c b/tools/bpf/bpftool/netlink_dumper.c new file mode 100644 index 000000000000..e12494fd1d2e --- /dev/null +++ b/tools/bpf/bpftool/netlink_dumper.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (C) 2018 Facebook + +#include +#include +#include +#include +#include + +#include +#include "main.h" +#include "netlink_dumper.h" + +static void xdp_dump_prog_id(struct nlattr **tb, int attr, + const char *type) +{ + if (!tb[attr]) + return; + + NET_DUMP_UINT(type, nla_getattr_u32(tb[attr])) +} + +static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex, + const char *name) +{ + struct nlattr *tb[IFLA_XDP_MAX + 1]; + unsigned char mode; + + if (nla_parse_nested(tb, IFLA_XDP_MAX, attr, NULL) < 0) + return -1; + + if (!tb[IFLA_XDP_ATTACHED]) + return 0; + + mode = nla_getattr_u8(tb[IFLA_XDP_ATTACHED]); + if (mode == XDP_ATTACHED_NONE) + return 0; + + NET_START_OBJECT; + NET_DUMP_UINT("ifindex", ifindex); + + if (name) + NET_DUMP_STR("devname", name); + + if (tb[IFLA_XDP_PROG_ID]) + NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[IFLA_XDP_PROG_ID])); + + if (mode == XDP_ATTACHED_MULTI) { + xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic_prog_id"); + xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "drv_prog_id"); + xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload_prog_id"); + } + + NET_END_OBJECT_FINAL; + return 0; +} + +int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb) +{ + if (!tb[IFLA_XDP]) + return 0; + + return do_xdp_dump_one(tb[IFLA_XDP], ifinfo->ifi_index, + nla_getattr_str(tb[IFLA_IFNAME])); +} + +static char *hexstring_n2a(const unsigned char *str, int len, + char *buf, int blen) +{ + char *ptr = buf; + int i; + + for (i = 0; i < len; i++) { + if (blen < 3) + break; + sprintf(ptr, "%02x", str[i]); + ptr += 2; + blen -= 2; + } + return buf; +} + +static int do_bpf_dump_one_act(struct nlattr *attr) +{ + struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; + char buf[256]; + + if (nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + if (!tb[TCA_ACT_BPF_PARMS]) + return -LIBBPF_ERRNO__NLPARSE; + + NET_START_OBJECT_NESTED2; + if (tb[TCA_ACT_BPF_NAME]) + NET_DUMP_STR("name", nla_getattr_str(tb[TCA_ACT_BPF_NAME])); + if (tb[TCA_ACT_BPF_ID]) + NET_DUMP_UINT("bpf_id", nla_getattr_u32(tb[TCA_ACT_BPF_ID])); + if (tb[TCA_ACT_BPF_TAG]) + NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_ACT_BPF_TAG]), + nla_len(tb[TCA_ACT_BPF_TAG]), + buf, sizeof(buf))); + NET_END_OBJECT_NESTED; + return 0; +} + +static int do_dump_one_act(struct nlattr *attr) +{ + struct nlattr *tb[TCA_ACT_MAX + 1]; + + if (!attr) + return 0; + + if (nla_parse_nested(tb, TCA_ACT_MAX, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + if (tb[TCA_ACT_KIND] && strcmp(nla_data(tb[TCA_ACT_KIND]), "bpf") == 0) + return do_bpf_dump_one_act(tb[TCA_ACT_OPTIONS]); + + return 0; +} + +static int do_bpf_act_dump(struct nlattr *attr) +{ + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + int act, ret; + + if (nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + NET_START_ARRAY("act", ""); + for (act = 0; act <= TCA_ACT_MAX_PRIO; act++) { + ret = do_dump_one_act(tb[act]); + if (ret) + break; + } + NET_END_ARRAY(" "); + + return ret; +} + +static int do_bpf_filter_dump(struct nlattr *attr) +{ + struct nlattr *tb[TCA_BPF_MAX + 1]; + char buf[256]; + int ret; + + if (nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0) + return -LIBBPF_ERRNO__NLPARSE; + + if (tb[TCA_BPF_NAME]) + NET_DUMP_STR("name", nla_getattr_str(tb[TCA_BPF_NAME])); + if (tb[TCA_BPF_ID]) + NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[TCA_BPF_ID])); + if (tb[TCA_BPF_TAG]) + NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_BPF_TAG]), + nla_len(tb[TCA_BPF_TAG]), + buf, sizeof(buf))); + if (tb[TCA_BPF_ACT]) { + ret = do_bpf_act_dump(tb[TCA_BPF_ACT]); + if (ret) + return ret; + } + + return 0; +} + +int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind) +{ + int ret = 0; + + if (tb[TCA_OPTIONS] && strcmp(nla_data(tb[TCA_KIND]), "bpf") == 0) { + NET_START_OBJECT; + NET_DUMP_UINT("ifindex", info->tcm_ifindex); + NET_DUMP_STR("kind", kind); + ret = do_bpf_filter_dump(tb[TCA_OPTIONS]); + NET_END_OBJECT_FINAL; + } + + return ret; +} diff --git a/tools/bpf/bpftool/netlink_dumper.h b/tools/bpf/bpftool/netlink_dumper.h new file mode 100644 index 000000000000..552d8851ac06 --- /dev/null +++ b/tools/bpf/bpftool/netlink_dumper.h @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0+ +// Copyright (C) 2018 Facebook + +#ifndef _NETLINK_DUMPER_H_ +#define _NETLINK_DUMPER_H_ + +#define NET_START_OBJECT \ +{ \ + if (json_output) \ + jsonw_start_object(json_wtr); \ +} + +#define NET_START_OBJECT_NESTED(name) \ +{ \ + if (json_output) { \ + jsonw_name(json_wtr, name); \ + jsonw_start_object(json_wtr); \ + } else { \ + fprintf(stderr, "%s {", name); \ + } \ +} + +#define NET_START_OBJECT_NESTED2 \ +{ \ + if (json_output) \ + jsonw_start_object(json_wtr); \ + else \ + fprintf(stderr, "{"); \ +} + +#define NET_END_OBJECT_NESTED \ +{ \ + if (json_output) \ + jsonw_end_object(json_wtr); \ + else \ + fprintf(stderr, "}"); \ +} + +#define NET_END_OBJECT \ +{ \ + if (json_output) \ + jsonw_end_object(json_wtr); \ +} + +#define NET_END_OBJECT_FINAL \ +{ \ + if (json_output) \ + jsonw_end_object(json_wtr); \ + else \ + fprintf(stderr, "\n"); \ +} + +#define NET_START_ARRAY(name, newline) \ +{ \ + if (json_output) { \ + jsonw_name(json_wtr, name); \ + jsonw_start_array(json_wtr); \ + } else { \ + fprintf(stderr, "%s [%s", name, newline);\ + } \ +} + +#define NET_END_ARRAY(endstr) \ +{ \ + if (json_output) \ + jsonw_end_array(json_wtr); \ + else \ + fprintf(stderr, "]%s", endstr); \ +} + +#define NET_DUMP_UINT(name, val) \ +{ \ + if (json_output) \ + jsonw_uint_field(json_wtr, name, val); \ + else \ + fprintf(stderr, "%s %d ", name, val); \ +} + +#define NET_DUMP_LLUINT(name, val) \ +{ \ + if (json_output) \ + jsonw_lluint_field(json_wtr, name, val);\ + else \ + fprintf(stderr, "%s %lld ", name, val); \ +} + +#define NET_DUMP_STR(name, str) \ +{ \ + if (json_output) \ + jsonw_string_field(json_wtr, name, str);\ + else \ + fprintf(stderr, "%s %s ", name, str); \ +} + +#define NET_DUMP_STR_ONLY(str) \ +{ \ + if (json_output) \ + jsonw_string(json_wtr, str); \ + else \ + fprintf(stderr, "%s ", str); \ +} + +#endif From a7c19db38d62fc1ce797dba19936e9f81cf2b9fb Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 6 Sep 2018 17:26:04 -0700 Subject: [PATCH 12/29] bpf: add bpffs pretty print for program array map Added bpffs pretty print for program array map. For a particular array index, if the program array points to a valid program, the ": " will be printed out like 0: 6 which means bpf program with id "6" is installed at index "0". Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f9d24121be99..dded84cbe814 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -553,6 +553,29 @@ static void bpf_fd_array_map_clear(struct bpf_map *map) fd_array_map_delete_elem(map, &i); } +static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, + struct seq_file *m) +{ + void **elem, *ptr; + u32 prog_id; + + rcu_read_lock(); + + elem = array_map_lookup_elem(map, key); + if (elem) { + ptr = READ_ONCE(*elem); + if (ptr) { + seq_printf(m, "%u: ", *(u32 *)key); + prog_id = prog_fd_array_sys_lookup_elem(ptr); + btf_type_seq_show(map->btf, map->btf_value_type_id, + &prog_id, m); + seq_puts(m, "\n"); + } + } + + rcu_read_unlock(); +} + const struct bpf_map_ops prog_array_map_ops = { .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_map_alloc, @@ -564,7 +587,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, .map_release_uref = bpf_fd_array_map_clear, - .map_check_btf = map_check_no_btf, + .map_seq_show_elem = prog_array_map_seq_show_elem, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, From ad3338d2508cd7752accdd39881deced1ec2b8a1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 6 Sep 2018 17:26:05 -0700 Subject: [PATCH 13/29] tools/bpf: bpftool: support prog array map and map of maps Currently, prog array map and map of maps are not supported in bpftool. This patch added the support. Different from other map types, for prog array map and map of maps, the key returned bpf_get_next_key() may not point to a valid value. So for these two map types, no error will be printed out when such a scenario happens. The following is the plain and json dump if btf is not available: $ ./bpftool map dump id 10 key: 08 00 00 00 value: 5c 01 00 00 Found 1 element $ ./bpftool -jp map dump id 10 [{ "key": ["0x08","0x00","0x00","0x00" ], "value": ["0x5c","0x01","0x00","0x00" ] }] If the BTF is available, the dump looks below: $ ./bpftool map dump id 2 [{ "key": 0, "value": 7 } ] $ ./bpftool -jp map dump id 2 [{ "key": ["0x00","0x00","0x00","0x00" ], "value": ["0x07","0x00","0x00","0x00" ], "formatted": { "key": 0, "value": 7 } }] Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/map.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 9c55077ca5dd..af8ad32fa6e9 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -673,12 +673,6 @@ static int do_dump(int argc, char **argv) if (fd < 0) return -1; - if (map_is_map_of_maps(info.type) || map_is_map_of_progs(info.type)) { - p_err("Dumping maps of maps and program maps not supported"); - close(fd); - return -1; - } - key = malloc(info.key_size); value = alloc_value(&info); if (!key || !value) { @@ -732,7 +726,9 @@ static int do_dump(int argc, char **argv) } else { print_entry_plain(&info, key, value); } - } else { + num_elems++; + } else if (!map_is_map_of_maps(info.type) && + !map_is_map_of_progs(info.type)) { if (json_output) { jsonw_name(json_wtr, "key"); print_hex_data_json(key, info.key_size); @@ -749,7 +745,6 @@ static int do_dump(int argc, char **argv) } prev_key = key; - num_elems++; } if (json_output) From 9d0b3c1f1451d1b9a33de3c70ae3d50ccd77db1a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 11 Sep 2018 14:09:11 -0700 Subject: [PATCH 14/29] tools/bpf: fix a netlink recv issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit f7010770fbac ("tools/bpf: move bpf/lib netlink related functions into a new file") introduced a while loop for the netlink recv path. This while loop is needed since the buffer in recv syscall may not be enough to hold all the information and in such cases multiple recv calls are needed. There is a bug introduced by the above commit as the while loop may block on recv syscall if there is no more messages are expected. The netlink message header flag NLM_F_MULTI is used to indicate that more messages are expected and this patch fixed the bug by doing further recv syscall only if multipart message is expected. The patch added another fix regarding to message length of 0. When netlink recv returns message length of 0, there will be no more messages for returning data so the while loop can end. Fixes: f7010770fbac ("tools/bpf: move bpf/lib netlink related functions into a new file") Reported-by: Björn Töpel Tested-by: Björn Töpel Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/netlink.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 469e068dd0c5..fde1d7bf8199 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -65,18 +65,23 @@ static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq, __dump_nlmsg_t _fn, dump_nlmsg_t fn, void *cookie) { + bool multipart = true; struct nlmsgerr *err; struct nlmsghdr *nh; char buf[4096]; int len, ret; - while (1) { + while (multipart) { + multipart = false; len = recv(sock, buf, sizeof(buf), 0); if (len < 0) { ret = -errno; goto done; } + if (len == 0) + break; + for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); nh = NLMSG_NEXT(nh, len)) { if (nh->nlmsg_pid != nl_pid) { @@ -87,6 +92,8 @@ static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq, ret = -LIBBPF_ERRNO__INVSEQ; goto done; } + if (nh->nlmsg_flags & NLM_F_MULTI) + multipart = true; switch (nh->nlmsg_type) { case NLMSG_ERROR: err = (struct nlmsgerr *)NLMSG_DATA(nh); From 1edb6e035eb72a17462ba275fe2db36c37a62909 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Fri, 7 Sep 2018 14:50:05 +0200 Subject: [PATCH 15/29] net/core/filter: fix unused-variable warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Building with CONFIG_INET=n will show the warning below: net/core/filter.c: In function ‘____bpf_getsockopt’: net/core/filter.c:4048:19: warning: unused variable ‘tp’ [-Wunused-variable] struct tcp_sock *tp; ^~ net/core/filter.c:4046:31: warning: unused variable ‘icsk’ [-Wunused-variable] struct inet_connection_sock *icsk; ^~~~ Move the variable declarations inside the {} block where they are used. Fixes: 1e215300f138 ("bpf: add TCP_SAVE_SYN/TCP_SAVED_SYN options for bpf_(set|get)sockopt") Signed-off-by: Anders Roxell Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 8cb242b4400f..bf5b6efd369a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4051,14 +4051,15 @@ static const struct bpf_func_proto bpf_setsockopt_proto = { BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { - struct inet_connection_sock *icsk; struct sock *sk = bpf_sock->sk; - struct tcp_sock *tp; if (!sk_fullsock(sk)) goto err_clear; #ifdef CONFIG_INET if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { + struct inet_connection_sock *icsk; + struct tcp_sock *tp; + switch (optname) { case TCP_CONGESTION: icsk = inet_csk(sk); From d58e468b1112dcd1d5193c0a89ff9f98b5a3e8b9 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 14 Sep 2018 07:46:18 -0700 Subject: [PATCH 16/29] flow_dissector: implements flow dissector BPF hook Adds a hook for programs of type BPF_PROG_TYPE_FLOW_DISSECTOR and attach type BPF_FLOW_DISSECTOR that is executed in the flow dissector path. The BPF program is per-network namespace. Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/linux/skbuff.h | 7 ++ include/net/net_namespace.h | 3 + include/net/sch_generic.h | 12 +++- include/uapi/linux/bpf.h | 26 +++++++ kernel/bpf/syscall.c | 8 +++ kernel/bpf/verifier.c | 32 +++++++++ net/core/filter.c | 70 +++++++++++++++++++ net/core/flow_dissector.c | 134 ++++++++++++++++++++++++++++++++++++ 10 files changed, 291 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 523481a3471b..988a00797bcd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -212,6 +212,7 @@ enum bpf_reg_type { PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ + PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ }; /* The information passed from prog-specific *_is_valid_access diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index cd26c090e7c0..22083712dd18 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -32,6 +32,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #ifdef CONFIG_INET BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) #endif +BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 17a13e4785fc..ce0e863f02a2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -243,6 +243,8 @@ struct scatterlist; struct pipe_inode_info; struct iov_iter; struct napi_struct; +struct bpf_prog; +union bpf_attr; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -1192,6 +1194,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog); + +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); + bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 9b5fdc50519a..99d4148e0f90 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -43,6 +43,7 @@ struct ctl_table_header; struct net_generic; struct uevent_sock; struct netns_ipvs; +struct bpf_prog; #define NETDEV_HASHBITS 8 @@ -145,6 +146,8 @@ struct net { #endif struct net_generic __rcu *gen; + struct bpf_prog __rcu *flow_dissector_prog; + /* Note : following structs are cache line aligned */ #ifdef CONFIG_XFRM struct netns_xfrm xfrm; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a6d00093f35e..1b81ba85fd2d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -19,6 +19,7 @@ struct Qdisc_ops; struct qdisc_walker; struct tcf_walker; struct module; +struct bpf_flow_keys; typedef int tc_setup_cb_t(enum tc_setup_type type, void *type_data, void *cb_priv); @@ -307,9 +308,14 @@ struct tcf_proto { }; struct qdisc_skb_cb { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; + union { + struct { + unsigned int pkt_len; + u16 slave_dev_queue_mapping; + u16 tc_classid; + }; + struct bpf_flow_keys *flow_keys; + }; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 66917a4eba27..aa5ccd2385ed 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -152,6 +152,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, }; enum bpf_attach_type { @@ -172,6 +173,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, + BPF_FLOW_DISSECTOR, __MAX_BPF_ATTACH_TYPE }; @@ -2333,6 +2335,7 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; + struct bpf_flow_keys *flow_keys; }; struct bpf_tunnel_key { @@ -2778,4 +2781,27 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; +struct bpf_flow_keys { + __u16 nhoff; + __u16 thoff; + __u16 addr_proto; /* ETH_P_* of valid addrs */ + __u8 is_frag; + __u8 is_first_frag; + __u8 is_encap; + __u8 ip_proto; + __be16 n_proto; + __be16 sport; + __be16 dport; + union { + struct { + __be32 ipv4_src; + __be32 ipv4_dst; + }; + struct { + __u32 ipv6_src[4]; /* in6_addr; network order */ + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + }; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3c9636f03bb2..b3c2d09bcf7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1615,6 +1615,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_LIRC_MODE2: ptype = BPF_PROG_TYPE_LIRC_MODE2; break; + case BPF_FLOW_DISSECTOR: + ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; + break; default: return -EINVAL; } @@ -1636,6 +1639,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + break; default: ret = cgroup_bpf_prog_attach(attr, ptype, prog); } @@ -1688,6 +1694,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); + case BPF_FLOW_DISSECTOR: + return skb_flow_dissector_bpf_prog_detach(attr); default: return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6ff1bac1795d..8ccbff4fff93 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -261,6 +261,7 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET] = "pkt", [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", }; static char slot_type_char[] = { @@ -965,6 +966,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: + case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: return true; default: @@ -1238,6 +1240,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_FLOW_DISSECTOR: if (meta) return meta->pkt_access; @@ -1321,6 +1324,18 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } +static int check_flow_keys_access(struct bpf_verifier_env *env, int off, + int size) +{ + if (size < 0 || off < 0 || + (u64)off + size > sizeof(struct bpf_flow_keys)) { + verbose(env, "invalid access to flow keys off=%d size=%d\n", + off, size); + return -EACCES; + } + return 0; +} + static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -1422,6 +1437,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, * right in front, treat it the very same way. */ return check_pkt_ptr_alignment(env, reg, off, size, strict); + case PTR_TO_FLOW_KEYS: + pointer_desc = "flow keys "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -1692,6 +1710,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_FLOW_KEYS) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into flow keys\n", + value_regno); + return -EACCES; + } + + err = check_flow_keys_access(env, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -1839,6 +1868,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_FLOW_KEYS: + return check_flow_keys_access(env, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); @@ -4366,6 +4397,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_CTX: case CONST_PTR_TO_MAP: case PTR_TO_PACKET_END: + case PTR_TO_FLOW_KEYS: /* Only valid matches are exact, which memcmp() above * would have accepted */ diff --git a/net/core/filter.c b/net/core/filter.c index bf5b6efd369a..9cc76f134ddb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5123,6 +5123,17 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto * lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5241,6 +5252,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != size_default) return false; break; + case bpf_ctx_range(struct __sk_buff, flow_keys): + if (size != sizeof(struct bpf_flow_keys *)) + return false; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5266,6 +5281,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): + case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5291,6 +5307,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): return false; } @@ -5501,6 +5518,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; + case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5702,6 +5720,7 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): return false; } @@ -5761,6 +5780,39 @@ static bool sk_msg_is_valid_access(int off, int size, return true; } +static bool flow_dissector_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + case bpf_ctx_range(struct __sk_buff, flow_keys): + info->reg_type = PTR_TO_FLOW_KEYS; + break; + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + return false; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -6055,6 +6107,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sock_common, skc_num, 2, target_size)); break; + + case offsetof(struct __sk_buff, flow_keys): + off = si->off; + off -= offsetof(struct __sk_buff, flow_keys); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, flow_keys); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; } return insn - insn_buf; @@ -7018,6 +7079,15 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = { const struct bpf_prog_ops sk_msg_prog_ops = { }; +const struct bpf_verifier_ops flow_dissector_verifier_ops = { + .get_func_proto = flow_dissector_func_proto, + .is_valid_access = flow_dissector_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops flow_dissector_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ce9eeeb7c024..5c5dd74b5b3b 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -25,6 +25,9 @@ #include #include #include +#include + +static DEFINE_MUTEX(flow_dissector_mutex); static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + struct bpf_prog *attached; + struct net *net; + + net = current->nsproxy->net_ns; + mutex_lock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->flow_dissector_prog, + lockdep_is_held(&flow_dissector_mutex)); + if (attached) { + /* Only one BPF program can be attached at a time */ + mutex_unlock(&flow_dissector_mutex); + return -EEXIST; + } + rcu_assign_pointer(net->flow_dissector_prog, prog); + mutex_unlock(&flow_dissector_mutex); + return 0; +} + +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +{ + struct bpf_prog *attached; + struct net *net; + + net = current->nsproxy->net_ns; + mutex_lock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->flow_dissector_prog, + lockdep_is_held(&flow_dissector_mutex)); + if (!attached) { + mutex_unlock(&flow_dissector_mutex); + return -ENOENT; + } + bpf_prog_put(attached); + RCU_INIT_POINTER(net->flow_dissector_prog, NULL); + mutex_unlock(&flow_dissector_mutex); + return 0; +} /** * skb_flow_get_be16 - extract be16 entity * @skb: sk_buff to extract from @@ -588,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs) return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); } +static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_control *key_control; + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_ports *key_ports; + + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + key_control->thoff = flow_keys->thoff; + if (flow_keys->is_frag) + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + if (flow_keys->is_first_frag) + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flow_keys->is_encap) + key_control->flags |= FLOW_DIS_ENCAPSULATION; + + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + key_basic->n_proto = flow_keys->n_proto; + key_basic->ip_proto = flow_keys->ip_proto; + + if (flow_keys->addr_proto == ETH_P_IP && + dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); + key_addrs->v4addrs.src = flow_keys->ipv4_src; + key_addrs->v4addrs.dst = flow_keys->ipv4_dst; + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } else if (flow_keys->addr_proto == ETH_P_IPV6 && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); + memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src, + sizeof(key_addrs->v6addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->src = flow_keys->sport; + key_ports->dst = flow_keys->dport; + } +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -619,6 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; + struct bpf_prog *attached; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -658,6 +754,44 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); + rcu_read_lock(); + attached = skb ? rcu_dereference(dev_net(skb->dev)->flow_dissector_prog) + : NULL; + if (attached) { + /* Note that even though the const qualifier is discarded + * throughout the execution of the BPF program, all changes(the + * control block) are reverted after the BPF program returns. + * Therefore, __skb_flow_dissect does not alter the skb. + */ + struct bpf_flow_keys flow_keys = {}; + struct bpf_skb_data_end cb_saved; + struct bpf_skb_data_end *cb; + u32 result; + + cb = (struct bpf_skb_data_end *)skb->cb; + + /* Save Control Block */ + memcpy(&cb_saved, cb, sizeof(cb_saved)); + memset(cb, 0, sizeof(cb_saved)); + + /* Pass parameters to the BPF program */ + cb->qdisc_cb.flow_keys = &flow_keys; + flow_keys.nhoff = nhoff; + + bpf_compute_data_pointers((struct sk_buff *)skb); + result = BPF_PROG_RUN(attached, skb); + + /* Restore state */ + memcpy(cb, &cb_saved, sizeof(cb_saved)); + + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + key_control->thoff = min_t(u16, key_control->thoff, skb->len); + rcu_read_unlock(); + return result == BPF_OK; + } + rcu_read_unlock(); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); From 2f965e3fcd4b4159a5ea832d4b5e9ff2550fa571 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 14 Sep 2018 07:46:19 -0700 Subject: [PATCH 17/29] bpf: sync bpf.h uapi with tools/ This patch syncs tools/include/uapi/linux/bpf.h with the flow dissector definitions from include/uapi/linux/bpf.h Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 66917a4eba27..aa5ccd2385ed 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -152,6 +152,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, }; enum bpf_attach_type { @@ -172,6 +173,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, + BPF_FLOW_DISSECTOR, __MAX_BPF_ATTACH_TYPE }; @@ -2333,6 +2335,7 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; + struct bpf_flow_keys *flow_keys; }; struct bpf_tunnel_key { @@ -2778,4 +2781,27 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; +struct bpf_flow_keys { + __u16 nhoff; + __u16 thoff; + __u16 addr_proto; /* ETH_P_* of valid addrs */ + __u8 is_frag; + __u8 is_first_frag; + __u8 is_encap; + __u8 ip_proto; + __be16 n_proto; + __be16 sport; + __be16 dport; + union { + struct { + __be32 ipv4_src; + __be32 ipv4_dst; + }; + struct { + __u32 ipv6_src[4]; /* in6_addr; network order */ + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + }; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ From c22fbae76c9fdef5b7406b27754db1758e041991 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 14 Sep 2018 07:46:20 -0700 Subject: [PATCH 18/29] bpf: support flow dissector in libbpf and bpftool This patch extends libbpf and bpftool to work with programs of type BPF_PROG_TYPE_FLOW_DISSECTOR. Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- tools/bpf/bpftool/prog.c | 1 + tools/lib/bpf/libbpf.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index dce960d22106..b1cd3bc8db70 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -74,6 +74,7 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr", [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2", + [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", }; static void print_boot_time(__u64 nsecs, char *buf, unsigned int size) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8476da7f2720..9ca8e0e624d8 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1502,6 +1502,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type) case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_LIRC_MODE2: case BPF_PROG_TYPE_SK_REUSEPORT: + case BPF_PROG_TYPE_FLOW_DISSECTOR: return false; case BPF_PROG_TYPE_UNSPEC: case BPF_PROG_TYPE_KPROBE: @@ -2121,6 +2122,7 @@ static const struct { BPF_PROG_SEC("sk_skb", BPF_PROG_TYPE_SK_SKB), BPF_PROG_SEC("sk_msg", BPF_PROG_TYPE_SK_MSG), BPF_PROG_SEC("lirc_mode2", BPF_PROG_TYPE_LIRC_MODE2), + BPF_PROG_SEC("flow_dissector", BPF_PROG_TYPE_FLOW_DISSECTOR), BPF_SA_PROG_SEC("cgroup/bind4", BPF_CGROUP_INET4_BIND), BPF_SA_PROG_SEC("cgroup/bind6", BPF_CGROUP_INET6_BIND), BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT), From 9c98b13cc3bb5d90ee2ec047d591272b382468fd Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 14 Sep 2018 07:46:21 -0700 Subject: [PATCH 19/29] flow_dissector: implements eBPF parser This eBPF program extracts basic/control/ip address/ports keys from incoming packets. It supports recursive parsing for IP encapsulation, and VLAN, along with IPv4/IPv6 and extension headers. This program is meant to show how flow dissection and key extraction can be done in eBPF. Link: http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/bpf_flow.c | 373 +++++++++++++++++++++++++ 2 files changed, 374 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/bpf_flow.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index fff7fb1285fc..e65f50f9185e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -35,7 +35,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ - test_skb_cgroup_id_kern.o + test_skb_cgroup_id_kern.o bpf_flow.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ diff --git a/tools/testing/selftests/bpf/bpf_flow.c b/tools/testing/selftests/bpf/bpf_flow.c new file mode 100644 index 000000000000..5fb809d95867 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_flow.c @@ -0,0 +1,373 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +int _version SEC("version") = 1; +#define PROG(F) SEC(#F) int bpf_func_##F + +/* These are the identifiers of the BPF programs that will be used in tail + * calls. Name is limited to 16 characters, with the terminating character and + * bpf_func_ above, we have only 6 to work with, anything after will be cropped. + */ +enum { + IP, + IPV6, + IPV6OP, /* Destination/Hop-by-Hop Options IPv6 Extension header */ + IPV6FR, /* Fragmentation IPv6 Extension Header */ + MPLS, + VLAN, +}; + +#define IP_MF 0x2000 +#define IP_OFFSET 0x1FFF +#define IP6_MF 0x0001 +#define IP6_OFFSET 0xFFF8 + +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +struct gre_hdr { + __be16 flags; + __be16 proto; +}; + +struct frag_hdr { + __u8 nexthdr; + __u8 reserved; + __be16 frag_off; + __be32 identification; +}; + +struct bpf_map_def SEC("maps") jmp_table = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = 8 +}; + +static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb, + __u16 hdr_size, + void *buffer) +{ + void *data_end = (void *)(long)skb->data_end; + void *data = (void *)(long)skb->data; + __u16 nhoff = skb->flow_keys->nhoff; + __u8 *hdr; + + /* Verifies this variable offset does not overflow */ + if (nhoff > (USHRT_MAX - hdr_size)) + return NULL; + + hdr = data + nhoff; + if (hdr + hdr_size <= data_end) + return hdr; + + if (bpf_skb_load_bytes(skb, nhoff, buffer, hdr_size)) + return NULL; + + return buffer; +} + +/* Dispatches on ETHERTYPE */ +static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto) +{ + struct bpf_flow_keys *keys = skb->flow_keys; + + keys->n_proto = proto; + switch (proto) { + case bpf_htons(ETH_P_IP): + bpf_tail_call(skb, &jmp_table, IP); + break; + case bpf_htons(ETH_P_IPV6): + bpf_tail_call(skb, &jmp_table, IPV6); + break; + case bpf_htons(ETH_P_MPLS_MC): + case bpf_htons(ETH_P_MPLS_UC): + bpf_tail_call(skb, &jmp_table, MPLS); + break; + case bpf_htons(ETH_P_8021Q): + case bpf_htons(ETH_P_8021AD): + bpf_tail_call(skb, &jmp_table, VLAN); + break; + default: + /* Protocol not supported */ + return BPF_DROP; + } + + return BPF_DROP; +} + +SEC("dissect") +int dissect(struct __sk_buff *skb) +{ + if (!skb->vlan_present) + return parse_eth_proto(skb, skb->protocol); + else + return parse_eth_proto(skb, skb->vlan_proto); +} + +/* Parses on IPPROTO_* */ +static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto) +{ + struct bpf_flow_keys *keys = skb->flow_keys; + void *data_end = (void *)(long)skb->data_end; + struct icmphdr *icmp, _icmp; + struct gre_hdr *gre, _gre; + struct ethhdr *eth, _eth; + struct tcphdr *tcp, _tcp; + struct udphdr *udp, _udp; + + keys->ip_proto = proto; + switch (proto) { + case IPPROTO_ICMP: + icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp), &_icmp); + if (!icmp) + return BPF_DROP; + return BPF_OK; + case IPPROTO_IPIP: + keys->is_encap = true; + return parse_eth_proto(skb, bpf_htons(ETH_P_IP)); + case IPPROTO_IPV6: + keys->is_encap = true; + return parse_eth_proto(skb, bpf_htons(ETH_P_IPV6)); + case IPPROTO_GRE: + gre = bpf_flow_dissect_get_header(skb, sizeof(*gre), &_gre); + if (!gre) + return BPF_DROP; + + if (bpf_htons(gre->flags & GRE_VERSION)) + /* Only inspect standard GRE packets with version 0 */ + return BPF_OK; + + keys->nhoff += sizeof(*gre); /* Step over GRE Flags and Proto */ + if (GRE_IS_CSUM(gre->flags)) + keys->nhoff += 4; /* Step over chksum and Padding */ + if (GRE_IS_KEY(gre->flags)) + keys->nhoff += 4; /* Step over key */ + if (GRE_IS_SEQ(gre->flags)) + keys->nhoff += 4; /* Step over sequence number */ + + keys->is_encap = true; + + if (gre->proto == bpf_htons(ETH_P_TEB)) { + eth = bpf_flow_dissect_get_header(skb, sizeof(*eth), + &_eth); + if (!eth) + return BPF_DROP; + + keys->nhoff += sizeof(*eth); + + return parse_eth_proto(skb, eth->h_proto); + } else { + return parse_eth_proto(skb, gre->proto); + } + case IPPROTO_TCP: + tcp = bpf_flow_dissect_get_header(skb, sizeof(*tcp), &_tcp); + if (!tcp) + return BPF_DROP; + + if (tcp->doff < 5) + return BPF_DROP; + + if ((__u8 *)tcp + (tcp->doff << 2) > data_end) + return BPF_DROP; + + keys->thoff = keys->nhoff; + keys->sport = tcp->source; + keys->dport = tcp->dest; + return BPF_OK; + case IPPROTO_UDP: + case IPPROTO_UDPLITE: + udp = bpf_flow_dissect_get_header(skb, sizeof(*udp), &_udp); + if (!udp) + return BPF_DROP; + + keys->thoff = keys->nhoff; + keys->sport = udp->source; + keys->dport = udp->dest; + return BPF_OK; + default: + return BPF_DROP; + } + + return BPF_DROP; +} + +static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr) +{ + struct bpf_flow_keys *keys = skb->flow_keys; + + keys->ip_proto = nexthdr; + switch (nexthdr) { + case IPPROTO_HOPOPTS: + case IPPROTO_DSTOPTS: + bpf_tail_call(skb, &jmp_table, IPV6OP); + break; + case IPPROTO_FRAGMENT: + bpf_tail_call(skb, &jmp_table, IPV6FR); + break; + default: + return parse_ip_proto(skb, nexthdr); + } + + return BPF_DROP; +} + +PROG(IP)(struct __sk_buff *skb) +{ + void *data_end = (void *)(long)skb->data_end; + struct bpf_flow_keys *keys = skb->flow_keys; + void *data = (void *)(long)skb->data; + struct iphdr *iph, _iph; + bool done = false; + + iph = bpf_flow_dissect_get_header(skb, sizeof(*iph), &_iph); + if (!iph) + return BPF_DROP; + + /* IP header cannot be smaller than 20 bytes */ + if (iph->ihl < 5) + return BPF_DROP; + + keys->addr_proto = ETH_P_IP; + keys->ipv4_src = iph->saddr; + keys->ipv4_dst = iph->daddr; + + keys->nhoff += iph->ihl << 2; + if (data + keys->nhoff > data_end) + return BPF_DROP; + + if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) { + keys->is_frag = true; + if (iph->frag_off & bpf_htons(IP_OFFSET)) + /* From second fragment on, packets do not have headers + * we can parse. + */ + done = true; + else + keys->is_first_frag = true; + } + + if (done) + return BPF_OK; + + return parse_ip_proto(skb, iph->protocol); +} + +PROG(IPV6)(struct __sk_buff *skb) +{ + struct bpf_flow_keys *keys = skb->flow_keys; + struct ipv6hdr *ip6h, _ip6h; + + ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h); + if (!ip6h) + return BPF_DROP; + + keys->addr_proto = ETH_P_IPV6; + memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr)); + + keys->nhoff += sizeof(struct ipv6hdr); + + return parse_ipv6_proto(skb, ip6h->nexthdr); +} + +PROG(IPV6OP)(struct __sk_buff *skb) +{ + struct ipv6_opt_hdr *ip6h, _ip6h; + + ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h); + if (!ip6h) + return BPF_DROP; + + /* hlen is in 8-octets and does not include the first 8 bytes + * of the header + */ + skb->flow_keys->nhoff += (1 + ip6h->hdrlen) << 3; + + return parse_ipv6_proto(skb, ip6h->nexthdr); +} + +PROG(IPV6FR)(struct __sk_buff *skb) +{ + struct bpf_flow_keys *keys = skb->flow_keys; + struct frag_hdr *fragh, _fragh; + + fragh = bpf_flow_dissect_get_header(skb, sizeof(*fragh), &_fragh); + if (!fragh) + return BPF_DROP; + + keys->nhoff += sizeof(*fragh); + keys->is_frag = true; + if (!(fragh->frag_off & bpf_htons(IP6_OFFSET))) + keys->is_first_frag = true; + + return parse_ipv6_proto(skb, fragh->nexthdr); +} + +PROG(MPLS)(struct __sk_buff *skb) +{ + struct mpls_label *mpls, _mpls; + + mpls = bpf_flow_dissect_get_header(skb, sizeof(*mpls), &_mpls); + if (!mpls) + return BPF_DROP; + + return BPF_OK; +} + +PROG(VLAN)(struct __sk_buff *skb) +{ + struct bpf_flow_keys *keys = skb->flow_keys; + struct vlan_hdr *vlan, _vlan; + __be16 proto; + + /* Peek back to see if single or double-tagging */ + if (bpf_skb_load_bytes(skb, keys->nhoff - sizeof(proto), &proto, + sizeof(proto))) + return BPF_DROP; + + /* Account for double-tagging */ + if (proto == bpf_htons(ETH_P_8021AD)) { + vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan); + if (!vlan) + return BPF_DROP; + + if (vlan->h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q)) + return BPF_DROP; + + keys->nhoff += sizeof(*vlan); + } + + vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan); + if (!vlan) + return BPF_DROP; + + keys->nhoff += sizeof(*vlan); + /* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/ + if (vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) || + vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q)) + return BPF_DROP; + + return parse_eth_proto(skb, vlan->h_vlan_encapsulated_proto); +} + +char __license[] SEC("license") = "GPL"; From 50b3ed57dee9cd0e06c59826cec8af14b51bab3e Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 14 Sep 2018 07:46:22 -0700 Subject: [PATCH 20/29] selftests/bpf: test bpf flow dissection Adds a test that sends different types of packets over multiple tunnels and verifies that valid packets are dissected correctly. To do so, a tc-flower rule is added to drop packets on UDP src port 9, and packets are sent from ports 8, 9, and 10. Only the packets on port 9 should be dropped. Because tc-flower relies on the flow dissector to match flows, correct classification demonstrates correct dissection. Also add support logic to load the BPF program and to inject the test packets. Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 2 + tools/testing/selftests/bpf/Makefile | 6 +- tools/testing/selftests/bpf/config | 1 + .../selftests/bpf/flow_dissector_load.c | 140 ++++ .../selftests/bpf/test_flow_dissector.c | 782 ++++++++++++++++++ .../selftests/bpf/test_flow_dissector.sh | 115 +++ tools/testing/selftests/bpf/with_addr.sh | 54 ++ tools/testing/selftests/bpf/with_tunnels.sh | 36 + 8 files changed, 1134 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/flow_dissector_load.c create mode 100644 tools/testing/selftests/bpf/test_flow_dissector.c create mode 100755 tools/testing/selftests/bpf/test_flow_dissector.sh create mode 100755 tools/testing/selftests/bpf/with_addr.sh create mode 100755 tools/testing/selftests/bpf/with_tunnels.sh diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 4d789c1e5167..8a60c9b9892d 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -23,3 +23,5 @@ test_skb_cgroup_id_user test_socket_cookie test_cgroup_storage test_select_reuseport +test_flow_dissector +flow_dissector_load diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e65f50f9185e..fd3851d5c079 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -47,10 +47,12 @@ TEST_PROGS := test_kmod.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ - test_skb_cgroup_id.sh + test_skb_cgroup_id.sh \ + test_flow_dissector.sh # Compile but not part of 'make run_tests' -TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user +TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \ + flow_dissector_load test_flow_dissector include ../lib.mk diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index b4994a94968b..3655508f95fd 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -18,3 +18,4 @@ CONFIG_CRYPTO_HMAC=m CONFIG_CRYPTO_SHA256=m CONFIG_VXLAN=y CONFIG_GENEVE=y +CONFIG_NET_CLS_FLOWER=m diff --git a/tools/testing/selftests/bpf/flow_dissector_load.c b/tools/testing/selftests/bpf/flow_dissector_load.c new file mode 100644 index 000000000000..d3273b5b3173 --- /dev/null +++ b/tools/testing/selftests/bpf/flow_dissector_load.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const char *cfg_pin_path = "/sys/fs/bpf/flow_dissector"; +const char *cfg_map_name = "jmp_table"; +bool cfg_attach = true; +char *cfg_section_name; +char *cfg_path_name; + +static void load_and_attach_program(void) +{ + struct bpf_program *prog, *main_prog; + struct bpf_map *prog_array; + int i, fd, prog_fd, ret; + struct bpf_object *obj; + int prog_array_fd; + + ret = bpf_prog_load(cfg_path_name, BPF_PROG_TYPE_FLOW_DISSECTOR, &obj, + &prog_fd); + if (ret) + error(1, 0, "bpf_prog_load %s", cfg_path_name); + + main_prog = bpf_object__find_program_by_title(obj, cfg_section_name); + if (!main_prog) + error(1, 0, "bpf_object__find_program_by_title %s", + cfg_section_name); + + prog_fd = bpf_program__fd(main_prog); + if (prog_fd < 0) + error(1, 0, "bpf_program__fd"); + + prog_array = bpf_object__find_map_by_name(obj, cfg_map_name); + if (!prog_array) + error(1, 0, "bpf_object__find_map_by_name %s", cfg_map_name); + + prog_array_fd = bpf_map__fd(prog_array); + if (prog_array_fd < 0) + error(1, 0, "bpf_map__fd %s", cfg_map_name); + + i = 0; + bpf_object__for_each_program(prog, obj) { + fd = bpf_program__fd(prog); + if (fd < 0) + error(1, 0, "bpf_program__fd"); + + if (fd != prog_fd) { + printf("%d: %s\n", i, bpf_program__title(prog, false)); + bpf_map_update_elem(prog_array_fd, &i, &fd, BPF_ANY); + ++i; + } + } + + ret = bpf_prog_attach(prog_fd, 0 /* Ignore */, BPF_FLOW_DISSECTOR, 0); + if (ret) + error(1, 0, "bpf_prog_attach %s", cfg_path_name); + + ret = bpf_object__pin(obj, cfg_pin_path); + if (ret) + error(1, 0, "bpf_object__pin %s", cfg_pin_path); + +} + +static void detach_program(void) +{ + char command[64]; + int ret; + + ret = bpf_prog_detach(0, BPF_FLOW_DISSECTOR); + if (ret) + error(1, 0, "bpf_prog_detach"); + + /* To unpin, it is necessary and sufficient to just remove this dir */ + sprintf(command, "rm -r %s", cfg_pin_path); + ret = system(command); + if (ret) + error(1, errno, command); +} + +static void parse_opts(int argc, char **argv) +{ + bool attach = false; + bool detach = false; + int c; + + while ((c = getopt(argc, argv, "adp:s:")) != -1) { + switch (c) { + case 'a': + if (detach) + error(1, 0, "attach/detach are exclusive"); + attach = true; + break; + case 'd': + if (attach) + error(1, 0, "attach/detach are exclusive"); + detach = true; + break; + case 'p': + if (cfg_path_name) + error(1, 0, "only one prog name can be given"); + + cfg_path_name = optarg; + break; + case 's': + if (cfg_section_name) + error(1, 0, "only one section can be given"); + + cfg_section_name = optarg; + break; + } + } + + if (detach) + cfg_attach = false; + + if (cfg_attach && !cfg_path_name) + error(1, 0, "must provide a path to the BPF program"); + + if (cfg_attach && !cfg_section_name) + error(1, 0, "must provide a section name"); +} + +int main(int argc, char **argv) +{ + parse_opts(argc, argv); + if (cfg_attach) + load_and_attach_program(); + else + detach_program(); + return 0; +} diff --git a/tools/testing/selftests/bpf/test_flow_dissector.c b/tools/testing/selftests/bpf/test_flow_dissector.c new file mode 100644 index 000000000000..12b784afba31 --- /dev/null +++ b/tools/testing/selftests/bpf/test_flow_dissector.c @@ -0,0 +1,782 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Inject packets with all sorts of encapsulation into the kernel. + * + * IPv4/IPv6 outer layer 3 + * GRE/GUE/BARE outer layer 4, where bare is IPIP/SIT/IPv4-in-IPv6/.. + * IPv4/IPv6 inner layer 3 + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CFG_PORT_INNER 8000 + +/* Add some protocol definitions that do not exist in userspace */ + +struct grehdr { + uint16_t unused; + uint16_t protocol; +} __attribute__((packed)); + +struct guehdr { + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 hlen:5, + control:1, + version:2; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 version:2, + control:1, + hlen:5; +#else +#error "Please fix " +#endif + __u8 proto_ctype; + __be16 flags; + }; + __be32 word; + }; +}; + +static uint8_t cfg_dsfield_inner; +static uint8_t cfg_dsfield_outer; +static uint8_t cfg_encap_proto; +static bool cfg_expect_failure = false; +static int cfg_l3_extra = AF_UNSPEC; /* optional SIT prefix */ +static int cfg_l3_inner = AF_UNSPEC; +static int cfg_l3_outer = AF_UNSPEC; +static int cfg_num_pkt = 10; +static int cfg_num_secs = 0; +static char cfg_payload_char = 'a'; +static int cfg_payload_len = 100; +static int cfg_port_gue = 6080; +static bool cfg_only_rx; +static bool cfg_only_tx; +static int cfg_src_port = 9; + +static char buf[ETH_DATA_LEN]; + +#define INIT_ADDR4(name, addr4, port) \ + static struct sockaddr_in name = { \ + .sin_family = AF_INET, \ + .sin_port = __constant_htons(port), \ + .sin_addr.s_addr = __constant_htonl(addr4), \ + }; + +#define INIT_ADDR6(name, addr6, port) \ + static struct sockaddr_in6 name = { \ + .sin6_family = AF_INET6, \ + .sin6_port = __constant_htons(port), \ + .sin6_addr = addr6, \ + }; + +INIT_ADDR4(in_daddr4, INADDR_LOOPBACK, CFG_PORT_INNER) +INIT_ADDR4(in_saddr4, INADDR_LOOPBACK + 2, 0) +INIT_ADDR4(out_daddr4, INADDR_LOOPBACK, 0) +INIT_ADDR4(out_saddr4, INADDR_LOOPBACK + 1, 0) +INIT_ADDR4(extra_daddr4, INADDR_LOOPBACK, 0) +INIT_ADDR4(extra_saddr4, INADDR_LOOPBACK + 1, 0) + +INIT_ADDR6(in_daddr6, IN6ADDR_LOOPBACK_INIT, CFG_PORT_INNER) +INIT_ADDR6(in_saddr6, IN6ADDR_LOOPBACK_INIT, 0) +INIT_ADDR6(out_daddr6, IN6ADDR_LOOPBACK_INIT, 0) +INIT_ADDR6(out_saddr6, IN6ADDR_LOOPBACK_INIT, 0) +INIT_ADDR6(extra_daddr6, IN6ADDR_LOOPBACK_INIT, 0) +INIT_ADDR6(extra_saddr6, IN6ADDR_LOOPBACK_INIT, 0) + +static unsigned long util_gettime(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); +} + +static void util_printaddr(const char *msg, struct sockaddr *addr) +{ + unsigned long off = 0; + char nbuf[INET6_ADDRSTRLEN]; + + switch (addr->sa_family) { + case PF_INET: + off = __builtin_offsetof(struct sockaddr_in, sin_addr); + break; + case PF_INET6: + off = __builtin_offsetof(struct sockaddr_in6, sin6_addr); + break; + default: + error(1, 0, "printaddr: unsupported family %u\n", + addr->sa_family); + } + + if (!inet_ntop(addr->sa_family, ((void *) addr) + off, nbuf, + sizeof(nbuf))) + error(1, errno, "inet_ntop"); + + fprintf(stderr, "%s: %s\n", msg, nbuf); +} + +static unsigned long add_csum_hword(const uint16_t *start, int num_u16) +{ + unsigned long sum = 0; + int i; + + for (i = 0; i < num_u16; i++) + sum += start[i]; + + return sum; +} + +static uint16_t build_ip_csum(const uint16_t *start, int num_u16, + unsigned long sum) +{ + sum += add_csum_hword(start, num_u16); + + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + + return ~sum; +} + +static void build_ipv4_header(void *header, uint8_t proto, + uint32_t src, uint32_t dst, + int payload_len, uint8_t tos) +{ + struct iphdr *iph = header; + + iph->ihl = 5; + iph->version = 4; + iph->tos = tos; + iph->ttl = 8; + iph->tot_len = htons(sizeof(*iph) + payload_len); + iph->id = htons(1337); + iph->protocol = proto; + iph->saddr = src; + iph->daddr = dst; + iph->check = build_ip_csum((void *) iph, iph->ihl << 1, 0); +} + +static void ipv6_set_dsfield(struct ipv6hdr *ip6h, uint8_t dsfield) +{ + uint16_t val, *ptr = (uint16_t *)ip6h; + + val = ntohs(*ptr); + val &= 0xF00F; + val |= ((uint16_t) dsfield) << 4; + *ptr = htons(val); +} + +static void build_ipv6_header(void *header, uint8_t proto, + struct sockaddr_in6 *src, + struct sockaddr_in6 *dst, + int payload_len, uint8_t dsfield) +{ + struct ipv6hdr *ip6h = header; + + ip6h->version = 6; + ip6h->payload_len = htons(payload_len); + ip6h->nexthdr = proto; + ip6h->hop_limit = 8; + ipv6_set_dsfield(ip6h, dsfield); + + memcpy(&ip6h->saddr, &src->sin6_addr, sizeof(ip6h->saddr)); + memcpy(&ip6h->daddr, &dst->sin6_addr, sizeof(ip6h->daddr)); +} + +static uint16_t build_udp_v4_csum(const struct iphdr *iph, + const struct udphdr *udph, + int num_words) +{ + unsigned long pseudo_sum; + int num_u16 = sizeof(iph->saddr); /* halfwords: twice byte len */ + + pseudo_sum = add_csum_hword((void *) &iph->saddr, num_u16); + pseudo_sum += htons(IPPROTO_UDP); + pseudo_sum += udph->len; + return build_ip_csum((void *) udph, num_words, pseudo_sum); +} + +static uint16_t build_udp_v6_csum(const struct ipv6hdr *ip6h, + const struct udphdr *udph, + int num_words) +{ + unsigned long pseudo_sum; + int num_u16 = sizeof(ip6h->saddr); /* halfwords: twice byte len */ + + pseudo_sum = add_csum_hword((void *) &ip6h->saddr, num_u16); + pseudo_sum += htons(ip6h->nexthdr); + pseudo_sum += ip6h->payload_len; + return build_ip_csum((void *) udph, num_words, pseudo_sum); +} + +static void build_udp_header(void *header, int payload_len, + uint16_t dport, int family) +{ + struct udphdr *udph = header; + int len = sizeof(*udph) + payload_len; + + udph->source = htons(cfg_src_port); + udph->dest = htons(dport); + udph->len = htons(len); + udph->check = 0; + if (family == AF_INET) + udph->check = build_udp_v4_csum(header - sizeof(struct iphdr), + udph, len >> 1); + else + udph->check = build_udp_v6_csum(header - sizeof(struct ipv6hdr), + udph, len >> 1); +} + +static void build_gue_header(void *header, uint8_t proto) +{ + struct guehdr *gueh = header; + + gueh->proto_ctype = proto; +} + +static void build_gre_header(void *header, uint16_t proto) +{ + struct grehdr *greh = header; + + greh->protocol = htons(proto); +} + +static int l3_length(int family) +{ + if (family == AF_INET) + return sizeof(struct iphdr); + else + return sizeof(struct ipv6hdr); +} + +static int build_packet(void) +{ + int ol3_len = 0, ol4_len = 0, il3_len = 0, il4_len = 0; + int el3_len = 0; + + if (cfg_l3_extra) + el3_len = l3_length(cfg_l3_extra); + + /* calculate header offsets */ + if (cfg_encap_proto) { + ol3_len = l3_length(cfg_l3_outer); + + if (cfg_encap_proto == IPPROTO_GRE) + ol4_len = sizeof(struct grehdr); + else if (cfg_encap_proto == IPPROTO_UDP) + ol4_len = sizeof(struct udphdr) + sizeof(struct guehdr); + } + + il3_len = l3_length(cfg_l3_inner); + il4_len = sizeof(struct udphdr); + + if (el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len >= + sizeof(buf)) + error(1, 0, "packet too large\n"); + + /* + * Fill packet from inside out, to calculate correct checksums. + * But create ip before udp headers, as udp uses ip for pseudo-sum. + */ + memset(buf + el3_len + ol3_len + ol4_len + il3_len + il4_len, + cfg_payload_char, cfg_payload_len); + + /* add zero byte for udp csum padding */ + buf[el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len] = 0; + + switch (cfg_l3_inner) { + case PF_INET: + build_ipv4_header(buf + el3_len + ol3_len + ol4_len, + IPPROTO_UDP, + in_saddr4.sin_addr.s_addr, + in_daddr4.sin_addr.s_addr, + il4_len + cfg_payload_len, + cfg_dsfield_inner); + break; + case PF_INET6: + build_ipv6_header(buf + el3_len + ol3_len + ol4_len, + IPPROTO_UDP, + &in_saddr6, &in_daddr6, + il4_len + cfg_payload_len, + cfg_dsfield_inner); + break; + } + + build_udp_header(buf + el3_len + ol3_len + ol4_len + il3_len, + cfg_payload_len, CFG_PORT_INNER, cfg_l3_inner); + + if (!cfg_encap_proto) + return il3_len + il4_len + cfg_payload_len; + + switch (cfg_l3_outer) { + case PF_INET: + build_ipv4_header(buf + el3_len, cfg_encap_proto, + out_saddr4.sin_addr.s_addr, + out_daddr4.sin_addr.s_addr, + ol4_len + il3_len + il4_len + cfg_payload_len, + cfg_dsfield_outer); + break; + case PF_INET6: + build_ipv6_header(buf + el3_len, cfg_encap_proto, + &out_saddr6, &out_daddr6, + ol4_len + il3_len + il4_len + cfg_payload_len, + cfg_dsfield_outer); + break; + } + + switch (cfg_encap_proto) { + case IPPROTO_UDP: + build_gue_header(buf + el3_len + ol3_len + ol4_len - + sizeof(struct guehdr), + cfg_l3_inner == PF_INET ? IPPROTO_IPIP + : IPPROTO_IPV6); + build_udp_header(buf + el3_len + ol3_len, + sizeof(struct guehdr) + il3_len + il4_len + + cfg_payload_len, + cfg_port_gue, cfg_l3_outer); + break; + case IPPROTO_GRE: + build_gre_header(buf + el3_len + ol3_len, + cfg_l3_inner == PF_INET ? ETH_P_IP + : ETH_P_IPV6); + break; + } + + switch (cfg_l3_extra) { + case PF_INET: + build_ipv4_header(buf, + cfg_l3_outer == PF_INET ? IPPROTO_IPIP + : IPPROTO_IPV6, + extra_saddr4.sin_addr.s_addr, + extra_daddr4.sin_addr.s_addr, + ol3_len + ol4_len + il3_len + il4_len + + cfg_payload_len, 0); + break; + case PF_INET6: + build_ipv6_header(buf, + cfg_l3_outer == PF_INET ? IPPROTO_IPIP + : IPPROTO_IPV6, + &extra_saddr6, &extra_daddr6, + ol3_len + ol4_len + il3_len + il4_len + + cfg_payload_len, 0); + break; + } + + return el3_len + ol3_len + ol4_len + il3_len + il4_len + + cfg_payload_len; +} + +/* sender transmits encapsulated over RAW or unencap'd over UDP */ +static int setup_tx(void) +{ + int family, fd, ret; + + if (cfg_l3_extra) + family = cfg_l3_extra; + else if (cfg_l3_outer) + family = cfg_l3_outer; + else + family = cfg_l3_inner; + + fd = socket(family, SOCK_RAW, IPPROTO_RAW); + if (fd == -1) + error(1, errno, "socket tx"); + + if (cfg_l3_extra) { + if (cfg_l3_extra == PF_INET) + ret = connect(fd, (void *) &extra_daddr4, + sizeof(extra_daddr4)); + else + ret = connect(fd, (void *) &extra_daddr6, + sizeof(extra_daddr6)); + if (ret) + error(1, errno, "connect tx"); + } else if (cfg_l3_outer) { + /* connect to destination if not encapsulated */ + if (cfg_l3_outer == PF_INET) + ret = connect(fd, (void *) &out_daddr4, + sizeof(out_daddr4)); + else + ret = connect(fd, (void *) &out_daddr6, + sizeof(out_daddr6)); + if (ret) + error(1, errno, "connect tx"); + } else { + /* otherwise using loopback */ + if (cfg_l3_inner == PF_INET) + ret = connect(fd, (void *) &in_daddr4, + sizeof(in_daddr4)); + else + ret = connect(fd, (void *) &in_daddr6, + sizeof(in_daddr6)); + if (ret) + error(1, errno, "connect tx"); + } + + return fd; +} + +/* receiver reads unencapsulated UDP */ +static int setup_rx(void) +{ + int fd, ret; + + fd = socket(cfg_l3_inner, SOCK_DGRAM, 0); + if (fd == -1) + error(1, errno, "socket rx"); + + if (cfg_l3_inner == PF_INET) + ret = bind(fd, (void *) &in_daddr4, sizeof(in_daddr4)); + else + ret = bind(fd, (void *) &in_daddr6, sizeof(in_daddr6)); + if (ret) + error(1, errno, "bind rx"); + + return fd; +} + +static int do_tx(int fd, const char *pkt, int len) +{ + int ret; + + ret = write(fd, pkt, len); + if (ret == -1) + error(1, errno, "send"); + if (ret != len) + error(1, errno, "send: len (%d < %d)\n", ret, len); + + return 1; +} + +static int do_poll(int fd, short events, int timeout) +{ + struct pollfd pfd; + int ret; + + pfd.fd = fd; + pfd.events = events; + + ret = poll(&pfd, 1, timeout); + if (ret == -1) + error(1, errno, "poll"); + if (ret && !(pfd.revents & POLLIN)) + error(1, errno, "poll: unexpected event 0x%x\n", pfd.revents); + + return ret; +} + +static int do_rx(int fd) +{ + char rbuf; + int ret, num = 0; + + while (1) { + ret = recv(fd, &rbuf, 1, MSG_DONTWAIT); + if (ret == -1 && errno == EAGAIN) + break; + if (ret == -1) + error(1, errno, "recv"); + if (rbuf != cfg_payload_char) + error(1, 0, "recv: payload mismatch"); + num++; + }; + + return num; +} + +static int do_main(void) +{ + unsigned long tstop, treport, tcur; + int fdt = -1, fdr = -1, len, tx = 0, rx = 0; + + if (!cfg_only_tx) + fdr = setup_rx(); + if (!cfg_only_rx) + fdt = setup_tx(); + + len = build_packet(); + + tcur = util_gettime(); + treport = tcur + 1000; + tstop = tcur + (cfg_num_secs * 1000); + + while (1) { + if (!cfg_only_rx) + tx += do_tx(fdt, buf, len); + + if (!cfg_only_tx) + rx += do_rx(fdr); + + if (cfg_num_secs) { + tcur = util_gettime(); + if (tcur >= tstop) + break; + if (tcur >= treport) { + fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx); + tx = 0; + rx = 0; + treport = tcur + 1000; + } + } else { + if (tx == cfg_num_pkt) + break; + } + } + + /* read straggler packets, if any */ + if (rx < tx) { + tstop = util_gettime() + 100; + while (rx < tx) { + tcur = util_gettime(); + if (tcur >= tstop) + break; + + do_poll(fdr, POLLIN, tstop - tcur); + rx += do_rx(fdr); + } + } + + fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx); + + if (fdr != -1 && close(fdr)) + error(1, errno, "close rx"); + if (fdt != -1 && close(fdt)) + error(1, errno, "close tx"); + + /* + * success (== 0) only if received all packets + * unless failure is expected, in which case none must arrive. + */ + if (cfg_expect_failure) + return rx != 0; + else + return rx != tx; +} + + +static void __attribute__((noreturn)) usage(const char *filepath) +{ + fprintf(stderr, "Usage: %s [-e gre|gue|bare|none] [-i 4|6] [-l len] " + "[-O 4|6] [-o 4|6] [-n num] [-t secs] [-R] [-T] " + "[-s [-d ] [-S ] [-D ] " + "[-x ] [-X ] [-f ] [-F]\n", + filepath); + exit(1); +} + +static void parse_addr(int family, void *addr, const char *optarg) +{ + int ret; + + ret = inet_pton(family, optarg, addr); + if (ret == -1) + error(1, errno, "inet_pton"); + if (ret == 0) + error(1, 0, "inet_pton: bad string"); +} + +static void parse_addr4(struct sockaddr_in *addr, const char *optarg) +{ + parse_addr(AF_INET, &addr->sin_addr, optarg); +} + +static void parse_addr6(struct sockaddr_in6 *addr, const char *optarg) +{ + parse_addr(AF_INET6, &addr->sin6_addr, optarg); +} + +static int parse_protocol_family(const char *filepath, const char *optarg) +{ + if (!strcmp(optarg, "4")) + return PF_INET; + if (!strcmp(optarg, "6")) + return PF_INET6; + + usage(filepath); +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "d:D:e:f:Fhi:l:n:o:O:Rs:S:t:Tx:X:")) != -1) { + switch (c) { + case 'd': + if (cfg_l3_outer == AF_UNSPEC) + error(1, 0, "-d must be preceded by -o"); + if (cfg_l3_outer == AF_INET) + parse_addr4(&out_daddr4, optarg); + else + parse_addr6(&out_daddr6, optarg); + break; + case 'D': + if (cfg_l3_inner == AF_UNSPEC) + error(1, 0, "-D must be preceded by -i"); + if (cfg_l3_inner == AF_INET) + parse_addr4(&in_daddr4, optarg); + else + parse_addr6(&in_daddr6, optarg); + break; + case 'e': + if (!strcmp(optarg, "gre")) + cfg_encap_proto = IPPROTO_GRE; + else if (!strcmp(optarg, "gue")) + cfg_encap_proto = IPPROTO_UDP; + else if (!strcmp(optarg, "bare")) + cfg_encap_proto = IPPROTO_IPIP; + else if (!strcmp(optarg, "none")) + cfg_encap_proto = IPPROTO_IP; /* == 0 */ + else + usage(argv[0]); + break; + case 'f': + cfg_src_port = strtol(optarg, NULL, 0); + break; + case 'F': + cfg_expect_failure = true; + break; + case 'h': + usage(argv[0]); + break; + case 'i': + if (!strcmp(optarg, "4")) + cfg_l3_inner = PF_INET; + else if (!strcmp(optarg, "6")) + cfg_l3_inner = PF_INET6; + else + usage(argv[0]); + break; + case 'l': + cfg_payload_len = strtol(optarg, NULL, 0); + break; + case 'n': + cfg_num_pkt = strtol(optarg, NULL, 0); + break; + case 'o': + cfg_l3_outer = parse_protocol_family(argv[0], optarg); + break; + case 'O': + cfg_l3_extra = parse_protocol_family(argv[0], optarg); + break; + case 'R': + cfg_only_rx = true; + break; + case 's': + if (cfg_l3_outer == AF_INET) + parse_addr4(&out_saddr4, optarg); + else + parse_addr6(&out_saddr6, optarg); + break; + case 'S': + if (cfg_l3_inner == AF_INET) + parse_addr4(&in_saddr4, optarg); + else + parse_addr6(&in_saddr6, optarg); + break; + case 't': + cfg_num_secs = strtol(optarg, NULL, 0); + break; + case 'T': + cfg_only_tx = true; + break; + case 'x': + cfg_dsfield_outer = strtol(optarg, NULL, 0); + break; + case 'X': + cfg_dsfield_inner = strtol(optarg, NULL, 0); + break; + } + } + + if (cfg_only_rx && cfg_only_tx) + error(1, 0, "options: cannot combine rx-only and tx-only"); + + if (cfg_encap_proto && cfg_l3_outer == AF_UNSPEC) + error(1, 0, "options: must specify outer with encap"); + else if ((!cfg_encap_proto) && cfg_l3_outer != AF_UNSPEC) + error(1, 0, "options: cannot combine no-encap and outer"); + else if ((!cfg_encap_proto) && cfg_l3_extra != AF_UNSPEC) + error(1, 0, "options: cannot combine no-encap and extra"); + + if (cfg_l3_inner == AF_UNSPEC) + cfg_l3_inner = AF_INET6; + if (cfg_l3_inner == AF_INET6 && cfg_encap_proto == IPPROTO_IPIP) + cfg_encap_proto = IPPROTO_IPV6; + + /* RFC 6040 4.2: + * on decap, if outer encountered congestion (CE == 0x3), + * but inner cannot encode ECN (NoECT == 0x0), then drop packet. + */ + if (((cfg_dsfield_outer & 0x3) == 0x3) && + ((cfg_dsfield_inner & 0x3) == 0x0)) + cfg_expect_failure = true; +} + +static void print_opts(void) +{ + if (cfg_l3_inner == PF_INET6) { + util_printaddr("inner.dest6", (void *) &in_daddr6); + util_printaddr("inner.source6", (void *) &in_saddr6); + } else { + util_printaddr("inner.dest4", (void *) &in_daddr4); + util_printaddr("inner.source4", (void *) &in_saddr4); + } + + if (!cfg_l3_outer) + return; + + fprintf(stderr, "encap proto: %u\n", cfg_encap_proto); + + if (cfg_l3_outer == PF_INET6) { + util_printaddr("outer.dest6", (void *) &out_daddr6); + util_printaddr("outer.source6", (void *) &out_saddr6); + } else { + util_printaddr("outer.dest4", (void *) &out_daddr4); + util_printaddr("outer.source4", (void *) &out_saddr4); + } + + if (!cfg_l3_extra) + return; + + if (cfg_l3_outer == PF_INET6) { + util_printaddr("extra.dest6", (void *) &extra_daddr6); + util_printaddr("extra.source6", (void *) &extra_saddr6); + } else { + util_printaddr("extra.dest4", (void *) &extra_daddr4); + util_printaddr("extra.source4", (void *) &extra_saddr4); + } + +} + +int main(int argc, char **argv) +{ + parse_opts(argc, argv); + print_opts(); + return do_main(); +} diff --git a/tools/testing/selftests/bpf/test_flow_dissector.sh b/tools/testing/selftests/bpf/test_flow_dissector.sh new file mode 100755 index 000000000000..c0fb073b5eab --- /dev/null +++ b/tools/testing/selftests/bpf/test_flow_dissector.sh @@ -0,0 +1,115 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Load BPF flow dissector and verify it correctly dissects traffic +export TESTNAME=test_flow_dissector +unmount=0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +msg="skip all tests:" +if [ $UID != 0 ]; then + echo $msg please run this as root >&2 + exit $ksft_skip +fi + +# This test needs to be run in a network namespace with in_netns.sh. Check if +# this is the case and run it with in_netns.sh if it is being run in the root +# namespace. +if [[ -z $(ip netns identify $$) ]]; then + ../net/in_netns.sh "$0" "$@" + exit $? +fi + +# Determine selftest success via shell exit code +exit_handler() +{ + if (( $? == 0 )); then + echo "selftests: $TESTNAME [PASS]"; + else + echo "selftests: $TESTNAME [FAILED]"; + fi + + set +e + + # Cleanup + tc filter del dev lo ingress pref 1337 2> /dev/null + tc qdisc del dev lo ingress 2> /dev/null + ./flow_dissector_load -d 2> /dev/null + if [ $unmount -ne 0 ]; then + umount bpffs 2> /dev/null + fi +} + +# Exit script immediately (well catched by trap handler) if any +# program/thing exits with a non-zero status. +set -e + +# (Use 'trap -l' to list meaning of numbers) +trap exit_handler 0 2 3 6 9 + +# Mount BPF file system +if /bin/mount | grep /sys/fs/bpf > /dev/null; then + echo "bpffs already mounted" +else + echo "bpffs not mounted. Mounting..." + unmount=1 + /bin/mount bpffs /sys/fs/bpf -t bpf +fi + +# Attach BPF program +./flow_dissector_load -p bpf_flow.o -s dissect + +# Setup +tc qdisc add dev lo ingress + +echo "Testing IPv4..." +# Drops all IP/UDP packets coming from port 9 +tc filter add dev lo parent ffff: protocol ip pref 1337 flower ip_proto \ + udp src_port 9 action drop + +# Send 10 IPv4/UDP packets from port 8. Filter should not drop any. +./test_flow_dissector -i 4 -f 8 +# Send 10 IPv4/UDP packets from port 9. Filter should drop all. +./test_flow_dissector -i 4 -f 9 -F +# Send 10 IPv4/UDP packets from port 10. Filter should not drop any. +./test_flow_dissector -i 4 -f 10 + +echo "Testing IPIP..." +# Send 10 IPv4/IPv4/UDP packets from port 8. Filter should not drop any. +./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \ + -D 192.168.0.1 -S 1.1.1.1 -f 8 +# Send 10 IPv4/IPv4/UDP packets from port 9. Filter should drop all. +./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \ + -D 192.168.0.1 -S 1.1.1.1 -f 9 -F +# Send 10 IPv4/IPv4/UDP packets from port 10. Filter should not drop any. +./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \ + -D 192.168.0.1 -S 1.1.1.1 -f 10 + +echo "Testing IPv4 + GRE..." +# Send 10 IPv4/GRE/IPv4/UDP packets from port 8. Filter should not drop any. +./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \ + -D 192.168.0.1 -S 1.1.1.1 -f 8 +# Send 10 IPv4/GRE/IPv4/UDP packets from port 9. Filter should drop all. +./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \ + -D 192.168.0.1 -S 1.1.1.1 -f 9 -F +# Send 10 IPv4/GRE/IPv4/UDP packets from port 10. Filter should not drop any. +./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \ + -D 192.168.0.1 -S 1.1.1.1 -f 10 + +tc filter del dev lo ingress pref 1337 + +echo "Testing IPv6..." +# Drops all IPv6/UDP packets coming from port 9 +tc filter add dev lo parent ffff: protocol ipv6 pref 1337 flower ip_proto \ + udp src_port 9 action drop + +# Send 10 IPv6/UDP packets from port 8. Filter should not drop any. +./test_flow_dissector -i 6 -f 8 +# Send 10 IPv6/UDP packets from port 9. Filter should drop all. +./test_flow_dissector -i 6 -f 9 -F +# Send 10 IPv6/UDP packets from port 10. Filter should not drop any. +./test_flow_dissector -i 6 -f 10 + +exit 0 diff --git a/tools/testing/selftests/bpf/with_addr.sh b/tools/testing/selftests/bpf/with_addr.sh new file mode 100755 index 000000000000..ffcd3953f94c --- /dev/null +++ b/tools/testing/selftests/bpf/with_addr.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# add private ipv4 and ipv6 addresses to loopback + +readonly V6_INNER='100::a/128' +readonly V4_INNER='192.168.0.1/32' + +if getopts ":s" opt; then + readonly SIT_DEV_NAME='sixtofourtest0' + readonly V6_SIT='2::/64' + readonly V4_SIT='172.17.0.1/32' + shift +fi + +fail() { + echo "error: $*" 1>&2 + exit 1 +} + +setup() { + ip -6 addr add "${V6_INNER}" dev lo || fail 'failed to setup v6 address' + ip -4 addr add "${V4_INNER}" dev lo || fail 'failed to setup v4 address' + + if [[ -n "${V6_SIT}" ]]; then + ip link add "${SIT_DEV_NAME}" type sit remote any local any \ + || fail 'failed to add sit' + ip link set dev "${SIT_DEV_NAME}" up \ + || fail 'failed to bring sit device up' + ip -6 addr add "${V6_SIT}" dev "${SIT_DEV_NAME}" \ + || fail 'failed to setup v6 SIT address' + ip -4 addr add "${V4_SIT}" dev "${SIT_DEV_NAME}" \ + || fail 'failed to setup v4 SIT address' + fi + + sleep 2 # avoid race causing bind to fail +} + +cleanup() { + if [[ -n "${V6_SIT}" ]]; then + ip -4 addr del "${V4_SIT}" dev "${SIT_DEV_NAME}" + ip -6 addr del "${V6_SIT}" dev "${SIT_DEV_NAME}" + ip link del "${SIT_DEV_NAME}" + fi + + ip -4 addr del "${V4_INNER}" dev lo + ip -6 addr del "${V6_INNER}" dev lo +} + +trap cleanup EXIT + +setup +"$@" +exit "$?" diff --git a/tools/testing/selftests/bpf/with_tunnels.sh b/tools/testing/selftests/bpf/with_tunnels.sh new file mode 100755 index 000000000000..e24949ed3a20 --- /dev/null +++ b/tools/testing/selftests/bpf/with_tunnels.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# setup tunnels for flow dissection test + +readonly SUFFIX="test_$(mktemp -u XXXX)" +CONFIG="remote 127.0.0.2 local 127.0.0.1 dev lo" + +setup() { + ip link add "ipip_${SUFFIX}" type ipip ${CONFIG} + ip link add "gre_${SUFFIX}" type gre ${CONFIG} + ip link add "sit_${SUFFIX}" type sit ${CONFIG} + + echo "tunnels before test:" + ip tunnel show + + ip link set "ipip_${SUFFIX}" up + ip link set "gre_${SUFFIX}" up + ip link set "sit_${SUFFIX}" up +} + + +cleanup() { + ip tunnel del "ipip_${SUFFIX}" + ip tunnel del "gre_${SUFFIX}" + ip tunnel del "sit_${SUFFIX}" + + echo "tunnels after test:" + ip tunnel show +} + +trap cleanup EXIT + +setup +"$@" +exit "$?" From 70e88c758a6b8544b5e0d982e55d1e36f9aa0b85 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 14 Sep 2018 12:09:05 -0700 Subject: [PATCH 21/29] selftests/bpf: fix bpf_flow.c build fix the following build error: clang -I. -I./include/uapi -I../../../include/uapi -idirafter /usr/local/include -idirafter /data/users/ast/llvm/bld/lib/clang/7.0.0/include -idirafter /usr/include -Wno-compare-distinct-pointer-types \ -O2 -target bpf -emit-llvm -c bpf_flow.c -o - | \ llc -march=bpf -mcpu=generic -filetype=obj -o /data/users/ast/bpf-next/tools/testing/selftests/bpf/bpf_flow.o LLVM ERROR: 'dissect' label emitted multiple times to assembly file make: *** [/data/users/ast/bpf-next/tools/testing/selftests/bpf/bpf_flow.o] Error 1 Fixes: 9c98b13cc3bb ("flow_dissector: implements eBPF parser") Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_flow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/bpf_flow.c b/tools/testing/selftests/bpf/bpf_flow.c index 5fb809d95867..107350a7821d 100644 --- a/tools/testing/selftests/bpf/bpf_flow.c +++ b/tools/testing/selftests/bpf/bpf_flow.c @@ -117,7 +117,7 @@ static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto) } SEC("dissect") -int dissect(struct __sk_buff *skb) +int _dissect(struct __sk_buff *skb) { if (!skb->vlan_present) return parse_eth_proto(skb, skb->protocol); From 7900efc19214e326913dc0f0e8ded24adc0018f2 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 17 Sep 2018 16:13:00 -0700 Subject: [PATCH 22/29] tools/bpf: bpftool: improve output format for bpftool net This is a followup patch for Commit f6f3bac08ff9 ("tools/bpf: bpftool: add net support"). Some improvements are made for the bpftool net output. Specially, plain output is more concise such that per attachment should nicely fit in one line. Compared to previous output, the prog tag is removed since it can be easily obtained with program id. Similar to xdp attachments, the device name is added to tc attachments. The bpf program attached through shared block mechanism is supported as well. $ ip link add dev v1 type veth peer name v2 $ tc qdisc add dev v1 ingress_block 10 egress_block 20 clsact $ tc qdisc add dev v2 ingress_block 10 egress_block 20 clsact $ tc filter add block 10 protocol ip prio 25 bpf obj bpf_shared.o sec ingress flowid 1:1 $ tc filter add block 20 protocol ip prio 30 bpf obj bpf_cyclic.o sec classifier flowid 1:1 $ bpftool net xdp: tc: v2(7) clsact/ingress bpf_shared.o:[ingress] id 23 v2(7) clsact/egress bpf_cyclic.o:[classifier] id 24 v1(8) clsact/ingress bpf_shared.o:[ingress] id 23 v1(8) clsact/egress bpf_cyclic.o:[classifier] id 24 The documentation and "bpftool net help" are updated to make it clear that current implementation only supports xdp and tc attachments. For programs attached to cgroups, "bpftool cgroup" can be used to dump attachments. For other programs e.g. sk_{filter,skb,msg,reuseport} and lwt/seg6, iproute2 tools should be used. The new output: $ bpftool net xdp: eth0(2) driver id 198 tc: eth0(2) clsact/ingress fbflow_icmp id 335 act [{icmp_action id 336}] eth0(2) clsact/egress fbflow_egress id 334 $ bpftool -jp net [{ "xdp": [{ "devname": "eth0", "ifindex": 2, "mode": "driver", "id": 198 } ], "tc": [{ "devname": "eth0", "ifindex": 2, "kind": "clsact/ingress", "name": "fbflow_icmp", "id": 335, "act": [{ "name": "icmp_action", "id": 336 } ] },{ "devname": "eth0", "ifindex": 2, "kind": "clsact/egress", "name": "fbflow_egress", "id": 334 } ] } ] Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- .../bpf/bpftool/Documentation/bpftool-net.rst | 76 +++++++------ tools/bpf/bpftool/main.h | 3 +- tools/bpf/bpftool/net.c | 103 ++++++++++++------ tools/bpf/bpftool/netlink_dumper.c | 85 +++++++-------- tools/bpf/bpftool/netlink_dumper.h | 22 ++-- 5 files changed, 160 insertions(+), 129 deletions(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index 48a61837a264..408ec30d8872 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -26,9 +26,20 @@ NET COMMANDS DESCRIPTION =========== **bpftool net { show | list } [ dev name ]** - List all networking device driver and tc attachment in the system. + List bpf program attachments in the kernel networking subsystem. - Output will start with all xdp program attachment, followed by + Currently, only device driver xdp attachments and tc filter + classification/action attachments are implemented, i.e., for + program types **BPF_PROG_TYPE_SCHED_CLS**, + **BPF_PROG_TYPE_SCHED_ACT** and **BPF_PROG_TYPE_XDP**. + For programs attached to a particular cgroup, e.g., + **BPF_PROG_TYPE_CGROUP_SKB**, **BPF_PROG_TYPE_CGROUP_SOCK**, + **BPF_PROG_TYPE_SOCK_OPS** and **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + users can use **bpftool cgroup** to dump cgroup attachments. + For sk_{filter, skb, msg, reuseport} and lwt/seg6 + bpf programs, users should consult other tools, e.g., iproute2. + + The current output will start with all xdp program attachments, followed by all tc class/qdisc bpf program attachments. Both xdp programs and tc programs are ordered based on ifindex number. If multiple bpf programs attached to the same networking device through **tc filter**, @@ -61,21 +72,15 @@ EXAMPLES :: - xdp [ - ifindex 2 devname eth0 prog_id 198 - ] - tc_filters [ - ifindex 2 kind qdisc_htb name prefix_matcher.o:[cls_prefix_matcher_htb] - prog_id 111727 tag d08fe3b4319bc2fd act [] - ifindex 2 kind qdisc_clsact_ingress name fbflow_icmp - prog_id 130246 tag 3f265c7f26db62c9 act [] - ifindex 2 kind qdisc_clsact_egress name prefix_matcher.o:[cls_prefix_matcher_clsact] - prog_id 111726 tag 99a197826974c876 - ifindex 2 kind qdisc_clsact_egress name cls_fg_dscp - prog_id 108619 tag dc4630674fd72dcc act [] - ifindex 2 kind qdisc_clsact_egress name fbflow_egress - prog_id 130245 tag 72d2d830d6888d2c - ] + xdp: + eth0(2) driver id 198 + + tc: + eth0(2) htb name prefix_matcher.o:[cls_prefix_matcher_htb] id 111727 act [] + eth0(2) clsact/ingress fbflow_icmp id 130246 act [] + eth0(2) clsact/egress prefix_matcher.o:[cls_prefix_matcher_clsact] id 111726 + eth0(2) clsact/egress cls_fg_dscp id 108619 act [] + eth0(2) clsact/egress fbflow_egress id 130245 | | **# bpftool -jp net** @@ -84,44 +89,45 @@ EXAMPLES [{ "xdp": [{ - "ifindex": 2, "devname": "eth0", - "prog_id": 198 + "ifindex": 2, + "mode": "driver", + "id": 198 } ], - "tc_filters": [{ + "tc": [{ + "devname": "eth0", "ifindex": 2, - "kind": "qdisc_htb", + "kind": "htb", "name": "prefix_matcher.o:[cls_prefix_matcher_htb]", - "prog_id": 111727, - "tag": "d08fe3b4319bc2fd", + "id": 111727, "act": [] },{ + "devname": "eth0", "ifindex": 2, - "kind": "qdisc_clsact_ingress", + "kind": "clsact/ingress", "name": "fbflow_icmp", - "prog_id": 130246, - "tag": "3f265c7f26db62c9", + "id": 130246, "act": [] },{ + "devname": "eth0", "ifindex": 2, - "kind": "qdisc_clsact_egress", + "kind": "clsact/egress", "name": "prefix_matcher.o:[cls_prefix_matcher_clsact]", - "prog_id": 111726, - "tag": "99a197826974c876" + "id": 111726, },{ + "devname": "eth0", "ifindex": 2, - "kind": "qdisc_clsact_egress", + "kind": "clsact/egress", "name": "cls_fg_dscp", - "prog_id": 108619, - "tag": "dc4630674fd72dcc", + "id": 108619, "act": [] },{ + "devname": "eth0", "ifindex": 2, - "kind": "qdisc_clsact_egress", + "kind": "clsact/egress", "name": "fbflow_egress", - "prog_id": 130245, - "tag": "72d2d830d6888d2c" + "id": 130245, } ] } diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 02dfbcb92a23..40492cdc4e53 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -171,5 +171,6 @@ struct nlattr; struct ifinfomsg; struct tcmsg; int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb); -int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind); +int do_filter_dump(struct tcmsg *ifinfo, struct nlattr **tb, const char *kind, + const char *devname, int ifindex); #endif diff --git a/tools/bpf/bpftool/net.c b/tools/bpf/bpftool/net.c index 77dd73dd9ade..ed205ee57655 100644 --- a/tools/bpf/bpftool/net.c +++ b/tools/bpf/bpftool/net.c @@ -2,6 +2,7 @@ // Copyright (C) 2018 Facebook #define _GNU_SOURCE +#include #include #include #include @@ -17,8 +18,13 @@ #include "main.h" #include "netlink_dumper.h" +struct ip_devname_ifindex { + char devname[64]; + int ifindex; +}; + struct bpf_netdev_t { - int *ifindex_array; + struct ip_devname_ifindex *devices; int used_len; int array_len; int filter_idx; @@ -36,6 +42,12 @@ struct bpf_tcinfo_t { bool is_qdisc; }; +struct bpf_filter_t { + const char *kind; + const char *devname; + int ifindex; +}; + static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb) { struct bpf_netdev_t *netinfo = cookie; @@ -45,11 +57,20 @@ static int dump_link_nlmsg(void *cookie, void *msg, struct nlattr **tb) return 0; if (netinfo->used_len == netinfo->array_len) { - netinfo->ifindex_array = realloc(netinfo->ifindex_array, - (netinfo->array_len + 16) * sizeof(int)); + netinfo->devices = realloc(netinfo->devices, + (netinfo->array_len + 16) * + sizeof(struct ip_devname_ifindex)); + if (!netinfo->devices) + return -ENOMEM; + netinfo->array_len += 16; } - netinfo->ifindex_array[netinfo->used_len++] = ifinfo->ifi_index; + netinfo->devices[netinfo->used_len].ifindex = ifinfo->ifi_index; + snprintf(netinfo->devices[netinfo->used_len].devname, + sizeof(netinfo->devices[netinfo->used_len].devname), + "%s", + tb[IFLA_IFNAME] ? nla_getattr_str(tb[IFLA_IFNAME]) : ""); + netinfo->used_len++; return do_xdp_dump(ifinfo, tb); } @@ -71,13 +92,15 @@ static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb) if (tcinfo->used_len == tcinfo->array_len) { tcinfo->handle_array = realloc(tcinfo->handle_array, (tcinfo->array_len + 16) * sizeof(struct tc_kind_handle)); + if (!tcinfo->handle_array) + return -ENOMEM; + tcinfo->array_len += 16; } tcinfo->handle_array[tcinfo->used_len].handle = info->tcm_handle; snprintf(tcinfo->handle_array[tcinfo->used_len].kind, sizeof(tcinfo->handle_array[tcinfo->used_len].kind), - "%s_%s", - tcinfo->is_qdisc ? "qdisc" : "class", + "%s", tb[TCA_KIND] ? nla_getattr_str(tb[TCA_KIND]) : "unknown"); tcinfo->used_len++; @@ -86,60 +109,71 @@ static int dump_class_qdisc_nlmsg(void *cookie, void *msg, struct nlattr **tb) static int dump_filter_nlmsg(void *cookie, void *msg, struct nlattr **tb) { - const char *kind = cookie; + const struct bpf_filter_t *filter_info = cookie; - return do_filter_dump((struct tcmsg *)msg, tb, kind); + return do_filter_dump((struct tcmsg *)msg, tb, filter_info->kind, + filter_info->devname, filter_info->ifindex); } -static int show_dev_tc_bpf(int sock, unsigned int nl_pid, int ifindex) +static int show_dev_tc_bpf(int sock, unsigned int nl_pid, + struct ip_devname_ifindex *dev) { + struct bpf_filter_t filter_info; struct bpf_tcinfo_t tcinfo; - int i, handle, ret; + int i, handle, ret = 0; tcinfo.handle_array = NULL; tcinfo.used_len = 0; tcinfo.array_len = 0; tcinfo.is_qdisc = false; - ret = nl_get_class(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg, + ret = nl_get_class(sock, nl_pid, dev->ifindex, dump_class_qdisc_nlmsg, &tcinfo); if (ret) - return ret; + goto out; tcinfo.is_qdisc = true; - ret = nl_get_qdisc(sock, nl_pid, ifindex, dump_class_qdisc_nlmsg, + ret = nl_get_qdisc(sock, nl_pid, dev->ifindex, dump_class_qdisc_nlmsg, &tcinfo); if (ret) - return ret; + goto out; + filter_info.devname = dev->devname; + filter_info.ifindex = dev->ifindex; for (i = 0; i < tcinfo.used_len; i++) { - ret = nl_get_filter(sock, nl_pid, ifindex, + filter_info.kind = tcinfo.handle_array[i].kind; + ret = nl_get_filter(sock, nl_pid, dev->ifindex, tcinfo.handle_array[i].handle, dump_filter_nlmsg, - tcinfo.handle_array[i].kind); + &filter_info); if (ret) - return ret; + goto out; } /* root, ingress and egress handle */ handle = TC_H_ROOT; - ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg, - "root"); + filter_info.kind = "root"; + ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle, + dump_filter_nlmsg, &filter_info); if (ret) - return ret; + goto out; handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS); - ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg, - "qdisc_clsact_ingress"); + filter_info.kind = "clsact/ingress"; + ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle, + dump_filter_nlmsg, &filter_info); if (ret) - return ret; + goto out; handle = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS); - ret = nl_get_filter(sock, nl_pid, ifindex, handle, dump_filter_nlmsg, - "qdisc_clsact_egress"); + filter_info.kind = "clsact/egress"; + ret = nl_get_filter(sock, nl_pid, dev->ifindex, handle, + dump_filter_nlmsg, &filter_info); if (ret) - return ret; + goto out; +out: + free(tcinfo.handle_array); return 0; } @@ -168,7 +202,7 @@ static int do_show(int argc, char **argv) return -1; } - dev_array.ifindex_array = NULL; + dev_array.devices = NULL; dev_array.used_len = 0; dev_array.array_len = 0; dev_array.filter_idx = filter_idx; @@ -176,15 +210,15 @@ static int do_show(int argc, char **argv) if (json_output) jsonw_start_array(json_wtr); NET_START_OBJECT; - NET_START_ARRAY("xdp", "\n"); + NET_START_ARRAY("xdp", "%s:\n"); ret = nl_get_link(sock, nl_pid, dump_link_nlmsg, &dev_array); NET_END_ARRAY("\n"); if (!ret) { - NET_START_ARRAY("tc_filters", "\n"); + NET_START_ARRAY("tc", "%s:\n"); for (i = 0; i < dev_array.used_len; i++) { ret = show_dev_tc_bpf(sock, nl_pid, - dev_array.ifindex_array[i]); + &dev_array.devices[i]); if (ret) break; } @@ -200,7 +234,7 @@ static int do_show(int argc, char **argv) libbpf_strerror(ret, err_buf, sizeof(err_buf)); fprintf(stderr, "Error: %s\n", err_buf); } - free(dev_array.ifindex_array); + free(dev_array.devices); close(sock); return ret; } @@ -214,7 +248,12 @@ static int do_help(int argc, char **argv) fprintf(stderr, "Usage: %s %s { show | list } [dev ]\n" - " %s %s help\n", + " %s %s help\n" + "Note: Only xdp and tc attachments are supported now.\n" + " For progs attached to cgroups, use \"bpftool cgroup\"\n" + " to dump program attachments. For program types\n" + " sk_{filter,skb,msg,reuseport} and lwt/seg6, please\n" + " consult iproute2.\n", bin_name, argv[-2], bin_name, argv[-2]); return 0; diff --git a/tools/bpf/bpftool/netlink_dumper.c b/tools/bpf/bpftool/netlink_dumper.c index e12494fd1d2e..6f5e9cc6836c 100644 --- a/tools/bpf/bpftool/netlink_dumper.c +++ b/tools/bpf/bpftool/netlink_dumper.c @@ -12,12 +12,18 @@ #include "netlink_dumper.h" static void xdp_dump_prog_id(struct nlattr **tb, int attr, - const char *type) + const char *mode, + bool new_json_object) { if (!tb[attr]) return; - NET_DUMP_UINT(type, nla_getattr_u32(tb[attr])) + if (new_json_object) + NET_START_OBJECT + NET_DUMP_STR("mode", " %s", mode); + NET_DUMP_UINT("id", " id %u", nla_getattr_u32(tb[attr])) + if (new_json_object) + NET_END_OBJECT } static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex, @@ -37,18 +43,26 @@ static int do_xdp_dump_one(struct nlattr *attr, unsigned int ifindex, return 0; NET_START_OBJECT; - NET_DUMP_UINT("ifindex", ifindex); - if (name) - NET_DUMP_STR("devname", name); - - if (tb[IFLA_XDP_PROG_ID]) - NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[IFLA_XDP_PROG_ID])); + NET_DUMP_STR("devname", "%s", name); + NET_DUMP_UINT("ifindex", "(%d)", ifindex); if (mode == XDP_ATTACHED_MULTI) { - xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic_prog_id"); - xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "drv_prog_id"); - xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload_prog_id"); + if (json_output) { + jsonw_name(json_wtr, "multi_attachments"); + jsonw_start_array(json_wtr); + } + xdp_dump_prog_id(tb, IFLA_XDP_SKB_PROG_ID, "generic", true); + xdp_dump_prog_id(tb, IFLA_XDP_DRV_PROG_ID, "driver", true); + xdp_dump_prog_id(tb, IFLA_XDP_HW_PROG_ID, "offload", true); + if (json_output) + jsonw_end_array(json_wtr); + } else if (mode == XDP_ATTACHED_DRV) { + xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "driver", false); + } else if (mode == XDP_ATTACHED_SKB) { + xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "generic", false); + } else if (mode == XDP_ATTACHED_HW) { + xdp_dump_prog_id(tb, IFLA_XDP_PROG_ID, "offload", false); } NET_END_OBJECT_FINAL; @@ -64,26 +78,9 @@ int do_xdp_dump(struct ifinfomsg *ifinfo, struct nlattr **tb) nla_getattr_str(tb[IFLA_IFNAME])); } -static char *hexstring_n2a(const unsigned char *str, int len, - char *buf, int blen) -{ - char *ptr = buf; - int i; - - for (i = 0; i < len; i++) { - if (blen < 3) - break; - sprintf(ptr, "%02x", str[i]); - ptr += 2; - blen -= 2; - } - return buf; -} - static int do_bpf_dump_one_act(struct nlattr *attr) { struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; - char buf[256]; if (nla_parse_nested(tb, TCA_ACT_BPF_MAX, attr, NULL) < 0) return -LIBBPF_ERRNO__NLPARSE; @@ -93,13 +90,11 @@ static int do_bpf_dump_one_act(struct nlattr *attr) NET_START_OBJECT_NESTED2; if (tb[TCA_ACT_BPF_NAME]) - NET_DUMP_STR("name", nla_getattr_str(tb[TCA_ACT_BPF_NAME])); + NET_DUMP_STR("name", "%s", + nla_getattr_str(tb[TCA_ACT_BPF_NAME])); if (tb[TCA_ACT_BPF_ID]) - NET_DUMP_UINT("bpf_id", nla_getattr_u32(tb[TCA_ACT_BPF_ID])); - if (tb[TCA_ACT_BPF_TAG]) - NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_ACT_BPF_TAG]), - nla_len(tb[TCA_ACT_BPF_TAG]), - buf, sizeof(buf))); + NET_DUMP_UINT("id", " id %u", + nla_getattr_u32(tb[TCA_ACT_BPF_ID])); NET_END_OBJECT_NESTED; return 0; } @@ -128,13 +123,13 @@ static int do_bpf_act_dump(struct nlattr *attr) if (nla_parse_nested(tb, TCA_ACT_MAX_PRIO, attr, NULL) < 0) return -LIBBPF_ERRNO__NLPARSE; - NET_START_ARRAY("act", ""); + NET_START_ARRAY("act", " %s ["); for (act = 0; act <= TCA_ACT_MAX_PRIO; act++) { ret = do_dump_one_act(tb[act]); if (ret) break; } - NET_END_ARRAY(" "); + NET_END_ARRAY("] "); return ret; } @@ -142,20 +137,15 @@ static int do_bpf_act_dump(struct nlattr *attr) static int do_bpf_filter_dump(struct nlattr *attr) { struct nlattr *tb[TCA_BPF_MAX + 1]; - char buf[256]; int ret; if (nla_parse_nested(tb, TCA_BPF_MAX, attr, NULL) < 0) return -LIBBPF_ERRNO__NLPARSE; if (tb[TCA_BPF_NAME]) - NET_DUMP_STR("name", nla_getattr_str(tb[TCA_BPF_NAME])); + NET_DUMP_STR("name", " %s", nla_getattr_str(tb[TCA_BPF_NAME])); if (tb[TCA_BPF_ID]) - NET_DUMP_UINT("prog_id", nla_getattr_u32(tb[TCA_BPF_ID])); - if (tb[TCA_BPF_TAG]) - NET_DUMP_STR("tag", hexstring_n2a(nla_data(tb[TCA_BPF_TAG]), - nla_len(tb[TCA_BPF_TAG]), - buf, sizeof(buf))); + NET_DUMP_UINT("id", " id %u", nla_getattr_u32(tb[TCA_BPF_ID])); if (tb[TCA_BPF_ACT]) { ret = do_bpf_act_dump(tb[TCA_BPF_ACT]); if (ret) @@ -165,14 +155,17 @@ static int do_bpf_filter_dump(struct nlattr *attr) return 0; } -int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind) +int do_filter_dump(struct tcmsg *info, struct nlattr **tb, const char *kind, + const char *devname, int ifindex) { int ret = 0; if (tb[TCA_OPTIONS] && strcmp(nla_data(tb[TCA_KIND]), "bpf") == 0) { NET_START_OBJECT; - NET_DUMP_UINT("ifindex", info->tcm_ifindex); - NET_DUMP_STR("kind", kind); + if (devname[0] != '\0') + NET_DUMP_STR("devname", "%s", devname); + NET_DUMP_UINT("ifindex", "(%u)", ifindex); + NET_DUMP_STR("kind", " %s", kind); ret = do_bpf_filter_dump(tb[TCA_OPTIONS]); NET_END_OBJECT_FINAL; } diff --git a/tools/bpf/bpftool/netlink_dumper.h b/tools/bpf/bpftool/netlink_dumper.h index 552d8851ac06..0788cfbbed0e 100644 --- a/tools/bpf/bpftool/netlink_dumper.h +++ b/tools/bpf/bpftool/netlink_dumper.h @@ -50,13 +50,13 @@ fprintf(stderr, "\n"); \ } -#define NET_START_ARRAY(name, newline) \ +#define NET_START_ARRAY(name, fmt_str) \ { \ if (json_output) { \ jsonw_name(json_wtr, name); \ jsonw_start_array(json_wtr); \ } else { \ - fprintf(stderr, "%s [%s", name, newline);\ + fprintf(stderr, fmt_str, name); \ } \ } @@ -65,31 +65,23 @@ if (json_output) \ jsonw_end_array(json_wtr); \ else \ - fprintf(stderr, "]%s", endstr); \ + fprintf(stderr, "%s", endstr); \ } -#define NET_DUMP_UINT(name, val) \ +#define NET_DUMP_UINT(name, fmt_str, val) \ { \ if (json_output) \ jsonw_uint_field(json_wtr, name, val); \ else \ - fprintf(stderr, "%s %d ", name, val); \ + fprintf(stderr, fmt_str, val); \ } -#define NET_DUMP_LLUINT(name, val) \ -{ \ - if (json_output) \ - jsonw_lluint_field(json_wtr, name, val);\ - else \ - fprintf(stderr, "%s %lld ", name, val); \ -} - -#define NET_DUMP_STR(name, str) \ +#define NET_DUMP_STR(name, fmt_str, str) \ { \ if (json_output) \ jsonw_string_field(json_wtr, name, str);\ else \ - fprintf(stderr, "%s %s ", name, str); \ + fprintf(stderr, fmt_str, str); \ } #define NET_DUMP_STR_ONLY(str) \ From 664e7878451f62389b273e204ae8866c85ef1456 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 18 Sep 2018 09:45:34 +0800 Subject: [PATCH 23/29] samples/bpf: remove duplicated includes Remove duplicated includes. Signed-off-by: YueHaibing Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- samples/bpf/bpf_load.c | 1 - samples/bpf/sampleip_user.c | 1 - samples/bpf/test_current_task_under_cgroup_user.c | 1 - 3 files changed, 3 deletions(-) diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 904e775d1a44..e6d7e0fe155b 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c index 60c2b73d1b4d..216c7ecbbbe9 100644 --- a/samples/bpf/sampleip_user.c +++ b/samples/bpf/sampleip_user.c @@ -9,7 +9,6 @@ */ #include #include -#include #include #include #include diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c index 4be4874ca2bc..2259f997a26c 100644 --- a/samples/bpf/test_current_task_under_cgroup_user.c +++ b/samples/bpf/test_current_task_under_cgroup_user.c @@ -11,7 +11,6 @@ #include #include #include "bpf_load.h" -#include #include "cgroup_helpers.h" #define CGROUP_PATH "/my-cgroup" From 534e0e52bc23de588e81b5a6f75e10c8c4b189fc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 17 Sep 2018 22:08:13 -0700 Subject: [PATCH 24/29] samples/bpf: fix a compilation failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit samples/bpf build failed with the following errors: $ make samples/bpf/ ... HOSTCC samples/bpf/sockex3_user.o /data/users/yhs/work/net-next/samples/bpf/sockex3_user.c:16:8: error: redefinition of ‘struct bpf_flow_keys’ struct bpf_flow_keys { ^ In file included from /data/users/yhs/work/net-next/samples/bpf/sockex3_user.c:4:0: ./usr/include/linux/bpf.h:2338:9: note: originally defined here struct bpf_flow_keys *flow_keys; ^ make[3]: *** [samples/bpf/sockex3_user.o] Error 1 Commit d58e468b1112d ("flow_dissector: implements flow dissector BPF hook") introduced struct bpf_flow_keys in include/uapi/linux/bpf.h and hence caused the naming conflict with samples/bpf/sockex3_user.c. The fix is to rename struct bpf_flow_keys in samples/bpf/sockex3_user.c to flow_keys to avoid the conflict. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- samples/bpf/sockex3_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index 5ba3ae9d180b..22f74d0e1493 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -13,7 +13,7 @@ #define PARSE_IP_PROG_FD (prog_fd[0]) #define PROG_ARRAY_FD (map_fd[0]) -struct bpf_flow_keys { +struct flow_keys { __be32 src; __be32 dst; union { @@ -64,7 +64,7 @@ int main(int argc, char **argv) (void) f; for (i = 0; i < 5; i++) { - struct bpf_flow_keys key = {}, next_key; + struct flow_keys key = {}, next_key; struct pair value; sleep(1); From 2dfd184abd38fd72d80715fa8b00c9de78490200 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 18 Sep 2018 16:20:18 -0400 Subject: [PATCH 25/29] flow_dissector: fix build failure without CONFIG_NET If boolean CONFIG_BPF_SYSCALL is enabled, kernel/bpf/syscall.c will call flow_dissector functions from net/core/flow_dissector.c. This causes this build failure if CONFIG_NET is disabled: kernel/bpf/syscall.o: In function `__x64_sys_bpf': syscall.c:(.text+0x3278): undefined reference to `skb_flow_dissector_bpf_prog_attach' syscall.c:(.text+0x3310): undefined reference to `skb_flow_dissector_bpf_prog_detach' kernel/bpf/syscall.o:(.rodata+0x3f0): undefined reference to `flow_dissector_prog_ops' kernel/bpf/verifier.o:(.rodata+0x250): undefined reference to `flow_dissector_verifier_ops' Analogous to other optional BPF program types in syscall.c, add stubs if the relevant functions are not compiled and move the BPF_PROG_TYPE definition in the #ifdef CONFIG_NET block. Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Reported-by: Randy Dunlap Signed-off-by: Willem de Bruijn Acked-by: Randy Dunlap # build-tested Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 2 +- include/linux/skbuff.h | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 22083712dd18..c9bd6fb765b0 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -16,6 +16,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) +BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) @@ -32,7 +33,6 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #ifdef CONFIG_INET BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) #endif -BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ce0e863f02a2..76be85ea392a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1194,10 +1194,23 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); +#ifdef CONFIG_NET int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); +#else +static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} +#endif bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, From 32c009798385ce21080beaa87a9b95faad3acd1e Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Thu, 20 Sep 2018 16:52:03 +0900 Subject: [PATCH 26/29] samples/bpf: fix compilation failure following commit: commit d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") added struct bpf_flow_keys which conflicts with the struct with same name in sockex2_kern.c and sockex3_kern.c similar to commit: commit 534e0e52bc23 ("samples/bpf: fix a compilation failure") we tried the rename it "flow_keys" but it also conflicted with struct having same name in include/net/flow_dissector.h. Hence renaming the struct to "flow_key_record". Also, this commit doesn't fix the compilation error completely because the similar struct is present in sockex3_kern.c. Hence renaming it in both files sockex3_user.c and sockex3_kern.c Signed-off-by: Prashant Bhole Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- samples/bpf/sockex2_kern.c | 11 ++++++----- samples/bpf/sockex3_kern.c | 8 ++++---- samples/bpf/sockex3_user.c | 4 ++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c index f58acfc92556..f2f9dbc021b0 100644 --- a/samples/bpf/sockex2_kern.c +++ b/samples/bpf/sockex2_kern.c @@ -14,7 +14,7 @@ struct vlan_hdr { __be16 h_vlan_encapsulated_proto; }; -struct bpf_flow_keys { +struct flow_key_record { __be32 src; __be32 dst; union { @@ -59,7 +59,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) } static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, - struct bpf_flow_keys *flow) + struct flow_key_record *flow) { __u64 verlen; @@ -83,7 +83,7 @@ static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto } static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_proto, - struct bpf_flow_keys *flow) + struct flow_key_record *flow) { *ip_proto = load_byte(skb, nhoff + offsetof(struct ipv6hdr, nexthdr)); @@ -96,7 +96,8 @@ static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 *ip_pro return nhoff; } -static inline bool flow_dissector(struct __sk_buff *skb, struct bpf_flow_keys *flow) +static inline bool flow_dissector(struct __sk_buff *skb, + struct flow_key_record *flow) { __u64 nhoff = ETH_HLEN; __u64 ip_proto; @@ -198,7 +199,7 @@ struct bpf_map_def SEC("maps") hash_map = { SEC("socket2") int bpf_prog2(struct __sk_buff *skb) { - struct bpf_flow_keys flow = {}; + struct flow_key_record flow = {}; struct pair *value; u32 key; diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c index 95907f8d2b17..c527b57d3ec8 100644 --- a/samples/bpf/sockex3_kern.c +++ b/samples/bpf/sockex3_kern.c @@ -61,7 +61,7 @@ struct vlan_hdr { __be16 h_vlan_encapsulated_proto; }; -struct bpf_flow_keys { +struct flow_key_record { __be32 src; __be32 dst; union { @@ -88,7 +88,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off) } struct globals { - struct bpf_flow_keys flow; + struct flow_key_record flow; }; struct bpf_map_def SEC("maps") percpu_map = { @@ -114,14 +114,14 @@ struct pair { struct bpf_map_def SEC("maps") hash_map = { .type = BPF_MAP_TYPE_HASH, - .key_size = sizeof(struct bpf_flow_keys), + .key_size = sizeof(struct flow_key_record), .value_size = sizeof(struct pair), .max_entries = 1024, }; static void update_stats(struct __sk_buff *skb, struct globals *g) { - struct bpf_flow_keys key = g->flow; + struct flow_key_record key = g->flow; struct pair *value; value = bpf_map_lookup_elem(&hash_map, &key); diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index 22f74d0e1493..9d02e0404719 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -13,7 +13,7 @@ #define PARSE_IP_PROG_FD (prog_fd[0]) #define PROG_ARRAY_FD (map_fd[0]) -struct flow_keys { +struct flow_key_record { __be32 src; __be32 dst; union { @@ -64,7 +64,7 @@ int main(int argc, char **argv) (void) f; for (i = 0; i < 5; i++) { - struct flow_keys key = {}, next_key; + struct flow_key_record key = {}, next_key; struct pair value; sleep(1); From 788758d1fe874fd20ecb0ab490552d94c024a9de Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Thu, 20 Sep 2018 17:46:12 +0800 Subject: [PATCH 27/29] bpf: remove redundant null pointer check before consume_skb consume_skb has taken the null pointer into account. hence it is safe to remove the redundant null pointer check before consume_skb. Signed-off-by: zhong jiang Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 488ef9663c01..a9359cbc3f93 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -590,8 +590,7 @@ static int free_sg(struct sock *sk, int start, if (i == MAX_SKB_FRAGS) i = 0; } - if (md->skb) - consume_skb(md->skb); + consume_skb(md->skb); return free; } @@ -973,8 +972,7 @@ bytes_ready: if (!sg->length && md->sg_start == md->sg_end) { list_del(&md->list); - if (md->skb) - consume_skb(md->skb); + consume_skb(md->skb); kfree(md); } } From 7ea3c40605e9acf12f2dda26e28cd82c0dba6923 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 21 Sep 2018 22:47:20 +0000 Subject: [PATCH 28/29] bpftool: add support for BPF_MAP_TYPE_REUSEPORT_SOCKARRAY maps Add BPF_MAP_TYPE_REUSEPORT_SOCKARRAY map type to the list of maps types which bpftool recognizes. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jakub Kicinski Cc: Yonghong Song Acked-by: Jakub Kicinski Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/map.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index af8ad32fa6e9..e22fbe8b975f 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -71,6 +71,7 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_XSKMAP] = "xskmap", [BPF_MAP_TYPE_SOCKHASH] = "sockhash", [BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage", + [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray", }; static bool map_is_per_cpu(__u32 type) From d0e13a1488ad30dc3c2c9347b931cb10f892e3a4 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 24 Sep 2018 16:49:57 -0400 Subject: [PATCH 29/29] flow_dissector: lookup netns by skb->sk if skb->dev is NULL BPF flow dissectors are configured per network namespace. __skb_flow_dissect looks up the netns through dev_net(skb->dev). In some dissector paths skb->dev is NULL, such as for Unix sockets. In these cases fall back to looking up the netns by socket. Analyzing the codepaths leading to __skb_flow_dissect I did not find a case where both skb->dev and skb->sk are NULL. Warn and fall back to standard flow dissector if one is found. Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Reported-by: Eric Dumazet Signed-off-by: Willem de Bruijn Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- net/core/flow_dissector.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5c5dd74b5b3b..738c7562e1e0 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -714,7 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; - struct bpf_prog *attached; + struct bpf_prog *attached = NULL; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -755,8 +755,14 @@ bool __skb_flow_dissect(const struct sk_buff *skb, target_container); rcu_read_lock(); - attached = skb ? rcu_dereference(dev_net(skb->dev)->flow_dissector_prog) - : NULL; + if (skb) { + if (skb->dev) + attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog); + else if (skb->sk) + attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog); + else + WARN_ON_ONCE(1); + } if (attached) { /* Note that even though the const qualifier is discarded * throughout the execution of the BPF program, all changes(the