From 16d584d2fc8f4ea36203af45a76becd7093586f1 Mon Sep 17 00:00:00 2001 From: Liang He Date: Wed, 22 Jun 2022 12:06:21 +0800 Subject: [PATCH 01/55] net/dsa/hirschmann: Add missing of_node_get() in hellcreek_led_setup() of_find_node_by_name() will decrease the refcount of its first arg and we need a of_node_get() to keep refcount balance. Fixes: 7d9ee2e8ff15 ("net: dsa: hellcreek: Add PTP status LEDs") Signed-off-by: Liang He Link: https://lore.kernel.org/r/20220622040621.4094304-1-windhl@126.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/hirschmann/hellcreek_ptp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/hirschmann/hellcreek_ptp.c b/drivers/net/dsa/hirschmann/hellcreek_ptp.c index 2572c6087bb5..b28baab6d56a 100644 --- a/drivers/net/dsa/hirschmann/hellcreek_ptp.c +++ b/drivers/net/dsa/hirschmann/hellcreek_ptp.c @@ -300,6 +300,7 @@ static int hellcreek_led_setup(struct hellcreek *hellcreek) const char *label, *state; int ret = -EINVAL; + of_node_get(hellcreek->dev->of_node); leds = of_find_node_by_name(hellcreek->dev->of_node, "leds"); if (!leds) { dev_err(hellcreek->dev, "No LEDs specified in device tree!\n"); From 7c97bc0128b2eecc703106112679a69d446d1a12 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Wed, 22 Jun 2022 20:02:04 -0700 Subject: [PATCH 02/55] net: dsa: bcm_sf2: force pause link settings The pause settings reported by the PHY should also be applied to the GMII port status override otherwise the switch will not generate pause frames towards the link partner despite the advertisement saying otherwise. Fixes: 246d7f773c13 ("net: dsa: add Broadcom SF2 switch driver") Signed-off-by: Doug Berger Signed-off-by: Florian Fainelli Link: https://lore.kernel.org/r/20220623030204.1966851-1-f.fainelli@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/bcm_sf2.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 87e81c636339..be0edfa093d0 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -878,6 +878,11 @@ static void bcm_sf2_sw_mac_link_up(struct dsa_switch *ds, int port, if (duplex == DUPLEX_FULL) reg |= DUPLX_MODE; + if (tx_pause) + reg |= TXFLOW_CNTL; + if (rx_pause) + reg |= RXFLOW_CNTL; + core_writel(priv, reg, offset); } From ad887a507d7386de09a532c5d303ed659eba2cfb Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 22 Jun 2022 13:54:16 +0200 Subject: [PATCH 03/55] net/ncsi: use proper "mellanox" DT vendor prefix "mlx" Devicetree vendor prefix is not documented and instead "mellanox" should be used. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220622115416.7400-1-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- net/ncsi/ncsi-manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 78814417d753..80713febfac6 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1803,7 +1803,8 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, pdev = to_platform_device(dev->dev.parent); if (pdev) { np = pdev->dev.of_node; - if (np && of_get_property(np, "mlx,multi-host", NULL)) + if (np && (of_get_property(np, "mellanox,multi-host", NULL) || + of_get_property(np, "mlx,multi-host", NULL))) ndp->mlx_multi_host = true; } From 1228b34c8d0ecf6de18c4c95d22f60cc8607c50a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Jun 2022 15:02:20 +0000 Subject: [PATCH 04/55] net: clear msg_get_inq in __sys_recvfrom() and __copy_msghdr_from_user() syzbot reported uninit-value in tcp_recvmsg() [1] Issue here is that msg->msg_get_inq should have been cleared, otherwise tcp_recvmsg() might read garbage and perform more work than needed, or have undefined behavior. Given CONFIG_INIT_STACK_ALL_ZERO=y is probably going to be the default soon, I chose to change __sys_recvfrom() to clear all fields but msghdr.addr which might be not NULL. For __copy_msghdr_from_user(), I added an explicit clear of kmsg->msg_get_inq. [1] BUG: KMSAN: uninit-value in tcp_recvmsg+0x6cf/0xb60 net/ipv4/tcp.c:2557 tcp_recvmsg+0x6cf/0xb60 net/ipv4/tcp.c:2557 inet_recvmsg+0x13a/0x5a0 net/ipv4/af_inet.c:850 sock_recvmsg_nosec net/socket.c:995 [inline] sock_recvmsg net/socket.c:1013 [inline] __sys_recvfrom+0x696/0x900 net/socket.c:2176 __do_sys_recvfrom net/socket.c:2194 [inline] __se_sys_recvfrom net/socket.c:2190 [inline] __x64_sys_recvfrom+0x122/0x1c0 net/socket.c:2190 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Local variable msg created at: __sys_recvfrom+0x81/0x900 net/socket.c:2154 __do_sys_recvfrom net/socket.c:2194 [inline] __se_sys_recvfrom net/socket.c:2190 [inline] __x64_sys_recvfrom+0x122/0x1c0 net/socket.c:2190 CPU: 0 PID: 3493 Comm: syz-executor170 Not tainted 5.19.0-rc3-syzkaller-30868-g4b28366af7d9 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: f94fd25cb0aa ("tcp: pass back data left in socket after receive") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Jens Axboe Tested-by: Alexander Potapenko Link: https://lore.kernel.org/r/20220622150220.1091182-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/socket.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/net/socket.c b/net/socket.c index 2bc8773d9dc5..96300cdc0625 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2149,10 +2149,13 @@ SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, struct sockaddr __user *addr, int __user *addr_len) { + struct sockaddr_storage address; + struct msghdr msg = { + /* Save some cycles and don't copy the address if not needed */ + .msg_name = addr ? (struct sockaddr *)&address : NULL, + }; struct socket *sock; struct iovec iov; - struct msghdr msg; - struct sockaddr_storage address; int err, err2; int fput_needed; @@ -2163,14 +2166,6 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags, if (!sock) goto out; - msg.msg_control = NULL; - msg.msg_controllen = 0; - /* Save some cycles and don't copy the address if not needed */ - msg.msg_name = addr ? (struct sockaddr *)&address : NULL; - /* We assume all kernel code knows the size of sockaddr_storage */ - msg.msg_namelen = 0; - msg.msg_iocb = NULL; - msg.msg_flags = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; err = sock_recvmsg(sock, &msg, flags); @@ -2375,6 +2370,7 @@ int __copy_msghdr_from_user(struct msghdr *kmsg, return -EFAULT; kmsg->msg_control_is_user = true; + kmsg->msg_get_inq = 0; kmsg->msg_control_user = msg.msg_control; kmsg->msg_controllen = msg.msg_controllen; kmsg->msg_flags = msg.msg_flags; From b968080808f7f28b89aa495b7402ba48eb17ee93 Mon Sep 17 00:00:00 2001 From: Dimitris Michailidis Date: Wed, 22 Jun 2022 17:02:34 -0700 Subject: [PATCH 05/55] selftests/net: pass ipv6_args to udpgso_bench's IPv6 TCP test udpgso_bench.sh has been running its IPv6 TCP test with IPv4 arguments since its initial conmit. Looks like a typo. Fixes: 3a687bef148d ("selftests: udp gso benchmark") Cc: willemb@google.com Signed-off-by: Dimitris Michailidis Acked-by: Willem de Bruijn Link: https://lore.kernel.org/r/20220623000234.61774-1-dmichail@fungible.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/udpgso_bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh index 80b5d352702e..dc932fd65363 100755 --- a/tools/testing/selftests/net/udpgso_bench.sh +++ b/tools/testing/selftests/net/udpgso_bench.sh @@ -120,7 +120,7 @@ run_all() { run_udp "${ipv4_args}" echo "ipv6" - run_tcp "${ipv4_args}" + run_tcp "${ipv6_args}" run_udp "${ipv6_args}" } From 935336c191040809bf5739654ea337c13fe8f9af Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 23 Jun 2022 11:12:31 +0200 Subject: [PATCH 06/55] selftests/bpf: Test sockmap update when socket has ULP Cover the scenario when we cannot insert a socket into the sockmap, because it has it is using ULP. Failed insert should not have any effect on the ULP state. This is a regression test. Signed-off-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20220623091231.417138-1-jakub@cloudflare.com Signed-off-by: Jakub Kicinski --- .../selftests/bpf/prog_tests/sockmap_ktls.c | 84 +++++++++++++++++-- 1 file changed, 75 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c index af293ea1542c..e172d89e92e1 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c @@ -4,6 +4,7 @@ * Tests for sockmap/sockhash holding kTLS sockets. */ +#include #include "test_progs.h" #define MAX_TEST_NAME 80 @@ -92,9 +93,78 @@ close_srv: close(srv); } +static void test_sockmap_ktls_update_fails_when_sock_has_ulp(int family, int map) +{ + struct sockaddr_storage addr = {}; + socklen_t len = sizeof(addr); + struct sockaddr_in6 *v6; + struct sockaddr_in *v4; + int err, s, zero = 0; + + switch (family) { + case AF_INET: + v4 = (struct sockaddr_in *)&addr; + v4->sin_family = AF_INET; + break; + case AF_INET6: + v6 = (struct sockaddr_in6 *)&addr; + v6->sin6_family = AF_INET6; + break; + default: + PRINT_FAIL("unsupported socket family %d", family); + return; + } + + s = socket(family, SOCK_STREAM, 0); + if (!ASSERT_GE(s, 0, "socket")) + return; + + err = bind(s, (struct sockaddr *)&addr, len); + if (!ASSERT_OK(err, "bind")) + goto close; + + err = getsockname(s, (struct sockaddr *)&addr, &len); + if (!ASSERT_OK(err, "getsockname")) + goto close; + + err = connect(s, (struct sockaddr *)&addr, len); + if (!ASSERT_OK(err, "connect")) + goto close; + + /* save sk->sk_prot and set it to tls_prots */ + err = setsockopt(s, IPPROTO_TCP, TCP_ULP, "tls", strlen("tls")); + if (!ASSERT_OK(err, "setsockopt(TCP_ULP)")) + goto close; + + /* sockmap update should not affect saved sk_prot */ + err = bpf_map_update_elem(map, &zero, &s, BPF_ANY); + if (!ASSERT_ERR(err, "sockmap update elem")) + goto close; + + /* call sk->sk_prot->setsockopt to dispatch to saved sk_prot */ + err = setsockopt(s, IPPROTO_TCP, TCP_NODELAY, &zero, sizeof(zero)); + ASSERT_OK(err, "setsockopt(TCP_NODELAY)"); + +close: + close(s); +} + +static const char *fmt_test_name(const char *subtest_name, int family, + enum bpf_map_type map_type) +{ + const char *map_type_str = BPF_MAP_TYPE_SOCKMAP ? "SOCKMAP" : "SOCKHASH"; + const char *family_str = AF_INET ? "IPv4" : "IPv6"; + static char test_name[MAX_TEST_NAME]; + + snprintf(test_name, MAX_TEST_NAME, + "sockmap_ktls %s %s %s", + subtest_name, family_str, map_type_str); + + return test_name; +} + static void run_tests(int family, enum bpf_map_type map_type) { - char test_name[MAX_TEST_NAME]; int map; map = bpf_map_create(map_type, NULL, sizeof(int), sizeof(int), 1, NULL); @@ -103,14 +173,10 @@ static void run_tests(int family, enum bpf_map_type map_type) return; } - snprintf(test_name, MAX_TEST_NAME, - "sockmap_ktls disconnect_after_delete %s %s", - family == AF_INET ? "IPv4" : "IPv6", - map_type == BPF_MAP_TYPE_SOCKMAP ? "SOCKMAP" : "SOCKHASH"); - if (!test__start_subtest(test_name)) - return; - - test_sockmap_ktls_disconnect_after_delete(family, map); + if (test__start_subtest(fmt_test_name("disconnect_after_delete", family, map_type))) + test_sockmap_ktls_disconnect_after_delete(family, map); + if (test__start_subtest(fmt_test_name("update_fails_when_sock_has_ulp", family, map_type))) + test_sockmap_ktls_update_fails_when_sock_has_ulp(family, map); close(map); } From 6f0012e35160cd08a53e46e3b3bbf724b92dfe68 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Jun 2022 05:04:36 +0000 Subject: [PATCH 07/55] tcp: add a missing nf_reset_ct() in 3WHS handling When the third packet of 3WHS connection establishment contains payload, it is added into socket receive queue without the XFRM check and the drop of connection tracking context. This means that if the data is left unread in the socket receive queue, conntrack module can not be unloaded. As most applications usually reads the incoming data immediately after accept(), bug has been hiding for quite a long time. Commit 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists") exposed this bug because even if the application reads this data, the skb with nfct state could stay in a per-cpu cache for an arbitrary time, if said cpu no longer process RX softirqs. Many thanks to Ilya Maximets for reporting this issue, and for testing various patches: https://lore.kernel.org/netdev/20220619003919.394622-1-i.maximets@ovn.org/ Note that I also added a missing xfrm4_policy_check() call, although this is probably not a big issue, as the SYN packet should have been dropped earlier. Fixes: b59c270104f0 ("[NETFILTER]: Keep conntrack reference until IPsec policy checks are done") Reported-by: Ilya Maximets Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Pablo Neira Ayuso Cc: Steffen Klassert Tested-by: Ilya Maximets Reviewed-by: Ilya Maximets Link: https://lore.kernel.org/r/20220623050436.1290307-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fe8f23b95d32..da5a3c44c4fb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1964,7 +1964,10 @@ process: struct sock *nsk; sk = req->rsk_listener; - drop_reason = tcp_inbound_md5_hash(sk, skb, + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) + drop_reason = SKB_DROP_REASON_XFRM_POLICY; + else + drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, &iph->daddr, AF_INET, dif, sdif); if (unlikely(drop_reason)) { @@ -2016,6 +2019,7 @@ process: } goto discard_and_relse; } + nf_reset_ct(skb); if (nsk == sk) { reqsk_put(req); tcp_v4_restore_cb(skb); From 3b9bc84d311104906d2b4995a9a02d7b7ddab2db Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 22 Jun 2022 21:20:39 -0700 Subject: [PATCH 08/55] net: tun: unlink NAPI from device on destruction Syzbot found a race between tun file and device destruction. NAPIs live in struct tun_file which can get destroyed before the netdev so we have to del them explicitly. The current code is missing deleting the NAPI if the queue was detached first. Fixes: 943170998b20 ("tun: enable NAPI for TUN/TAP driver") Reported-by: syzbot+b75c138e9286ac742647@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20220623042039.2274708-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 87a635aac008..7fd0288c3789 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -727,6 +727,7 @@ static void tun_detach_all(struct net_device *dev) sock_put(&tfile->sk); } list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) { + tun_napi_del(tfile); tun_enable_queue(tfile); tun_queue_purge(tfile); xdp_rxq_info_unreg(&tfile->xdp_rxq); From c96614eeab663646f57f67aa591e015abd8bd0ba Mon Sep 17 00:00:00 2001 From: Enguerrand de Ribaucourt Date: Thu, 23 Jun 2022 15:46:44 +0200 Subject: [PATCH 09/55] net: dp83822: disable false carrier interrupt When unplugging an Ethernet cable, false carrier events were produced by the PHY at a very high rate. Once the false carrier counter full, an interrupt was triggered every few clock cycles until the cable was replugged. This resulted in approximately 10k/s interrupts. Since the false carrier counter (FCSCR) is never used, we can safely disable this interrupt. In addition to improving performance, this also solved MDIO read timeouts I was randomly encountering with an i.MX8 fec MAC because of the interrupt flood. The interrupt count and MDIO timeout fix were tested on a v5.4.110 kernel. Fixes: 87461f7a58ab ("net: phy: DP83822 initial driver submission") Signed-off-by: Enguerrand de Ribaucourt Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83822.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index e6ad3a494d32..95ef507053a6 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -230,7 +230,6 @@ static int dp83822_config_intr(struct phy_device *phydev) return misr_status; misr_status |= (DP83822_RX_ERR_HF_INT_EN | - DP83822_FALSE_CARRIER_HF_INT_EN | DP83822_LINK_STAT_INT_EN | DP83822_ENERGY_DET_INT_EN | DP83822_LINK_QUAL_INT_EN); From 0e597e2affb90d6ea48df6890d882924acf71e19 Mon Sep 17 00:00:00 2001 From: Enguerrand de Ribaucourt Date: Thu, 23 Jun 2022 15:46:45 +0200 Subject: [PATCH 10/55] net: dp83822: disable rx error interrupt Some RX errors, notably when disconnecting the cable, increase the RCSR register. Once half full (0x7fff), an interrupt flood is generated. I measured ~3k/s interrupts even after the RX errors transfer was stopped. Since we don't read and clear the RCSR register, we should disable this interrupt. Fixes: 87461f7a58ab ("net: phy: DP83822 initial driver submission") Signed-off-by: Enguerrand de Ribaucourt Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83822.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index 95ef507053a6..8549e0e356c9 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -229,8 +229,7 @@ static int dp83822_config_intr(struct phy_device *phydev) if (misr_status < 0) return misr_status; - misr_status |= (DP83822_RX_ERR_HF_INT_EN | - DP83822_LINK_STAT_INT_EN | + misr_status |= (DP83822_LINK_STAT_INT_EN | DP83822_ENERGY_DET_INT_EN | DP83822_LINK_QUAL_INT_EN); From 3b89b511ea0c705cc418440e2abf9d692a556d84 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 23 Jun 2022 16:32:32 +0300 Subject: [PATCH 11/55] net: fix IFF_TX_SKB_NO_LINEAR definition The "1<<31" shift has a sign extension bug so IFF_TX_SKB_NO_LINEAR is 0xffffffff80000000 instead of 0x0000000080000000. Fixes: c2ff53d8049f ("net: Add priv_flags for allow tx skb without linear") Signed-off-by: Dan Carpenter Reviewed-by: Xuan Zhuo Link: https://lore.kernel.org/r/YrRrcGttfEVnf85Q@kili Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f615a66c89e9..2563d30736e9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1671,7 +1671,7 @@ enum netdev_priv_flags { IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, - IFF_TX_SKB_NO_LINEAR = 1<<31, + IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; From 853a7614880231747040cada91d2b8d2e995c51a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 24 Jun 2022 15:30:20 +0000 Subject: [PATCH 12/55] tunnels: do not assume mac header is set in skb_tunnel_check_pmtu() Recently added debug in commit f9aefd6b2aa3 ("net: warn if mac header was not set") caught a bug in skb_tunnel_check_pmtu(), as shown in this syzbot report [1]. In ndo_start_xmit() paths, there is really no need to use skb->mac_header, because skb->data is supposed to point at it. [1] WARNING: CPU: 1 PID: 8604 at include/linux/skbuff.h:2784 skb_mac_header_len include/linux/skbuff.h:2784 [inline] WARNING: CPU: 1 PID: 8604 at include/linux/skbuff.h:2784 skb_tunnel_check_pmtu+0x5de/0x2f90 net/ipv4/ip_tunnel_core.c:413 Modules linked in: CPU: 1 PID: 8604 Comm: syz-executor.3 Not tainted 5.19.0-rc2-syzkaller-00443-g8720bd951b8e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_mac_header_len include/linux/skbuff.h:2784 [inline] RIP: 0010:skb_tunnel_check_pmtu+0x5de/0x2f90 net/ipv4/ip_tunnel_core.c:413 Code: 00 00 00 00 fc ff df 4c 89 fa 48 c1 ea 03 80 3c 02 00 0f 84 b9 fe ff ff 4c 89 ff e8 7c 0f d7 f9 e9 ac fe ff ff e8 c2 13 8a f9 <0f> 0b e9 28 fc ff ff e8 b6 13 8a f9 48 8b 54 24 70 48 b8 00 00 00 RSP: 0018:ffffc90002e4f520 EFLAGS: 00010212 RAX: 0000000000000324 RBX: ffff88804d5fd500 RCX: ffffc90005b52000 RDX: 0000000000040000 RSI: ffffffff87f05e3e RDI: 0000000000000003 RBP: ffffc90002e4f650 R08: 0000000000000003 R09: 000000000000ffff R10: 000000000000ffff R11: 0000000000000000 R12: 000000000000ffff R13: 0000000000000000 R14: 000000000000ffcd R15: 000000000000001f FS: 00007f3babba9700(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000080 CR3: 0000000075319000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: geneve_xmit_skb drivers/net/geneve.c:927 [inline] geneve_xmit+0xcf8/0x35d0 drivers/net/geneve.c:1107 __netdev_start_xmit include/linux/netdevice.h:4805 [inline] netdev_start_xmit include/linux/netdevice.h:4819 [inline] __dev_direct_xmit+0x500/0x730 net/core/dev.c:4309 dev_direct_xmit include/linux/netdevice.h:3007 [inline] packet_direct_xmit+0x1b8/0x2c0 net/packet/af_packet.c:282 packet_snd net/packet/af_packet.c:3073 [inline] packet_sendmsg+0x21f4/0x55d0 net/packet/af_packet.c:3104 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:734 ____sys_sendmsg+0x6eb/0x810 net/socket.c:2489 ___sys_sendmsg+0xf3/0x170 net/socket.c:2543 __sys_sendmsg net/socket.c:2572 [inline] __do_sys_sendmsg net/socket.c:2581 [inline] __se_sys_sendmsg net/socket.c:2579 [inline] __x64_sys_sendmsg+0x132/0x220 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7f3baaa89109 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f3babba9168 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f3baab9bf60 RCX: 00007f3baaa89109 RDX: 0000000000000000 RSI: 0000000020000a00 RDI: 0000000000000003 RBP: 00007f3baaae305d R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffe74f2543f R14: 00007f3babba9300 R15: 0000000000022000 Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Stefano Brivio Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 6b2dc7b2b612..cc1caab4a654 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -410,7 +410,7 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, u32 mtu = dst_mtu(encap_dst) - headroom; if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) || - (!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu)) + (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu)) return 0; skb_dst_update_pmtu_no_confirm(skb, mtu); From cb8092d70a6f5f01ec1490fce4d35efed3ed996c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 24 Jun 2022 12:24:31 -0400 Subject: [PATCH 13/55] tipc: move bc link creation back to tipc_node_create Shuang Li reported a NULL pointer dereference crash: [] BUG: kernel NULL pointer dereference, address: 0000000000000068 [] RIP: 0010:tipc_link_is_up+0x5/0x10 [tipc] [] Call Trace: [] [] tipc_bcast_rcv+0xa2/0x190 [tipc] [] tipc_node_bc_rcv+0x8b/0x200 [tipc] [] tipc_rcv+0x3af/0x5b0 [tipc] [] tipc_udp_recv+0xc7/0x1e0 [tipc] It was caused by the 'l' passed into tipc_bcast_rcv() is NULL. When it creates a node in tipc_node_check_dest(), after inserting the new node into hashtable in tipc_node_create(), it creates the bc link. However, there is a gap between this insert and bc link creation, a bc packet may come in and get the node from the hashtable then try to dereference its bc link, which is NULL. This patch is to fix it by moving the bc link creation before inserting into the hashtable. Note that for a preliminary node becoming "real", the bc link creation should also be called before it's rehashed, as we don't create it for preliminary nodes. Fixes: 4cbf8ac2fe5a ("tipc: enable creating a "preliminary" node") Reported-by: Shuang Li Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/net/tipc/node.c b/net/tipc/node.c index 6ef95ce565bd..b48d97cbbe29 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -472,8 +472,8 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_link *l, *snd_l = tipc_bc_sndlink(net); struct tipc_node *n, *temp_node; - struct tipc_link *l; unsigned long intv; int bearer_id; int i; @@ -488,6 +488,16 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, goto exit; /* A preliminary node becomes "real" now, refresh its data */ tipc_node_write_lock(n); + if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, + tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), + n->capabilities, &n->bc_entry.inputq1, + &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { + pr_warn("Broadcast rcv link refresh failed, no memory\n"); + tipc_node_write_unlock_fast(n); + tipc_node_put(n); + n = NULL; + goto exit; + } n->preliminary = false; n->addr = addr; hlist_del_rcu(&n->hash); @@ -567,7 +577,16 @@ update: n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; - n->bc_entry.link = NULL; + if (!preliminary && + !tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, + tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), + n->capabilities, &n->bc_entry.inputq1, + &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { + pr_warn("Broadcast rcv link creation failed, no memory\n"); + kfree(n); + n = NULL; + goto exit; + } tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); /* Start a slow timer anyway, crypto needs it */ @@ -1155,7 +1174,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, bool *respond, bool *dupl_addr) { struct tipc_node *n; - struct tipc_link *l, *snd_l; + struct tipc_link *l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; @@ -1175,22 +1194,6 @@ void tipc_node_check_dest(struct net *net, u32 addr, return; tipc_node_write_lock(n); - if (unlikely(!n->bc_entry.link)) { - snd_l = tipc_bc_sndlink(net); - if (!tipc_link_bc_create(net, tipc_own_addr(net), - addr, peer_id, U16_MAX, - tipc_link_min_win(snd_l), - tipc_link_max_win(snd_l), - n->capabilities, - &n->bc_entry.inputq1, - &n->bc_entry.namedq, snd_l, - &n->bc_entry.link)) { - pr_warn("Broadcast rcv link creation failed, no mem\n"); - tipc_node_write_unlock_fast(n); - tipc_node_put(n); - return; - } - } le = &n->links[b->identity]; From 05907f10e235680cc7fb196810e4ad3215d5e648 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 21 Jun 2022 14:01:41 +0200 Subject: [PATCH 14/55] netfilter: nft_dynset: restore set element counter when failing to update This patch fixes a race condition. nft_rhash_update() might fail for two reasons: - Element already exists in the hashtable. - Another packet won race to insert an entry in the hashtable. In both cases, new() has already bumped the counter via atomic_add_unless(), therefore, decrement the set element counter. Fixes: 22fe54d5fefc ("netfilter: nf_tables: add support for dynamic set updates") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_hash.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index df40314de21f..76de6c8d9865 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -143,6 +143,7 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, /* Another cpu may race to insert the element with the same key */ if (prev) { nft_set_elem_destroy(set, he, true); + atomic_dec(&set->nelems); he = prev; } @@ -152,6 +153,7 @@ out: err2: nft_set_elem_destroy(set, he, true); + atomic_dec(&set->nelems); err1: return false; } From e34b9ed96ce3b06c79bf884009b16961ca478f87 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 22 Jun 2022 16:43:57 +0200 Subject: [PATCH 15/55] netfilter: nf_tables: avoid skb access on nf_stolen When verdict is NF_STOLEN, the skb might have been freed. When tracing is enabled, this can result in a use-after-free: 1. access to skb->nf_trace 2. access to skb->mark 3. computation of trace id 4. dump of packet payload To avoid 1, keep a cached copy of skb->nf_trace in the trace state struct. Refresh this copy whenever verdict is != STOLEN. Avoid 2 by skipping skb->mark access if verdict is STOLEN. 3 is avoided by precomputing the trace id. Only dump the packet when verdict is not "STOLEN". Reported-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 16 ++++++----- net/netfilter/nf_tables_core.c | 24 ++++++++++++++--- net/netfilter/nf_tables_trace.c | 44 +++++++++++++++++-------------- 3 files changed, 55 insertions(+), 29 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 279ae0fff7ad..5c4e5a96a984 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1338,24 +1338,28 @@ void nft_unregister_flowtable_type(struct nf_flowtable_type *type); /** * struct nft_traceinfo - nft tracing information and state * + * @trace: other struct members are initialised + * @nf_trace: copy of skb->nf_trace before rule evaluation + * @type: event type (enum nft_trace_types) + * @skbid: hash of skb to be used as trace id + * @packet_dumped: packet headers sent in a previous traceinfo message * @pkt: pktinfo currently processed * @basechain: base chain currently processed * @chain: chain currently processed * @rule: rule that was evaluated * @verdict: verdict given by rule - * @type: event type (enum nft_trace_types) - * @packet_dumped: packet headers sent in a previous traceinfo message - * @trace: other struct members are initialised */ struct nft_traceinfo { + bool trace; + bool nf_trace; + bool packet_dumped; + enum nft_trace_types type:8; + u32 skbid; const struct nft_pktinfo *pkt; const struct nft_base_chain *basechain; const struct nft_chain *chain; const struct nft_rule_dp *rule; const struct nft_verdict *verdict; - enum nft_trace_types type; - bool packet_dumped; - bool trace; }; void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 53f40e473855..3ddce24ac76d 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -25,9 +25,7 @@ static noinline void __nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, enum nft_trace_types type) { - const struct nft_pktinfo *pkt = info->pkt; - - if (!info->trace || !pkt->skb->nf_trace) + if (!info->trace || !info->nf_trace) return; info->chain = chain; @@ -42,11 +40,24 @@ static inline void nft_trace_packet(struct nft_traceinfo *info, enum nft_trace_types type) { if (static_branch_unlikely(&nft_trace_enabled)) { + const struct nft_pktinfo *pkt = info->pkt; + + info->nf_trace = pkt->skb->nf_trace; info->rule = rule; __nft_trace_packet(info, chain, type); } } +static inline void nft_trace_copy_nftrace(struct nft_traceinfo *info) +{ + if (static_branch_unlikely(&nft_trace_enabled)) { + const struct nft_pktinfo *pkt = info->pkt; + + if (info->trace) + info->nf_trace = pkt->skb->nf_trace; + } +} + static void nft_bitwise_fast_eval(const struct nft_expr *expr, struct nft_regs *regs) { @@ -85,6 +96,7 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info, const struct nft_chain *chain, const struct nft_regs *regs) { + const struct nft_pktinfo *pkt = info->pkt; enum nft_trace_types type; switch (regs->verdict.code) { @@ -92,8 +104,13 @@ static noinline void __nft_trace_verdict(struct nft_traceinfo *info, case NFT_RETURN: type = NFT_TRACETYPE_RETURN; break; + case NF_STOLEN: + type = NFT_TRACETYPE_RULE; + /* can't access skb->nf_trace; use copy */ + break; default: type = NFT_TRACETYPE_RULE; + info->nf_trace = pkt->skb->nf_trace; break; } @@ -254,6 +271,7 @@ next_rule: switch (regs.verdict.code) { case NFT_BREAK: regs.verdict.code = NFT_CONTINUE; + nft_trace_copy_nftrace(&info); continue; case NFT_CONTINUE: nft_trace_packet(&info, chain, rule, diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 5041725423c2..1163ba9c1401 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include @@ -25,22 +25,6 @@ DEFINE_STATIC_KEY_FALSE(nft_trace_enabled); EXPORT_SYMBOL_GPL(nft_trace_enabled); -static int trace_fill_id(struct sk_buff *nlskb, struct sk_buff *skb) -{ - __be32 id; - - /* using skb address as ID results in a limited number of - * values (and quick reuse). - * - * So we attempt to use as many skb members that will not - * change while skb is with netfilter. - */ - id = (__be32)jhash_2words(hash32_ptr(skb), skb_get_hash(skb), - skb->skb_iif); - - return nla_put_be32(nlskb, NFTA_TRACE_ID, id); -} - static int trace_fill_header(struct sk_buff *nlskb, u16 type, const struct sk_buff *skb, int off, unsigned int len) @@ -186,6 +170,7 @@ void nft_trace_notify(struct nft_traceinfo *info) struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int size; + u32 mark = 0; u16 event; if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE)) @@ -229,7 +214,7 @@ void nft_trace_notify(struct nft_traceinfo *info) if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type))) goto nla_put_failure; - if (trace_fill_id(skb, pkt->skb)) + if (nla_put_u32(skb, NFTA_TRACE_ID, info->skbid)) goto nla_put_failure; if (nla_put_string(skb, NFTA_TRACE_CHAIN, info->chain->name)) @@ -249,16 +234,24 @@ void nft_trace_notify(struct nft_traceinfo *info) case NFT_TRACETYPE_RULE: if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict)) goto nla_put_failure; + + /* pkt->skb undefined iff NF_STOLEN, disable dump */ + if (info->verdict->code == NF_STOLEN) + info->packet_dumped = true; + else + mark = pkt->skb->mark; + break; case NFT_TRACETYPE_POLICY: + mark = pkt->skb->mark; + if (nla_put_be32(skb, NFTA_TRACE_POLICY, htonl(info->basechain->policy))) goto nla_put_failure; break; } - if (pkt->skb->mark && - nla_put_be32(skb, NFTA_TRACE_MARK, htonl(pkt->skb->mark))) + if (mark && nla_put_be32(skb, NFTA_TRACE_MARK, htonl(mark))) goto nla_put_failure; if (!info->packet_dumped) { @@ -283,9 +276,20 @@ void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, const struct nft_chain *chain) { + static siphash_key_t trace_key __read_mostly; + struct sk_buff *skb = pkt->skb; + info->basechain = nft_base_chain(chain); info->trace = true; + info->nf_trace = pkt->skb->nf_trace; info->packet_dumped = false; info->pkt = pkt; info->verdict = verdict; + + net_get_random_once(&trace_key, sizeof(trace_key)); + + info->skbid = (u32)siphash_3u32(hash32_ptr(skb), + skb_get_hash(skb), + skb->skb_iif, + &trace_key); } From c2577862eeb0be94f151f2f1fff662b028061b00 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 21 Jun 2022 18:26:03 +0200 Subject: [PATCH 16/55] netfilter: br_netfilter: do not skip all hooks with 0 priority When br_netfilter module is loaded, skbs may be diverted to the ipv4/ipv6 hooks, just like as if we were routing. Unfortunately, bridge filter hooks with priority 0 may be skipped in this case. Example: 1. an nftables bridge ruleset is loaded, with a prerouting hook that has priority 0. 2. interface is added to the bridge. 3. no tcp packet is ever seen by the bridge prerouting hook. 4. flush the ruleset 5. load the bridge ruleset again. 6. tcp packets are processed as expected. After 1) the only registered hook is the bridge prerouting hook, but its not called yet because the bridge hasn't been brought up yet. After 2), hook order is: 0 br_nf_pre_routing // br_netfilter internal hook 0 chain bridge f prerouting // nftables bridge ruleset The packet is diverted to br_nf_pre_routing. If call-iptables is off, the nftables bridge ruleset is called as expected. But if its enabled, br_nf_hook_thresh() will skip it because it assumes that all 0-priority hooks had been called previously in bridge context. To avoid this, check for the br_nf_pre_routing hook itself, we need to resume directly after it, even if this hook has a priority of 0. Unfortunately, this still results in different packet flow. With this fix, the eval order after in 3) is: 1. br_nf_pre_routing 2. ip(6)tables (if enabled) 3. nftables bridge but after 5 its the much saner: 1. nftables bridge 2. br_nf_pre_routing 3. ip(6)tables (if enabled) Unfortunately I don't see a solution here: It would be possible to move br_nf_pre_routing to a higher priority so that it will be called later in the pipeline, but this also impacts ebtables evaluation order, and would still result in this very ordering problem for all nftables-bridge hooks with the same priority as the br_nf_pre_routing one. Searching back through the git history I don't think this has ever behaved in any other way, hence, no fixes-tag. Reported-by: Radim Hrazdil Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 4fd882686b04..ff4779036649 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1012,9 +1012,24 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, return okfn(net, sk, skb); ops = nf_hook_entries_get_hook_ops(e); - for (i = 0; i < e->num_hook_entries && - ops[i]->priority <= NF_BR_PRI_BRNF; i++) - ; + for (i = 0; i < e->num_hook_entries; i++) { + /* These hooks have already been called */ + if (ops[i]->priority < NF_BR_PRI_BRNF) + continue; + + /* These hooks have not been called yet, run them. */ + if (ops[i]->priority > NF_BR_PRI_BRNF) + break; + + /* take a closer look at NF_BR_PRI_BRNF. */ + if (ops[i]->hook == br_nf_pre_routing) { + /* This hook diverted the skb to this function, + * hooks after this have not been run yet. + */ + i++; + break; + } + } nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); From a8fc8cb5692aebb9c6f7afd4265366d25dcd1d01 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 22 Jun 2022 21:21:05 -0700 Subject: [PATCH 17/55] net: tun: stop NAPI when detaching queues While looking at a syzbot report I noticed the NAPI only gets disabled before it's deleted. I think that user can detach the queue before destroying the device and the NAPI will never be stopped. Fixes: 943170998b20 ("tun: enable NAPI for TUN/TAP driver") Acked-by: Petar Penkov Link: https://lore.kernel.org/r/20220623042105.2274812-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7fd0288c3789..e2eb35887394 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -273,6 +273,12 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, } } +static void tun_napi_enable(struct tun_file *tfile) +{ + if (tfile->napi_enabled) + napi_enable(&tfile->napi); +} + static void tun_napi_disable(struct tun_file *tfile) { if (tfile->napi_enabled) @@ -653,8 +659,10 @@ static void __tun_detach(struct tun_file *tfile, bool clean) if (clean) { RCU_INIT_POINTER(tfile->tun, NULL); sock_put(&tfile->sk); - } else + } else { tun_disable_queue(tun, tfile); + tun_napi_disable(tfile); + } synchronize_net(); tun_flow_delete_by_queue(tun, tun->numqueues + 1); @@ -808,6 +816,7 @@ static int tun_attach(struct tun_struct *tun, struct file *file, if (tfile->detached) { tun_enable_queue(tfile); + tun_napi_enable(tfile); } else { sock_hold(&tfile->sk); tun_napi_init(tun, tfile, napi, napi_frags); From 8ee9d82cd0a45e7d050ade598c9f33032a0f2891 Mon Sep 17 00:00:00 2001 From: Tong Zhang Date: Sun, 26 Jun 2022 21:33:48 -0700 Subject: [PATCH 18/55] epic100: fix use after free on rmmod epic_close() calls epic_rx() and uses dma buffer, but in epic_remove_one() we already freed the dma buffer. To fix this issue, reorder function calls like in the .probe function. BUG: KASAN: use-after-free in epic_rx+0xa6/0x7e0 [epic100] Call Trace: epic_rx+0xa6/0x7e0 [epic100] epic_close+0xec/0x2f0 [epic100] unregister_netdev+0x18/0x20 epic_remove_one+0xaa/0xf0 [epic100] Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Yilun Wu Signed-off-by: Tong Zhang Reviewed-by: Francois Romieu Link: https://lore.kernel.org/r/20220627043351.25615-1-ztong0001@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/smsc/epic100.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/smsc/epic100.c b/drivers/net/ethernet/smsc/epic100.c index a0654e88444c..0329caf63279 100644 --- a/drivers/net/ethernet/smsc/epic100.c +++ b/drivers/net/ethernet/smsc/epic100.c @@ -1515,14 +1515,14 @@ static void epic_remove_one(struct pci_dev *pdev) struct net_device *dev = pci_get_drvdata(pdev); struct epic_private *ep = netdev_priv(dev); + unregister_netdev(dev); dma_free_coherent(&pdev->dev, TX_TOTAL_SIZE, ep->tx_ring, ep->tx_ring_dma); dma_free_coherent(&pdev->dev, RX_TOTAL_SIZE, ep->rx_ring, ep->rx_ring_dma); - unregister_netdev(dev); pci_iounmap(pdev, ep->ioaddr); - pci_release_regions(pdev); free_netdev(dev); + pci_release_regions(pdev); pci_disable_device(pdev); /* pci_power_off(pdev, -1); */ } From 76b39b94382f9e0a639e1c70c3253de248cc4c83 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Thu, 23 Jun 2022 11:07:41 -0300 Subject: [PATCH 19/55] net/sched: act_api: Notify user space if any actions were flushed before error If during an action flush operation one of the actions is still being referenced, the flush operation is aborted and the kernel returns to user space with an error. However, if the kernel was able to flush, for example, 3 actions and failed on the fourth, the kernel will not notify user space that it deleted 3 actions before failing. This patch fixes that behaviour by notifying user space of how many actions were deleted before flush failed and by setting extack with a message describing what happened. Fixes: 55334a5db5cd ("net_sched: act: refuse to remove bound action outside") Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski --- net/sched/act_api.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index da9733da9868..817065aa2833 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -588,7 +588,8 @@ static int tcf_idr_release_unsafe(struct tc_action *p) } static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct nlattr *nest; int n_i = 0; @@ -604,20 +605,25 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; + ret = 0; mutex_lock(&idrinfo->lock); idr_for_each_entry_ul(idr, p, tmp, id) { if (IS_ERR(p)) continue; ret = tcf_idr_release_unsafe(p); - if (ret == ACT_P_DELETED) { + if (ret == ACT_P_DELETED) module_put(ops->owner); - n_i++; - } else if (ret < 0) { - mutex_unlock(&idrinfo->lock); - goto nla_put_failure; - } + else if (ret < 0) + break; + n_i++; } mutex_unlock(&idrinfo->lock); + if (ret < 0) { + if (n_i) + NL_SET_ERR_MSG(extack, "Unable to flush all TC actions"); + else + goto nla_put_failure; + } ret = nla_put_u32(skb, TCA_FCNT, n_i); if (ret) @@ -638,7 +644,7 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct tcf_idrinfo *idrinfo = tn->idrinfo; if (type == RTM_DELACTION) { - return tcf_del_walker(idrinfo, skb, ops); + return tcf_del_walker(idrinfo, skb, ops, extack); } else if (type == RTM_GETACTION) { return tcf_dump_walker(idrinfo, skb, cb); } else { From 88153e29c1e0f3ace8c831b06f6cea9503f16cec Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Thu, 23 Jun 2022 11:07:42 -0300 Subject: [PATCH 20/55] selftests: tc-testing: Add testcases to test new flush behaviour Add tdc test cases to verify new flush behaviour is correct, which do the following: - Try to flush only one action which is being referenced by a filter - Try to flush three actions where the last one (index 3) is being referenced by a filter Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/actions/gact.json | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/gact.json b/tools/testing/selftests/tc-testing/tc-tests/actions/gact.json index b24494c6f546..c652e8c1157d 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/gact.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/gact.json @@ -609,5 +609,82 @@ "teardown": [ "$TC actions flush action gact" ] + }, + { + "id": "7f52", + "name": "Try to flush action which is referenced by filter", + "category": [ + "actions", + "gact" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + [ + "$TC actions flush action gact", + 0, + 1, + 255 + ], + "$TC qdisc add dev $DEV1 ingress", + "$TC actions add action pass index 1", + "$TC filter add dev $DEV1 protocol all ingress prio 1 handle 0x1234 matchall action gact index 1" + ], + "cmdUnderTest": "$TC actions flush action gact", + "expExitCode": "1", + "verifyCmd": "$TC actions ls action gact", + "matchPattern": "total acts 1.*action order [0-9]*: gact action pass.*index 1 ref 2 bind 1", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress", + [ + "sleep 1; $TC actions flush action gact", + 0, + 1 + ] + ] + }, + { + "id": "ae1e", + "name": "Try to flush actions when last one is referenced by filter", + "category": [ + "actions", + "gact" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + [ + "$TC actions flush action gact", + 0, + 1, + 255 + ], + "$TC qdisc add dev $DEV1 ingress", + [ + "$TC actions add action pass index 1", + 0, + 1, + 255 + ], + "$TC actions add action reclassify index 2", + "$TC actions add action drop index 3", + "$TC filter add dev $DEV1 protocol all ingress prio 1 handle 0x1234 matchall action gact index 3" + ], + "cmdUnderTest": "$TC actions flush action gact", + "expExitCode": "0", + "verifyCmd": "$TC actions ls action gact", + "matchPattern": "total acts 1.*action order [0-9]*: gact action drop.*index 3 ref 2 bind 1", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 ingress", + [ + "sleep 1; $TC actions flush action gact", + 0, + 1 + ] + ] } ] From 4bbfed9112ca9da88ac83d5ffe62c988f7169e9f Mon Sep 17 00:00:00 2001 From: Shreenidhi Shedi Date: Sun, 26 Jun 2022 18:59:47 +0530 Subject: [PATCH 21/55] octeon_ep: use bitwise AND This should be bitwise operator not logical. Fixes: 862cd659a6fb ("octeon_ep: Add driver framework and device initialization") Signed-off-by: Shreenidhi Shedi Link: https://lore.kernel.org/r/20220626132947.3992423-1-sshedi@vmware.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h b/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h index cc51149790ff..3d5d39a52fe6 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_regs_cn9k_pf.h @@ -52,7 +52,7 @@ #define CN93_SDP_EPF_RINFO_SRN(val) ((val) & 0xFF) #define CN93_SDP_EPF_RINFO_RPVF(val) (((val) >> 32) & 0xF) -#define CN93_SDP_EPF_RINFO_NVFS(val) (((val) >> 48) && 0xFF) +#define CN93_SDP_EPF_RINFO_NVFS(val) (((val) >> 48) & 0xFF) /* SDP Function select */ #define CN93_SDP_FUNC_SEL_EPF_BIT_POS 8 From 6b9f1d46fdada756294d2d5f04df613478386d34 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Sun, 26 Jun 2022 22:00:39 +0200 Subject: [PATCH 22/55] MAINTAINERS: nfc: drop Charles Gorand from NXP-NCI Mails to Charles get an auto reply, that he is no longer working at Eff'Innov technologies. Drop the entry and mark the driver as orphaned. Signed-off-by: Michael Walle Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220626200039.4062784-1-michael@walle.cc Signed-off-by: Jakub Kicinski --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 6cc825857722..6a288d59ff79 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14350,9 +14350,8 @@ F: Documentation/devicetree/bindings/sound/nxp,tfa989x.yaml F: sound/soc/codecs/tfa989x.c NXP-NCI NFC DRIVER -R: Charles Gorand L: linux-nfc@lists.01.org (subscribers-only) -S: Supported +S: Orphan F: Documentation/devicetree/bindings/net/nfc/nxp,nci.yaml F: drivers/nfc/nxp-nci From 805206e66fab4ba1e0ebd19402006d62cd1d4902 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 24 Jun 2022 09:51:38 +0200 Subject: [PATCH 23/55] net: asix: fix "can't send until first packet is send" issue If cable is attached after probe sequence, the usbnet framework would not automatically start processing RX packets except at least one packet was transmitted. On systems with any kind of address auto configuration this issue was not detected, because some packets are send immediately after link state is changed to "running". With this patch we will notify usbnet about link status change provided by the PHYlib. Fixes: e532a096be0e ("net: usb: asix: ax88772: add phylib support") Reported-by: Anton Lundin Signed-off-by: Oleksij Rempel Tested-by: Anton Lundin Link: https://lore.kernel.org/r/20220624075139.3139300-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/asix_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c index 632fa6c1d5e3..b4a1b7abcfc9 100644 --- a/drivers/net/usb/asix_common.c +++ b/drivers/net/usb/asix_common.c @@ -431,6 +431,7 @@ void asix_adjust_link(struct net_device *netdev) asix_write_medium_mode(dev, mode, 0); phy_print_status(phydev); + usbnet_link_change(dev, phydev->link, 0); } int asix_write_gpio(struct usbnet *dev, u16 value, int sleep, int in_pm) From ce95ab775f8d8e89a038c0e5611a7381a2ef8e43 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 24 Jun 2022 09:51:39 +0200 Subject: [PATCH 24/55] net: usb: asix: do not force pause frames support We should respect link partner capabilities and not force flow control support on every link. Even more, in current state the MAC driver do not advertises pause support so we should not keep flow control enabled at all. Fixes: e532a096be0e ("net: usb: asix: ax88772: add phylib support") Reported-by: Anton Lundin Signed-off-by: Oleksij Rempel Tested-by: Anton Lundin Link: https://lore.kernel.org/r/20220624075139.3139300-2-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/asix.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/usb/asix.h b/drivers/net/usb/asix.h index 2c81236c6c7c..45d3cc5cc355 100644 --- a/drivers/net/usb/asix.h +++ b/drivers/net/usb/asix.h @@ -126,8 +126,7 @@ AX_MEDIUM_RE) #define AX88772_MEDIUM_DEFAULT \ - (AX_MEDIUM_FD | AX_MEDIUM_RFC | \ - AX_MEDIUM_TFC | AX_MEDIUM_PS | \ + (AX_MEDIUM_FD | AX_MEDIUM_PS | \ AX_MEDIUM_AC | AX_MEDIUM_RE) /* AX88772 & AX88178 RX_CTL values */ From 3b0dc529f56b5f2328244130683210be98f16f7f Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 23 Jun 2022 14:00:15 +0200 Subject: [PATCH 25/55] ipv6: take care of disable_policy when restoring routes When routes corresponding to addresses are restored by fixup_permanent_addr(), the dst_nopolicy parameter was not set. The typical use case is a user that configures an address on a down interface and then put this interface up. Let's take care of this flag in addrconf_f6i_alloc(), so that every callers benefit ont it. CC: stable@kernel.org CC: David Forster Fixes: df789fe75206 ("ipv6: Provide ipv6 version of "disable_policy" sysctl") Reported-by: Siwar Zitouni Signed-off-by: Nicolas Dichtel Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220623120015.32640-1-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 4 ---- net/ipv6/route.c | 9 ++++++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 1b1932502e9e..5864cbc30db6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1109,10 +1109,6 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, goto out; } - if (net->ipv6.devconf_all->disable_policy || - idev->cnf.disable_policy) - f6i->dst_nopolicy = true; - neigh_parms_data_state_setall(idev->nd_parms); ifa->addr = *cfg->pfx; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d25dc83bac62..828355710c57 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4569,8 +4569,15 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, } f6i = ip6_route_info_create(&cfg, gfp_flags, NULL); - if (!IS_ERR(f6i)) + if (!IS_ERR(f6i)) { f6i->dst_nocount = true; + + if (!anycast && + (net->ipv6.devconf_all->disable_policy || + idev->cnf.disable_policy)) + f6i->dst_nopolicy = true; + } + return f6i; } From ab84db251c04d38b8dc7ee86e13d4050bedb1c88 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Jun 2022 10:28:13 +0000 Subject: [PATCH 26/55] net: bonding: fix possible NULL deref in rlb code syzbot has two reports involving the same root cause. bond_alb_initialize() must not set bond->alb_info.rlb_enabled if a memory allocation error is detected. Report 1: general protection fault, probably for non-canonical address 0xdffffc0000000002: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] CPU: 0 PID: 12276 Comm: kworker/u4:10 Not tainted 5.19.0-rc3-syzkaller-00132-g3b89b511ea0c #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: netns cleanup_net RIP: 0010:rlb_clear_slave+0x10e/0x690 drivers/net/bonding/bond_alb.c:393 Code: 8e fc 83 fb ff 0f 84 74 02 00 00 e8 cc 2a 8e fc 48 8b 44 24 08 89 dd 48 c1 e5 06 4c 8d 34 28 49 8d 7e 14 48 89 f8 48 c1 e8 03 <42> 0f b6 14 20 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 RSP: 0018:ffffc90018a8f678 EFLAGS: 00010203 RAX: 0000000000000002 RBX: 0000000000000000 RCX: 0000000000000000 RDX: ffff88803375bb00 RSI: ffffffff84ec4ac4 RDI: 0000000000000014 RBP: 0000000000000000 R08: 0000000000000005 R09: 00000000ffffffff R10: 0000000000000000 R11: 0000000000000000 R12: dffffc0000000000 R13: ffff8880ac889000 R14: 0000000000000000 R15: ffff88815a668c80 FS: 0000000000000000(0000) GS:ffff8880b9a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005597077e10b0 CR3: 0000000026668000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: bond_alb_deinit_slave+0x43c/0x6b0 drivers/net/bonding/bond_alb.c:1663 __bond_release_one.cold+0x383/0xd53 drivers/net/bonding/bond_main.c:2370 bond_slave_netdev_event drivers/net/bonding/bond_main.c:3778 [inline] bond_netdev_event+0x993/0xad0 drivers/net/bonding/bond_main.c:3889 notifier_call_chain+0xb5/0x200 kernel/notifier.c:87 call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:1945 call_netdevice_notifiers_extack net/core/dev.c:1983 [inline] call_netdevice_notifiers net/core/dev.c:1997 [inline] unregister_netdevice_many+0x948/0x18b0 net/core/dev.c:10839 default_device_exit_batch+0x449/0x590 net/core/dev.c:11333 ops_exit_list+0x125/0x170 net/core/net_namespace.c:167 cleanup_net+0x4ea/0xb00 net/core/net_namespace.c:594 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:302 Report 2: general protection fault, probably for non-canonical address 0xdffffc0000000006: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037] CPU: 1 PID: 5206 Comm: syz-executor.1 Not tainted 5.18.0-syzkaller-12108-g58f9d52ff689 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:rlb_req_update_slave_clients+0x109/0x2f0 drivers/net/bonding/bond_alb.c:502 Code: 5d 18 8f fc 41 80 3e 00 0f 85 a5 01 00 00 89 d8 48 c1 e0 06 49 03 84 24 68 01 00 00 48 8d 78 30 49 89 c7 48 89 fa 48 c1 ea 03 <80> 3c 2a 00 0f 85 98 01 00 00 4d 39 6f 30 75 83 e8 22 18 8f fc 49 RSP: 0018:ffffc9000300ee80 EFLAGS: 00010206 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffc90016c11000 RDX: 0000000000000006 RSI: ffffffff84eb6bf3 RDI: 0000000000000030 RBP: dffffc0000000000 R08: 0000000000000005 R09: 00000000ffffffff R10: 0000000000000000 R11: 0000000000000000 R12: ffff888027c80c80 R13: ffff88807d7ff800 R14: ffffed1004f901bd R15: 0000000000000000 FS: 00007f6f46c58700(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020010000 CR3: 00000000516cc000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: alb_fasten_mac_swap+0x886/0xa80 drivers/net/bonding/bond_alb.c:1070 bond_alb_handle_active_change+0x624/0x1050 drivers/net/bonding/bond_alb.c:1765 bond_change_active_slave+0xfa1/0x29b0 drivers/net/bonding/bond_main.c:1173 bond_select_active_slave+0x23f/0xa50 drivers/net/bonding/bond_main.c:1253 bond_enslave+0x3b34/0x53b0 drivers/net/bonding/bond_main.c:2159 do_set_master+0x1c8/0x220 net/core/rtnetlink.c:2577 rtnl_newlink_create net/core/rtnetlink.c:3380 [inline] __rtnl_newlink+0x13ac/0x17e0 net/core/rtnetlink.c:3580 rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3593 rtnetlink_rcv_msg+0x43a/0xc90 net/core/rtnetlink.c:6089 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2501 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x543/0x7f0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x917/0xe10 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:734 ____sys_sendmsg+0x6eb/0x810 net/socket.c:2492 ___sys_sendmsg+0xf3/0x170 net/socket.c:2546 __sys_sendmsg net/socket.c:2575 [inline] __do_sys_sendmsg net/socket.c:2584 [inline] __se_sys_sendmsg net/socket.c:2582 [inline] __x64_sys_sendmsg+0x132/0x220 net/socket.c:2582 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7f6f45a89109 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f6f46c58168 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f6f45b9c030 RCX: 00007f6f45a89109 RDX: 0000000000000000 RSI: 0000000020000080 RDI: 0000000000000006 RBP: 00007f6f45ae308d R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffed99029af R14: 00007f6f46c58300 R15: 0000000000022000 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Acked-by: Jay Vosburgh Link: https://lore.kernel.org/r/20220627102813.126264-1-edumazet@google.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_alb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index 303c8d32d451..007d43e46dcb 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -1302,12 +1302,12 @@ int bond_alb_initialize(struct bonding *bond, int rlb_enabled) return res; if (rlb_enabled) { - bond->alb_info.rlb_enabled = 1; res = rlb_initialize(bond); if (res) { tlb_deinitialize(bond); return res; } + bond->alb_info.rlb_enabled = 1; } else { bond->alb_info.rlb_enabled = 0; } From 0c1f78a49af721490a5ad70b73e8b4d382465dae Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 27 Jun 2022 18:02:35 -0700 Subject: [PATCH 27/55] mptcp: fix error mibs accounting The current accounting for MP_FAIL and FASTCLOSE is not very accurate: both can be increased even when the related option is not really sent. Move the accounting into the correct place. Fixes: eb7f33654dc1 ("mptcp: add the mibs for MP_FAIL") Fixes: 1e75629cb964 ("mptcp: add the mibs for MP_FASTCLOSE") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 5 +++-- net/mptcp/pm.c | 1 - net/mptcp/subflow.c | 4 +--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index be3b918a6d15..2a6351d55a7d 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -765,6 +765,7 @@ static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_bu opts->suboptions |= OPTION_MPTCP_RST; opts->reset_transient = subflow->reset_transient; opts->reset_reason = subflow->reset_reason; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); return true; } @@ -788,6 +789,7 @@ static bool mptcp_established_options_fastclose(struct sock *sk, opts->rcvr_key = msk->remote_key; pr_debug("FASTCLOSE key=%llu", opts->rcvr_key); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); return true; } @@ -809,6 +811,7 @@ static bool mptcp_established_options_mp_fail(struct sock *sk, opts->fail_seq = subflow->map_seq; pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); return true; } @@ -833,13 +836,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); } /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */ if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); } return true; } diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 59a85220edc9..91a62b598de4 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -310,7 +310,6 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) pr_debug("send MP_FAIL response and infinite map"); subflow->send_mp_fail = 1; - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); subflow->send_infinite_map = 1; } else if (!sock_flag(sk, SOCK_DEAD)) { pr_debug("MP_FAIL response received"); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 8841e8cd9ad8..50ad19adc003 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -958,10 +958,8 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff * subflow->map_data_csum); if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); - if (subflow->mp_join || subflow->valid_csum_seen) { + if (subflow->mp_join || subflow->valid_csum_seen) subflow->send_mp_fail = 1; - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX); - } return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; } From 31bf11de146c3f8892093ff39f8f9b3069d6a852 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 27 Jun 2022 18:02:36 -0700 Subject: [PATCH 28/55] mptcp: introduce MAPPING_BAD_CSUM This allow moving a couple of conditional out of the fast path, making the code more easy to follow and will simplify the next patch. Fixes: ae66fb2ba6c3 ("mptcp: Do TCP fallback on early DSS checksum failure") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 50ad19adc003..42a7c18a1c24 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -843,7 +843,8 @@ enum mapping_status { MAPPING_INVALID, MAPPING_EMPTY, MAPPING_DATA_FIN, - MAPPING_DUMMY + MAPPING_DUMMY, + MAPPING_BAD_CSUM }; static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn) @@ -958,9 +959,7 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff * subflow->map_data_csum); if (unlikely(csum)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); - if (subflow->mp_join || subflow->valid_csum_seen) - subflow->send_mp_fail = 1; - return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; + return MAPPING_BAD_CSUM; } subflow->valid_csum_seen = 1; @@ -1182,10 +1181,8 @@ static bool subflow_check_data_avail(struct sock *ssk) status = get_mapping_status(ssk, msk); trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); - if (unlikely(status == MAPPING_INVALID)) - goto fallback; - - if (unlikely(status == MAPPING_DUMMY)) + if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY || + status == MAPPING_BAD_CSUM)) goto fallback; if (status != MAPPING_OK) @@ -1227,7 +1224,10 @@ no_data: fallback: if (!__mptcp_check_fallback(msk)) { /* RFC 8684 section 3.7. */ - if (subflow->send_mp_fail) { + if (status == MAPPING_BAD_CSUM && + (subflow->mp_join || subflow->valid_csum_seen)) { + subflow->send_mp_fail = 1; + if (!READ_ONCE(msk->allow_infinite_fallback)) { ssk->sk_err = EBADMSG; tcp_set_state(ssk, TCP_CLOSE); From 76a13b315709b5b65a7b65caf9ede9a8a38d8930 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 27 Jun 2022 18:02:37 -0700 Subject: [PATCH 29/55] mptcp: invoke MP_FAIL response when needed mptcp_mp_fail_no_response shouldn't be invoked on each worker run, it should be invoked only when MP_FAIL response timeout occurs. This patch refactors the MP_FAIL response logic. It leverages the fact that only the MPC/first subflow can gracefully fail to avoid unneeded subflows traversal: the failing subflow can be only msk->first. A new 'fail_tout' field is added to the subflow context to record the MP_FAIL response timeout and use such field to reliably share the timeout timer between the MP_FAIL event and the MPTCP socket close timeout. Finally, a new ack is generated to send out MP_FAIL notification as soon as we hit the relevant condition, instead of waiting a possibly unbound time for the next data packet. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/281 Fixes: d9fb797046c5 ("mptcp: Do not traverse the subflow connection list without lock") Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 9 +++--- net/mptcp/protocol.c | 77 +++++++++++++++++++++++++++----------------- net/mptcp/protocol.h | 3 +- net/mptcp/subflow.c | 38 ++++++++++++++++------ 4 files changed, 82 insertions(+), 45 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 91a62b598de4..45e2a48397b9 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -299,22 +299,21 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - struct sock *s = (struct sock *)msk; pr_debug("fail_seq=%llu", fail_seq); if (!READ_ONCE(msk->allow_infinite_fallback)) return; - if (!READ_ONCE(subflow->mp_fail_response_expect)) { + if (!subflow->fail_tout) { pr_debug("send MP_FAIL response and infinite map"); subflow->send_mp_fail = 1; subflow->send_infinite_map = 1; - } else if (!sock_flag(sk, SOCK_DEAD)) { + tcp_send_ack(sk); + } else { pr_debug("MP_FAIL response received"); - - sk_stop_timer(s, &s->sk_timer); + WRITE_ONCE(subflow->fail_tout, 0); } } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 17e13396024a..e6fcb61443dd 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -500,7 +500,7 @@ static void mptcp_set_timeout(struct sock *sk) __mptcp_set_timeout(sk, tout); } -static bool tcp_can_send_ack(const struct sock *ssk) +static inline bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); @@ -2175,21 +2175,6 @@ static void mptcp_retransmit_timer(struct timer_list *t) sock_put(sk); } -static struct mptcp_subflow_context * -mp_fail_response_expect_subflow(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow, *ret = NULL; - - mptcp_for_each_subflow(msk, subflow) { - if (READ_ONCE(subflow->mp_fail_response_expect)) { - ret = subflow; - break; - } - } - - return ret; -} - static void mptcp_timeout_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); @@ -2518,27 +2503,50 @@ reset_timer: mptcp_reset_timer(sk); } +/* schedule the timeout timer for the relevant event: either close timeout + * or mp_fail timeout. The close timeout takes precedence on the mp_fail one + */ +void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout) +{ + struct sock *sk = (struct sock *)msk; + unsigned long timeout, close_timeout; + + if (!fail_tout && !sock_flag(sk, SOCK_DEAD)) + return; + + close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN; + + /* the close timeout takes precedence on the fail one, and here at least one of + * them is active + */ + timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout; + + sk_reset_timer(sk, &sk->sk_timer, timeout); +} + static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) { - struct mptcp_subflow_context *subflow; - struct sock *ssk; + struct sock *ssk = msk->first; bool slow; - subflow = mp_fail_response_expect_subflow(msk); - if (subflow) { - pr_debug("MP_FAIL doesn't respond, reset the subflow"); + if (!ssk) + return; - ssk = mptcp_subflow_tcp_sock(subflow); - slow = lock_sock_fast(ssk); - mptcp_subflow_reset(ssk); - unlock_sock_fast(ssk, slow); - } + pr_debug("MP_FAIL doesn't respond, reset the subflow"); + + slow = lock_sock_fast(ssk); + mptcp_subflow_reset(ssk); + WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0); + unlock_sock_fast(ssk, slow); + + mptcp_reset_timeout(msk, 0); } static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *sk = &msk->sk.icsk_inet.sk; + unsigned long fail_tout; int state; lock_sock(sk); @@ -2575,7 +2583,9 @@ static void mptcp_worker(struct work_struct *work) if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) __mptcp_retrans(sk); - mptcp_mp_fail_no_response(msk); + fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0; + if (fail_tout && time_after(jiffies, fail_tout)) + mptcp_mp_fail_no_response(msk); unlock: release_sock(sk); @@ -2822,6 +2832,7 @@ static void __mptcp_destroy_sock(struct sock *sk) static void mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; lock_sock(sk); @@ -2840,10 +2851,16 @@ static void mptcp_close(struct sock *sk, long timeout) cleanup: /* orphan all the subflows */ inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; - mptcp_for_each_subflow(mptcp_sk(sk), subflow) { + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow = lock_sock_fast_nested(ssk); + /* since the close timeout takes precedence on the fail one, + * cancel the latter + */ + if (ssk == msk->first) + subflow->fail_tout = 0; + sock_orphan(ssk); unlock_sock_fast(ssk, slow); } @@ -2852,13 +2869,13 @@ cleanup: sock_hold(sk); pr_debug("msk=%p state=%d", sk, sk->sk_state); if (mptcp_sk(sk)->token) - mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); if (sk->sk_state == TCP_CLOSE) { __mptcp_destroy_sock(sk); do_cancel_work = true; } else { - sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN); + mptcp_reset_timeout(msk, 0); } release_sock(sk); if (do_cancel_work) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 200f89f6d62f..1d2d71711872 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -468,7 +468,6 @@ struct mptcp_subflow_context { local_id_valid : 1, /* local_id is correctly initialized */ valid_csum_seen : 1; /* at least one csum validated */ enum mptcp_data_avail data_avail; - bool mp_fail_response_expect; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -482,6 +481,7 @@ struct mptcp_subflow_context { u8 stale_count; long delegated_status; + unsigned long fail_tout; ); @@ -662,6 +662,7 @@ void mptcp_get_options(const struct sk_buff *skb, void mptcp_finish_connect(struct sock *sk); void __mptcp_set_connected(struct sock *sk); +void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout); static inline bool mptcp_is_fully_established(struct sock *sk) { return inet_sk_state_load(sk) == TCP_ESTABLISHED && diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 42a7c18a1c24..8dfea6a8a82f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -971,7 +971,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); bool csum_reqd = READ_ONCE(msk->csum_enabled); - struct sock *sk = (struct sock *)msk; struct mptcp_ext *mpext; struct sk_buff *skb; u16 data_len; @@ -1013,9 +1012,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk, pr_debug("infinite mapping received"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); subflow->map_data_len = 0; - if (!sock_flag(ssk, SOCK_DEAD)) - sk_stop_timer(sk, &sk->sk_timer); - return MAPPING_INVALID; } @@ -1162,6 +1158,33 @@ static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) return !subflow->fully_established; } +static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + unsigned long fail_tout; + + /* greceful failure can happen only on the MPC subflow */ + if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first))) + return; + + /* since the close timeout take precedence on the fail one, + * no need to start the latter when the first is already set + */ + if (sock_flag((struct sock *)msk, SOCK_DEAD)) + return; + + /* we don't need extreme accuracy here, use a zero fail_tout as special + * value meaning no fail timeout at all; + */ + fail_tout = jiffies + TCP_RTO_MAX; + if (!fail_tout) + fail_tout = 1; + WRITE_ONCE(subflow->fail_tout, fail_tout); + tcp_send_ack(ssk); + + mptcp_reset_timeout(msk, subflow->fail_tout); +} + static bool subflow_check_data_avail(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); @@ -1236,11 +1259,8 @@ fallback: tcp_send_active_reset(ssk, GFP_ATOMIC); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); - } else if (!sock_flag(ssk, SOCK_DEAD)) { - WRITE_ONCE(subflow->mp_fail_response_expect, true); - sk_reset_timer((struct sock *)msk, - &((struct sock *)msk)->sk_timer, - jiffies + TCP_RTO_MAX); + } else { + mptcp_subflow_fail(msk, ssk); } WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); return true; From d51991e2e31477853e5b9c1005ac617707077286 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 27 Jun 2022 18:02:38 -0700 Subject: [PATCH 30/55] mptcp: fix shutdown vs fallback race If the MPTCP socket shutdown happens before a fallback to TCP, and all the pending data have been already spooled, we never close the TCP connection. Address the issue explicitly checking for critical condition at fallback time. Fixes: 1e39e5a32ad7 ("mptcp: infinite mapping sending") Fixes: 0348c690ed37 ("mptcp: add the fallback check") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 2 +- net/mptcp/protocol.c | 2 +- net/mptcp/protocol.h | 19 ++++++++++++++++--- net/mptcp/subflow.c | 2 +- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 2a6351d55a7d..aead331866a0 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -967,7 +967,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, goto reset; subflow->mp_capable = 0; pr_fallback(msk); - __mptcp_do_fallback(msk); + mptcp_do_fallback(ssk); return false; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e6fcb61443dd..e63bc2bb7fff 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1245,7 +1245,7 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk, MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX); mptcp_subflow_ctx(ssk)->send_infinite_map = 0; pr_fallback(msk); - __mptcp_do_fallback(msk); + mptcp_do_fallback(ssk); } static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 1d2d71711872..9860179bfd5e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -927,12 +927,25 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk) set_bit(MPTCP_FALLBACK_DONE, &msk->flags); } -static inline void mptcp_do_fallback(struct sock *sk) +static inline void mptcp_do_fallback(struct sock *ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + struct mptcp_sock *msk; + msk = mptcp_sk(sk); __mptcp_do_fallback(msk); + if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) { + gfp_t saved_allocation = ssk->sk_allocation; + + /* we are in a atomic (BH) scope, override ssk default for data + * fin allocation + */ + ssk->sk_allocation = GFP_ATOMIC; + ssk->sk_shutdown |= SEND_SHUTDOWN; + tcp_shutdown(ssk, SEND_SHUTDOWN); + ssk->sk_allocation = saved_allocation; + } } #define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)", __func__, a) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 8dfea6a8a82f..b34b96fb742f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1279,7 +1279,7 @@ fallback: return false; } - __mptcp_do_fallback(msk); + mptcp_do_fallback(ssk); } skb = skb_peek(&ssk->sk_receive_queue); From f745a3ebdfb9041d3a55e66eb345895cd8ecc90c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 27 Jun 2022 18:02:39 -0700 Subject: [PATCH 31/55] mptcp: consistent map handling on failure When the MPTCP receive path reach a non fatal fall-back condition, e.g. when the MPC sockets must fall-back to TCP, the existing code is a little self-inconsistent: it reports that new data is available - return true - but sets the MPC flag to the opposite value. As the consequence read operations in some exceptional scenario may block unexpectedly. Address the issue setting the correct MPC read status. Additionally avoid some code duplication in the fatal fall-back scenario. Fixes: 9c81be0dbc89 ("mptcp: add MP_FAIL response support") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index b34b96fb742f..03862103665d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1252,17 +1252,12 @@ fallback: subflow->send_mp_fail = 1; if (!READ_ONCE(msk->allow_infinite_fallback)) { - ssk->sk_err = EBADMSG; - tcp_set_state(ssk, TCP_CLOSE); subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; - tcp_send_active_reset(ssk, GFP_ATOMIC); - while ((skb = skb_peek(&ssk->sk_receive_queue))) - sk_eat_skb(ssk, skb); - } else { - mptcp_subflow_fail(msk, ssk); + goto reset; } - WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); + mptcp_subflow_fail(msk, ssk); + WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL); return true; } @@ -1270,10 +1265,14 @@ fallback: /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ - ssk->sk_err = EBADMSG; - tcp_set_state(ssk, TCP_CLOSE); subflow->reset_transient = 0; subflow->reset_reason = MPTCP_RST_EMPTCP; + +reset: + ssk->sk_err = EBADMSG; + tcp_set_state(ssk, TCP_CLOSE); + while ((skb = skb_peek(&ssk->sk_receive_queue))) + sk_eat_skb(ssk, skb); tcp_send_active_reset(ssk, GFP_ATOMIC); WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA); return false; From 6aeed9045071f2252ff4e98fc13d1e304f33e5b0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 27 Jun 2022 18:02:40 -0700 Subject: [PATCH 32/55] mptcp: fix race on unaccepted mptcp sockets When the listener socket owning the relevant request is closed, it frees the unaccepted subflows and that causes later deletion of the paired MPTCP sockets. The mptcp socket's worker can run in the time interval between such delete operations. When that happens, any access to msk->first will cause an UaF access, as the subflow cleanup did not cleared such field in the mptcp socket. Address the issue explicitly traversing the listener socket accept queue at close time and performing the needed cleanup on the pending msk. Note that the locking is a bit tricky, as we need to acquire the msk socket lock, while still owning the subflow socket one. Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 5 +++++ net/mptcp/protocol.h | 2 ++ net/mptcp/subflow.c | 52 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e63bc2bb7fff..e475212f2618 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2331,6 +2331,11 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, kfree_rcu(subflow, rcu); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ + if (ssk->sk_state == TCP_LISTEN) { + tcp_set_state(ssk, TCP_CLOSE); + mptcp_subflow_queue_clean(ssk); + inet_csk_listen_stop(ssk); + } __tcp_close(ssk, 0); /* close acquired an extra ref */ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 9860179bfd5e..c14d70c036d0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -306,6 +306,7 @@ struct mptcp_sock { u32 setsockopt_seq; char ca_name[TCP_CA_NAME_MAX]; + struct mptcp_sock *dl_next; }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) @@ -608,6 +609,7 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); +void mptcp_subflow_queue_clean(struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 03862103665d..63e8892ec807 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1723,6 +1723,58 @@ static void subflow_state_change(struct sock *sk) } } +void mptcp_subflow_queue_clean(struct sock *listener_ssk) +{ + struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; + struct mptcp_sock *msk, *next, *head = NULL; + struct request_sock *req; + + /* build a list of all unaccepted mptcp sockets */ + spin_lock_bh(&queue->rskq_lock); + for (req = queue->rskq_accept_head; req; req = req->dl_next) { + struct mptcp_subflow_context *subflow; + struct sock *ssk = req->sk; + struct mptcp_sock *msk; + + if (!sk_is_mptcp(ssk)) + continue; + + subflow = mptcp_subflow_ctx(ssk); + if (!subflow || !subflow->conn) + continue; + + /* skip if already in list */ + msk = mptcp_sk(subflow->conn); + if (msk->dl_next || msk == head) + continue; + + msk->dl_next = head; + head = msk; + } + spin_unlock_bh(&queue->rskq_lock); + if (!head) + return; + + /* can't acquire the msk socket lock under the subflow one, + * or will cause ABBA deadlock + */ + release_sock(listener_ssk); + + for (msk = head; msk; msk = next) { + struct sock *sk = (struct sock *)msk; + bool slow; + + slow = lock_sock_fast_nested(sk); + next = msk->dl_next; + msk->first = NULL; + msk->dl_next = NULL; + unlock_sock_fast(sk, slow); + } + + /* we are still under the listener msk socket lock */ + lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); +} + static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); From 42fb6cddec3b306c9f6ef136b6438e0de1836431 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 27 Jun 2022 18:02:41 -0700 Subject: [PATCH 33/55] selftests: mptcp: more stable diag tests The mentioned test-case still use an hard-coded-len sleep to wait for a relative large number of connection to be established. On very slow VM and with debug build such timeout could be exceeded, causing failures in our CI. Address the issue polling for the expected condition several times, up to an unreasonable high amount of time. On reasonably fast system the self-tests will be faster then before, on very slow one we will still catch the correct condition. Fixes: df62f2ec3df6 ("selftests/mptcp: add diag interface tests") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/diag.sh | 48 +++++++++++++++++++---- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 9dd43d7d957b..515859a5168b 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -61,6 +61,39 @@ chk_msk_nr() __chk_nr "grep -c token:" $* } +wait_msk_nr() +{ + local condition="grep -c token:" + local expected=$1 + local timeout=20 + local msg nr + local max=0 + local i=0 + + shift 1 + msg=$* + + while [ $i -lt $timeout ]; do + nr=$(ss -inmHMN $ns | $condition) + [ $nr == $expected ] && break; + [ $nr -gt $max ] && max=$nr + i=$((i + 1)) + sleep 1 + done + + printf "%-50s" "$msg" + if [ $i -ge $timeout ]; then + echo "[ fail ] timeout while expecting $expected max $max last $nr" + ret=$test_cnt + elif [ $nr != $expected ]; then + echo "[ fail ] expected $expected found $nr" + ret=$test_cnt + else + echo "[ ok ]" + fi + test_cnt=$((test_cnt+1)) +} + chk_msk_fallback_nr() { __chk_nr "grep -c fallback" $* @@ -146,7 +179,7 @@ ip -n $ns link set dev lo up echo "a" | \ timeout ${timeout_test} \ ip netns exec $ns \ - ./mptcp_connect -p 10000 -l -t ${timeout_poll} \ + ./mptcp_connect -p 10000 -l -t ${timeout_poll} -w 20 \ 0.0.0.0 >/dev/null & wait_local_port_listen $ns 10000 chk_msk_nr 0 "no msk on netns creation" @@ -155,7 +188,7 @@ chk_msk_listen 10000 echo "b" | \ timeout ${timeout_test} \ ip netns exec $ns \ - ./mptcp_connect -p 10000 -r 0 -t ${timeout_poll} \ + ./mptcp_connect -p 10000 -r 0 -t ${timeout_poll} -w 20 \ 127.0.0.1 >/dev/null & wait_connected $ns 10000 chk_msk_nr 2 "after MPC handshake " @@ -167,13 +200,13 @@ flush_pids echo "a" | \ timeout ${timeout_test} \ ip netns exec $ns \ - ./mptcp_connect -p 10001 -l -s TCP -t ${timeout_poll} \ + ./mptcp_connect -p 10001 -l -s TCP -t ${timeout_poll} -w 20 \ 0.0.0.0 >/dev/null & wait_local_port_listen $ns 10001 echo "b" | \ timeout ${timeout_test} \ ip netns exec $ns \ - ./mptcp_connect -p 10001 -r 0 -t ${timeout_poll} \ + ./mptcp_connect -p 10001 -r 0 -t ${timeout_poll} -w 20 \ 127.0.0.1 >/dev/null & wait_connected $ns 10001 chk_msk_fallback_nr 1 "check fallback" @@ -184,7 +217,7 @@ for I in `seq 1 $NR_CLIENTS`; do echo "a" | \ timeout ${timeout_test} \ ip netns exec $ns \ - ./mptcp_connect -p $((I+10001)) -l -w 10 \ + ./mptcp_connect -p $((I+10001)) -l -w 20 \ -t ${timeout_poll} 0.0.0.0 >/dev/null & done wait_local_port_listen $ns $((NR_CLIENTS + 10001)) @@ -193,12 +226,11 @@ for I in `seq 1 $NR_CLIENTS`; do echo "b" | \ timeout ${timeout_test} \ ip netns exec $ns \ - ./mptcp_connect -p $((I+10001)) -w 10 \ + ./mptcp_connect -p $((I+10001)) -w 20 \ -t ${timeout_poll} 127.0.0.1 >/dev/null & done -sleep 1.5 -chk_msk_nr $((NR_CLIENTS*2)) "many msk socket present" +wait_msk_nr $((NR_CLIENTS*2)) "many msk socket present" flush_pids exit $ret From 06e445f740c1a0fe5d16b3dff8a4ef18e124e54e Mon Sep 17 00:00:00 2001 From: Ossama Othman Date: Mon, 27 Jun 2022 18:02:42 -0700 Subject: [PATCH 34/55] mptcp: fix conflict with Including before the C library header causes symbol redefinition errors at compile-time due to duplicate declarations and definitions in the header included by . Explicitly include before in when __KERNEL__ is not defined so that the C library compatibility logic in is enabled when including in user space code. Fixes: c11c5906bc0a ("mptcp: add MPTCP_SUBFLOW_ADDRS getsockopt support") Signed-off-by: Ossama Othman Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 921963589904..dfe19bf13f4c 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -2,16 +2,17 @@ #ifndef _UAPI_MPTCP_H #define _UAPI_MPTCP_H +#ifndef __KERNEL__ +#include /* for sockaddr_in and sockaddr_in6 */ +#include /* for struct sockaddr */ +#endif + #include #include #include /* for sockaddr_in */ #include /* for sockaddr_in6 */ #include /* for sockaddr_storage and sa_family */ -#ifndef __KERNEL__ -#include /* for struct sockaddr */ -#endif - #define MPTCP_SUBFLOW_FLAG_MCAP_REM _BITUL(0) #define MPTCP_SUBFLOW_FLAG_MCAP_LOC _BITUL(1) #define MPTCP_SUBFLOW_FLAG_JOIN_REM _BITUL(2) From fd37c2ecb21f7aee04ccca5f561469f07d00063c Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Mon, 27 Jun 2022 18:02:43 -0700 Subject: [PATCH 35/55] selftests: mptcp: Initialize variables to quiet gcc 12 warnings In a few MPTCP selftest tools, gcc 12 complains that the 'sock' variable might be used uninitialized. This is a false positive because the only code path that could lead to uninitialized access is where getaddrinfo() fails, but the local xgetaddrinfo() wrapper exits if such a failure occurs. Initialize the 'sock' variable anyway to allow the tools to build with gcc 12. Fixes: 048d19d444be ("mptcp: add basic kselftest for mptcp") Acked-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_connect.c | 2 +- tools/testing/selftests/net/mptcp/mptcp_inq.c | 2 +- tools/testing/selftests/net/mptcp/mptcp_sockopt.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 8628aa61b763..e2ea6c126c99 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -265,7 +265,7 @@ static void sock_test_tcpulp(int sock, int proto, unsigned int line) static int sock_listen_mptcp(const char * const listenaddr, const char * const port) { - int sock; + int sock = -1; struct addrinfo hints = { .ai_protocol = IPPROTO_TCP, .ai_socktype = SOCK_STREAM, diff --git a/tools/testing/selftests/net/mptcp/mptcp_inq.c b/tools/testing/selftests/net/mptcp/mptcp_inq.c index 29f75e2a1116..8672d898f8cd 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_inq.c +++ b/tools/testing/selftests/net/mptcp/mptcp_inq.c @@ -88,7 +88,7 @@ static void xgetaddrinfo(const char *node, const char *service, static int sock_listen_mptcp(const char * const listenaddr, const char * const port) { - int sock; + int sock = -1; struct addrinfo hints = { .ai_protocol = IPPROTO_TCP, .ai_socktype = SOCK_STREAM, diff --git a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c index ac9a4d9c1764..ae61f39556ca 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_sockopt.c +++ b/tools/testing/selftests/net/mptcp/mptcp_sockopt.c @@ -136,7 +136,7 @@ static void xgetaddrinfo(const char *node, const char *service, static int sock_listen_mptcp(const char * const listenaddr, const char * const port) { - int sock; + int sock = -1; struct addrinfo hints = { .ai_protocol = IPPROTO_TCP, .ai_socktype = SOCK_STREAM, From adabdd8f6acabc0c3fdbba2e7f5a2edd9c5ef22d Mon Sep 17 00:00:00 2001 From: katrinzhou Date: Tue, 28 Jun 2022 11:50:30 +0800 Subject: [PATCH 36/55] ipv6/sit: fix ipip6_tunnel_get_prl return value When kcalloc fails, ipip6_tunnel_get_prl() should return -ENOMEM. Move the position of label "out" to return correctly. Addresses-Coverity: ("Unused value") Fixes: 300aaeeaab5f ("[IPV6] SIT: Add SIOCGETPRL ioctl to get/dump PRL.") Signed-off-by: katrinzhou Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220628035030.1039171-1-zys.zljxml@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/sit.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index c0b138c20992..6bcd5e419a08 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -323,8 +323,6 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) : NULL; - rcu_read_lock(); - ca = min(t->prl_count, cmax); if (!kp) { @@ -341,7 +339,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u } } - c = 0; + rcu_read_lock(); for_each_prl_rcu(t->prl) { if (c >= cmax) break; @@ -353,7 +351,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __u if (kprl.addr != htonl(INADDR_ANY)) break; } -out: + rcu_read_unlock(); len = sizeof(*kp) * c; @@ -362,7 +360,7 @@ out: ret = -EFAULT; kfree(kp); - +out: return ret; } From 53ad46169fe2996fe1b623ba6c9c4fa33847876f Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 28 Jun 2022 11:31:34 +0800 Subject: [PATCH 37/55] net: ipv6: unexport __init-annotated seg6_hmac_net_init() As of commit 5801f064e351 ("net: ipv6: unexport __init-annotated seg6_hmac_init()"), EXPORT_SYMBOL and __init is a bad combination because the .init.text section is freed up after the initialization. Hence, modules cannot use symbols annotated __init. The access to a freed symbol may end up with kernel panic. This remove the EXPORT_SYMBOL to fix modpost warning: WARNING: modpost: vmlinux.o(___ksymtab+seg6_hmac_net_init+0x0): Section mismatch in reference from the variable __ksymtab_seg6_hmac_net_init to the function .init.text:seg6_hmac_net_init() The symbol seg6_hmac_net_init is exported and annotated __init Fix this by removing the __init annotation of seg6_hmac_net_init or drop the export. Fixes: bf355b8d2c30 ("ipv6: sr: add core files for SR HMAC support") Reported-by: Hulk Robot Signed-off-by: YueHaibing Link: https://lore.kernel.org/r/20220628033134.21088-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_hmac.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 6de01185cc68..d43c50a7310d 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -406,7 +406,6 @@ int __net_init seg6_hmac_net_init(struct net *net) return rhashtable_init(&sdata->hmac_infos, &rht_params); } -EXPORT_SYMBOL(seg6_hmac_net_init); void seg6_hmac_exit(void) { From 5a478a653b4cca148d5c89832f007ec0809d7e6d Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 27 Jun 2022 14:40:48 +0200 Subject: [PATCH 38/55] nfc: nfcmrvl: Fix irq_of_parse_and_map() return value The irq_of_parse_and_map() returns 0 on failure, not a negative ERRNO. Reported-by: Lv Ruyi Fixes: caf6e49bf6d0 ("NFC: nfcmrvl: add spi driver") Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220627124048.296253-1-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- drivers/nfc/nfcmrvl/i2c.c | 6 +++--- drivers/nfc/nfcmrvl/spi.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/nfc/nfcmrvl/i2c.c b/drivers/nfc/nfcmrvl/i2c.c index ceef81d93ac9..01329b91d59d 100644 --- a/drivers/nfc/nfcmrvl/i2c.c +++ b/drivers/nfc/nfcmrvl/i2c.c @@ -167,9 +167,9 @@ static int nfcmrvl_i2c_parse_dt(struct device_node *node, pdata->irq_polarity = IRQF_TRIGGER_RISING; ret = irq_of_parse_and_map(node, 0); - if (ret < 0) { - pr_err("Unable to get irq, error: %d\n", ret); - return ret; + if (!ret) { + pr_err("Unable to get irq\n"); + return -EINVAL; } pdata->irq = ret; diff --git a/drivers/nfc/nfcmrvl/spi.c b/drivers/nfc/nfcmrvl/spi.c index a38e2fcdfd39..ad3359a4942c 100644 --- a/drivers/nfc/nfcmrvl/spi.c +++ b/drivers/nfc/nfcmrvl/spi.c @@ -115,9 +115,9 @@ static int nfcmrvl_spi_parse_dt(struct device_node *node, } ret = irq_of_parse_and_map(node, 0); - if (ret < 0) { - pr_err("Unable to get irq, error: %d\n", ret); - return ret; + if (!ret) { + pr_err("Unable to get irq\n"); + return -EINVAL; } pdata->irq = ret; From 00aff3590fc0a73bddd3b743863c14e76fd35c0c Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Wed, 29 Jun 2022 14:34:18 +0800 Subject: [PATCH 39/55] net: tipc: fix possible refcount leak in tipc_sk_create() Free sk in case tipc_sk_insert() fails. Signed-off-by: Hangyu Hua Reviewed-by: Tung Nguyen Signed-off-by: David S. Miller --- net/tipc/socket.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 17f8c523e33b..43509c7e90fc 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -502,6 +502,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sock_init_data(sock, sk); tipc_set_sk_state(sk, TIPC_OPEN); if (tipc_sk_insert(tsk)) { + sk_free(sk); pr_warn("Socket create failed; port number exhausted\n"); return -EINVAL; } From eddd95b9423946aaacb55cac6a9b2cea8ab944fc Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 27 Jun 2022 19:06:42 +0200 Subject: [PATCH 40/55] NFC: nxp-nci: Don't issue a zero length i2c_master_read() There are packets which doesn't have a payload. In that case, the second i2c_master_read() will have a zero length. But because the NFC controller doesn't have any data left, it will NACK the I2C read and -ENXIO will be returned. In case there is no payload, just skip the second i2c master read. Fixes: 6be88670fc59 ("NFC: nxp-nci_i2c: Add I2C support to NXP NCI driver") Signed-off-by: Michael Walle Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- drivers/nfc/nxp-nci/i2c.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nfc/nxp-nci/i2c.c b/drivers/nfc/nxp-nci/i2c.c index 7e451c10985d..e8f3b35afbee 100644 --- a/drivers/nfc/nxp-nci/i2c.c +++ b/drivers/nfc/nxp-nci/i2c.c @@ -162,6 +162,9 @@ static int nxp_nci_i2c_nci_read(struct nxp_nci_i2c_phy *phy, skb_put_data(*skb, (void *)&header, NCI_CTRL_HDR_SIZE); + if (!header.plen) + return 0; + r = i2c_master_recv(client, skb_put(*skb, header.plen), header.plen); if (r != header.plen) { nfc_err(&client->dev, From 9577fc5fdc8b07b891709af6453545db405e24ad Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 27 Jun 2022 19:06:43 +0200 Subject: [PATCH 41/55] NFC: nxp-nci: don't print header length mismatch on i2c error Don't print a misleading header length mismatch error if the i2c call returns an error. Instead just return the error code without any error message. Signed-off-by: Michael Walle Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller --- drivers/nfc/nxp-nci/i2c.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/nfc/nxp-nci/i2c.c b/drivers/nfc/nxp-nci/i2c.c index e8f3b35afbee..ae2ba08d8ac3 100644 --- a/drivers/nfc/nxp-nci/i2c.c +++ b/drivers/nfc/nxp-nci/i2c.c @@ -122,7 +122,9 @@ static int nxp_nci_i2c_fw_read(struct nxp_nci_i2c_phy *phy, skb_put_data(*skb, &header, NXP_NCI_FW_HDR_LEN); r = i2c_master_recv(client, skb_put(*skb, frame_len), frame_len); - if (r != frame_len) { + if (r < 0) { + goto fw_read_exit_free_skb; + } else if (r != frame_len) { nfc_err(&client->dev, "Invalid frame length: %u (expected %zu)\n", r, frame_len); @@ -166,7 +168,9 @@ static int nxp_nci_i2c_nci_read(struct nxp_nci_i2c_phy *phy, return 0; r = i2c_master_recv(client, skb_put(*skb, header.plen), header.plen); - if (r != header.plen) { + if (r < 0) { + goto nci_read_exit_free_skb; + } else if (r != header.plen) { nfc_err(&client->dev, "Invalid frame payload length: %u (expected %u)\n", r, header.plen); From 7b92aa9e613508cbaa29dd35bf27db4c35628b10 Mon Sep 17 00:00:00 2001 From: Coleman Dietsch Date: Tue, 28 Jun 2022 12:47:44 -0500 Subject: [PATCH 42/55] selftests net: fix kselftest net fatal error The incorrect path is causing the following error when trying to run net kselftests: In file included from bpf/nat6to4.c:43: ../../../lib/bpf/bpf_helpers.h:11:10: fatal error: 'bpf_helper_defs.h' file not found ^~~~~~~~~~~~~~~~~~~ 1 error generated. Fixes: cf67838c4422 ("selftests net: fix bpf build error") Signed-off-by: Coleman Dietsch Link: https://lore.kernel.org/r/20220628174744.7908-1-dietschc@csp.edu Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/bpf/Makefile b/tools/testing/selftests/net/bpf/Makefile index 8a69c91fcca0..8ccaf8732eb2 100644 --- a/tools/testing/selftests/net/bpf/Makefile +++ b/tools/testing/selftests/net/bpf/Makefile @@ -2,7 +2,7 @@ CLANG ?= clang CCINCLUDE += -I../../bpf -CCINCLUDE += -I../../../lib +CCINCLUDE += -I../../../../lib CCINCLUDE += -I../../../../../usr/include/ TEST_CUSTOM_PROGS = $(OUTPUT)/bpf/nat6to4.o From e65af5403e462ccd7dff6a045a886c64da598c2e Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 28 Jun 2022 11:35:17 +0200 Subject: [PATCH 43/55] usbnet: fix memory allocation in helpers usbnet provides some helper functions that are also used in the context of reset() operations. During a reset the other drivers on a device are unable to operate. As that can be block drivers, a driver for another interface cannot use paging in its memory allocations without risking a deadlock. Use GFP_NOIO in the helpers. Fixes: 877bd862f32b8 ("usbnet: introduce usbnet 3 command helpers") Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20220628093517.7469-1-oneukum@suse.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/usbnet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 1cb6dab3e2d0..e2135ab87a6e 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -2004,7 +2004,7 @@ static int __usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, cmd, reqtype, value, index, size); if (size) { - buf = kmalloc(size, GFP_KERNEL); + buf = kmalloc(size, GFP_NOIO); if (!buf) goto out; } @@ -2036,7 +2036,7 @@ static int __usbnet_write_cmd(struct usbnet *dev, u8 cmd, u8 reqtype, cmd, reqtype, value, index, size); if (data) { - buf = kmemdup(data, size, GFP_KERNEL); + buf = kmemdup(data, size, GFP_NOIO); if (!buf) goto out; } else { From 1758bde2e4aa5ff188d53e7d9d388bbb7e12eebb Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 28 Jun 2022 12:15:08 +0200 Subject: [PATCH 44/55] net: phy: Don't trigger state machine while in suspend Upon system sleep, mdio_bus_phy_suspend() stops the phy_state_machine(), but subsequent interrupts may retrigger it: They may have been left enabled to facilitate wakeup and are not quiesced until the ->suspend_noirq() phase. Unwanted interrupts may hence occur between mdio_bus_phy_suspend() and dpm_suspend_noirq(), as well as between dpm_resume_noirq() and mdio_bus_phy_resume(). Retriggering the phy_state_machine() through an interrupt is not only undesirable for the reason given in mdio_bus_phy_suspend() (freezing it midway with phydev->lock held), but also because the PHY may be inaccessible after it's suspended: Accesses to USB-attached PHYs are blocked once usb_suspend_both() clears the can_submit flag and PHYs on PCI network cards may become inaccessible upon suspend as well. Amend phy_interrupt() to avoid triggering the state machine if the PHY is suspended. Signal wakeup instead if the attached net_device or its parent has been configured as a wakeup source. (Those conditions are identical to mdio_bus_phy_may_suspend().) Postpone handling of the interrupt until the PHY has resumed. Before stopping the phy_state_machine() in mdio_bus_phy_suspend(), wait for a concurrent phy_interrupt() to run to completion. That is necessary because phy_interrupt() may have checked the PHY's suspend status before the system sleep transition commenced and it may thus retrigger the state machine after it was stopped. Likewise, after re-enabling interrupt handling in mdio_bus_phy_resume(), wait for a concurrent phy_interrupt() to complete to ensure that interrupts which it postponed are properly rerun. The issue was exposed by commit 1ce8b37241ed ("usbnet: smsc95xx: Forward PHY interrupts to PHY driver to avoid polling"), but has existed since forever. Fixes: 541cd3ee00a4 ("phylib: Fix deadlock on resume") Link: https://lore.kernel.org/netdev/a5315a8a-32c2-962f-f696-de9a26d30091@samsung.com/ Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Signed-off-by: Lukas Wunner Acked-by: Rafael J. Wysocki Cc: stable@vger.kernel.org # v2.6.33+ Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/b7f386d04e9b5b0e2738f0125743e30676f309ef.1656410895.git.lukas@wunner.de Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 23 +++++++++++++++++++++++ drivers/net/phy/phy_device.c | 23 +++++++++++++++++++++++ include/linux/phy.h | 6 ++++++ 3 files changed, 52 insertions(+) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index ef62f357b76d..8d3ee3a6495b 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -976,6 +977,28 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat) struct phy_driver *drv = phydev->drv; irqreturn_t ret; + /* Wakeup interrupts may occur during a system sleep transition. + * Postpone handling until the PHY has resumed. + */ + if (IS_ENABLED(CONFIG_PM_SLEEP) && phydev->irq_suspended) { + struct net_device *netdev = phydev->attached_dev; + + if (netdev) { + struct device *parent = netdev->dev.parent; + + if (netdev->wol_enabled) + pm_system_wakeup(); + else if (device_may_wakeup(&netdev->dev)) + pm_wakeup_dev_event(&netdev->dev, 0, true); + else if (parent && device_may_wakeup(parent)) + pm_wakeup_dev_event(parent, 0, true); + } + + phydev->irq_rerun = 1; + disable_irq_nosync(irq); + return IRQ_HANDLED; + } + mutex_lock(&phydev->lock); ret = drv->handle_interrupt(phydev); mutex_unlock(&phydev->lock); diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 431a8719c635..46acddd865a7 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -278,6 +278,15 @@ static __maybe_unused int mdio_bus_phy_suspend(struct device *dev) if (phydev->mac_managed_pm) return 0; + /* Wakeup interrupts may occur during the system sleep transition when + * the PHY is inaccessible. Set flag to postpone handling until the PHY + * has resumed. Wait for concurrent interrupt handler to complete. + */ + if (phy_interrupt_is_valid(phydev)) { + phydev->irq_suspended = 1; + synchronize_irq(phydev->irq); + } + /* We must stop the state machine manually, otherwise it stops out of * control, possibly with the phydev->lock held. Upon resume, netdev * may call phy routines that try to grab the same lock, and that may @@ -315,6 +324,20 @@ static __maybe_unused int mdio_bus_phy_resume(struct device *dev) if (ret < 0) return ret; no_resume: + if (phy_interrupt_is_valid(phydev)) { + phydev->irq_suspended = 0; + synchronize_irq(phydev->irq); + + /* Rerun interrupts which were postponed by phy_interrupt() + * because they occurred during the system sleep transition. + */ + if (phydev->irq_rerun) { + phydev->irq_rerun = 0; + enable_irq(phydev->irq); + irq_wake_thread(phydev->irq, phydev); + } + } + if (phydev->attached_dev && phydev->adjust_link) phy_start_machine(phydev); diff --git a/include/linux/phy.h b/include/linux/phy.h index 508f1149665b..b09f7d36cff2 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -572,6 +572,10 @@ struct macsec_ops; * @mdix_ctrl: User setting of crossover * @pma_extable: Cached value of PMA/PMD Extended Abilities Register * @interrupts: Flag interrupts have been enabled + * @irq_suspended: Flag indicating PHY is suspended and therefore interrupt + * handling shall be postponed until PHY has resumed + * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, + * requiring a rerun of the interrupt handler after resume * @interface: enum phy_interface_t value * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics @@ -626,6 +630,8 @@ struct phy_device { /* Interrupts are enabled */ unsigned interrupts:1; + unsigned irq_suspended:1; + unsigned irq_rerun:1; enum phy_state state; From fa152f626b24ec2ca3489100d8c5c0a0bce4e2ef Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 28 Jun 2022 13:43:49 +0200 Subject: [PATCH 45/55] net: phy: ax88772a: fix lost pause advertisement configuration In case of asix_ax88772a_link_change_notify() workaround, we run soft reset which will automatically clear MII_ADVERTISE configuration. The PHYlib framework do not know about changed configuration state of the PHY, so we need use phy_init_hw() to reinit PHY configuration. Fixes: dde258469257 ("net: usb/phy: asix: add support for ax88772A/C PHYs") Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220628114349.3929928-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/phy/ax88796b.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/ax88796b.c b/drivers/net/phy/ax88796b.c index 457896337505..0f1e617a26c9 100644 --- a/drivers/net/phy/ax88796b.c +++ b/drivers/net/phy/ax88796b.c @@ -88,8 +88,10 @@ static void asix_ax88772a_link_change_notify(struct phy_device *phydev) /* Reset PHY, otherwise MII_LPA will provide outdated information. * This issue is reproducible only with some link partner PHYs */ - if (phydev->state == PHY_NOLINK && phydev->drv->soft_reset) - phydev->drv->soft_reset(phydev); + if (phydev->state == PHY_NOLINK) { + phy_init_hw(phydev); + phy_start_aneg(phydev); + } } static struct phy_driver asix_driver[] = { From 4e43e64d0f1332fcc503babad4dc31aead7131ca Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Jun 2022 12:12:48 +0000 Subject: [PATCH 46/55] ipv6: fix lockdep splat in in6_dump_addrs() As reported by syzbot, we should not use rcu_dereference() when rcu_read_lock() is not held. WARNING: suspicious RCU usage 5.19.0-rc2-syzkaller #0 Not tainted net/ipv6/addrconf.c:5175 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by syz-executor326/3617: #0: ffffffff8d5848e8 (rtnl_mutex){+.+.}-{3:3}, at: netlink_dump+0xae/0xc20 net/netlink/af_netlink.c:2223 stack backtrace: CPU: 0 PID: 3617 Comm: syz-executor326 Not tainted 5.19.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 in6_dump_addrs+0x12d1/0x1790 net/ipv6/addrconf.c:5175 inet6_dump_addr+0x9c1/0xb50 net/ipv6/addrconf.c:5300 netlink_dump+0x541/0xc20 net/netlink/af_netlink.c:2275 __netlink_dump_start+0x647/0x900 net/netlink/af_netlink.c:2380 netlink_dump_start include/linux/netlink.h:245 [inline] rtnetlink_rcv_msg+0x73e/0xc90 net/core/rtnetlink.c:6046 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2501 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x543/0x7f0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x917/0xe10 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:734 ____sys_sendmsg+0x6eb/0x810 net/socket.c:2492 ___sys_sendmsg+0xf3/0x170 net/socket.c:2546 __sys_sendmsg net/socket.c:2575 [inline] __do_sys_sendmsg net/socket.c:2584 [inline] __se_sys_sendmsg net/socket.c:2582 [inline] __x64_sys_sendmsg+0x132/0x220 net/socket.c:2582 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Fixes: 88e2ca308094 ("mld: convert ifmcaddr6 to RCU") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Taehee Yoo Link: https://lore.kernel.org/r/20220628121248.858695-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5864cbc30db6..49cc6587dd77 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5168,9 +5168,9 @@ next: fillargs->event = RTM_GETMULTICAST; /* multicast address */ - for (ifmca = rcu_dereference(idev->mc_list); + for (ifmca = rtnl_dereference(idev->mc_list); ifmca; - ifmca = rcu_dereference(ifmca->next), ip_idx++) { + ifmca = rtnl_dereference(ifmca->next), ip_idx++) { if (ip_idx < s_ip_idx) continue; err = inet6_fill_ifmcaddr(skb, ifmca, fillargs); From 050133e1aa2cb49bb17be847d48a4431598ef562 Mon Sep 17 00:00:00 2001 From: Yevhen Orlov Date: Wed, 29 Jun 2022 04:29:14 +0300 Subject: [PATCH 47/55] net: bonding: fix use-after-free after 802.3ad slave unbind commit 0622cab0341c ("bonding: fix 802.3ad aggregator reselection"), resolve case, when there is several aggregation groups in the same bond. bond_3ad_unbind_slave will invalidate (clear) aggregator when __agg_active_ports return zero. So, ad_clear_agg can be executed even, when num_of_ports!=0. Than bond_3ad_unbind_slave can be executed again for, previously cleared aggregator. NOTE: at this time bond_3ad_unbind_slave will not update slave ports list, because lag_ports==NULL. So, here we got slave ports, pointing to freed aggregator memory. Fix with checking actual number of ports in group (as was before commit 0622cab0341c ("bonding: fix 802.3ad aggregator reselection") ), before ad_clear_agg(). The KASAN logs are as follows: [ 767.617392] ================================================================== [ 767.630776] BUG: KASAN: use-after-free in bond_3ad_state_machine_handler+0x13dc/0x1470 [ 767.638764] Read of size 2 at addr ffff00011ba9d430 by task kworker/u8:7/767 [ 767.647361] CPU: 3 PID: 767 Comm: kworker/u8:7 Tainted: G O 5.15.11 #15 [ 767.655329] Hardware name: DNI AmazonGo1 A7040 board (DT) [ 767.660760] Workqueue: lacp_1 bond_3ad_state_machine_handler [ 767.666468] Call trace: [ 767.668930] dump_backtrace+0x0/0x2d0 [ 767.672625] show_stack+0x24/0x30 [ 767.675965] dump_stack_lvl+0x68/0x84 [ 767.679659] print_address_description.constprop.0+0x74/0x2b8 [ 767.685451] kasan_report+0x1f0/0x260 [ 767.689148] __asan_load2+0x94/0xd0 [ 767.692667] bond_3ad_state_machine_handler+0x13dc/0x1470 Fixes: 0622cab0341c ("bonding: fix 802.3ad aggregator reselection") Co-developed-by: Maksym Glubokiy Signed-off-by: Maksym Glubokiy Signed-off-by: Yevhen Orlov Acked-by: Jay Vosburgh Link: https://lore.kernel.org/r/20220629012914.361-1-yevhen.orlov@plvision.eu Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_3ad.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index a86b1f71762e..d7fb33c078e8 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -2228,7 +2228,8 @@ void bond_3ad_unbind_slave(struct slave *slave) temp_aggregator->num_of_ports--; if (__agg_active_ports(temp_aggregator) == 0) { select_new_active_agg = temp_aggregator->is_active; - ad_clear_agg(temp_aggregator); + if (temp_aggregator->num_of_ports == 0) + ad_clear_agg(temp_aggregator); if (select_new_active_agg) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); /* select new active aggregator */ From f8ebb3ac881b17712e1d5967c97ab1806b16d3d6 Mon Sep 17 00:00:00 2001 From: Jose Alonso Date: Tue, 28 Jun 2022 12:13:02 -0300 Subject: [PATCH 48/55] net: usb: ax88179_178a: Fix packet receiving This patch corrects packet receiving in ax88179_rx_fixup. - problem observed: ifconfig shows allways a lot of 'RX Errors' while packets are received normally. This occurs because ax88179_rx_fixup does not recognise properly the usb urb received. The packets are normally processed and at the end, the code exits with 'return 0', generating RX Errors. (pkt_cnt==-2 and ptk_hdr over field rx_hdr trying to identify another packet there) This is a usb urb received by "tcpdump -i usbmon2 -X" on a little-endian CPU: 0x0000: eeee f8e3 3b19 87a0 94de 80e3 daac 0800 ^ packet 1 start (pkt_len = 0x05ec) ^^^^ IP alignment pseudo header ^ ethernet packet start last byte ethernet packet v padding (8-bytes aligned) vvvv vvvv 0x05e0: c92d d444 1420 8a69 83dd 272f e82b 9811 0x05f0: eeee f8e3 3b19 87a0 94de 80e3 daac 0800 ... ^ packet 2 0x0be0: eeee f8e3 3b19 87a0 94de 80e3 daac 0800 ... 0x1130: 9d41 9171 8a38 0ec5 eeee f8e3 3b19 87a0 ... 0x1720: 8cfc 15ff 5e4c e85c eeee f8e3 3b19 87a0 ... 0x1d10: ecfa 2a3a 19ab c78c eeee f8e3 3b19 87a0 ... 0x2070: eeee f8e3 3b19 87a0 94de 80e3 daac 0800 ... ^ packet 7 0x2120: 7c88 4ca5 5c57 7dcc 0d34 7577 f778 7e0a 0x2130: f032 e093 7489 0740 3008 ec05 0000 0080 ====1==== ====2==== hdr_off ^ pkt_len = 0x05ec ^^^^ AX_RXHDR_*=0x00830 ^^^^ ^ pkt_len = 0 ^^^^ AX_RXHDR_DROP_ERR=0x80000000 ^^^^ ^ 0x2140: 3008 ec05 0000 0080 3008 5805 0000 0080 0x2150: 3008 ec05 0000 0080 3008 ec05 0000 0080 0x2160: 3008 5803 0000 0080 3008 c800 0000 0080 ===11==== ===12==== ===13==== ===14==== 0x2170: 0000 0000 0e00 3821 ^^^^ ^^^^ rx_hdr ^^^^ pkt_cnt=14 ^^^^ hdr_off=0x2138 ^^^^ ^^^^ padding The dump shows that pkt_cnt is the number of entrys in the per-packet metadata. It is "2 * packet count". Each packet have two entrys. The first have a valid value (pkt_len and AX_RXHDR_*) and the second have a dummy-header 0x80000000 (pkt_len=0 with AX_RXHDR_DROP_ERR). Why exists dummy-header for each packet?!? My guess is that this was done probably to align the entry for each packet to 64-bits and maintain compatibility with old firmware. There is also a padding (0x00000000) before the rx_hdr to align the end of rx_hdr to 64-bit. Note that packets have a alignment of 64-bits (8-bytes). This patch assumes that the dummy-header and the last padding are optional. So it preserves semantics and recognises the same valid packets as the current code. This patch was made using only the dumpfile information and tested with only one device: 0b95:1790 ASIX Electronics Corp. AX88179 Gigabit Ethernet Fixes: 57bc3d3ae8c1 ("net: usb: ax88179_178a: Fix out-of-bounds accesses in RX fixup") Fixes: e2ca90c276e1 ("ax88179_178a: ASIX AX88179_178A USB 3.0/2.0 to gigabit ethernet adapter driver") Signed-off-by: Jose Alonso Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/d6970bb04bf67598af4d316eaeb1792040b18cfd.camel@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/usb/ax88179_178a.c | 105 ++++++++++++++++++++++++--------- 1 file changed, 78 insertions(+), 27 deletions(-) diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index 4704ed6f00ef..ac2d400d1d6c 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1472,6 +1472,42 @@ static int ax88179_rx_fixup(struct usbnet *dev, struct sk_buff *skb) * are bundled into this buffer and where we can find an array of * per-packet metadata (which contains elements encoded into u16). */ + + /* SKB contents for current firmware: + * + * ... + * + * + * ... + * + * + * + * where: + * contains pkt_len bytes: + * 2 bytes of IP alignment pseudo header + * packet received + * contains 4 bytes: + * pkt_len and fields AX_RXHDR_* + * 0-7 bytes to terminate at + * 8 bytes boundary (64-bit). + * 4 bytes to make rx_hdr terminate at + * 8 bytes boundary (64-bit) + * contains 4 bytes: + * pkt_len=0 and AX_RXHDR_DROP_ERR + * contains 4 bytes: + * pkt_cnt and hdr_off (offset of + * ) + * + * pkt_cnt is number of entrys in the per-packet metadata. + * In current firmware there is 2 entrys per packet. + * The first points to the packet and the + * second is a dummy header. + * This was done probably to align fields in 64-bit and + * maintain compatibility with old firmware. + * This code assumes that and are + * optional. + */ + if (skb->len < 4) return 0; skb_trim(skb, skb->len - 4); @@ -1485,51 +1521,66 @@ static int ax88179_rx_fixup(struct usbnet *dev, struct sk_buff *skb) /* Make sure that the bounds of the metadata array are inside the SKB * (and in front of the counter at the end). */ - if (pkt_cnt * 2 + hdr_off > skb->len) + if (pkt_cnt * 4 + hdr_off > skb->len) return 0; pkt_hdr = (u32 *)(skb->data + hdr_off); /* Packets must not overlap the metadata array */ skb_trim(skb, hdr_off); - for (; ; pkt_cnt--, pkt_hdr++) { + for (; pkt_cnt > 0; pkt_cnt--, pkt_hdr++) { + u16 pkt_len_plus_padd; u16 pkt_len; le32_to_cpus(pkt_hdr); pkt_len = (*pkt_hdr >> 16) & 0x1fff; + pkt_len_plus_padd = (pkt_len + 7) & 0xfff8; - if (pkt_len > skb->len) + /* Skip dummy header used for alignment + */ + if (pkt_len == 0) + continue; + + if (pkt_len_plus_padd > skb->len) return 0; /* Check CRC or runt packet */ - if (((*pkt_hdr & (AX_RXHDR_CRC_ERR | AX_RXHDR_DROP_ERR)) == 0) && - pkt_len >= 2 + ETH_HLEN) { - bool last = (pkt_cnt == 0); - - if (last) { - ax_skb = skb; - } else { - ax_skb = skb_clone(skb, GFP_ATOMIC); - if (!ax_skb) - return 0; - } - ax_skb->len = pkt_len; - /* Skip IP alignment pseudo header */ - skb_pull(ax_skb, 2); - skb_set_tail_pointer(ax_skb, ax_skb->len); - ax_skb->truesize = pkt_len + sizeof(struct sk_buff); - ax88179_rx_checksum(ax_skb, pkt_hdr); - - if (last) - return 1; - - usbnet_skb_return(dev, ax_skb); + if ((*pkt_hdr & (AX_RXHDR_CRC_ERR | AX_RXHDR_DROP_ERR)) || + pkt_len < 2 + ETH_HLEN) { + dev->net->stats.rx_errors++; + skb_pull(skb, pkt_len_plus_padd); + continue; } - /* Trim this packet away from the SKB */ - if (!skb_pull(skb, (pkt_len + 7) & 0xFFF8)) + /* last packet */ + if (pkt_len_plus_padd == skb->len) { + skb_trim(skb, pkt_len); + + /* Skip IP alignment pseudo header */ + skb_pull(skb, 2); + + skb->truesize = SKB_TRUESIZE(pkt_len_plus_padd); + ax88179_rx_checksum(skb, pkt_hdr); + return 1; + } + + ax_skb = skb_clone(skb, GFP_ATOMIC); + if (!ax_skb) return 0; + skb_trim(ax_skb, pkt_len); + + /* Skip IP alignment pseudo header */ + skb_pull(ax_skb, 2); + + skb->truesize = pkt_len_plus_padd + + SKB_DATA_ALIGN(sizeof(struct sk_buff)); + ax88179_rx_checksum(ax_skb, pkt_hdr); + usbnet_skb_return(dev, ax_skb); + + skb_pull(skb, pkt_len_plus_padd); } + + return 0; } static struct sk_buff * From 9cc02ede696272c5271a401e4f27c262359bc2f6 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Wed, 29 Jun 2022 08:26:40 +0800 Subject: [PATCH 49/55] net: rose: fix UAF bugs caused by timer handler There are UAF bugs in rose_heartbeat_expiry(), rose_timer_expiry() and rose_idletimer_expiry(). The root cause is that del_timer() could not stop the timer handler that is running and the refcount of sock is not managed properly. One of the UAF bugs is shown below: (thread 1) | (thread 2) | rose_bind | rose_connect | rose_start_heartbeat rose_release | (wait a time) case ROSE_STATE_0 | rose_destroy_socket | rose_heartbeat_expiry rose_stop_heartbeat | sock_put(sk) | ... sock_put(sk) // FREE | | bh_lock_sock(sk) // USE The sock is deallocated by sock_put() in rose_release() and then used by bh_lock_sock() in rose_heartbeat_expiry(). Although rose_destroy_socket() calls rose_stop_heartbeat(), it could not stop the timer that is running. The KASAN report triggered by POC is shown below: BUG: KASAN: use-after-free in _raw_spin_lock+0x5a/0x110 Write of size 4 at addr ffff88800ae59098 by task swapper/3/0 ... Call Trace: dump_stack_lvl+0xbf/0xee print_address_description+0x7b/0x440 print_report+0x101/0x230 ? irq_work_single+0xbb/0x140 ? _raw_spin_lock+0x5a/0x110 kasan_report+0xed/0x120 ? _raw_spin_lock+0x5a/0x110 kasan_check_range+0x2bd/0x2e0 _raw_spin_lock+0x5a/0x110 rose_heartbeat_expiry+0x39/0x370 ? rose_start_heartbeat+0xb0/0xb0 call_timer_fn+0x2d/0x1c0 ? rose_start_heartbeat+0xb0/0xb0 expire_timers+0x1f3/0x320 __run_timers+0x3ff/0x4d0 run_timer_softirq+0x41/0x80 __do_softirq+0x233/0x544 irq_exit_rcu+0x41/0xa0 sysvec_apic_timer_interrupt+0x8c/0xb0 asm_sysvec_apic_timer_interrupt+0x1b/0x20 RIP: 0010:default_idle+0xb/0x10 RSP: 0018:ffffc9000012fea0 EFLAGS: 00000202 RAX: 000000000000bcae RBX: ffff888006660f00 RCX: 000000000000bcae RDX: 0000000000000001 RSI: ffffffff843a11c0 RDI: ffffffff843a1180 RBP: dffffc0000000000 R08: dffffc0000000000 R09: ffffed100da36d46 R10: dfffe9100da36d47 R11: ffffffff83cf0950 R12: 0000000000000000 R13: 1ffff11000ccc1e0 R14: ffffffff8542af28 R15: dffffc0000000000 ... Allocated by task 146: __kasan_kmalloc+0xc4/0xf0 sk_prot_alloc+0xdd/0x1a0 sk_alloc+0x2d/0x4e0 rose_create+0x7b/0x330 __sock_create+0x2dd/0x640 __sys_socket+0xc7/0x270 __x64_sys_socket+0x71/0x80 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 Freed by task 152: kasan_set_track+0x4c/0x70 kasan_set_free_info+0x1f/0x40 ____kasan_slab_free+0x124/0x190 kfree+0xd3/0x270 __sk_destruct+0x314/0x460 rose_release+0x2fa/0x3b0 sock_close+0xcb/0x230 __fput+0x2d9/0x650 task_work_run+0xd6/0x160 exit_to_user_mode_loop+0xc7/0xd0 exit_to_user_mode_prepare+0x4e/0x80 syscall_exit_to_user_mode+0x20/0x40 do_syscall_64+0x4f/0x90 entry_SYSCALL_64_after_hwframe+0x46/0xb0 This patch adds refcount of sock when we use functions such as rose_start_heartbeat() and so on to start timer, and decreases the refcount of sock when timer is finished or deleted by functions such as rose_stop_heartbeat() and so on. As a result, the UAF bugs could be mitigated. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Duoming Zhou Tested-by: Duoming Zhou Link: https://lore.kernel.org/r/20220629002640.5693-1-duoming@zju.edu.cn Signed-off-by: Paolo Abeni --- net/rose/rose_timer.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index b3138fc2e552..f06ddbed3fed 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -31,89 +31,89 @@ static void rose_idletimer_expiry(struct timer_list *); void rose_start_heartbeat(struct sock *sk) { - del_timer(&sk->sk_timer); + sk_stop_timer(sk, &sk->sk_timer); sk->sk_timer.function = rose_heartbeat_expiry; sk->sk_timer.expires = jiffies + 5 * HZ; - add_timer(&sk->sk_timer); + sk_reset_timer(sk, &sk->sk_timer, sk->sk_timer.expires); } void rose_start_t1timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t1; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_t2timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t2; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_t3timer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->t3; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_hbtimer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->timer); + sk_stop_timer(sk, &rose->timer); rose->timer.function = rose_timer_expiry; rose->timer.expires = jiffies + rose->hb; - add_timer(&rose->timer); + sk_reset_timer(sk, &rose->timer, rose->timer.expires); } void rose_start_idletimer(struct sock *sk) { struct rose_sock *rose = rose_sk(sk); - del_timer(&rose->idletimer); + sk_stop_timer(sk, &rose->idletimer); if (rose->idle > 0) { rose->idletimer.function = rose_idletimer_expiry; rose->idletimer.expires = jiffies + rose->idle; - add_timer(&rose->idletimer); + sk_reset_timer(sk, &rose->idletimer, rose->idletimer.expires); } } void rose_stop_heartbeat(struct sock *sk) { - del_timer(&sk->sk_timer); + sk_stop_timer(sk, &sk->sk_timer); } void rose_stop_timer(struct sock *sk) { - del_timer(&rose_sk(sk)->timer); + sk_stop_timer(sk, &rose_sk(sk)->timer); } void rose_stop_idletimer(struct sock *sk) { - del_timer(&rose_sk(sk)->idletimer); + sk_stop_timer(sk, &rose_sk(sk)->idletimer); } static void rose_heartbeat_expiry(struct timer_list *t) @@ -130,6 +130,7 @@ static void rose_heartbeat_expiry(struct timer_list *t) (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { bh_unlock_sock(sk); rose_destroy_socket(sk); + sock_put(sk); return; } break; @@ -152,6 +153,7 @@ static void rose_heartbeat_expiry(struct timer_list *t) rose_start_heartbeat(sk); bh_unlock_sock(sk); + sock_put(sk); } static void rose_timer_expiry(struct timer_list *t) @@ -181,6 +183,7 @@ static void rose_timer_expiry(struct timer_list *t) break; } bh_unlock_sock(sk); + sock_put(sk); } static void rose_idletimer_expiry(struct timer_list *t) @@ -205,4 +208,5 @@ static void rose_idletimer_expiry(struct timer_list *t) sock_set_flag(sk, SOCK_DEAD); } bh_unlock_sock(sk); + sock_put(sk); } From 665030fd0c1ed9f505932e6e73e7a2c788787a0a Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 29 Jun 2022 10:02:05 +0300 Subject: [PATCH 50/55] mlxsw: spectrum_router: Fix rollback in tunnel next hop init In mlxsw_sp_nexthop6_init(), a next hop is always added to the router linked list, and mlxsw_sp_nexthop_type_init() is invoked afterwards. When that function results in an error, the next hop will not have been removed from the linked list. As the error is propagated upwards and the caller frees the next hop object, the linked list ends up holding an invalid object. A similar issue comes up with mlxsw_sp_nexthop4_init(), where rollback block does exist, however does not include the linked list removal. Both IPv6 and IPv4 next hops have a similar issue with next-hop counter rollbacks. As these were introduced in the same patchset as the next hop linked list, include the cleanup in this patch. Fixes: dbe4598c1e92 ("mlxsw: spectrum_router: Keep nexthops in a linked list") Fixes: a5390278a5eb ("mlxsw: spectrum: Add support for setting counters on nexthops") Signed-off-by: Petr Machata Reviewed-by: Amit Cohen Signed-off-by: Ido Schimmel Link: https://lore.kernel.org/r/20220629070205.803952-1-idosch@nvidia.com Signed-off-by: Paolo Abeni --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 9dbb573d53ea..0d8a0068e4ca 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -4415,6 +4415,8 @@ static int mlxsw_sp_nexthop4_init(struct mlxsw_sp *mlxsw_sp, return 0; err_nexthop_neigh_init: + list_del(&nh->router_list_node); + mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh); mlxsw_sp_nexthop_remove(mlxsw_sp, nh); return err; } @@ -6740,6 +6742,7 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp, const struct fib6_info *rt) { struct net_device *dev = rt->fib6_nh->fib_nh_dev; + int err; nh->nhgi = nh_grp->nhgi; nh->nh_weight = rt->fib6_nh->fib_nh_weight; @@ -6755,7 +6758,16 @@ static int mlxsw_sp_nexthop6_init(struct mlxsw_sp *mlxsw_sp, return 0; nh->ifindex = dev->ifindex; - return mlxsw_sp_nexthop_type_init(mlxsw_sp, nh, dev); + err = mlxsw_sp_nexthop_type_init(mlxsw_sp, nh, dev); + if (err) + goto err_nexthop_type_init; + + return 0; + +err_nexthop_type_init: + list_del(&nh->router_list_node); + mlxsw_sp_nexthop_counter_free(mlxsw_sp, nh); + return err; } static void mlxsw_sp_nexthop6_fini(struct mlxsw_sp *mlxsw_sp, From 0a18d802d65cf662644fd1d369c86d84a5630652 Mon Sep 17 00:00:00 2001 From: Jianglei Nie Date: Wed, 29 Jun 2022 15:55:50 +0800 Subject: [PATCH 51/55] net: sfp: fix memory leak in sfp_probe() sfp_probe() allocates a memory chunk from sfp with sfp_alloc(). When devm_add_action() fails, sfp is not freed, which leads to a memory leak. We should use devm_add_action_or_reset() instead of devm_add_action(). Signed-off-by: Jianglei Nie Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20220629075550.2152003-1-niejianglei2021@163.com Signed-off-by: Paolo Abeni --- drivers/net/phy/sfp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index 9a5d5a10560f..e7b0e12cc75b 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -2516,7 +2516,7 @@ static int sfp_probe(struct platform_device *pdev) platform_set_drvdata(pdev, sfp); - err = devm_add_action(sfp->dev, sfp_cleanup, sfp); + err = devm_add_action_or_reset(sfp->dev, sfp_cleanup, sfp); if (err < 0) return err; From 9c5de246c1dbe785268fc2e83c88624b92e4ec93 Mon Sep 17 00:00:00 2001 From: Casper Andersson Date: Thu, 30 Jun 2022 14:22:26 +0200 Subject: [PATCH 52/55] net: sparx5: mdb add/del handle non-sparx5 devices When adding/deleting mdb entries on other net_devices, eg., tap interfaces, it should not crash. Fixes: 3bacfccdcb2d ("net: sparx5: Add mdb handlers") Signed-off-by: Casper Andersson Reviewed-by: Steen Hegelund Link: https://lore.kernel.org/r/20220630122226.316812-1-casper.casan@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c index 3429660cd2e5..5edc8b7176c8 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c @@ -396,6 +396,9 @@ static int sparx5_handle_port_mdb_add(struct net_device *dev, u32 mact_entry; int res, err; + if (!sparx5_netdevice_check(dev)) + return -EOPNOTSUPP; + if (netif_is_bridge_master(v->obj.orig_dev)) { sparx5_mact_learn(spx5, PGID_CPU, v->addr, v->vid); return 0; @@ -466,6 +469,9 @@ static int sparx5_handle_port_mdb_del(struct net_device *dev, u32 mact_entry, res, pgid_entry[3]; int err; + if (!sparx5_netdevice_check(dev)) + return -EOPNOTSUPP; + if (netif_is_bridge_master(v->obj.orig_dev)) { sparx5_mact_forget(spx5, v->addr, v->vid); return 0; From ff1fa2081d173b01cebe2fbf0a2d0f1cee9ce4b5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Jun 2022 11:19:10 -0700 Subject: [PATCH 53/55] net: tun: avoid disabling NAPI twice Eric reports that syzbot made short work out of my speculative fix. Indeed when queue gets detached its tfile->tun remains, so we would try to stop NAPI twice with a detach(), close() sequence. Alternative fix would be to move tun_napi_disable() to tun_detach_all() and let the NAPI run after the queue has been detached. Fixes: a8fc8cb5692a ("net: tun: stop NAPI when detaching queues") Reported-by: syzbot Reported-by: Eric Dumazet Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220629181911.372047-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index e2eb35887394..259b2b84b2b3 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -640,7 +640,8 @@ static void __tun_detach(struct tun_file *tfile, bool clean) tun = rtnl_dereference(tfile->tun); if (tun && clean) { - tun_napi_disable(tfile); + if (!tfile->detached) + tun_napi_disable(tfile); tun_napi_del(tfile); } From 839b92fede7ba308f1a475aa00fea55f63b7fccf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Jun 2022 11:19:11 -0700 Subject: [PATCH 54/55] selftest: tun: add test for NAPI dismantle Being lazy does not pay, add the test for various ordering of tun queue close / detach / destroy. Link: https://lore.kernel.org/r/20220629181911.372047-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/tun.c | 162 +++++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/tun.c diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 7ea54af55490..ddad703ace34 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -54,7 +54,7 @@ TEST_GEN_FILES += ipsec TEST_GEN_FILES += ioam6_parser TEST_GEN_FILES += gro TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa -TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls +TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun TEST_GEN_FILES += toeplitz TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += stress_reuseport_listen diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c new file mode 100644 index 000000000000..fa83918b62d1 --- /dev/null +++ b/tools/testing/selftests/net/tun.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" + +static int tun_attach(int fd, char *dev) +{ + struct ifreq ifr; + + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, dev); + ifr.ifr_flags = IFF_ATTACH_QUEUE; + + return ioctl(fd, TUNSETQUEUE, (void *) &ifr); +} + +static int tun_detach(int fd, char *dev) +{ + struct ifreq ifr; + + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, dev); + ifr.ifr_flags = IFF_DETACH_QUEUE; + + return ioctl(fd, TUNSETQUEUE, (void *) &ifr); +} + +static int tun_alloc(char *dev) +{ + struct ifreq ifr; + int fd, err; + + fd = open("/dev/net/tun", O_RDWR); + if (fd < 0) { + fprintf(stderr, "can't open tun: %s\n", strerror(errno)); + return fd; + } + + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, dev); + ifr.ifr_flags = IFF_TAP | IFF_NAPI | IFF_MULTI_QUEUE; + + err = ioctl(fd, TUNSETIFF, (void *) &ifr); + if (err < 0) { + fprintf(stderr, "can't TUNSETIFF: %s\n", strerror(errno)); + close(fd); + return err; + } + strcpy(dev, ifr.ifr_name); + return fd; +} + +static int tun_delete(char *dev) +{ + struct { + struct nlmsghdr nh; + struct ifinfomsg ifm; + unsigned char data[64]; + } req; + struct rtattr *rta; + int ret, rtnl; + + rtnl = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); + if (rtnl < 0) { + fprintf(stderr, "can't open rtnl: %s\n", strerror(errno)); + return 1; + } + + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_ALIGN(NLMSG_LENGTH(sizeof(req.ifm))); + req.nh.nlmsg_flags = NLM_F_REQUEST; + req.nh.nlmsg_type = RTM_DELLINK; + + req.ifm.ifi_family = AF_UNSPEC; + + rta = (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.nh.nlmsg_len)); + rta->rta_type = IFLA_IFNAME; + rta->rta_len = RTA_LENGTH(IFNAMSIZ); + req.nh.nlmsg_len += rta->rta_len; + memcpy(RTA_DATA(rta), dev, IFNAMSIZ); + + ret = send(rtnl, &req, req.nh.nlmsg_len, 0); + if (ret < 0) + fprintf(stderr, "can't send: %s\n", strerror(errno)); + ret = (unsigned int)ret != req.nh.nlmsg_len; + + close(rtnl); + return ret; +} + +FIXTURE(tun) +{ + char ifname[IFNAMSIZ]; + int fd, fd2; +}; + +FIXTURE_SETUP(tun) +{ + memset(self->ifname, 0, sizeof(self->ifname)); + + self->fd = tun_alloc(self->ifname); + ASSERT_GE(self->fd, 0); + + self->fd2 = tun_alloc(self->ifname); + ASSERT_GE(self->fd2, 0); +} + +FIXTURE_TEARDOWN(tun) +{ + if (self->fd >= 0) + close(self->fd); + if (self->fd2 >= 0) + close(self->fd2); +} + +TEST_F(tun, delete_detach_close) { + EXPECT_EQ(tun_delete(self->ifname), 0); + EXPECT_EQ(tun_detach(self->fd, self->ifname), -1); + EXPECT_EQ(errno, 22); +} + +TEST_F(tun, detach_delete_close) { + EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); + EXPECT_EQ(tun_delete(self->ifname), 0); +} + +TEST_F(tun, detach_close_delete) { + EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); + close(self->fd); + self->fd = -1; + EXPECT_EQ(tun_delete(self->ifname), 0); +} + +TEST_F(tun, reattach_delete_close) { + EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); + EXPECT_EQ(tun_attach(self->fd, self->ifname), 0); + EXPECT_EQ(tun_delete(self->ifname), 0); +} + +TEST_F(tun, reattach_close_delete) { + EXPECT_EQ(tun_detach(self->fd, self->ifname), 0); + EXPECT_EQ(tun_attach(self->fd, self->ifname), 0); + close(self->fd); + self->fd = -1; + EXPECT_EQ(tun_delete(self->ifname), 0); +} + +TEST_HARNESS_MAIN From 58bf4db695287c4bb2a5fc9fc12c78fdd4c36894 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 29 Jun 2022 21:30:07 +0300 Subject: [PATCH 55/55] net: dsa: felix: fix race between reading PSFP stats and port stats Both PSFP stats and the port stats read by ocelot_check_stats_work() are indirectly read through the same mechanism - write to STAT_CFG:STAT_VIEW, read from SYS:STAT:CNT[n]. It's just that for port stats, we write STAT_VIEW with the index of the port, and for PSFP stats, we write STAT_VIEW with the filter index. So if we allow them to run concurrently, ocelot_check_stats_work() may change the view from vsc9959_psfp_counters_get(), and vice versa. Fixes: 7d4b564d6add ("net: dsa: felix: support psfp filter on vsc9959") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220629183007.3808130-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix_vsc9959.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index 570d0204b7be..9c27b9b0128d 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1886,6 +1886,8 @@ static void vsc9959_psfp_sgi_table_del(struct ocelot *ocelot, static void vsc9959_psfp_counters_get(struct ocelot *ocelot, u32 index, struct felix_stream_filter_counters *counters) { + mutex_lock(&ocelot->stats_lock); + ocelot_rmw(ocelot, SYS_STAT_CFG_STAT_VIEW(index), SYS_STAT_CFG_STAT_VIEW_M, SYS_STAT_CFG); @@ -1900,6 +1902,8 @@ static void vsc9959_psfp_counters_get(struct ocelot *ocelot, u32 index, SYS_STAT_CFG_STAT_VIEW(index) | SYS_STAT_CFG_STAT_CLEAR_SHOT(0x10), SYS_STAT_CFG); + + mutex_unlock(&ocelot->stats_lock); } static int vsc9959_psfp_filter_add(struct ocelot *ocelot, int port,