From e6194923237f3952b955c343b65b211f36bce01c Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Thu, 13 Jul 2017 09:13:30 +0200 Subject: [PATCH 001/118] esp: Fix memleaks on error paths. We leak the temporary allocated resources in error paths, fix this by freeing them. Fixes: fca11ebde3f ("esp4: Reorganize esp_output") Fixes: 383d0350f2c ("esp6: Reorganize esp_output") Fixes: 3f29770723f ("ipsec: check return value of skb_to_sgvec always") Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 13 ++++++++----- net/ipv6/esp6.c | 9 +++++---- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 0cbee0a666ff..dbb31a942dfa 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -381,7 +381,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); if (unlikely(err < 0)) - goto error; + goto error_free; if (!esp->inplace) { int allocsize; @@ -392,7 +392,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * spin_lock_bh(&x->lock); if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { spin_unlock_bh(&x->lock); - goto error; + goto error_free; } skb_shinfo(skb)->nr_frags = 1; @@ -409,7 +409,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); if (unlikely(err < 0)) - goto error; + goto error_free; } if ((x->props.flags & XFRM_STATE_ESN)) @@ -442,8 +442,9 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * if (sg != dsg) esp_ssg_unref(x, tmp); - kfree(tmp); +error_free: + kfree(tmp); error: return err; } @@ -695,8 +696,10 @@ skip_cow: sg_init_table(sg, nfrags); err = skb_to_sgvec(skb, sg, 0, skb->len); - if (unlikely(err < 0)) + if (unlikely(err < 0)) { + kfree(tmp); goto out; + } skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 9ed35473dcb5..392def1fcf21 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -345,7 +345,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); if (unlikely(err < 0)) - goto error; + goto error_free; if (!esp->inplace) { int allocsize; @@ -356,7 +356,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info spin_lock_bh(&x->lock); if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { spin_unlock_bh(&x->lock); - goto error; + goto error_free; } skb_shinfo(skb)->nr_frags = 1; @@ -373,7 +373,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info (unsigned char *)esph - skb->data, assoclen + ivlen + esp->clen + alen); if (unlikely(err < 0)) - goto error; + goto error_free; } if ((x->props.flags & XFRM_STATE_ESN)) @@ -406,8 +406,9 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info if (sg != dsg) esp_ssg_unref(x, tmp); - kfree(tmp); +error_free: + kfree(tmp); error: return err; } From 3840538ad384fb7891adeeaf36624f870c51fc0e Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 18 Jul 2017 14:56:17 +0200 Subject: [PATCH 002/118] netfilter: ipt_CLUSTERIP: fix use-after-free of proc entry When we delete a netns with a CLUSTERIP rule, clusterip_net_exit() is called first, removing /proc/net/ipt_CLUSTERIP. Then clusterip_config_entry_put() is called from clusterip_tg_destroy(), and tries to remove its entry under /proc/net/ipt_CLUSTERIP/. Fix this by checking that the parent directory of the entry to remove hasn't already been deleted. The following triggers a KASAN splat (stealing the reproducer from 202f59afd441, thanks to Jianlin Shi and Xin Long): ip netns add test ip link add veth0_in type veth peer name veth0_out ip link set veth0_in netns test ip netns exec test ip link set lo up ip netns exec test ip link set veth0_in up ip netns exec test iptables -I INPUT -d 1.2.3.4 -i veth0_in -j \ CLUSTERIP --new --clustermac 89:d4:47:eb:9a:fa --total-nodes 3 \ --local-node 1 --hashmode sourceip-sourceport ip netns del test Fixes: ce4ff76c15a8 ("netfilter: ipt_CLUSTERIP: make proc directory per net namespace") Signed-off-by: Sabrina Dubroca Reviewed-by: Xin Long Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 7d72decb80f9..efaa04dcc80e 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -117,7 +117,8 @@ clusterip_config_entry_put(struct net *net, struct clusterip_config *c) * functions are also incrementing the refcount on their own, * so it's safe to remove the entry even if it's in use. */ #ifdef CONFIG_PROC_FS - proc_remove(c->pde); + if (cn->procdir) + proc_remove(c->pde); #endif return; } @@ -815,6 +816,7 @@ static void clusterip_net_exit(struct net *net) #ifdef CONFIG_PROC_FS struct clusterip_net *cn = net_generic(net, clusterip_net_id); proc_remove(cn->procdir); + cn->procdir = NULL; #endif nf_unregister_net_hook(net, &cip_arp_ops); } From f7fb77fc12352d15180dc3c08ffba10573d5167d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 18 Jul 2017 20:03:05 +0200 Subject: [PATCH 003/118] netfilter: nft_compat: check extension hook mask only if set If the x_tables extension comes with no hook mask, skip this validation. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index f5a7cb68694e..b89f4f65b2a0 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -305,7 +305,7 @@ static int nft_target_validate(const struct nft_ctx *ctx, const struct nf_hook_ops *ops = &basechain->ops[0]; hook_mask = 1 << ops->hooknum; - if (!(hook_mask & target->hooks)) + if (target->hooks && !(hook_mask & target->hooks)) return -EINVAL; ret = nft_compat_chain_validate_dependency(target->table, @@ -484,7 +484,7 @@ static int nft_match_validate(const struct nft_ctx *ctx, const struct nf_hook_ops *ops = &basechain->ops[0]; hook_mask = 1 << ops->hooknum; - if (!(hook_mask & match->hooks)) + if (match->hooks && !(hook_mask & match->hooks)) return -EINVAL; ret = nft_compat_chain_validate_dependency(match->table, From 9beceb54fa2c0b47532dd2b07f37e410641cf9b2 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 27 Jul 2017 11:22:04 +0900 Subject: [PATCH 004/118] netfilter: x_tables: Fix use-after-free in ipt_do_table. If verdict is NF_STOLEN in the SYNPROXY target, the skb is consumed. However, ipt_do_table() always tries to get ip header from the skb. So that, KASAN triggers the use-after-free message. We can reproduce this message using below command. # iptables -I INPUT -p tcp -j SYNPROXY --mss 1460 [ 193.542265] BUG: KASAN: use-after-free in ipt_do_table+0x1405/0x1c10 [ ... ] [ 193.578603] Call Trace: [ 193.581590] [ 193.584107] dump_stack+0x68/0xa0 [ 193.588168] print_address_description+0x78/0x290 [ 193.593828] ? ipt_do_table+0x1405/0x1c10 [ 193.598690] kasan_report+0x230/0x340 [ 193.603194] __asan_report_load2_noabort+0x19/0x20 [ 193.608950] ipt_do_table+0x1405/0x1c10 [ 193.613591] ? rcu_read_lock_held+0xae/0xd0 [ 193.618631] ? ip_route_input_rcu+0x27d7/0x4270 [ 193.624348] ? ipt_do_table+0xb68/0x1c10 [ 193.629124] ? do_add_counters+0x620/0x620 [ 193.634234] ? iptable_filter_net_init+0x60/0x60 [ ... ] After this patch, only when verdict is XT_CONTINUE, ipt_do_table() tries to get ip header. Also arpt_do_table() is modified because it has same bug. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 10 +++++----- net/ipv4/netfilter/ip_tables.c | 9 +++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 0bc3c3d73e61..9e9d9afd18f7 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -268,14 +268,14 @@ unsigned int arpt_do_table(struct sk_buff *skb, acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); - /* Target might have changed stuff. */ - arp = arp_hdr(skb); - - if (verdict == XT_CONTINUE) + if (verdict == XT_CONTINUE) { + /* Target might have changed stuff. */ + arp = arp_hdr(skb); e = arpt_next_entry(e); - else + } else { /* Verdict */ break; + } } while (!acpar.hotdrop); xt_write_recseq_end(addend); local_bh_enable(); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2a55a40211cb..622ed2887cd5 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -352,13 +352,14 @@ ipt_do_table(struct sk_buff *skb, acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); - /* Target might have changed stuff. */ - ip = ip_hdr(skb); - if (verdict == XT_CONTINUE) + if (verdict == XT_CONTINUE) { + /* Target might have changed stuff. */ + ip = ip_hdr(skb); e = ipt_next_entry(e); - else + } else { /* Verdict */ break; + } } while (!acpar.hotdrop); xt_write_recseq_end(addend); From 3f5a95ad6c6c05e6b00f0c20e30da66c986564d5 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Tue, 1 Aug 2017 23:21:46 +0900 Subject: [PATCH 005/118] xfrm: fix null pointer dereference on state and tmpl sort Creating sub policy that matches the same outer flow as main policy does leads to a null pointer dereference if the outer mode's family is ipv4. For userspace compatibility, this patch just eliminates the crash i.e., does not introduce any new sorting rule, which would fruitlessly affect all but the aforementioned case. Signed-off-by: Koichiro Den Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 6c0956d10db6..a792effdb0b5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1620,6 +1620,7 @@ int xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, unsigned short family, struct net *net) { + int i; int err = 0; struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); if (!afinfo) @@ -1628,6 +1629,9 @@ xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, spin_lock_bh(&net->xfrm.xfrm_state_lock); /*FIXME*/ if (afinfo->tmpl_sort) err = afinfo->tmpl_sort(dst, src, n); + else + for (i = 0; i < n; i++) + dst[i] = src[i]; spin_unlock_bh(&net->xfrm.xfrm_state_lock); rcu_read_unlock(); return err; @@ -1638,6 +1642,7 @@ int xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, unsigned short family) { + int i; int err = 0; struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); struct net *net = xs_net(*src); @@ -1648,6 +1653,9 @@ xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, spin_lock_bh(&net->xfrm.xfrm_state_lock); if (afinfo->state_sort) err = afinfo->state_sort(dst, src, n); + else + for (i = 0; i < n; i++) + dst[i] = src[i]; spin_unlock_bh(&net->xfrm.xfrm_state_lock); rcu_read_unlock(); return err; From 7bab09631c2a303f87a7eb7e3d69e888673b9b7e Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Wed, 2 Aug 2017 19:50:14 +0200 Subject: [PATCH 006/118] xfrm: policy: check policy direction value The 'dir' parameter in xfrm_migrate() is a user-controlled byte which is used as an array index. This can lead to an out-of-bound access, kernel lockup and DoS. Add a check for the 'dir' value. This fixes CVE-2017-11600. References: https://bugzilla.redhat.com/show_bug.cgi?id=1474928 Fixes: 80c9abaabf42 ("[XFRM]: Extension for dynamic update of endpoint address(es)") Cc: # v2.6.21-rc1 Reported-by: "bo Zhang" Signed-off-by: Vladis Dronov Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ff61d8557929..6f5a0dad502f 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3308,9 +3308,15 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_state *x_new[XFRM_MAX_DEPTH]; struct xfrm_migrate *mp; + /* Stage 0 - sanity checks */ if ((err = xfrm_migrate_check(m, num_migrate)) < 0) goto out; + if (dir >= XFRM_POLICY_MAX) { + err = -EINVAL; + goto out; + } + /* Stage 1 - find policy */ if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) { err = -ENOENT; From 4ff0308f06da5016aafb05330ed37809b54f81ae Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Aug 2017 08:31:07 +0200 Subject: [PATCH 007/118] esp: Fix error handling on layer 2 xmit. esp_output_tail() and esp6_output_tail() can return negative and positive error values. We currently treat only negative values as errors, fix this to treat both cases as error. Fixes: fca11ebde3f0 ("esp4: Reorganize esp_output") Fixes: 383d0350f2cc ("esp6: Reorganize esp_output") Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 2 +- net/ipv6/esp6_offload.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index e0666016a764..50112324fa5c 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -257,7 +257,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_ esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); err = esp_output_tail(x, skb, &esp); - if (err < 0) + if (err) return err; secpath_reset(skb); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index f02f131f6435..1cf437f75b0b 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -286,7 +286,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); err = esp6_output_tail(x, skb, &esp); - if (err < 0) + if (err) return err; secpath_reset(skb); From 3de33e1ba0506723ab25734e098cf280ecc34756 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 18 Aug 2017 14:40:53 +0200 Subject: [PATCH 008/118] ipv6: accept 64k - 1 packet length in ip6_find_1stfragopt() A packet length of exactly IPV6_MAXPLEN is allowed, we should refuse parsing options only if the size is 64KiB or more. While at it, remove one extra variable and one assignment which were also introduced by the commit that introduced the size check. Checking the sum 'offset + len' and only later adding 'len' to 'offset' doesn't provide any advantage over directly summing to 'offset' and checking it. Fixes: 6399f1fae4ec ("ipv6: avoid overflow of offset in ip6_find_1stfragopt") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/output_core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index abb2c307fbe8..a338bbc33cf3 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -86,7 +86,6 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) while (offset <= packet_len) { struct ipv6_opt_hdr *exthdr; - unsigned int len; switch (**nexthdr) { @@ -112,10 +111,9 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); - len = ipv6_optlen(exthdr); - if (len + offset >= IPV6_MAXPLEN) + offset += ipv6_optlen(exthdr); + if (offset > IPV6_MAXPLEN) return -EINVAL; - offset += len; *nexthdr = &exthdr->nexthdr; } From c5cff8561d2d0006e972bd114afd51f082fee77c Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 21 Aug 2017 09:47:10 -0700 Subject: [PATCH 009/118] ipv6: add rcu grace period before freeing fib6_node We currently keep rt->rt6i_node pointing to the fib6_node for the route. And some functions make use of this pointer to dereference the fib6_node from rt structure, e.g. rt6_check(). However, as there is neither refcount nor rcu taken when dereferencing rt->rt6i_node, it could potentially cause crashes as rt->rt6i_node could be set to NULL by other CPUs when doing a route deletion. This patch introduces an rcu grace period before freeing fib6_node and makes sure the functions that dereference it takes rcu_read_lock(). Note: there is no "Fixes" tag because this bug was there in a very early stage. Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 30 +++++++++++++++++++++++++++++- net/ipv6/ip6_fib.c | 20 ++++++++++++++++---- net/ipv6/route.c | 14 +++++++++++--- 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 1a88008cc6f5..e9c59db92942 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -70,6 +70,7 @@ struct fib6_node { __u16 fn_flags; int fn_sernum; struct rt6_info *rr_ptr; + struct rcu_head rcu; }; #ifndef CONFIG_IPV6_SUBTREES @@ -167,13 +168,40 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) rt0->rt6i_flags |= RTF_EXPIRES; } +/* Function to safely get fn->sernum for passed in rt + * and store result in passed in cookie. + * Return true if we can get cookie safely + * Return false if not + */ +static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, + u32 *cookie) +{ + struct fib6_node *fn; + bool status = false; + + rcu_read_lock(); + fn = rcu_dereference(rt->rt6i_node); + + if (fn) { + *cookie = fn->fn_sernum; + status = true; + } + + rcu_read_unlock(); + return status; +} + static inline u32 rt6_get_cookie(const struct rt6_info *rt) { + u32 cookie = 0; + if (rt->rt6i_flags & RTF_PCPU || (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) rt = (struct rt6_info *)(rt->dst.from); - return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + rt6_get_cookie_safe(rt, &cookie); + + return cookie; } static inline void ip6_rt_put(struct rt6_info *rt) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5cc0ea038198..a5ebf86f6be8 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -148,11 +148,23 @@ static struct fib6_node *node_alloc(void) return fn; } -static void node_free(struct fib6_node *fn) +static void node_free_immediate(struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); } +static void node_free_rcu(struct rcu_head *head) +{ + struct fib6_node *fn = container_of(head, struct fib6_node, rcu); + + kmem_cache_free(fib6_node_kmem, fn); +} + +static void node_free(struct fib6_node *fn) +{ + call_rcu(&fn->rcu, node_free_rcu); +} + static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) { int cpu; @@ -601,9 +613,9 @@ insert_above: if (!in || !ln) { if (in) - node_free(in); + node_free_immediate(in); if (ln) - node_free(ln); + node_free_immediate(ln); return ERR_PTR(-ENOMEM); } @@ -1038,7 +1050,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, root, and then (in failure) stale node in main tree. */ - node_free(sfn); + node_free_immediate(sfn); err = PTR_ERR(sn); goto failure; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 94d6a13d47f0..a9d3564caf49 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1289,7 +1289,9 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt) static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) { - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) + u32 rt_cookie; + + if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) return NULL; if (rt6_check_expired(rt)) @@ -1357,8 +1359,14 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt->rt6i_flags & RTF_CACHE) { if (dst_hold_safe(&rt->dst)) ip6_del_rt(rt); - } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { - rt->rt6i_node->fn_sernum = -1; + } else { + struct fib6_node *fn; + + rcu_read_lock(); + fn = rcu_dereference(rt->rt6i_node); + if (fn && (rt->rt6i_flags & RTF_DEFAULT)) + fn->fn_sernum = -1; + rcu_read_unlock(); } } } From 414e7d76af6d3ec2dd4e9079927dbe0e2e4ca914 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 21 Aug 2017 12:59:10 -0700 Subject: [PATCH 010/118] net/hsr: Check skb_put_padto() return value skb_put_padto() will free the sk_buff passed as reference in case of errors, but we still need to check its return value and decide what to do. Detected by CoverityScan, CID#1416688 ("CHECKED_RETURN") Fixes: ee1c27977284 ("net/hsr: Added support for HSR v1") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 4e7bdb213cd0..172d8309f89e 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -314,7 +314,8 @@ static void send_hsr_supervision_frame(struct hsr_port *master, hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->MacAddressA, master->dev->dev_addr); - skb_put_padto(skb, ETH_ZLEN + HSR_HLEN); + if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) + return; hsr_forward_skb(skb, master); return; From 5160a153a00ef24dc3a47c215e21fbc197f60887 Mon Sep 17 00:00:00 2001 From: Andrew Jeffery Date: Tue, 22 Aug 2017 16:06:22 +0930 Subject: [PATCH 011/118] net: ftgmac100: Fix oops in probe on failure to find associated PHY netif_napi_del() should be paired with netif_napi_add(), however no such call takes place in ftgmac100_probe(). This triggers a NULL pointer dereference if e.g. no PHY is found by the MDIO probe: [ 2.770000] libphy: Fixed MDIO Bus: probed [ 2.770000] ftgmac100 1e660000.ethernet: Generated random MAC address 66:58:c0:5a:50:b8 [ 2.790000] libphy: ftgmac100_mdio: probed [ 2.790000] ftgmac100 1e660000.ethernet (unnamed net_device) (uninitialized): eth%d: no PHY found [ 2.790000] ftgmac100 1e660000.ethernet: MII Probe failed! [ 2.810000] Unable to handle kernel NULL pointer dereference at virtual address 00000004 [ 2.810000] pgd = 80004000 [ 2.810000] [00000004] *pgd=00000000 [ 2.810000] Internal error: Oops: 805 [#1] ARM [ 2.810000] CPU: 0 PID: 1 Comm: swapper Not tainted 4.10.17-1a4df30c39cf5ee0e3d2528c409787ccbb4a672a #1 [ 2.810000] Hardware name: ASpeed SoC [ 2.810000] task: 9e421b60 task.stack: 9e4a0000 [ 2.810000] PC is at netif_napi_del+0x74/0xa4 [ 2.810000] LR is at ftgmac100_probe+0x290/0x674 [ 2.810000] pc : [<80331004>] lr : [<80292b30>] psr: 60000013 [ 2.810000] sp : 9e4a1d70 ip : 9e4a1d88 fp : 9e4a1d84 [ 2.810000] r10: 9e565000 r9 : ffffffed r8 : 00000007 [ 2.810000] r7 : 9e565480 r6 : 9ec072c0 r5 : 00000000 r4 : 9e5654d8 [ 2.810000] r3 : 9e565530 r2 : 00000000 r1 : 00000000 r0 : 9e5654d8 [ 2.810000] Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user [ 2.810000] Control: 00c5387d Table: 80004008 DAC: 00000055 [ 2.810000] Process swapper (pid: 1, stack limit = 0x9e4a0188) [ 2.810000] Stack: (0x9e4a1d70 to 0x9e4a2000) [ 2.810000] 1d60: 9e565000 9e549e10 9e4a1dcc 9e4a1d88 [ 2.810000] 1d80: 80292b30 80330f9c ffffffff 9e4a1d98 80146058 9ec072c0 00009e10 00000000 [ 2.810000] 1da0: 9e549e18 9e549e10 ffffffed 805f81f4 fffffdfb 00000000 00000000 00000000 [ 2.810000] 1dc0: 9e4a1dec 9e4a1dd0 80243df8 802928ac 9e549e10 8062cbd8 8062cbe0 805f81f4 [ 2.810000] 1de0: 9e4a1e24 9e4a1df0 80242178 80243da4 803001d0 802ffa60 9e4a1e24 9e549e10 [ 2.810000] 1e00: 9e549e44 805f81f4 00000000 00000000 805b8840 8058a6b0 9e4a1e44 9e4a1e28 [ 2.810000] 1e20: 80242434 80241f04 00000000 805f81f4 80242344 00000000 9e4a1e6c 9e4a1e48 [ 2.810000] 1e40: 80240148 80242350 9e425bac 9e4fdc90 9e790e94 805f81f4 9e790e60 805f5640 [ 2.810000] 1e60: 9e4a1e7c 9e4a1e70 802425dc 802400d8 9e4a1ea4 9e4a1e80 80240ba8 802425c0 [ 2.810000] 1e80: 8050b6ac 9e4a1e90 805f81f4 ffffe000 805b8838 80616720 9e4a1ebc 9e4a1ea8 [ 2.810000] 1ea0: 80243068 80240a68 805ab24c ffffe000 9e4a1ecc 9e4a1ec0 80244a38 80242fec [ 2.810000] 1ec0: 9e4a1edc 9e4a1ed0 805ab264 80244a04 9e4a1f4c 9e4a1ee0 8058ae70 805ab258 [ 2.810000] 1ee0: 80032c68 801e3fd8 8052f800 8041af2c 9e4a1f4c 9e4a1f00 80032f90 8058a6bc [ 2.810000] 1f00: 9e4a1f2c 9e4a1f10 00000006 00000006 00000000 8052f220 805112f0 00000000 [ 2.810000] 1f20: 9e4a1f4c 00000006 80616720 805cf400 80616720 805b8838 80616720 00000057 [ 2.810000] 1f40: 9e4a1f94 9e4a1f50 8058b040 8058add0 00000006 00000006 00000000 8058a6b0 [ 2.810000] 1f60: 3940bf3d 00000007 f115c2e8 00000000 803fd158 00000000 00000000 00000000 [ 2.810000] 1f80: 00000000 00000000 9e4a1fac 9e4a1f98 803fd170 8058af38 00000000 803fd158 [ 2.810000] 1fa0: 00000000 9e4a1fb0 8000a5e8 803fd164 00000000 00000000 00000000 00000000 [ 2.810000] 1fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 [ 2.810000] 1fe0: 00000000 00000000 00000000 00000000 00000013 00000000 d11dcae8 af8ddec5 [ 2.810000] [<80331004>] (netif_napi_del) from [<80292b30>] (ftgmac100_probe+0x290/0x674) [ 2.810000] [<80292b30>] (ftgmac100_probe) from [<80243df8>] (platform_drv_probe+0x60/0xc0) [ 2.810000] [<80243df8>] (platform_drv_probe) from [<80242178>] (driver_probe_device+0x280/0x44c) [ 2.810000] [<80242178>] (driver_probe_device) from [<80242434>] (__driver_attach+0xf0/0x104) [ 2.810000] [<80242434>] (__driver_attach) from [<80240148>] (bus_for_each_dev+0x7c/0xb0) [ 2.810000] [<80240148>] (bus_for_each_dev) from [<802425dc>] (driver_attach+0x28/0x30) [ 2.810000] [<802425dc>] (driver_attach) from [<80240ba8>] (bus_add_driver+0x14c/0x268) [ 2.810000] [<80240ba8>] (bus_add_driver) from [<80243068>] (driver_register+0x88/0x104) [ 2.810000] [<80243068>] (driver_register) from [<80244a38>] (__platform_driver_register+0x40/0x54) [ 2.810000] [<80244a38>] (__platform_driver_register) from [<805ab264>] (ftgmac100_driver_init+0x18/0x20) [ 2.810000] [<805ab264>] (ftgmac100_driver_init) from [<8058ae70>] (do_one_initcall+0xac/0x168) [ 2.810000] [<8058ae70>] (do_one_initcall) from [<8058b040>] (kernel_init_freeable+0x114/0x1cc) [ 2.810000] [<8058b040>] (kernel_init_freeable) from [<803fd170>] (kernel_init+0x18/0x104) [ 2.810000] [<803fd170>] (kernel_init) from [<8000a5e8>] (ret_from_fork+0x14/0x2c) [ 2.810000] Code: e594205c e5941058 e2843058 e3a05000 (e5812004) [ 3.210000] ---[ end trace f32811052fd3860c ]--- Signed-off-by: Andrew Jeffery Acked-by: Benjamin Herrenschmidt Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftgmac100.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 34dae51effd4..59da7ac3c108 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1863,7 +1863,6 @@ err_setup_mdio: err_ioremap: release_resource(priv->res); err_req_mem: - netif_napi_del(&priv->napi); free_netdev(netdev); err_alloc_etherdev: return err; From 4eb6a3bdb4629b610a39dd222a0170c72ef1c690 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Tue, 22 Aug 2017 10:28:11 +0200 Subject: [PATCH 012/118] mlxsw: spectrum_switchdev: Fix mrouter flag update Update the value of the mrouter flag in struct mlxsw_sp_bridge_port when it is being changed. Fixes: c57529e1d5d8 ("mlxsw: spectrum: Replace vPorts with Port-VLAN") Signed-off-by: Nogah Frankel Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 5eb1606765c5..d39ffbfcc436 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -705,6 +705,7 @@ static int mlxsw_sp_port_attr_mc_router_set(struct mlxsw_sp_port *mlxsw_sp_port, bool is_port_mc_router) { struct mlxsw_sp_bridge_port *bridge_port; + int err; if (switchdev_trans_ph_prepare(trans)) return 0; @@ -715,11 +716,17 @@ static int mlxsw_sp_port_attr_mc_router_set(struct mlxsw_sp_port *mlxsw_sp_port, return 0; if (!bridge_port->bridge_device->multicast_enabled) - return 0; + goto out; - return mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port, - MLXSW_SP_FLOOD_TYPE_MC, - is_port_mc_router); + err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port, + MLXSW_SP_FLOOD_TYPE_MC, + is_port_mc_router); + if (err) + return err; + +out: + bridge_port->mrouter = is_port_mc_router; + return 0; } static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port, From 458be024ef5f72f1697d5d55289c4ed45f8ee910 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Tue, 22 Aug 2017 12:28:40 +0200 Subject: [PATCH 013/118] tipc: remove subscription references only for pending timers In commit, 139bb36f754a ("tipc: advance the time of deleting subscription from subscriber->subscrp_list"), we delete the subscription from the subscribers list and from nametable unconditionally. This leads to the following bug if the timer running tipc_subscrp_timeout() in another CPU accesses the subscription list after the subscription delete request. [39.570] general protection fault: 0000 [#1] SMP :: [39.574] task: ffffffff81c10540 task.stack: ffffffff81c00000 [39.575] RIP: 0010:tipc_subscrp_timeout+0x32/0x80 [tipc] [39.576] RSP: 0018:ffff88003ba03e90 EFLAGS: 00010282 [39.576] RAX: dead000000000200 RBX: ffff88003f0f3600 RCX: 0000000000000101 [39.577] RDX: dead000000000100 RSI: 0000000000000201 RDI: ffff88003f0d7948 [39.578] RBP: ffff88003ba03ea0 R08: 0000000000000001 R09: ffff88003ba03ef8 [39.579] R10: 000000000000014f R11: 0000000000000000 R12: ffff88003f0d7948 [39.580] R13: ffff88003f0f3618 R14: ffffffffa006c250 R15: ffff88003f0f3600 [39.581] FS: 0000000000000000(0000) GS:ffff88003ba00000(0000) knlGS:0000000000000000 [39.582] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [39.583] CR2: 00007f831c6e0714 CR3: 000000003d3b0000 CR4: 00000000000006f0 [39.584] Call Trace: [39.584] [39.585] call_timer_fn+0x3d/0x180 [39.585] ? tipc_subscrb_rcv_cb+0x260/0x260 [tipc] [39.586] run_timer_softirq+0x168/0x1f0 [39.586] ? sched_clock_cpu+0x16/0xc0 [39.587] __do_softirq+0x9b/0x2de [39.587] irq_exit+0x60/0x70 [39.588] smp_apic_timer_interrupt+0x3d/0x50 [39.588] apic_timer_interrupt+0x86/0x90 [39.589] RIP: 0010:default_idle+0x20/0xf0 [39.589] RSP: 0018:ffffffff81c03e58 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff10 [39.590] RAX: 0000000000000000 RBX: ffffffff81c10540 RCX: 0000000000000000 [39.591] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [39.592] RBP: ffffffff81c03e68 R08: 0000000000000000 R09: 0000000000000000 [39.593] R10: ffffc90001cbbe00 R11: 0000000000000000 R12: 0000000000000000 [39.594] R13: ffffffff81c10540 R14: 0000000000000000 R15: 0000000000000000 [39.595] :: [39.603] RIP: tipc_subscrp_timeout+0x32/0x80 [tipc] RSP: ffff88003ba03e90 [39.604] ---[ end trace 79ce94b7216cb459 ]--- Fixes: 139bb36f754a ("tipc: advance the time of deleting subscription from subscriber->subscrp_list") Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/subscr.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 0bf91cd3733c..f2c81f42dfda 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -52,7 +52,6 @@ struct tipc_subscriber { struct list_head subscrp_list; }; -static void tipc_subscrp_delete(struct tipc_subscription *sub); static void tipc_subscrb_put(struct tipc_subscriber *subscriber); /** @@ -197,15 +196,19 @@ static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, { struct list_head *subscription_list = &subscriber->subscrp_list; struct tipc_subscription *sub, *temp; + u32 timeout; spin_lock_bh(&subscriber->lock); list_for_each_entry_safe(sub, temp, subscription_list, subscrp_list) { if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) continue; - tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscrp_list); - tipc_subscrp_delete(sub); + timeout = htohl(sub->evt.s.timeout, sub->swap); + if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) { + tipc_nametbl_unsubscribe(sub); + list_del(&sub->subscrp_list); + tipc_subscrp_put(sub); + } if (s) break; @@ -236,14 +239,6 @@ static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) tipc_subscrb_put(subscriber); } -static void tipc_subscrp_delete(struct tipc_subscription *sub) -{ - u32 timeout = htohl(sub->evt.s.timeout, sub->swap); - - if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) - tipc_subscrp_put(sub); -} - static void tipc_subscrp_cancel(struct tipc_subscr *s, struct tipc_subscriber *subscriber) { From fd849b7c41f0fabfe783d0691a63c5518e8ebc99 Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Tue, 22 Aug 2017 12:28:41 +0200 Subject: [PATCH 014/118] tipc: fix a race condition of releasing subscriber object No matter whether a request is inserted into workqueue as a work item to cancel a subscription or to delete a subscription's subscriber asynchronously, the work items may be executed in different workers. As a result, it doesn't mean that one request which is raised prior to another request is definitely handled before the latter. By contrast, if the latter request is executed before the former request, below error may happen: [ 656.183644] BUG: spinlock bad magic on CPU#0, kworker/u8:0/12117 [ 656.184487] general protection fault: 0000 [#1] SMP [ 656.185160] Modules linked in: tipc ip6_udp_tunnel udp_tunnel 9pnet_virtio 9p 9pnet virtio_net virtio_pci virtio_ring virtio [last unloaded: ip6_udp_tunnel] [ 656.187003] CPU: 0 PID: 12117 Comm: kworker/u8:0 Not tainted 4.11.0-rc7+ #6 [ 656.187920] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 656.188690] Workqueue: tipc_rcv tipc_recv_work [tipc] [ 656.189371] task: ffff88003f5cec40 task.stack: ffffc90004448000 [ 656.190157] RIP: 0010:spin_bug+0xdd/0xf0 [ 656.190678] RSP: 0018:ffffc9000444bcb8 EFLAGS: 00010202 [ 656.191375] RAX: 0000000000000034 RBX: ffff88003f8d1388 RCX: 0000000000000000 [ 656.192321] RDX: ffff88003ba13708 RSI: ffff88003ba0cd08 RDI: ffff88003ba0cd08 [ 656.193265] RBP: ffffc9000444bcd0 R08: 0000000000000030 R09: 000000006b6b6b6b [ 656.194208] R10: ffff8800bde3e000 R11: 00000000000001b4 R12: 6b6b6b6b6b6b6b6b [ 656.195157] R13: ffffffff81a3ca64 R14: ffff88003f8d1388 R15: ffff88003f8d13a0 [ 656.196101] FS: 0000000000000000(0000) GS:ffff88003ba00000(0000) knlGS:0000000000000000 [ 656.197172] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 656.197935] CR2: 00007f0b3d2e6000 CR3: 000000003ef9e000 CR4: 00000000000006f0 [ 656.198873] Call Trace: [ 656.199210] do_raw_spin_lock+0x66/0xa0 [ 656.199735] _raw_spin_lock_bh+0x19/0x20 [ 656.200258] tipc_subscrb_subscrp_delete+0x28/0xf0 [tipc] [ 656.200990] tipc_subscrb_rcv_cb+0x45/0x260 [tipc] [ 656.201632] tipc_receive_from_sock+0xaf/0x100 [tipc] [ 656.202299] tipc_recv_work+0x2b/0x60 [tipc] [ 656.202872] process_one_work+0x157/0x420 [ 656.203404] worker_thread+0x69/0x4c0 [ 656.203898] kthread+0x138/0x170 [ 656.204328] ? process_one_work+0x420/0x420 [ 656.204889] ? kthread_create_on_node+0x40/0x40 [ 656.205527] ret_from_fork+0x29/0x40 [ 656.206012] Code: 48 8b 0c 25 00 c5 00 00 48 c7 c7 f0 24 a3 81 48 81 c1 f0 05 00 00 65 8b 15 61 ef f5 7e e8 9a 4c 09 00 4d 85 e4 44 8b 4b 08 74 92 <45> 8b 84 24 40 04 00 00 49 8d 8c 24 f0 05 00 00 eb 8d 90 0f 1f [ 656.208504] RIP: spin_bug+0xdd/0xf0 RSP: ffffc9000444bcb8 [ 656.209798] ---[ end trace e2a800e6eb0770be ]--- In above scenario, the request of deleting subscriber was performed earlier than the request of canceling a subscription although the latter was issued before the former, which means tipc_subscrb_delete() was called before tipc_subscrp_cancel(). As a result, when tipc_subscrb_subscrp_delete() called by tipc_subscrp_cancel() was executed to cancel a subscription, the subscription's subscriber refcnt had been decreased to 1. After tipc_subscrp_delete() where the subscriber was freed because its refcnt was decremented to zero, but the subscriber's lock had to be released, as a consequence, panic happened. By contrast, if we increase subscriber's refcnt before tipc_subscrb_subscrp_delete() is called in tipc_subscrp_cancel(), the panic issue can be avoided. Fixes: d094c4d5f5c7 ("tipc: add subscription refcount to avoid invalid delete") Reported-by: Parthasarathy Bhuvaragan Signed-off-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/subscr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index f2c81f42dfda..be3d9e3183dc 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -242,7 +242,9 @@ static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) static void tipc_subscrp_cancel(struct tipc_subscr *s, struct tipc_subscriber *subscriber) { + tipc_subscrb_get(subscriber); tipc_subscrb_subscrp_delete(subscriber, s); + tipc_subscrb_put(subscriber); } static struct tipc_subscription *tipc_subscrp_create(struct net *net, From 78362998f58c7c271e2719dcd0aaced435c801f9 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 22 Aug 2017 15:36:08 +0200 Subject: [PATCH 015/118] macsec: add genl family module alias This helps tools such as wpa_supplicant can start even if the macsec module isn't loaded yet. Fixes: c09440f7dcb3 ("macsec: introduce IEEE 802.1AE driver") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller --- drivers/net/macsec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 5e1ab1160856..98e4deaa3a6a 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -3521,6 +3521,7 @@ module_init(macsec_init); module_exit(macsec_exit); MODULE_ALIAS_RTNL_LINK("macsec"); +MODULE_ALIAS_GENL_FAMILY("macsec"); MODULE_DESCRIPTION("MACsec IEEE 802.1AE"); MODULE_LICENSE("GPL v2"); From fd6055a806edc4019be1b9fb7d25262599bca5b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Aug 2017 09:39:28 -0700 Subject: [PATCH 016/118] udp: on peeking bad csum, drop packets even if not at head When peeking, if a bad csum is discovered, the skb is unlinked from the queue with __sk_queue_drop_skb and the peek operation restarted. __sk_queue_drop_skb only drops packets that match the queue head. This fails if the skb was found after the head, using SO_PEEK_OFF socket option. This causes an infinite loop. We MUST drop this problematic skb, and we can simply check if skb was already removed by another thread, by looking at skb->next : This pointer is set to NULL by the __skb_unlink() operation, that might have happened only under the spinlock protection. Many thanks to syzkaller team (and particularly Dmitry Vyukov who provided us nice C reproducers exhibiting the lockup) and Willem de Bruijn who provided first version for this patch and a test program. Fixes: 627d2d6b5500 ("udp: enable MSG_PEEK at non-zero offset") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Willem de Bruijn Acked-by: Paolo Abeni Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/datagram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/datagram.c b/net/core/datagram.c index a21ca8dee5ea..8c2f4489ff8f 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -362,7 +362,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, if (flags & MSG_PEEK) { err = -ENOENT; spin_lock_bh(&sk_queue->lock); - if (skb == skb_peek(sk_queue)) { + if (skb->next) { __skb_unlink(skb, sk_queue); refcount_dec(&skb->users); if (destructor) From 744a4cf63e528c29840f45811d6fb93fd129b87d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 22 Aug 2017 22:46:49 +0200 Subject: [PATCH 017/118] net: sched: fix use after free when tcf_chain_destroy is called multiple times The goto_chain termination action takes a reference of a chain. In that case, there is an issue when block_put is called tcf_chain_destroy directly. The follo-up call of tcf_chain_put by goto_chain action free works with memory that is already freed. This was caught by kasan: [ 220.337908] BUG: KASAN: use-after-free in tcf_chain_put+0x1b/0x50 [ 220.344103] Read of size 4 at addr ffff88036d1f2cec by task systemd-journal/261 [ 220.353047] CPU: 0 PID: 261 Comm: systemd-journal Not tainted 4.13.0-rc5jiri+ #54 [ 220.360661] Hardware name: Mellanox Technologies Ltd. Mellanox switch/Mellanox x86 mezzanine board, BIOS 4.6.5 08/02/2016 [ 220.371784] Call Trace: [ 220.374290] [ 220.376355] dump_stack+0xd5/0x150 [ 220.391485] print_address_description+0x86/0x410 [ 220.396308] kasan_report+0x181/0x4c0 [ 220.415211] tcf_chain_put+0x1b/0x50 [ 220.418949] free_tcf+0x95/0xc0 So allow tcf_chain_destroy to be called multiple times, free only in case the reference count drops to 0. Fixes: 5bc1701881e3 ("net: sched: introduce multichain support for filters") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 9fd44c221347..45cd34eee727 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -215,9 +215,17 @@ static void tcf_chain_flush(struct tcf_chain *chain) static void tcf_chain_destroy(struct tcf_chain *chain) { - list_del(&chain->list); + /* May be already removed from the list by the previous call. */ + if (!list_empty(&chain->list)) + list_del_init(&chain->list); + tcf_chain_flush(chain); - kfree(chain); + + /* There might still be a reference held when we got here from + * tcf_block_put. Wait for the user to drop reference before free. + */ + if (!chain->refcnt) + kfree(chain); } struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, From 30d65e8f96ad01d9f998039e9af9ce5545e5a4ee Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 22 Aug 2017 22:46:50 +0200 Subject: [PATCH 018/118] net: sched: don't do tcf_chain_flush from tcf_chain_destroy tcf_chain_flush needs to be called with RTNL. However, on free_tcf-> tcf_action_goto_chain_fini-> tcf_chain_put-> tcf_chain_destroy-> tcf_chain_flush callpath, it is called without RTNL. This issue was notified by following warning: [ 155.599052] WARNING: suspicious RCU usage [ 155.603165] 4.13.0-rc5jiri+ #54 Not tainted [ 155.607456] ----------------------------- [ 155.611561] net/sched/cls_api.c:195 suspicious rcu_dereference_protected() usage! Since on this callpath, the chain is guaranteed to be already empty by check in tcf_chain_put, move the tcf_chain_flush call out and call it only where it is needed - into tcf_block_put. Fixes: db50514f9a9c ("net: sched: add termination action to allow goto chain") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 45cd34eee727..6c5ea84d2682 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -219,8 +219,6 @@ static void tcf_chain_destroy(struct tcf_chain *chain) if (!list_empty(&chain->list)) list_del_init(&chain->list); - tcf_chain_flush(chain); - /* There might still be a reference held when we got here from * tcf_block_put. Wait for the user to drop reference before free. */ @@ -296,8 +294,10 @@ void tcf_block_put(struct tcf_block *block) if (!block) return; - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { + tcf_chain_flush(chain); tcf_chain_destroy(chain); + } kfree(block); } EXPORT_SYMBOL(tcf_block_put); From fcd03e362b1cd17de487953aac34f2d4574895cf Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 22 Aug 2017 14:26:47 -0700 Subject: [PATCH 019/118] net: phy: Deal with unbound PHY driver in phy_attached_print() Priit reported that stmmac was crashing with the trace below. This is because phy_attached_print() is called too early right after the PHY device has been found, but before it has a driver attached, since that is only done in phy_probe() which occurs later. Fix this by dealing with a possibly NULL phydev->drv point since that can happen here, but could also happen if we voluntarily did an unbind of the PHY device with the PHY driver. sun7i-dwmac 1c50000.ethernet: PTP uses main clock sun7i-dwmac 1c50000.ethernet: no reset control found sun7i-dwmac 1c50000.ethernet: no regulator found sun7i-dwmac 1c50000.ethernet: Ring mode enabled sun7i-dwmac 1c50000.ethernet: DMA HW capability register supported sun7i-dwmac 1c50000.ethernet: Normal descriptors libphy: stmmac: probed Unable to handle kernel NULL pointer dereference at virtual address 00000048 pgd = c0004000 [00000048] *pgd=00000000 Internal error: Oops: 5 [#1] SMP ARM Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0-rc6-00318-g0065bd7fa384 #1 Hardware name: Allwinner sun7i (A20) Family task: ee868000 task.stack: ee85c000 PC is at phy_attached_print+0x1c/0x8c LR is at stmmac_mdio_register+0x12c/0x200 pc : [] lr : [] psr: 60000013 sp : ee85ddc8 ip : 00000000 fp : c07dfb5c r10: ee981210 r9 : 00000001 r8 : eea73000 r7 : eeaa6dd0 r6 : eeb49800 r5 : 00000000 r4 : 00000000 r3 : 00000000 r2 : 00000000 r1 : 00000000 r0 : eeb49800 Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none Control: 10c5387d Table: 4000406a DAC: 00000051 Process swapper/0 (pid: 1, stack limit = 0xee85c210) Stack: (0xee85ddc8 to 0xee85e000) ddc0: 00000000 00000002 eeb49400 eea72000 00000000 eeb49400 dde0: c045e6b4 00000000 ffffffff eeab0810 00000000 c08051f8 ee9292c0 c016d480 de00: eea725c0 eea73000 eea72000 00000001 eea726c0 c0457d0c 00000040 00000020 de20: 00000000 c045b850 00000001 00000000 ee981200 eeab0810 eeaa6ed0 ee981210 de40: 00000000 c094a4a0 00000000 c0465180 eeaa7550 f08d0000 c9ffb90c 00000032 de60: fffffffa 00000032 ee981210 ffffffed c0a46620 fffffdfb c0a46620 c03f7be8 de80: ee981210 c0a9a388 00000000 00000000 c0a46620 c03f63e0 ee981210 c0a46620 dea0: ee981244 00000000 00000007 000000c6 c094a4a0 c03f6534 00000000 c0a46620 dec0: c03f6490 c03f49ec ee828a58 ee9217b4 c0a46620 eeaa4b00 c0a43230 c03f59fc dee0: c08051f8 c094a49c c0a46620 c0a46620 00000000 c091c668 c093783c c03f6dfc df00: ffffe000 00000000 c091c668 c010177c eefe0938 eefe0935 c085e200 000000c6 df20: 00000005 c0136bc8 60000013 c080b3a4 00000006 00000006 c07ce7b4 00000000 df40: c07d7ddc c07cef28 eefe0938 eefe093e c0a0b2f0 c0a641c0 c0a641c0 c0a641c0 df60: c0937834 00000007 000000c6 c094a4a0 00000000 c0900d88 00000006 00000006 df80: 00000000 c09005a8 00000000 c060ecf4 00000000 00000000 00000000 00000000 dfa0: 00000000 c060ecfc 00000000 c0107738 00000000 00000000 00000000 00000000 dfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 dfe0: 00000000 00000000 00000000 00000000 00000013 00000000 ffdeffff ffffffff [] (phy_attached_print) from [] (stmmac_mdio_register+0x12c/0x200) [] (stmmac_mdio_register) from [] (stmmac_dvr_probe+0x850/0x96c) [] (stmmac_dvr_probe) from [] (sun7i_gmac_probe+0x120/0x180) [] (sun7i_gmac_probe) from [] (platform_drv_probe+0x50/0xac) [] (platform_drv_probe) from [] (driver_probe_device+0x234/0x2e4) [] (driver_probe_device) from [] (__driver_attach+0xa4/0xa8) [] (__driver_attach) from [] (bus_for_each_dev+0x4c/0x9c) [] (bus_for_each_dev) from [] (bus_add_driver+0x190/0x214) [] (bus_add_driver) from [] (driver_register+0x78/0xf4) [] (driver_register) from [] (do_one_initcall+0x44/0x168) [] (do_one_initcall) from [] (kernel_init_freeable+0x144/0x1d0) [] (kernel_init_freeable) from [] (kernel_init+0x8/0x110) [] (kernel_init) from [] (ret_from_fork+0x14/0x3c) Code: e59021c8 e59d401c e590302c e3540000 (e5922048) ---[ end trace 39ae87c7923562d0 ]--- Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b Tested-By: Priit Laes Fixes: fbca164776e4 ("net: stmmac: Use the right logging function in stmmac_mdio_register") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 1790f7fec125..2f742ae5b92e 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -864,15 +864,17 @@ EXPORT_SYMBOL(phy_attached_info); #define ATTACHED_FMT "attached PHY driver [%s] (mii_bus:phy_addr=%s, irq=%d)" void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) { + const char *drv_name = phydev->drv ? phydev->drv->name : "unbound"; + if (!fmt) { dev_info(&phydev->mdio.dev, ATTACHED_FMT "\n", - phydev->drv->name, phydev_name(phydev), + drv_name, phydev_name(phydev), phydev->irq); } else { va_list ap; dev_info(&phydev->mdio.dev, ATTACHED_FMT, - phydev->drv->name, phydev_name(phydev), + drv_name, phydev_name(phydev), phydev->irq); va_start(ap, fmt); From 33ba43ed0afc13a29b1314e3e45a9938d310ba13 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 23 Aug 2017 00:06:09 +0200 Subject: [PATCH 020/118] bpf: fix map value attribute for hash of maps Currently, iproute2's BPF ELF loader works fine with array of maps when retrieving the fd from a pinned node and doing a selfcheck against the provided map attributes from the object file, but we fail to do the same for hash of maps and thus refuse to get the map from pinned node. Reason is that when allocating hash of maps, fd_htab_map_alloc() will set the value size to sizeof(void *), and any user space map creation requests are forced to set 4 bytes as value size. Thus, selfcheck will complain about exposed 8 bytes on 64 bit archs vs. 4 bytes from object file as value size. Contract is that fdinfo or BPF_MAP_GET_FD_BY_ID returns the value size used to create the map. Fix it by handling it the same way as we do for array of maps, which means that we leave value size at 4 bytes and in the allocation phase round up value size to 8 bytes. alloc_htab_elem() needs an adjustment in order to copy rounded up 8 bytes due to bpf_fd_htab_map_update_elem() calling into htab_map_update_elem() with the pointer of the map pointer as value. Unlike array of maps where we just xchg(), we're using the generic htab_map_update_elem() callback also used from helper calls, which published the key/value already on return, so we need to ensure to memcpy() the right size. Fixes: bcc6b1b7ebf8 ("bpf: Add hash of maps support") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4fb463172aa8..d11c8181f4c5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -652,12 +652,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, } } +static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && + BITS_PER_LONG == 64; +} + +static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) +{ + u32 size = htab->map.value_size; + + if (percpu || fd_htab_map_needs_adjust(htab)) + size = round_up(size, 8); + return size; +} + static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { - u32 size = htab->map.value_size; + u32 size = htab_size_value(htab, percpu); bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; @@ -696,9 +711,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, memcpy(l_new->key, key, key_size); if (percpu) { - /* round up value_size to 8 bytes */ - size = round_up(size, 8); - if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { @@ -1209,17 +1221,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) { - struct bpf_map *map; - if (attr->value_size != sizeof(u32)) return ERR_PTR(-EINVAL); - - /* pointer is stored internally */ - attr->value_size = sizeof(void *); - map = htab_map_alloc(attr); - attr->value_size = sizeof(u32); - - return map; + return htab_map_alloc(attr); } static void fd_htab_map_free(struct bpf_map *map) From a1a50c8e4c241a505b7270e1a3c6e50d94e794b1 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 22 Aug 2017 15:24:47 -0700 Subject: [PATCH 021/118] fsl/man: Inherit parent device and of_node Junote Cai reported that he was not able to get a DSA setup involving the Freescale DPAA/FMAN driver to work and narrowed it down to of_find_net_device_by_node(). This function requires the network device's device reference to be correctly set which is the case here, though we have lost any device_node association there. The problem is that dpaa_eth_add_device() allocates a "dpaa-ethernet" platform device, and later on dpaa_eth_probe() is called but SET_NETDEV_DEV() won't be propagating &pdev->dev.of_node properly. Fix this by inherenting both the parent device and the of_node when dpaa_eth_add_device() creates the platform device. Fixes: 3933961682a3 ("fsl/fman: Add FMan MAC driver") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fman/mac.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c index 6e67d22fd0d5..1c7da16ad0ff 100644 --- a/drivers/net/ethernet/freescale/fman/mac.c +++ b/drivers/net/ethernet/freescale/fman/mac.c @@ -623,6 +623,8 @@ static struct platform_device *dpaa_eth_add_device(int fman_id, goto no_mem; } + pdev->dev.of_node = node; + pdev->dev.parent = priv->dev; set_dma_ops(&pdev->dev, get_dma_ops(priv->dev)); ret = platform_device_add_data(pdev, &data, sizeof(data)); From 013dae5dbc07aa521a38f1ca2d32123ec674bd5d Mon Sep 17 00:00:00 2001 From: Stephan Gatzka Date: Tue, 22 Aug 2017 14:25:07 +0200 Subject: [PATCH 022/118] net: stmmac: socfgpa: Ensure emac bit set in sys manager for MII/GMII/SGMII. When using MII/GMII/SGMII in the Altera SoC, the phy needs to be wired through the FPGA. To ensure correct behavior, the appropriate bit in the System Manager FPGA Interface Group register needs to be set. Signed-off-by: Stephan Gatzka Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 17d4bbaeb65c..6e359572b9f0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -269,7 +269,10 @@ static int socfpga_dwmac_set_phy_mode(struct socfpga_dwmac *dwmac) ctrl &= ~(SYSMGR_EMACGRP_CTRL_PHYSEL_MASK << reg_shift); ctrl |= val << reg_shift; - if (dwmac->f2h_ptp_ref_clk) { + if (dwmac->f2h_ptp_ref_clk || + phymode == PHY_INTERFACE_MODE_MII || + phymode == PHY_INTERFACE_MODE_GMII || + phymode == PHY_INTERFACE_MODE_SGMII) { ctrl |= SYSMGR_EMACGRP_CTRL_PTP_REF_CLK_MASK << (reg_shift / 2); regmap_read(sys_mgr_base_addr, SYSMGR_FPGAGRP_MODULE_REG, &module); From cd0a137acbb66208368353723f5f1480995cf1c4 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 22 Aug 2017 15:12:14 -0700 Subject: [PATCH 023/118] net: core: Specify skb_pad()/skb_put_padto() SKB freeing Rename skb_pad() into __skb_pad() and make it take a third argument: free_on_error which controls whether kfree_skb() should be called or not, skb_pad() directly makes use of it and passes true to preserve its existing behavior. Do exactly the same thing with __skb_put_padto() and skb_put_padto(). Suggested-by: David Miller Signed-off-by: Florian Fainelli Reviewed-by: Woojung Huh Signed-off-by: David S. Miller --- include/linux/skbuff.h | 53 ++++++++++++++++++++++++++++++++++-------- net/core/skbuff.c | 13 +++++++---- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index dbe29b6c9bd6..d67a8182e5eb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -973,7 +973,23 @@ int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len); int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer); -int skb_pad(struct sk_buff *skb, int pad); +int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error); + +/** + * skb_pad - zero pad the tail of an skb + * @skb: buffer to pad + * @pad: space to pad + * + * Ensure that a buffer is followed by a padding area that is zero + * filled. Used by network drivers which may DMA or transfer data + * beyond the buffer end onto the wire. + * + * May return error in out of memory cases. The skb is freed on error. + */ +static inline int skb_pad(struct sk_buff *skb, int pad) +{ + return __skb_pad(skb, pad, true); +} #define dev_kfree_skb(a) consume_skb(a) int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, @@ -2821,6 +2837,31 @@ static inline int skb_padto(struct sk_buff *skb, unsigned int len) return skb_pad(skb, len - size); } +/** + * skb_put_padto - increase size and pad an skbuff up to a minimal size + * @skb: buffer to pad + * @len: minimal length + * @free_on_error: free buffer on error + * + * Pads up a buffer to ensure the trailing bytes exist and are + * blanked. If the buffer already contains sufficient data it + * is untouched. Otherwise it is extended. Returns zero on + * success. The skb is freed on error if @free_on_error is true. + */ +static inline int __skb_put_padto(struct sk_buff *skb, unsigned int len, + bool free_on_error) +{ + unsigned int size = skb->len; + + if (unlikely(size < len)) { + len -= size; + if (__skb_pad(skb, len, free_on_error)) + return -ENOMEM; + __skb_put(skb, len); + } + return 0; +} + /** * skb_put_padto - increase size and pad an skbuff up to a minimal size * @skb: buffer to pad @@ -2833,15 +2874,7 @@ static inline int skb_padto(struct sk_buff *skb, unsigned int len) */ static inline int skb_put_padto(struct sk_buff *skb, unsigned int len) { - unsigned int size = skb->len; - - if (unlikely(size < len)) { - len -= size; - if (skb_pad(skb, len)) - return -ENOMEM; - __skb_put(skb, len); - } - return 0; + return __skb_put_padto(skb, len, true); } static inline int skb_add_data(struct sk_buff *skb, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f990eb8b30a9..e07556606284 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1363,18 +1363,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, EXPORT_SYMBOL(skb_copy_expand); /** - * skb_pad - zero pad the tail of an skb + * __skb_pad - zero pad the tail of an skb * @skb: buffer to pad * @pad: space to pad + * @free_on_error: free buffer on error * * Ensure that a buffer is followed by a padding area that is zero * filled. Used by network drivers which may DMA or transfer data * beyond the buffer end onto the wire. * - * May return error in out of memory cases. The skb is freed on error. + * May return error in out of memory cases. The skb is freed on error + * if @free_on_error is true. */ -int skb_pad(struct sk_buff *skb, int pad) +int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) { int err; int ntail; @@ -1403,10 +1405,11 @@ int skb_pad(struct sk_buff *skb, int pad) return 0; free_skb: - kfree_skb(skb); + if (free_on_error) + kfree_skb(skb); return err; } -EXPORT_SYMBOL(skb_pad); +EXPORT_SYMBOL(__skb_pad); /** * pskb_put - add data to the tail of a potentially fragmented buffer From 49716679248a0bf64d8914711b2d1ea48751853e Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 22 Aug 2017 15:12:15 -0700 Subject: [PATCH 024/118] net: dsa: skb_put_padto() already frees nskb The first call of skb_put_padto() will free up the SKB on error, but we return NULL which tells dsa_slave_xmit() that the original SKB should be freed so this would lead to a double free here. The second skb_put_padto() already frees the passed sk_buff reference upon error, so calling kfree_skb() on it again is not necessary. Detected by CoverityScan, CID#1416687 ("USE_AFTER_FREE") Fixes: e71cb9e00922 ("net: dsa: ksz: fix skb freeing") Signed-off-by: Florian Fainelli Reviewed-by: Woojung Huh Signed-off-by: David S. Miller --- net/dsa/tag_ksz.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index de66ca8e6201..3bd6e2a83125 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -42,7 +42,8 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len; if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) { - if (skb_put_padto(skb, skb->len + padlen)) + /* Let dsa_slave_xmit() free skb */ + if (__skb_put_padto(skb, skb->len + padlen, false)) return NULL; nskb = skb; @@ -60,10 +61,11 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) skb_transport_header(skb) - skb->head); skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len)); - if (skb_put_padto(nskb, nskb->len + padlen)) { - kfree_skb(nskb); + /* Let skb_put_padto() free nskb, and let dsa_slave_xmit() free + * skb + */ + if (skb_put_padto(nskb, nskb->len + padlen)) return NULL; - } kfree_skb(skb); } From d6e1ab9ea3514840e4f32957c457b094646c2e9d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Aug 2017 23:22:42 -0700 Subject: [PATCH 025/118] nfp: don't hold PF lock while enabling SR-IOV Enabling SR-IOV VFs will cause the PCI subsystem to schedule a work and flush its workqueue. Since the nfp driver schedules its own work we can't enable VFs while holding driver load. Commit 6d48ceb27af1 ("nfp: allocate a private workqueue for driver work") tried to avoid this deadlock by creating a separate workqueue. Unfortunately, due to the architecture of workqueue subsystem this does not guarantee a separate thread of execution. Luckily we can simply take pci_enable_sriov() from under the driver lock. Take pci_disable_sriov() from under the lock too for symmetry. Fixes: 6d48ceb27af1 ("nfp: allocate a private workqueue for driver work") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_main.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c index d67969d3e484..3f199db2002e 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c @@ -98,21 +98,20 @@ static int nfp_pcie_sriov_enable(struct pci_dev *pdev, int num_vfs) struct nfp_pf *pf = pci_get_drvdata(pdev); int err; - mutex_lock(&pf->lock); - if (num_vfs > pf->limit_vfs) { nfp_info(pf->cpp, "Firmware limits number of VFs to %u\n", pf->limit_vfs); - err = -EINVAL; - goto err_unlock; + return -EINVAL; } err = pci_enable_sriov(pdev, num_vfs); if (err) { dev_warn(&pdev->dev, "Failed to enable PCI SR-IOV: %d\n", err); - goto err_unlock; + return err; } + mutex_lock(&pf->lock); + err = nfp_app_sriov_enable(pf->app, num_vfs); if (err) { dev_warn(&pdev->dev, @@ -129,9 +128,8 @@ static int nfp_pcie_sriov_enable(struct pci_dev *pdev, int num_vfs) return num_vfs; err_sriov_disable: - pci_disable_sriov(pdev); -err_unlock: mutex_unlock(&pf->lock); + pci_disable_sriov(pdev); return err; #endif return 0; @@ -158,10 +156,10 @@ static int nfp_pcie_sriov_disable(struct pci_dev *pdev) pf->num_vfs = 0; + mutex_unlock(&pf->lock); + pci_disable_sriov(pdev); dev_dbg(&pdev->dev, "Removed VFs.\n"); - - mutex_unlock(&pf->lock); #endif return 0; } From 326ce603015eefaa86fc6e490f43638e1010a838 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Aug 2017 23:22:43 -0700 Subject: [PATCH 026/118] nfp: make sure representors are destroyed before their lower netdev App start/stop callbacks can perform application initialization. Unfortunately, flower app started using them for creating and destroying representors. This can lead to a situation where lower vNIC netdev is destroyed while representors still try to pass traffic. This will most likely lead to a NULL-dereference on the lower netdev TX path. Move the start/stop callbacks, so that representors are created/ destroyed when vNICs are fully initialized. Fixes: 5de73ee46704 ("nfp: general representor implementation") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../net/ethernet/netronome/nfp/nfp_net_main.c | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c index 5797dbf2b507..1aca4e57bf41 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c @@ -456,10 +456,6 @@ static int nfp_net_pf_app_start(struct nfp_pf *pf) { int err; - err = nfp_net_pf_app_start_ctrl(pf); - if (err) - return err; - err = nfp_app_start(pf->app, pf->ctrl_vnic); if (err) goto err_ctrl_stop; @@ -484,7 +480,6 @@ static void nfp_net_pf_app_stop(struct nfp_pf *pf) if (pf->num_vfs) nfp_app_sriov_disable(pf->app); nfp_app_stop(pf->app); - nfp_net_pf_app_stop_ctrl(pf); } static void nfp_net_pci_unmap_mem(struct nfp_pf *pf) @@ -559,7 +554,7 @@ err_unmap_ctrl: static void nfp_net_pci_remove_finish(struct nfp_pf *pf) { - nfp_net_pf_app_stop(pf); + nfp_net_pf_app_stop_ctrl(pf); /* stop app first, to avoid double free of ctrl vNIC's ddir */ nfp_net_debugfs_dir_clean(&pf->ddir); @@ -690,6 +685,7 @@ int nfp_net_pci_probe(struct nfp_pf *pf) { struct nfp_net_fw_version fw_ver; u8 __iomem *ctrl_bar, *qc_bar; + struct nfp_net *nn; int stride; int err; @@ -766,7 +762,7 @@ int nfp_net_pci_probe(struct nfp_pf *pf) if (err) goto err_free_vnics; - err = nfp_net_pf_app_start(pf); + err = nfp_net_pf_app_start_ctrl(pf); if (err) goto err_free_irqs; @@ -774,12 +770,20 @@ int nfp_net_pci_probe(struct nfp_pf *pf) if (err) goto err_stop_app; + err = nfp_net_pf_app_start(pf); + if (err) + goto err_clean_vnics; + mutex_unlock(&pf->lock); return 0; +err_clean_vnics: + list_for_each_entry(nn, &pf->vnics, vnic_list) + if (nfp_net_is_data_vnic(nn)) + nfp_net_pf_clean_vnic(pf, nn); err_stop_app: - nfp_net_pf_app_stop(pf); + nfp_net_pf_app_stop_ctrl(pf); err_free_irqs: nfp_net_pf_free_irqs(pf); err_free_vnics: @@ -803,6 +807,8 @@ void nfp_net_pci_remove(struct nfp_pf *pf) if (list_empty(&pf->vnics)) goto out; + nfp_net_pf_app_stop(pf); + list_for_each_entry(nn, &pf->vnics, vnic_list) if (nfp_net_is_data_vnic(nn)) nfp_net_pf_clean_vnic(pf, nn); From 1691a4c0f4634d50ffeb74373fdeec63495c911e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 22 Aug 2017 23:22:44 -0700 Subject: [PATCH 027/118] nfp: avoid buffer leak when representor is missing When driver receives a muxed frame, but it can't find the representor netdev it is destined to it will try to "drop" that frame, i.e. reuse the buffer. The issue is that the replacement buffer has already been allocated at this point, and reusing the buffer from received frame will leak it. Change the code to put the new buffer on the ring earlier and not reuse the old buffer (make the buffer parameter to nfp_net_rx_drop() a NULL). Fixes: 91bf82ca9eed ("nfp: add support for tx/rx with metadata portid") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 9f77ce038a4a..1ff0c577819e 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1751,6 +1751,10 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) continue; } + nfp_net_dma_unmap_rx(dp, rxbuf->dma_addr); + + nfp_net_rx_give_one(dp, rx_ring, new_frag, new_dma_addr); + if (likely(!meta.portid)) { netdev = dp->netdev; } else { @@ -1759,16 +1763,12 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) nn = netdev_priv(dp->netdev); netdev = nfp_app_repr_get(nn->app, meta.portid); if (unlikely(!netdev)) { - nfp_net_rx_drop(dp, r_vec, rx_ring, rxbuf, skb); + nfp_net_rx_drop(dp, r_vec, rx_ring, NULL, skb); continue; } nfp_repr_inc_rx_stats(netdev, pkt_len); } - nfp_net_dma_unmap_rx(dp, rxbuf->dma_addr); - - nfp_net_rx_give_one(dp, rx_ring, new_frag, new_dma_addr); - skb_reserve(skb, pkt_off); skb_put(skb, pkt_len); From 2b33bc8aa236b75d6e86a8a79126fd9739e4a5bd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 Aug 2017 21:40:32 -0700 Subject: [PATCH 028/118] net: dsa: use consume_skb() Two kfree_skb() should be consume_skb(), to be friend with drop monitor (perf record ... -e skb:kfree_skb) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dsa/tag_ksz.c | 2 +- net/dsa/tag_trailer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 3bd6e2a83125..fcd90f79458e 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -67,7 +67,7 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev) if (skb_put_padto(nskb, nskb->len + padlen)) return NULL; - kfree_skb(skb); + consume_skb(skb); } tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index b09e56214005..9c7b1d74a5c6 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -40,7 +40,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) skb_set_network_header(nskb, skb_network_header(skb) - skb->head); skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head); skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len)); - kfree_skb(skb); + consume_skb(skb); if (padlen) { skb_put_zero(nskb, padlen); From ee6c88bb754e3d363e568da78086adfedb692447 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 23 Aug 2017 13:27:13 +0200 Subject: [PATCH 029/118] sctp: Avoid out-of-bounds reads from address storage inet_diag_msg_sctp{,l}addr_fill() and sctp_get_sctp_info() copy sizeof(sockaddr_storage) bytes to fill in sockaddr structs used to export diagnostic information to userspace. However, the memory allocated to store sockaddr information is smaller than that and depends on the address family, so we leak up to 100 uninitialized bytes to userspace. Just use the size of the source structs instead, in all the three cases this is what userspace expects. Zero out the remaining memory. Unused bytes (i.e. when IPv4 addresses are used) in source structs sctp_sockaddr_entry and sctp_transport are already cleared by sctp_add_bind_addr() and sctp_transport_new(), respectively. Noticed while testing KASAN-enabled kernel with 'ss': [ 2326.885243] BUG: KASAN: slab-out-of-bounds in inet_sctp_diag_fill+0x42c/0x6c0 [sctp_diag] at addr ffff881be8779800 [ 2326.896800] Read of size 128 by task ss/9527 [ 2326.901564] CPU: 0 PID: 9527 Comm: ss Not tainted 4.11.0-22.el7a.x86_64 #1 [ 2326.909236] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.4.3 01/17/2017 [ 2326.917585] Call Trace: [ 2326.920312] dump_stack+0x63/0x8d [ 2326.924014] kasan_object_err+0x21/0x70 [ 2326.928295] kasan_report+0x288/0x540 [ 2326.932380] ? inet_sctp_diag_fill+0x42c/0x6c0 [sctp_diag] [ 2326.938500] ? skb_put+0x8b/0xd0 [ 2326.942098] ? memset+0x31/0x40 [ 2326.945599] check_memory_region+0x13c/0x1a0 [ 2326.950362] memcpy+0x23/0x50 [ 2326.953669] inet_sctp_diag_fill+0x42c/0x6c0 [sctp_diag] [ 2326.959596] ? inet_diag_msg_sctpasoc_fill+0x460/0x460 [sctp_diag] [ 2326.966495] ? __lock_sock+0x102/0x150 [ 2326.970671] ? sock_def_wakeup+0x60/0x60 [ 2326.975048] ? remove_wait_queue+0xc0/0xc0 [ 2326.979619] sctp_diag_dump+0x44a/0x760 [sctp_diag] [ 2326.985063] ? sctp_ep_dump+0x280/0x280 [sctp_diag] [ 2326.990504] ? memset+0x31/0x40 [ 2326.994007] ? mutex_lock+0x12/0x40 [ 2326.997900] __inet_diag_dump+0x57/0xb0 [inet_diag] [ 2327.003340] ? __sys_sendmsg+0x150/0x150 [ 2327.007715] inet_diag_dump+0x4d/0x80 [inet_diag] [ 2327.012979] netlink_dump+0x1e6/0x490 [ 2327.017064] __netlink_dump_start+0x28e/0x2c0 [ 2327.021924] inet_diag_handler_cmd+0x189/0x1a0 [inet_diag] [ 2327.028045] ? inet_diag_rcv_msg_compat+0x1b0/0x1b0 [inet_diag] [ 2327.034651] ? inet_diag_dump_compat+0x190/0x190 [inet_diag] [ 2327.040965] ? __netlink_lookup+0x1b9/0x260 [ 2327.045631] sock_diag_rcv_msg+0x18b/0x1e0 [ 2327.050199] netlink_rcv_skb+0x14b/0x180 [ 2327.054574] ? sock_diag_bind+0x60/0x60 [ 2327.058850] sock_diag_rcv+0x28/0x40 [ 2327.062837] netlink_unicast+0x2e7/0x3b0 [ 2327.067212] ? netlink_attachskb+0x330/0x330 [ 2327.071975] ? kasan_check_write+0x14/0x20 [ 2327.076544] netlink_sendmsg+0x5be/0x730 [ 2327.080918] ? netlink_unicast+0x3b0/0x3b0 [ 2327.085486] ? kasan_check_write+0x14/0x20 [ 2327.090057] ? selinux_socket_sendmsg+0x24/0x30 [ 2327.095109] ? netlink_unicast+0x3b0/0x3b0 [ 2327.099678] sock_sendmsg+0x74/0x80 [ 2327.103567] ___sys_sendmsg+0x520/0x530 [ 2327.107844] ? __get_locked_pte+0x178/0x200 [ 2327.112510] ? copy_msghdr_from_user+0x270/0x270 [ 2327.117660] ? vm_insert_page+0x360/0x360 [ 2327.122133] ? vm_insert_pfn_prot+0xb4/0x150 [ 2327.126895] ? vm_insert_pfn+0x32/0x40 [ 2327.131077] ? vvar_fault+0x71/0xd0 [ 2327.134968] ? special_mapping_fault+0x69/0x110 [ 2327.140022] ? __do_fault+0x42/0x120 [ 2327.144008] ? __handle_mm_fault+0x1062/0x17a0 [ 2327.148965] ? __fget_light+0xa7/0xc0 [ 2327.153049] __sys_sendmsg+0xcb/0x150 [ 2327.157133] ? __sys_sendmsg+0xcb/0x150 [ 2327.161409] ? SyS_shutdown+0x140/0x140 [ 2327.165688] ? exit_to_usermode_loop+0xd0/0xd0 [ 2327.170646] ? __do_page_fault+0x55d/0x620 [ 2327.175216] ? __sys_sendmsg+0x150/0x150 [ 2327.179591] SyS_sendmsg+0x12/0x20 [ 2327.183384] do_syscall_64+0xe3/0x230 [ 2327.187471] entry_SYSCALL64_slow_path+0x25/0x25 [ 2327.192622] RIP: 0033:0x7f41d18fa3b0 [ 2327.196608] RSP: 002b:00007ffc3b731218 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 2327.205055] RAX: ffffffffffffffda RBX: 00007ffc3b731380 RCX: 00007f41d18fa3b0 [ 2327.213017] RDX: 0000000000000000 RSI: 00007ffc3b731340 RDI: 0000000000000003 [ 2327.220978] RBP: 0000000000000002 R08: 0000000000000004 R09: 0000000000000040 [ 2327.228939] R10: 00007ffc3b730f30 R11: 0000000000000246 R12: 0000000000000003 [ 2327.236901] R13: 00007ffc3b731340 R14: 00007ffc3b7313d0 R15: 0000000000000084 [ 2327.244865] Object at ffff881be87797e0, in cache kmalloc-64 size: 64 [ 2327.251953] Allocated: [ 2327.254581] PID = 9484 [ 2327.257215] save_stack_trace+0x1b/0x20 [ 2327.261485] save_stack+0x46/0xd0 [ 2327.265179] kasan_kmalloc+0xad/0xe0 [ 2327.269165] kmem_cache_alloc_trace+0xe6/0x1d0 [ 2327.274138] sctp_add_bind_addr+0x58/0x180 [sctp] [ 2327.279400] sctp_do_bind+0x208/0x310 [sctp] [ 2327.284176] sctp_bind+0x61/0xa0 [sctp] [ 2327.288455] inet_bind+0x5f/0x3a0 [ 2327.292151] SYSC_bind+0x1a4/0x1e0 [ 2327.295944] SyS_bind+0xe/0x10 [ 2327.299349] do_syscall_64+0xe3/0x230 [ 2327.303433] return_from_SYSCALL_64+0x0/0x6a [ 2327.308194] Freed: [ 2327.310434] PID = 4131 [ 2327.313065] save_stack_trace+0x1b/0x20 [ 2327.317344] save_stack+0x46/0xd0 [ 2327.321040] kasan_slab_free+0x73/0xc0 [ 2327.325220] kfree+0x96/0x1a0 [ 2327.328530] dynamic_kobj_release+0x15/0x40 [ 2327.333195] kobject_release+0x99/0x1e0 [ 2327.337472] kobject_put+0x38/0x70 [ 2327.341266] free_notes_attrs+0x66/0x80 [ 2327.345545] mod_sysfs_teardown+0x1a5/0x270 [ 2327.350211] free_module+0x20/0x2a0 [ 2327.354099] SyS_delete_module+0x2cb/0x2f0 [ 2327.358667] do_syscall_64+0xe3/0x230 [ 2327.362750] return_from_SYSCALL_64+0x0/0x6a [ 2327.367510] Memory state around the buggy address: [ 2327.372855] ffff881be8779700: fc fc fc fc 00 00 00 00 00 00 00 00 fc fc fc fc [ 2327.380914] ffff881be8779780: fb fb fb fb fb fb fb fb fc fc fc fc 00 00 00 00 [ 2327.388972] >ffff881be8779800: 00 00 00 00 fc fc fc fc fb fb fb fb fb fb fb fb [ 2327.397031] ^ [ 2327.401792] ffff881be8779880: fc fc fc fc fb fb fb fb fb fb fb fb fc fc fc fc [ 2327.409850] ffff881be8779900: 00 00 00 00 00 04 fc fc fc fc fc fc 00 00 00 00 [ 2327.417907] ================================================================== This fixes CVE-2017-7558. References: https://bugzilla.redhat.com/show_bug.cgi?id=1480266 Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file") Cc: Xin Long Cc: Vlad Yasevich Cc: Neil Horman Signed-off-by: Stefano Brivio Acked-by: Marcelo Ricardo Leitner Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sctp_diag.c | 7 +++++-- net/sctp/socket.c | 3 +-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index 9a647214a91e..e99518e79b52 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -70,7 +70,8 @@ static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, info = nla_data(attr); list_for_each_entry_rcu(laddr, address_list, list) { - memcpy(info, &laddr->a, addrlen); + memcpy(info, &laddr->a, sizeof(laddr->a)); + memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a)); info += addrlen; } @@ -93,7 +94,9 @@ static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb, info = nla_data(attr); list_for_each_entry(from, &asoc->peer.transport_addr_list, transports) { - memcpy(info, &from->ipaddr, addrlen); + memcpy(info, &from->ipaddr, sizeof(from->ipaddr)); + memset(info + sizeof(from->ipaddr), 0, + addrlen - sizeof(from->ipaddr)); info += addrlen; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 1db478e34520..8d760863bc41 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4538,8 +4538,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, info->sctpi_ictrlchunks = asoc->stats.ictrlchunks; prim = asoc->peer.primary_path; - memcpy(&info->sctpi_p_address, &prim->ipaddr, - sizeof(struct sockaddr_storage)); + memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr)); info->sctpi_p_state = prim->state; info->sctpi_p_cwnd = prim->cwnd; info->sctpi_p_srtt = prim->srtt; From 46f1c52e66dbc0d7a99f7c2a3c9debb497fe7b7b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 23 Aug 2017 14:41:50 -0700 Subject: [PATCH 030/118] nfp: TX time stamp packets before HW doorbell is rung TX completion may happen any time after HW queue was kicked. We can't access the skb afterwards. Move the time stamping before ringing the doorbell. Fixes: 4c3523623dc0 ("net: add driver for Netronome NFP4000/NFP6000 NIC VFs") Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 1ff0c577819e..66a09e490cf5 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -895,6 +895,8 @@ static int nfp_net_tx(struct sk_buff *skb, struct net_device *netdev) netdev_tx_sent_queue(nd_q, txbuf->real_len); + skb_tx_timestamp(skb); + tx_ring->wr_p += nr_frags + 1; if (nfp_net_tx_ring_should_stop(tx_ring)) nfp_net_tx_ring_stop(nd_q, tx_ring); @@ -903,8 +905,6 @@ static int nfp_net_tx(struct sk_buff *skb, struct net_device *netdev) if (!skb->xmit_more || netif_xmit_stopped(nd_q)) nfp_net_tx_xmit_more_flush(tx_ring); - skb_tx_timestamp(skb); - return NETDEV_TX_OK; err_unmap: From 87e9b3778c94694c9e098c91a0cc05725f0e017f Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 23 Aug 2017 19:34:03 -0400 Subject: [PATCH 031/118] bnxt_en: Fix .ndo_setup_tc() to include XDP rings. When the number of TX rings is changed in bnxt_setup_tc(), we need to include the XDP rings in the total TX ring count. Fixes: 38413406277f ("bnxt_en: Add support for XDP_TX action.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index e7c8539cbddf..f2f2bbfa8a7f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7152,6 +7152,7 @@ int bnxt_setup_mq_tc(struct net_device *dev, u8 tc) bp->tx_nr_rings = bp->tx_nr_rings_per_tc; netdev_reset_tc(dev); } + bp->tx_nr_rings += bp->tx_nr_rings_xdp; bp->cp_nr_rings = sh ? max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) : bp->tx_nr_rings + bp->rx_nr_rings; bp->num_stat_ctxs = bp->cp_nr_rings; From 146ed3c5b87d8c65ec31bc56df26f027fe624b8f Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 23 Aug 2017 19:34:04 -0400 Subject: [PATCH 032/118] bnxt_en: Free MSIX vectors when unregistering the device from bnxt_re. Take back ownership of the MSIX vectors when unregistering the device from bnxt_re. Fixes: a588e4580a7e ("bnxt_en: Add interface to support RDMA driver.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index 77da75a55c02..997e10e8b863 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -84,6 +84,8 @@ static int bnxt_unregister_dev(struct bnxt_en_dev *edev, int ulp_id) max_stat_ctxs = bnxt_get_max_func_stat_ctxs(bp); bnxt_set_max_func_stat_ctxs(bp, max_stat_ctxs + 1); + if (ulp->msix_requested) + edev->en_ops->bnxt_free_msix(edev, ulp_id); } if (ulp->max_async_event_id) bnxt_hwrm_func_rgtr_async_events(bp, NULL, 0); From a22a6ac2ff8080c87e446e20592725c064229c71 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 23 Aug 2017 19:34:05 -0400 Subject: [PATCH 033/118] bnxt_en: Do not setup MAC address in bnxt_hwrm_func_qcaps(). bnxt_hwrm_func_qcaps() is called during probe to get all device resources and it also sets up the factory MAC address. The same function is called when SRIOV is disabled to reclaim all resources. If the MAC address has been overridden by a user administered MAC address, calling this function will overwrite it. Separate the logic that sets up the default MAC address into a new function bnxt_init_mac_addr() that is only called during probe time. Fixes: 4a21b49b34c0 ("bnxt_en: Improve VF resource accounting.") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 40 ++++++++++++++++------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index f2f2bbfa8a7f..f20b3d2a4c23 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4647,7 +4647,6 @@ static int bnxt_hwrm_func_qcaps(struct bnxt *bp) pf->port_id = le16_to_cpu(resp->port_id); bp->dev->dev_port = pf->port_id; memcpy(pf->mac_addr, resp->mac_address, ETH_ALEN); - memcpy(bp->dev->dev_addr, pf->mac_addr, ETH_ALEN); pf->max_rsscos_ctxs = le16_to_cpu(resp->max_rsscos_ctx); pf->max_cp_rings = le16_to_cpu(resp->max_cmpl_rings); pf->max_tx_rings = le16_to_cpu(resp->max_tx_rings); @@ -4687,16 +4686,6 @@ static int bnxt_hwrm_func_qcaps(struct bnxt *bp) vf->max_stat_ctxs = le16_to_cpu(resp->max_stat_ctx); memcpy(vf->mac_addr, resp->mac_address, ETH_ALEN); - mutex_unlock(&bp->hwrm_cmd_lock); - - if (is_valid_ether_addr(vf->mac_addr)) { - /* overwrite netdev dev_adr with admin VF MAC */ - memcpy(bp->dev->dev_addr, vf->mac_addr, ETH_ALEN); - } else { - eth_hw_addr_random(bp->dev); - rc = bnxt_approve_mac(bp, bp->dev->dev_addr); - } - return rc; #endif } @@ -7662,6 +7651,28 @@ void bnxt_restore_pf_fw_resources(struct bnxt *bp) bnxt_subtract_ulp_resources(bp, BNXT_ROCE_ULP); } +static int bnxt_init_mac_addr(struct bnxt *bp) +{ + int rc = 0; + + if (BNXT_PF(bp)) { + memcpy(bp->dev->dev_addr, bp->pf.mac_addr, ETH_ALEN); + } else { +#ifdef CONFIG_BNXT_SRIOV + struct bnxt_vf_info *vf = &bp->vf; + + if (is_valid_ether_addr(vf->mac_addr)) { + /* overwrite netdev dev_adr with admin VF MAC */ + memcpy(bp->dev->dev_addr, vf->mac_addr, ETH_ALEN); + } else { + eth_hw_addr_random(bp->dev); + rc = bnxt_approve_mac(bp, bp->dev->dev_addr); + } +#endif + } + return rc; +} + static void bnxt_parse_log_pcie_link(struct bnxt *bp) { enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN; @@ -7790,7 +7801,12 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) rc = -1; goto init_err_pci_clean; } - + rc = bnxt_init_mac_addr(bp); + if (rc) { + dev_err(&pdev->dev, "Unable to initialize mac address.\n"); + rc = -EADDRNOTAVAIL; + goto init_err_pci_clean; + } rc = bnxt_hwrm_queue_qportcfg(bp); if (rc) { netdev_err(bp->dev, "hwrm query qportcfg failure rc: %x\n", From 8a4b5784fac2af93ad6a8c08fb56d021e0c0416b Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Wed, 23 Aug 2017 17:14:39 +0900 Subject: [PATCH 034/118] net: xfrm: don't double-hold dst when sk_policy in use. While removing dst_entry garbage collection, commit 52df157f17e5 ("xfrm: take refcnt of dst when creating struct xfrm_dst bundle") changed xfrm_resolve_and_create_bundle so it returns an xdst with a refcount of 1 instead of 0. However, it did not delete the dst_hold performed by xfrm_lookup when a per-socket policy is in use. This means that when a socket policy is in use, dst entries returned by xfrm_lookup have a refcount of 2, and are not freed when no longer in use. Cc: Wei Wang Fixes: 52df157f17 ("xfrm: take refcnt of dst when creating struct xfrm_dst bundle") Tested: https://android-review.googlesource.com/417481 Tested: https://android-review.googlesource.com/418659 Tested: https://android-review.googlesource.com/424463 Tested: https://android-review.googlesource.com/452776 passes on net-next Signed-off-by: Lorenzo Colitti Acked-by: Wei Wang Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 6f5a0dad502f..69b16ee327d9 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2226,7 +2226,6 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, goto no_transform; } - dst_hold(&xdst->u.dst); route = xdst->route; } } From 10a54d8196d11f6cc0db2f71249f93854cb9fe55 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Tue, 22 Aug 2017 10:37:29 +0300 Subject: [PATCH 035/118] iwlwifi: pcie: move rx workqueue initialization to iwl_trans_pcie_alloc() Work queues cannot be allocated when a mutex is held because the mutex may be in use and that would make it sleep. Doing so generates the following splat with 4.13+: [ 19.513298] ====================================================== [ 19.513429] WARNING: possible circular locking dependency detected [ 19.513557] 4.13.0-rc5+ #6 Not tainted [ 19.513638] ------------------------------------------------------ [ 19.513767] cpuhp/0/12 is trying to acquire lock: [ 19.513867] (&tz->lock){+.+.+.}, at: [] thermal_zone_get_temp+0x5b/0xb0 [ 19.514047] [ 19.514047] but task is already holding lock: [ 19.514166] (cpuhp_state){+.+.+.}, at: [] cpuhp_thread_fun+0x3a/0x210 [ 19.514338] [ 19.514338] which lock already depends on the new lock. This lock dependency already existed with previous kernel versions, but it was not detected until commit 49dfe2a67797 ("cpuhotplug: Link lock stacks for hotplug callbacks") was introduced. Reported-by: David Weinehall Reported-by: Jiri Kosina Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/pcie/internal.h | 2 ++ drivers/net/wireless/intel/iwlwifi/pcie/rx.c | 10 +--------- drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 9 +++++++++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index fa315d84e98e..a1ea9ef97ed9 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -787,6 +787,8 @@ int iwl_pci_fw_enter_d0i3(struct iwl_trans *trans); void iwl_pcie_enable_rx_wake(struct iwl_trans *trans, bool enable); +void iwl_pcie_rx_allocator_work(struct work_struct *data); + /* common functions that are used by gen2 transport */ void iwl_pcie_apm_config(struct iwl_trans *trans); int iwl_pcie_prepare_card_hw(struct iwl_trans *trans); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c index 351c4423125a..942736d3fa75 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c @@ -597,7 +597,7 @@ static void iwl_pcie_rx_allocator_get(struct iwl_trans *trans, rxq->free_count += RX_CLAIM_REQ_ALLOC; } -static void iwl_pcie_rx_allocator_work(struct work_struct *data) +void iwl_pcie_rx_allocator_work(struct work_struct *data) { struct iwl_rb_allocator *rba_p = container_of(data, struct iwl_rb_allocator, rx_alloc); @@ -900,10 +900,6 @@ static int _iwl_pcie_rx_init(struct iwl_trans *trans) return err; } def_rxq = trans_pcie->rxq; - if (!rba->alloc_wq) - rba->alloc_wq = alloc_workqueue("rb_allocator", - WQ_HIGHPRI | WQ_UNBOUND, 1); - INIT_WORK(&rba->rx_alloc, iwl_pcie_rx_allocator_work); spin_lock(&rba->lock); atomic_set(&rba->req_pending, 0); @@ -1017,10 +1013,6 @@ void iwl_pcie_rx_free(struct iwl_trans *trans) } cancel_work_sync(&rba->rx_alloc); - if (rba->alloc_wq) { - destroy_workqueue(rba->alloc_wq); - rba->alloc_wq = NULL; - } iwl_pcie_free_rbs_pool(trans); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index f95eec52508e..3927bbf04f72 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -1786,6 +1786,11 @@ void iwl_trans_pcie_free(struct iwl_trans *trans) iwl_pcie_tx_free(trans); iwl_pcie_rx_free(trans); + if (trans_pcie->rba.alloc_wq) { + destroy_workqueue(trans_pcie->rba.alloc_wq); + trans_pcie->rba.alloc_wq = NULL; + } + if (trans_pcie->msix_enabled) { for (i = 0; i < trans_pcie->alloc_vecs; i++) { irq_set_affinity_hint( @@ -3169,6 +3174,10 @@ struct iwl_trans *iwl_trans_pcie_alloc(struct pci_dev *pdev, trans_pcie->inta_mask = CSR_INI_SET_MASK; } + trans_pcie->rba.alloc_wq = alloc_workqueue("rb_allocator", + WQ_HIGHPRI | WQ_UNBOUND, 1); + INIT_WORK(&trans_pcie->rba.rx_alloc, iwl_pcie_rx_allocator_work); + #ifdef CONFIG_IWLWIFI_PCIE_RTPM trans->runtime_pm_mode = IWL_PLAT_PM_MODE_D0I3; #else From ab6dd1beac7be3c17f8bf3d38bdf29ecb7293f1e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 10 Aug 2017 10:22:24 +0800 Subject: [PATCH 036/118] netfilter: check for seqadj ext existence before adding it in nf_nat_setup_info Commit 4440a2ab3b9f ("netfilter: synproxy: Check oom when adding synproxy and seqadj ct extensions") wanted to drop the packet when it fails to add seqadj ext due to no memory by checking if nfct_seqadj_ext_add returns NULL. But that nfct_seqadj_ext_add returns NULL can also happen when seqadj ext already exists in a nf_conn. It will cause that userspace protocol doesn't work when both dnat and snat are configured. Li Shuang found this issue in the case: Topo: ftp client router ftp server 10.167.131.2 <-> 10.167.131.254 10.167.141.254 <-> 10.167.141.1 Rules: # iptables -t nat -A PREROUTING -i eth1 -p tcp -m tcp --dport 21 -j \ DNAT --to-destination 10.167.141.1 # iptables -t nat -A POSTROUTING -o eth2 -p tcp -m tcp --dport 21 -j \ SNAT --to-source 10.167.141.254 In router, when both dnat and snat are added, nf_nat_setup_info will be called twice. The packet can be dropped at the 2nd time for DNAT due to seqadj ext is already added at the 1st time for SNAT. This patch is to fix it by checking for seqadj ext existence before adding it, so that the packet will not be dropped if seqadj ext already exists. Note that as Florian mentioned, as a long term, we should review ext_add() behaviour, it's better to return a pointer to the existing ext instead. Fixes: 4440a2ab3b9f ("netfilter: synproxy: Check oom when adding synproxy and seqadj ct extensions") Reported-by: Li Shuang Acked-by: Florian Westphal Signed-off-by: Xin Long Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index eb541786ccb7..b1d3740ae36a 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -441,7 +441,7 @@ nf_nat_setup_info(struct nf_conn *ct, else ct->status |= IPS_DST_NAT; - if (nfct_help(ct)) + if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; } From c26844eda9d4fdbd266660e3b3de2d0270e3a1ed Mon Sep 17 00:00:00 2001 From: andy zhou Date: Mon, 21 Aug 2017 12:38:53 -0700 Subject: [PATCH 037/118] netfilter: nf_tables: Fix nft limit burst handling Current implementation treats the burst configuration the same as rate configuration. This can cause the per packet cost to be lower than configured. In effect, this bug causes the token bucket to be refilled at a higher rate than what user has specified. This patch changes the implementation so that the token bucket size is controlled by "rate + burst", while maintain the token bucket refill rate the same as user specified. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Andy Zhou Acked-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_limit.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 18dd57a52651..14538b1d4d11 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -65,19 +65,23 @@ static int nft_limit_init(struct nft_limit *limit, limit->nsecs = unit * NSEC_PER_SEC; if (limit->rate == 0 || limit->nsecs < unit) return -EOVERFLOW; - limit->tokens = limit->tokens_max = limit->nsecs; - - if (tb[NFTA_LIMIT_BURST]) { - u64 rate; + if (tb[NFTA_LIMIT_BURST]) limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); + else + limit->burst = 0; - rate = limit->rate + limit->burst; - if (rate < limit->rate) - return -EOVERFLOW; + if (limit->rate + limit->burst < limit->rate) + return -EOVERFLOW; + + /* The token bucket size limits the number of tokens can be + * accumulated. tokens_max specifies the bucket size. + * tokens_max = unit * (rate + burst) / rate. + */ + limit->tokens = div_u64(limit->nsecs * (limit->rate + limit->burst), + limit->rate); + limit->tokens_max = limit->tokens; - limit->rate = rate; - } if (tb[NFTA_LIMIT_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS])); @@ -95,9 +99,8 @@ static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit, { u32 flags = limit->invert ? NFT_LIMIT_F_INV : 0; u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); - u64 rate = limit->rate - limit->burst; - if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate), + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(limit->rate), NFTA_LIMIT_PAD) || nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs), NFTA_LIMIT_PAD) || From dadc0736f7be553a25ad34dc437ae379c5ab4a68 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 24 Aug 2017 09:02:49 -0700 Subject: [PATCH 038/118] virtio_net: be drop monitor friendly This change is needed to not fool drop monitor. (perf record ... -e skb:kfree_skb ) Packets were properly sent and are consumed after TX completion. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 98f17b05c68b..b06169ea60dc 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1058,7 +1058,7 @@ static void free_old_xmit_skbs(struct send_queue *sq) bytes += skb->len; packets++; - dev_kfree_skb_any(skb); + dev_consume_skb_any(skb); } /* Avoid overhead when no packets have been processed From e58f95831e7468d25eb6e41f234842ecfe6f014f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 23 Aug 2017 15:59:49 +0200 Subject: [PATCH 039/118] qlge: avoid memcpy buffer overflow gcc-8.0.0 (snapshot) points out that we copy a variable-length string into a fixed length field using memcpy() with the destination length, and that ends up copying whatever follows the string: inlined from 'ql_core_dump' at drivers/net/ethernet/qlogic/qlge/qlge_dbg.c:1106:2: drivers/net/ethernet/qlogic/qlge/qlge_dbg.c:708:2: error: 'memcpy' reading 15 bytes from a region of size 14 [-Werror=stringop-overflow=] memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1); Changing it to use strncpy() will instead zero-pad the destination, which seems to be the right thing to do here. The bug is probably harmless, but it seems like a good idea to address it in stable kernels as well, if only for the purpose of building with gcc-8 without warnings. Fixes: a61f80261306 ("qlge: Add ethtool register dump function.") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlge/qlge_dbg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c index 28ea0af89aef..e3223f2fe2ff 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c +++ b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c @@ -724,7 +724,7 @@ static void ql_build_coredump_seg_header( seg_hdr->cookie = MPI_COREDUMP_COOKIE; seg_hdr->segNum = seg_number; seg_hdr->segSize = seg_size; - memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1); + strncpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1); } /* From 6c7e983b220f89e03286dc70a41c7ef3a8b409df Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 23 Aug 2017 10:43:02 -0400 Subject: [PATCH 040/118] tipc: Fix tipc_sk_reinit handling of -EAGAIN In 9dbbfb0ab6680c6a85609041011484e6658e7d3c function tipc_sk_reinit had additional logic added to loop in the event that function rhashtable_walk_next() returned -EAGAIN. No worries. However, if rhashtable_walk_start returns -EAGAIN, it does "continue", and therefore skips the call to rhashtable_walk_stop(). That has the effect of calling rcu_read_lock() without its paired call to rcu_read_unlock(). Since rcu_read_lock() may be nested, the problem may not be apparent for a while, especially since resize events may be rare. But the comments to rhashtable_walk_start() state: * ...Note that we take the RCU lock in all * cases including when we return an error. So you must always call * rhashtable_walk_stop to clean up. This patch replaces the continue with a goto and label to ensure a matching call to rhashtable_walk_stop(). Signed-off-by: Bob Peterson Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/tipc/socket.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 101e3597338f..d50edd6e0019 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2255,8 +2255,8 @@ void tipc_sk_reinit(struct net *net) do { tsk = ERR_PTR(rhashtable_walk_start(&iter)); - if (tsk) - continue; + if (IS_ERR(tsk)) + goto walk_stop; while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { spin_lock_bh(&tsk->sk.sk_lock.slock); @@ -2265,7 +2265,7 @@ void tipc_sk_reinit(struct net *net) msg_set_orignode(msg, tn->own_addr); spin_unlock_bh(&tsk->sk.sk_lock.slock); } - +walk_stop: rhashtable_walk_stop(&iter); } while (tsk == ERR_PTR(-EAGAIN)); } From c45182eb967af11e9482168be5be41aa22e5d321 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 24 Aug 2017 15:20:41 -0700 Subject: [PATCH 041/118] net: systemport: Be drop monitor friendly Utilize dev_consume_skb_any(cb->skb) in bcm_sysport_free_cb() which is used when a TX packet is completed, as well as when the RX ring is cleaned on shutdown. None of these two cases are packet drops, so be drop monitor friendly. Suggested-by: Eric Dumazet Fixes: 80105befdb4b ("net: systemport: add Broadcom SYSTEMPORT Ethernet MAC driver") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index dc3052751bc1..e6add99cc31c 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -597,7 +597,7 @@ static int bcm_sysport_set_coalesce(struct net_device *dev, static void bcm_sysport_free_cb(struct bcm_sysport_cb *cb) { - dev_kfree_skb_any(cb->skb); + dev_consume_skb_any(cb->skb); cb->skb = NULL; dma_unmap_addr_set(cb, dma_addr, 0); } From 4e458debbb69af0cbde5bd6430d64519d5f59274 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 24 Aug 2017 15:48:21 -0700 Subject: [PATCH 042/118] bpf: fix bpf_setsockopts return value This patch fixes a bug causing any sock operations to always return EINVAL. Fixes: a5192c52377e ("bpf: fix to bpf_setsockops"). Reported-by: Neal Cardwell Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Acked-by: Craig Gallek Acked-by: Daniel Borkmann Acked-by: Lawrence Brakmo Signed-off-by: David S. Miller --- net/core/filter.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 6280a602604c..8eb81e5fae08 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2872,7 +2872,6 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } } - ret = -EINVAL; #endif } else { ret = -EINVAL; From d4fec855905fa8bd5fb1c59f73ad2d74a944876a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 24 Aug 2017 15:56:29 -0700 Subject: [PATCH 043/118] net: bcmgenet: Be drop monitor friendly There are 3 spots where we call dev_kfree_skb() but we are actually just doing a normal SKB consumption: __bcmgenet_tx_reclaim() for normal TX reclamation, bcmgenet_alloc_rx_buffers() during the initial RX ring setup and bcmgenet_free_rx_buffers() during RX ring cleanup. Fixes: d6707bec5986 ("net: bcmgenet: rewrite bcmgenet_rx_refill()") Fixes: f48bed16a756 ("net: bcmgenet: Free skb after last Tx frag") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index a981c4ee9d72..fea3f9a5fb2d 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -1360,7 +1360,7 @@ static unsigned int __bcmgenet_tx_reclaim(struct net_device *dev, if (skb) { pkts_compl++; bytes_compl += GENET_CB(skb)->bytes_sent; - dev_kfree_skb_any(skb); + dev_consume_skb_any(skb); } txbds_processed++; @@ -1875,7 +1875,7 @@ static int bcmgenet_alloc_rx_buffers(struct bcmgenet_priv *priv, cb = ring->cbs + i; skb = bcmgenet_rx_refill(priv, cb); if (skb) - dev_kfree_skb_any(skb); + dev_consume_skb_any(skb); if (!cb->skb) return -ENOMEM; } @@ -1894,7 +1894,7 @@ static void bcmgenet_free_rx_buffers(struct bcmgenet_priv *priv) skb = bcmgenet_free_rx_cb(&priv->pdev->dev, cb); if (skb) - dev_kfree_skb_any(skb); + dev_consume_skb_any(skb); } } From c2062ee3d9615828109ffe8089fbf69bed394d05 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 24 Aug 2017 16:01:13 -0700 Subject: [PATCH 044/118] net: systemport: Free DMA coherent descriptors on errors In case bcm_sysport_init_tx_ring() is not able to allocate ring->cbs, we would return with an error, and call bcm_sysport_fini_tx_ring() and it would see that ring->cbs is NULL and do nothing. This would leak the coherent DMA descriptor area, so we need to free it on error before returning. Reported-by: Eric Dumazet Fixes: 80105befdb4b ("net: systemport: add Broadcom SYSTEMPORT Ethernet MAC driver") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index e6add99cc31c..c28fa5a8734c 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1346,6 +1346,8 @@ static int bcm_sysport_init_tx_ring(struct bcm_sysport_priv *priv, ring->cbs = kcalloc(size, sizeof(struct bcm_sysport_cb), GFP_KERNEL); if (!ring->cbs) { + dma_free_coherent(kdev, sizeof(struct dma_desc), + ring->desc_cpu, ring->desc_dma); netif_err(priv, hw, priv->netdev, "CB allocation failed\n"); return -ENOMEM; } From 551143d8d954fe398324a5caa276f518466c428b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 24 Aug 2017 21:12:28 -0700 Subject: [PATCH 045/118] net_sched: fix a refcount_t issue with noop_qdisc syzkaller reported a refcount_t warning [1] Issue here is that noop_qdisc refcnt was never really considered as a true refcount, since qdisc_destroy() found TCQ_F_BUILTIN set : if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt))) return; Meaning that all atomic_inc() we did on noop_qdisc.refcnt were not really needed, but harmless until refcount_t came. To fix this problem, we simply need to not increment noop_qdisc.refcnt, since we never decrement it. [1] refcount_t: increment on 0; use-after-free. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 21754 at lib/refcount.c:152 refcount_inc+0x47/0x50 lib/refcount.c:152 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 21754 Comm: syz-executor7 Not tainted 4.13.0-rc6+ #20 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:180 __warn+0x1c4/0x1d9 kernel/panic.c:541 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190 do_trap_no_signal arch/x86/kernel/traps.c:224 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:273 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:310 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323 invalid_op+0x1e/0x30 arch/x86/entry/entry_64.S:846 RIP: 0010:refcount_inc+0x47/0x50 lib/refcount.c:152 RSP: 0018:ffff8801c43477a0 EFLAGS: 00010282 RAX: 000000000000002b RBX: ffffffff86093c14 RCX: 0000000000000000 RDX: 000000000000002b RSI: ffffffff8159314e RDI: ffffed0038868ee8 RBP: ffff8801c43477a8 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff86093ac0 R13: 0000000000000001 R14: ffff8801d0f3bac0 R15: dffffc0000000000 attach_default_qdiscs net/sched/sch_generic.c:792 [inline] dev_activate+0x7d3/0xaa0 net/sched/sch_generic.c:833 __dev_open+0x227/0x330 net/core/dev.c:1380 __dev_change_flags+0x695/0x990 net/core/dev.c:6726 dev_change_flags+0x88/0x140 net/core/dev.c:6792 dev_ifsioc+0x5a6/0x930 net/core/dev_ioctl.c:256 dev_ioctl+0x2bc/0xf90 net/core/dev_ioctl.c:554 sock_do_ioctl+0x94/0xb0 net/socket.c:968 sock_ioctl+0x2c2/0x440 net/socket.c:1058 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 Fixes: 7b9364050246 ("net, sched: convert Qdisc.refcnt from atomic_t to refcount_t") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Reshetova, Elena Signed-off-by: David S. Miller --- include/net/sch_generic.h | 7 +++++++ net/sched/sch_api.c | 6 +++--- net/sched/sch_generic.c | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 67f815e5d525..c1109cdbbfa6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -101,6 +101,13 @@ struct Qdisc { spinlock_t busylock ____cacheline_aligned_in_smp; }; +static inline void qdisc_refcount_inc(struct Qdisc *qdisc) +{ + if (qdisc->flags & TCQ_F_BUILTIN) + return; + refcount_inc(&qdisc->refcnt); +} + static inline bool qdisc_is_running(const struct Qdisc *qdisc) { return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a3fa144b8648..4fb5a3222d0d 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -836,7 +836,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, old = dev_graft_qdisc(dev_queue, new); if (new && i > 0) - refcount_inc(&new->refcnt); + qdisc_refcount_inc(new); if (!ingress) qdisc_destroy(old); @@ -847,7 +847,7 @@ skip: notify_and_destroy(net, skb, n, classid, dev->qdisc, new); if (new && !new->ops->attach) - refcount_inc(&new->refcnt); + qdisc_refcount_inc(new); dev->qdisc = new ? : &noop_qdisc; if (new && new->ops->attach) @@ -1256,7 +1256,7 @@ replay: if (q == p || (p && check_loop(q, p, 0))) return -ELOOP; - refcount_inc(&q->refcnt); + qdisc_refcount_inc(q); goto graft; } else { if (!q) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 57ba406f1437..4ba6da5fb254 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -785,7 +785,7 @@ static void attach_default_qdiscs(struct net_device *dev) dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); dev->qdisc = txq->qdisc_sleeping; - refcount_inc(&dev->qdisc->refcnt); + qdisc_refcount_inc(dev->qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT); if (qdisc) { From 27163138b4d80e36f2006273d66b6c122d241f30 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Thu, 24 Aug 2017 16:31:22 +0200 Subject: [PATCH 046/118] tipc: perform skb_linearize() before parsing the inner header In tipc_rcv(), we linearize only the header and usually the packets are consumed as the nodes permit direct reception. However, if the skb contains tunnelled message due to fail over or synchronization we parse it in tipc_node_check_state() without performing linearization. This will cause link disturbances if the skb was non linear. In this commit, we perform linearization for the above messages. Signed-off-by: Parthasarathy Bhuvaragan Reviewed-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tipc/node.c b/net/tipc/node.c index 9b4dcb6a16b5..b113a52f8914 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1557,6 +1557,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) /* Check/update node state before receiving */ if (unlikely(skb)) { + if (unlikely(skb_linearize(skb))) + goto discard; tipc_node_write_lock(n); if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) { if (le->link) { From 60d1d93664a0bb3d5af722ed38c57ee165a45bf7 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Thu, 24 Aug 2017 16:31:23 +0200 Subject: [PATCH 047/118] tipc: reassign pointers after skb reallocation / linearization In tipc_msg_reverse(), we assign skb attributes to local pointers in stack at startup. This is followed by skb_linearize() and for cloned buffers we perform skb relocation using pskb_expand_head(). Both these methods may update the skb attributes and thus making the pointers incorrect. In this commit, we fix this error by ensuring that the pointers are re-assigned after any of these skb operations. Fixes: 29042e19f2c60 ("tipc: let function tipc_msg_reverse() expand header when needed") Signed-off-by: Parthasarathy Bhuvaragan Reviewed-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index dcd90e6fa7c3..6ef379f004ac 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -479,13 +479,14 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) { struct sk_buff *_skb = *skb; - struct tipc_msg *hdr = buf_msg(_skb); + struct tipc_msg *hdr; struct tipc_msg ohdr; - int dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE); + int dlen; if (skb_linearize(_skb)) goto exit; hdr = buf_msg(_skb); + dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE); if (msg_dest_droppable(hdr)) goto exit; if (msg_errcode(hdr)) @@ -511,6 +512,8 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) goto exit; + /* reassign after skb header modifications */ + hdr = buf_msg(_skb); /* Now reverse the concerned fields */ msg_set_errcode(hdr, err); msg_set_non_seq(hdr, 0); From 991ca84daa001193066554fa49f3a934746317d6 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Thu, 24 Aug 2017 16:31:24 +0200 Subject: [PATCH 048/118] tipc: context imbalance at node read unlock If we fail to find a valid bearer in tipc_node_get_linkname(), node_read_unlock() is called without holding the node read lock. This commit fixes this error. Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/node.c b/net/tipc/node.c index b113a52f8914..7dd22330a6b4 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1126,8 +1126,8 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, strncpy(linkname, tipc_link_name(link), len); err = 0; } -exit: tipc_node_read_unlock(node); +exit: tipc_node_put(node); return err; } From 9b4e946ce14e20d7addbfb7d9139e604f9fda107 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 24 Aug 2017 16:49:16 -0700 Subject: [PATCH 049/118] netvsc: fix deadlock betwen link status and removal There is a deadlock possible when canceling the link status delayed work queue. The removal process is run with RTNL held, and the link status callback is acquring RTNL. Resolve the issue by using trylock and rescheduling. If cancel is in process, that block it from happening. Fixes: 122a5f6410f4 ("staging: hv: use delayed_work for netvsc_send_garp()") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 0d78727f1a14..d91cbc6c3ca4 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1269,7 +1269,12 @@ static void netvsc_link_change(struct work_struct *w) bool notify = false, reschedule = false; unsigned long flags, next_reconfig, delay; - rtnl_lock(); + /* if changes are happening, comeback later */ + if (!rtnl_trylock()) { + schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT); + return; + } + net_device = rtnl_dereference(ndev_ctx->nvdev); if (!net_device) goto out_unlock; From 36ff0dd39f9b88ca83e1733b735e9f22b7be893b Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 25 Aug 2017 07:16:07 +0200 Subject: [PATCH 050/118] esp: Fix locking on page fragment allocation We allocate the page fragment for the ESP trailer inside a spinlock, but consume it outside of the lock. This is racy as some other cou could get the same page fragment then. Fix this by consuming the page fragment inside the lock too. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 5 +++-- net/ipv6/esp6.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index dbb31a942dfa..a8ddb95e7f06 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -292,8 +292,6 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * kunmap_atomic(vaddr); - spin_unlock_bh(&x->lock); - nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, @@ -301,6 +299,9 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * skb_shinfo(skb)->nr_frags = ++nfrags; pfrag->offset = pfrag->offset + allocsize; + + spin_unlock_bh(&x->lock); + nfrags++; skb->len += tailen; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 392def1fcf21..4e3fdc888943 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -260,8 +260,6 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info kunmap_atomic(vaddr); - spin_unlock_bh(&x->lock); - nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, @@ -269,6 +267,9 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info skb_shinfo(skb)->nr_frags = ++nfrags; pfrag->offset = pfrag->offset + allocsize; + + spin_unlock_bh(&x->lock); + nfrags++; skb->len += tailen; From 54ffd790792898f05e215dce5aa593473e80e92f Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 25 Aug 2017 07:34:35 +0200 Subject: [PATCH 051/118] esp: Fix skb tailroom calculation We use skb_availroom to calculate the skb tailroom for the ESP trailer. skb_availroom calculates the tailroom and subtracts this value by reserved_tailroom. However reserved_tailroom is a union with the skb mark. This means that we subtract the tailroom by the skb mark if set. Fix this by using skb_tailroom instead. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index a8ddb95e7f06..df68963dc90a 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -258,7 +258,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * esp_output_udp_encap(x, skb, esp); if (!skb_cloned(skb)) { - if (tailen <= skb_availroom(skb)) { + if (tailen <= skb_tailroom(skb)) { nfrags = 1; trailer = skb; tail = skb_tail_pointer(trailer); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 4e3fdc888943..ab64f367d11c 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -226,7 +226,7 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info int tailen = esp->tailen; if (!skb_cloned(skb)) { - if (tailen <= skb_availroom(skb)) { + if (tailen <= skb_tailroom(skb)) { nfrags = 1; trailer = skb; tail = skb_tail_pointer(trailer); From 3614364527daa870264f6dde77f02853cdecd02c Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 25 Aug 2017 09:05:42 +0200 Subject: [PATCH 052/118] ipv6: Fix may be used uninitialized warning in rt6_check rt_cookie might be used uninitialized, fix this by initializing it. Fixes: c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node") Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a9d3564caf49..48c8c92dcbd3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1289,7 +1289,7 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt) static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) { - u32 rt_cookie; + u32 rt_cookie = 0; if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) return NULL; From ebfa00c5745660fe7f0a91eea88d4dff658486c4 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 25 Aug 2017 13:10:12 +0200 Subject: [PATCH 053/118] tcp: fix refcnt leak with ebpf congestion control There are a few bugs around refcnt handling in the new BPF congestion control setsockopt: - The new ca is assigned to icsk->icsk_ca_ops even in the case where we cannot get a reference on it. This would lead to a use after free, since that ca is going away soon. - Changing the congestion control case doesn't release the refcnt on the previous ca. - In the reinit case, we first leak a reference on the old ca, then we call tcp_reinit_congestion_control on the ca that we have just assigned, leading to deinitializing the wrong ca (->release of the new ca on the old ca's data) and releasing the refcount on the ca that we actually want to use. This is visible by building (for example) BIC as a module and setting net.ipv4.tcp_congestion_control=bic, and using tcp_cong_kern.c from samples/bpf. This patch fixes the refcount issues, and moves reinit back into tcp core to avoid passing a ca pointer back to BPF. Fixes: 91b5b21c7c16 ("bpf: Add support for changing congestion control") Signed-off-by: Sabrina Dubroca Acked-by: Lawrence Brakmo Signed-off-by: David S. Miller --- include/net/tcp.h | 4 +--- net/core/filter.c | 7 ++----- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_cong.c | 19 ++++++++++++++----- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index ada65e767b28..f642a39f9eee 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1004,9 +1004,7 @@ void tcp_get_default_congestion_control(char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load); -void tcp_reinit_congestion_control(struct sock *sk, - const struct tcp_congestion_ops *ca); +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); diff --git a/net/core/filter.c b/net/core/filter.c index 8eb81e5fae08..169974998c76 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2836,15 +2836,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; + bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false); - if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) - /* replacing an existing ca */ - tcp_reinit_congestion_control(sk, - inet_csk(sk)->icsk_ca_ops); + ret = tcp_set_congestion_control(sk, name, false, reinit); } else { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71ce33decd97..a3e91b552edc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true); + err = tcp_set_congestion_control(sk, name, true, true); release_sock(sk); return err; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index fde983f6376b..421ea1b918da 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk) INET_ECN_dontxmit(sk); } -void tcp_reinit_congestion_control(struct sock *sk, - const struct tcp_congestion_ops *ca) +static void tcp_reinit_congestion_control(struct sock *sk, + const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -338,7 +338,7 @@ out: * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -360,9 +360,18 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) if (!ca) { err = -ENOENT; } else if (!load) { - icsk->icsk_ca_ops = ca; - if (!try_module_get(ca->owner)) + const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; + + if (try_module_get(ca->owner)) { + if (reinit) { + tcp_reinit_congestion_control(sk, ca); + } else { + icsk->icsk_ca_ops = ca; + module_put(old_ca->owner); + } + } else { err = -EBUSY; + } } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { err = -EPERM; From 1089650d8837095f63e001bbf14d7b48043d67ad Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 24 Aug 2017 18:34:43 -0700 Subject: [PATCH 054/118] r8169: Do not increment tx_dropped in TX ring cleaning rtl8169_tx_clear_range() is responsible for cleaning up the TX ring during interface shutdown, incrementing tx_dropped for every SKB that we left at the time in the ring is misleading. Fixes: cac4b22f3d6a ("r8169: do not account fragments as packets") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index bd07a15d3b7c..8a1bbd2a6a20 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -6863,7 +6863,6 @@ static void rtl8169_tx_clear_range(struct rtl8169_private *tp, u32 start, rtl8169_unmap_tx_skb(&tp->pci_dev->dev, tx_skb, tp->TxDescArray + entry); if (skb) { - tp->dev->stats.tx_dropped++; dev_kfree_skb_any(skb); tx_skb->skb = NULL; } From 7a4b813cb739ce598ffbad2e84d19d13fa23e25d Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 24 Aug 2017 18:34:44 -0700 Subject: [PATCH 055/118] r8169: Be drop monitor friendly rtl_tx() is the TX reclamation process whereas rtl8169_tx_clear_range() does the TX ring cleaning during shutdown, both of these functions should call dev_consume_skb_any() to be drop monitor friendly. Fixes: cac4b22f3d6a ("r8169: do not account fragments as packets") Fixes: eb781397904e ("r8169: Do not use dev_kfree_skb in xmit path") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 8a1bbd2a6a20..e03fcf914690 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -6863,7 +6863,7 @@ static void rtl8169_tx_clear_range(struct rtl8169_private *tp, u32 start, rtl8169_unmap_tx_skb(&tp->pci_dev->dev, tx_skb, tp->TxDescArray + entry); if (skb) { - dev_kfree_skb_any(skb); + dev_consume_skb_any(skb); tx_skb->skb = NULL; } } @@ -7318,7 +7318,7 @@ static void rtl_tx(struct net_device *dev, struct rtl8169_private *tp) tp->tx_stats.packets++; tp->tx_stats.bytes += tx_skb->skb->len; u64_stats_update_end(&tp->tx_stats.syncp); - dev_kfree_skb_any(tx_skb->skb); + dev_consume_skb_any(tx_skb->skb); tx_skb->skb = NULL; } dirty_tx++; From 2207d182c14294d78b98142f0b6a16bea5e8b0fb Mon Sep 17 00:00:00 2001 From: Christophe Jaillet Date: Fri, 25 Aug 2017 07:35:51 +0200 Subject: [PATCH 056/118] net: sxgbe: check memory allocation failure Check memory allocation failure and return -ENOMEM in such a case, as already done few lines below for another memory allocation. Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c index 73427e29df2a..fbd00cb0cb7d 100644 --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c @@ -47,6 +47,8 @@ static int sxgbe_probe_config_dt(struct platform_device *pdev, plat->mdio_bus_data = devm_kzalloc(&pdev->dev, sizeof(*plat->mdio_bus_data), GFP_KERNEL); + if (!plat->mdio_bus_data) + return -ENOMEM; dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg), GFP_KERNEL); if (!dma_cfg) From 64f0f5d18a47c703c85576375cc010e83dac6a48 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 25 Aug 2017 14:31:01 +0200 Subject: [PATCH 057/118] udp6: set rx_dst_cookie on rx_dst updates Currently, in the udp6 code, the dst cookie is not initialized/updated concurrently with the RX dst used by early demux. As a result, the dst_check() in the early_demux path always fails, the rx dst cache is always invalidated, and we can't really leverage significant gain from the demux lookup. Fix it adding udp6 specific variant of sk_rx_dst_set() and use it to set the dst cookie when the dst entry is really changed. The issue is there since the introduction of early demux for ipv6. Fixes: 5425077d73e0 ("net: ipv6: Add early demux handler for UDP unicast") Acked-by: Hannes Frederic Sowa Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/udp.h | 2 +- net/ipv4/udp.c | 4 +++- net/ipv6/udp.c | 11 ++++++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/net/udp.h b/include/net/udp.h index 586de4b811b5..626c2d8a70c5 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -260,7 +260,7 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, } void udp_v4_early_demux(struct sk_buff *skb); -void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); +bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, const struct sock *)); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cd1d044a7fa5..a6dc48d76a29 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1929,14 +1929,16 @@ drop: /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ -void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; if (dst_hold_safe(dst)) { old = xchg(&sk->sk_rx_dst, dst); dst_release(old); + return old != dst; } + return false; } EXPORT_SYMBOL(udp_sk_rx_dst_set); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 20039c8501eb..d6886228e1d0 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -768,6 +768,15 @@ start_lookup: return 0; } +static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) +{ + if (udp_sk_rx_dst_set(sk, dst)) { + const struct rt6_info *rt = (const struct rt6_info *)dst; + + inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); + } +} + int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { @@ -817,7 +826,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int ret; if (unlikely(sk->sk_rx_dst != dst)) - udp_sk_rx_dst_set(sk, dst); + udp6_sk_rx_dst_set(sk, dst); ret = udpv6_queue_rcv_skb(sk, skb); sock_put(sk); From 5fe0d4bd8f86d19f7f24c1ae5a9b6e6a5a52e51a Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 26 Aug 2017 17:08:57 +0200 Subject: [PATCH 058/118] xfrm_user: fix info leak in copy_user_offload() The memory reserved to dump the xfrm offload state includes padding bytes of struct xfrm_user_offload added by the compiler for alignment. Add an explicit memset(0) before filling the buffer to avoid the heap info leak. Cc: Steffen Klassert Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Mathias Krause Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 2be4c6af008a..3259555ae7d7 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -796,7 +796,7 @@ static int copy_user_offload(struct xfrm_state_offload *xso, struct sk_buff *skb return -EMSGSIZE; xuo = nla_data(attr); - + memset(xuo, 0, sizeof(*xuo)); xuo->ifindex = xso->dev->ifindex; xuo->flags = xso->flags; From 50329c8a340c9dea60d837645fcf13fc36bfb84d Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 26 Aug 2017 17:08:58 +0200 Subject: [PATCH 059/118] xfrm_user: fix info leak in xfrm_notify_sa() The memory reserved to dump the ID of the xfrm state includes a padding byte in struct xfrm_usersa_id added by the compiler for alignment. To prevent the heap info leak, memset(0) the whole struct before filling it. Cc: Herbert Xu Fixes: 0603eac0d6b7 ("[IPSEC]: Add XFRMA_SA/XFRMA_POLICY for delete notification") Signed-off-by: Mathias Krause Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 3259555ae7d7..c33516ef52f2 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2715,6 +2715,7 @@ static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c) struct nlattr *attr; id = nlmsg_data(nlh); + memset(id, 0, sizeof(*id)); memcpy(&id->daddr, &x->id.daddr, sizeof(id->daddr)); id->spi = x->id.spi; id->family = x->props.family; From e3e5fc1698ae35ac60d075b477e84accb96e2652 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 26 Aug 2017 17:08:59 +0200 Subject: [PATCH 060/118] xfrm_user: fix info leak in build_expire() The memory reserved to dump the expired xfrm state includes padding bytes in struct xfrm_user_expire added by the compiler for alignment. To prevent the heap info leak, memset(0) the remainder of the struct. Initializing the whole structure isn't needed as copy_to_user_state() already takes care of clearing the padding bytes within the 'state' member. Signed-off-by: Mathias Krause Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c33516ef52f2..2cbdc81610c6 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2578,6 +2578,8 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct ue = nlmsg_data(nlh); copy_to_user_state(x, &ue->state); ue->hard = (c->data.hard != 0) ? 1 : 0; + /* clear the padding bytes */ + memset(&ue->hard + 1, 0, sizeof(*ue) - offsetofend(typeof(*ue), hard)); err = xfrm_mark_put(skb, &x->mark); if (err) From 931e79d7a7ddee4709c56b39de169a36804589a1 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 26 Aug 2017 17:09:00 +0200 Subject: [PATCH 061/118] xfrm_user: fix info leak in build_aevent() The memory reserved to dump the ID of the xfrm state includes a padding byte in struct xfrm_usersa_id added by the compiler for alignment. To prevent the heap info leak, memset(0) the sa_id before filling it. Cc: Jamal Hadi Salim Fixes: d51d081d6504 ("[IPSEC]: Sync series - user") Signed-off-by: Mathias Krause Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 2cbdc81610c6..9391ced05259 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1869,6 +1869,7 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct return -EMSGSIZE; id = nlmsg_data(nlh); + memset(&id->sa_id, 0, sizeof(id->sa_id)); memcpy(&id->sa_id.daddr, &x->id.daddr, sizeof(x->id.daddr)); id->sa_id.spi = x->id.spi; id->sa_id.family = x->props.family; From 1e22391e8fbec9c3709bad82b997b108d1c6228b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 25 Aug 2017 15:04:32 +0200 Subject: [PATCH 062/118] net: missing call of trace_napi_poll in busy_poll_stop Noticed that busy_poll_stop() also invoke the drivers napi->poll() function pointer, but didn't have an associated call to trace_napi_poll() like all other call sites. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index ce15a06d5558..818dfa6e7ab5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5289,6 +5289,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ rc = napi->poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); netpoll_poll_unlock(have_poll_lock); if (rc == BUSY_POLL_BUDGET) __napi_schedule(napi); From 3b638f0f0b94fc41f419033f2a02c49d52f3fec9 Mon Sep 17 00:00:00 2001 From: Aleksander Morgado Date: Fri, 25 Aug 2017 15:39:16 +0200 Subject: [PATCH 063/118] cdc_ncm: flag the u-blox TOBY-L4 as wwan The u-blox TOBY-L4 is a LTE Advanced (Cat 6) module with HSPA+ and 2G fallback. Unlike the TOBY-L2, this module has one single USB layout and exposes several TTYs for control and a NCM interface for data. Connecting this module may be done just by activating the desired PDP context with 'AT+CGACT=1,' and then running DHCP on the NCM interface. Signed-off-by: Aleksander Morgado Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ncm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 8f572b9f3625..9c80e80c5493 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -1758,6 +1758,13 @@ static const struct usb_device_id cdc_devs[] = { .driver_info = (unsigned long)&wwan_noarp_info, }, + /* u-blox TOBY-L4 */ + { USB_DEVICE_AND_INTERFACE_INFO(0x1546, 0x1010, + USB_CLASS_COMM, + USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_info, + }, + /* Generic CDC-NCM devices */ { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), From 4c22868264516fe0c42817a87f37efb44254e7a9 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Fri, 25 Aug 2017 16:14:17 +0200 Subject: [PATCH 064/118] net: mvpp2: fix the mac address used when using PPv2.2 The mac address is only retrieved from h/w when using PPv2.1. Otherwise the variable holding it is still checked and used if it contains a valid value. As the variable isn't initialized to an invalid mac address value, we end up with random mac addresses which can be the same for all the ports handled by this PPv2 driver. Fixes this by initializing the h/w mac address variable to {0}, which is an invalid mac address value. This way the random assignation fallback is called and all ports end up with their own addresses. Signed-off-by: Antoine Tenart Fixes: 2697582144dd ("net: mvpp2: handle misc PPv2.1/PPv2.2 differences") Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index 48d21c1e09f2..4d598ca8503a 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -6504,7 +6504,7 @@ static int mvpp2_port_probe(struct platform_device *pdev, struct resource *res; const char *dt_mac_addr; const char *mac_from; - char hw_mac_addr[ETH_ALEN]; + char hw_mac_addr[ETH_ALEN] = {0}; u32 id; int features; int phy_mode; From 9ee369a405c57613d7c83a3967780c3e30c52ecc Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 25 Aug 2017 16:22:17 +0200 Subject: [PATCH 065/118] l2tp: initialise session's refcount before making it reachable Sessions must be fully initialised before calling l2tp_session_add_to_tunnel(). Otherwise, there's a short time frame where partially initialised sessions can be accessed by external users. Fixes: dbdbc73b4478 ("l2tp: fix duplicate session creation") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index b0c2d4ae781d..f363669eae47 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1844,6 +1844,8 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn l2tp_session_set_header_len(session, tunnel->version); + refcount_set(&session->ref_count, 1); + err = l2tp_session_add_to_tunnel(tunnel, session); if (err) { kfree(session); @@ -1851,10 +1853,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn return ERR_PTR(err); } - /* Bump the reference count. The session context is deleted - * only when this drops to zero. - */ - refcount_set(&session->ref_count, 1); l2tp_tunnel_inc_refcount(tunnel); /* Ensure tunnel socket isn't deleted */ From 54652eb12c1b72e9602d09cb2821d5760939190f Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 25 Aug 2017 16:51:40 +0200 Subject: [PATCH 066/118] l2tp: hold tunnel while looking up sessions in l2tp_netlink l2tp_tunnel_find() doesn't take a reference on the returned tunnel. Therefore, it's unsafe to use it because the returned tunnel can go away on us anytime. Fix this by defining l2tp_tunnel_get(), which works like l2tp_tunnel_find(), but takes a reference on the returned tunnel. Caller then has to drop this reference using l2tp_tunnel_dec_refcount(). As l2tp_tunnel_dec_refcount() needs to be moved to l2tp_core.h, let's simplify the patch and not move the L2TP_REFCNT_DEBUG part. This code has been broken (not even compiling) in May 2012 by commit a4ca44fa578c ("net: l2tp: Standardize logging styles") and fixed more than two years later by commit 29abe2fda54f ("l2tp: fix missing line continuation"). So it doesn't appear to be used by anyone. Same thing for l2tp_tunnel_free(); instead of moving it to l2tp_core.h, let's just simplify things and call kfree_rcu() directly in l2tp_tunnel_dec_refcount(). Extra assertions and debugging code provided by l2tp_tunnel_free() didn't help catching any of the reference counting and socket handling issues found while working on this series. Fixes: 309795f4bec2 ("l2tp: Add netlink control API for L2TP") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 66 +++++++++++++---------------------------- net/l2tp/l2tp_core.h | 13 ++++++++ net/l2tp/l2tp_netlink.c | 6 ++-- 3 files changed, 38 insertions(+), 47 deletions(-) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index f363669eae47..90165a6874bc 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -113,7 +113,6 @@ struct l2tp_net { spinlock_t l2tp_session_hlist_lock; }; -static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk) { @@ -127,39 +126,6 @@ static inline struct l2tp_net *l2tp_pernet(const struct net *net) return net_generic(net, l2tp_net_id); } -/* Tunnel reference counts. Incremented per session that is added to - * the tunnel. - */ -static inline void l2tp_tunnel_inc_refcount_1(struct l2tp_tunnel *tunnel) -{ - refcount_inc(&tunnel->ref_count); -} - -static inline void l2tp_tunnel_dec_refcount_1(struct l2tp_tunnel *tunnel) -{ - if (refcount_dec_and_test(&tunnel->ref_count)) - l2tp_tunnel_free(tunnel); -} -#ifdef L2TP_REFCNT_DEBUG -#define l2tp_tunnel_inc_refcount(_t) \ -do { \ - pr_debug("l2tp_tunnel_inc_refcount: %s:%d %s: cnt=%d\n", \ - __func__, __LINE__, (_t)->name, \ - refcount_read(&_t->ref_count)); \ - l2tp_tunnel_inc_refcount_1(_t); \ -} while (0) -#define l2tp_tunnel_dec_refcount(_t) \ -do { \ - pr_debug("l2tp_tunnel_dec_refcount: %s:%d %s: cnt=%d\n", \ - __func__, __LINE__, (_t)->name, \ - refcount_read(&_t->ref_count)); \ - l2tp_tunnel_dec_refcount_1(_t); \ -} while (0) -#else -#define l2tp_tunnel_inc_refcount(t) l2tp_tunnel_inc_refcount_1(t) -#define l2tp_tunnel_dec_refcount(t) l2tp_tunnel_dec_refcount_1(t) -#endif - /* Session hash global list for L2TPv3. * The session_id SHOULD be random according to RFC3931, but several * L2TP implementations use incrementing session_ids. So we do a real @@ -229,6 +195,27 @@ l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id) return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)]; } +/* Lookup a tunnel. A new reference is held on the returned tunnel. */ +struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) +{ + const struct l2tp_net *pn = l2tp_pernet(net); + struct l2tp_tunnel *tunnel; + + rcu_read_lock_bh(); + list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) { + if (tunnel->tunnel_id == tunnel_id) { + l2tp_tunnel_inc_refcount(tunnel); + rcu_read_unlock_bh(); + + return tunnel; + } + } + rcu_read_unlock_bh(); + + return NULL; +} +EXPORT_SYMBOL_GPL(l2tp_tunnel_get); + /* Lookup a session. A new reference is held on the returned session. * Optionally calls session->ref() too if do_ref is true. */ @@ -1348,17 +1335,6 @@ static void l2tp_udp_encap_destroy(struct sock *sk) } } -/* Really kill the tunnel. - * Come here only when all sessions have been cleared from the tunnel. - */ -static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) -{ - BUG_ON(refcount_read(&tunnel->ref_count) != 0); - BUG_ON(tunnel->sock != NULL); - l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: free...\n", tunnel->name); - kfree_rcu(tunnel, rcu); -} - /* Workqueue tunnel deletion function */ static void l2tp_tunnel_del_work(struct work_struct *work) { diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index cdb6e3327f74..9101297f27ad 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -231,6 +231,8 @@ out: return tunnel; } +struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); + struct l2tp_session *l2tp_session_get(const struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, bool do_ref); @@ -269,6 +271,17 @@ int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg); +static inline void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel) +{ + refcount_inc(&tunnel->ref_count); +} + +static inline void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel) +{ + if (refcount_dec_and_test(&tunnel->ref_count)) + kfree_rcu(tunnel, rcu); +} + /* Session reference counts. Incremented when code obtains a reference * to a session. */ diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 12cfcd0ca807..27ee94b5c189 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -65,10 +65,12 @@ static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info, (info->attrs[L2TP_ATTR_CONN_ID])) { tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel) + tunnel = l2tp_tunnel_get(net, tunnel_id); + if (tunnel) { session = l2tp_session_get(net, tunnel, session_id, do_ref); + l2tp_tunnel_dec_refcount(tunnel); + } } return session; From bb0a32ce4389e17e47e198d2cddaf141561581ad Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 25 Aug 2017 16:51:42 +0200 Subject: [PATCH 067/118] l2tp: hold tunnel while processing genl delete command l2tp_nl_cmd_tunnel_delete() needs to take a reference on the tunnel, to prevent it from being concurrently freed by l2tp_tunnel_destruct(). Fixes: 309795f4bec2 ("l2tp: Add netlink control API for L2TP") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_netlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 27ee94b5c189..808966550620 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -273,8 +273,8 @@ static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel == NULL) { + tunnel = l2tp_tunnel_get(net, tunnel_id); + if (!tunnel) { ret = -ENODEV; goto out; } @@ -284,6 +284,8 @@ static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info (void) l2tp_tunnel_delete(tunnel); + l2tp_tunnel_dec_refcount(tunnel); + out: return ret; } From 8c0e421525c9eb50d68e8f633f703ca31680b746 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 25 Aug 2017 16:51:42 +0200 Subject: [PATCH 068/118] l2tp: hold tunnel while handling genl tunnel updates We need to make sure the tunnel is not going to be destroyed by l2tp_tunnel_destruct() concurrently. Fixes: 309795f4bec2 ("l2tp: Add netlink control API for L2TP") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_netlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 808966550620..d61e75b4e619 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -303,8 +303,8 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel == NULL) { + tunnel = l2tp_tunnel_get(net, tunnel_id); + if (!tunnel) { ret = -ENODEV; goto out; } @@ -315,6 +315,8 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel, L2TP_CMD_TUNNEL_MODIFY); + l2tp_tunnel_dec_refcount(tunnel); + out: return ret; } From 4e4b21da3acc68a7ea55f850cacc13706b7480e9 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 25 Aug 2017 16:51:43 +0200 Subject: [PATCH 069/118] l2tp: hold tunnel while handling genl TUNNEL_GET commands Use l2tp_tunnel_get() instead of l2tp_tunnel_find() so that we get a reference on the tunnel, preventing l2tp_tunnel_destruct() from freeing it from under us. Also move l2tp_tunnel_get() below nlmsg_new() so that we only take the reference when needed. Fixes: 309795f4bec2 ("l2tp: Add netlink control API for L2TP") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_netlink.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index d61e75b4e619..ae5170e26281 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -444,34 +444,37 @@ static int l2tp_nl_cmd_tunnel_get(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[L2TP_ATTR_CONN_ID]) { ret = -EINVAL; - goto out; + goto err; } tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); - if (tunnel == NULL) { - ret = -ENODEV; - goto out; - } - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { ret = -ENOMEM; - goto out; + goto err; + } + + tunnel = l2tp_tunnel_get(net, tunnel_id); + if (!tunnel) { + ret = -ENODEV; + goto err_nlmsg; } ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq, NLM_F_ACK, tunnel, L2TP_CMD_TUNNEL_GET); if (ret < 0) - goto err_out; + goto err_nlmsg_tunnel; + + l2tp_tunnel_dec_refcount(tunnel); return genlmsg_unicast(net, msg, info->snd_portid); -err_out: +err_nlmsg_tunnel: + l2tp_tunnel_dec_refcount(tunnel); +err_nlmsg: nlmsg_free(msg); - -out: +err: return ret; } From e702c1204eb57788ef189c839c8c779368267d70 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 25 Aug 2017 16:51:46 +0200 Subject: [PATCH 070/118] l2tp: hold tunnel used while creating sessions with netlink Use l2tp_tunnel_get() to retrieve tunnel, so that it can't go away on us. Otherwise l2tp_tunnel_destruct() might release the last reference count concurrently, thus freeing the tunnel while we're using it. Fixes: 309795f4bec2 ("l2tp: Add netlink control API for L2TP") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_netlink.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index ae5170e26281..57427d430f10 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -518,8 +518,9 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf ret = -EINVAL; goto out; } + tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]); - tunnel = l2tp_tunnel_find(net, tunnel_id); + tunnel = l2tp_tunnel_get(net, tunnel_id); if (!tunnel) { ret = -ENODEV; goto out; @@ -527,24 +528,24 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if (!info->attrs[L2TP_ATTR_SESSION_ID]) { ret = -EINVAL; - goto out; + goto out_tunnel; } session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); if (!info->attrs[L2TP_ATTR_PEER_SESSION_ID]) { ret = -EINVAL; - goto out; + goto out_tunnel; } peer_session_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_SESSION_ID]); if (!info->attrs[L2TP_ATTR_PW_TYPE]) { ret = -EINVAL; - goto out; + goto out_tunnel; } cfg.pw_type = nla_get_u16(info->attrs[L2TP_ATTR_PW_TYPE]); if (cfg.pw_type >= __L2TP_PWTYPE_MAX) { ret = -EINVAL; - goto out; + goto out_tunnel; } if (tunnel->version > 2) { @@ -566,7 +567,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]); if (len > 8) { ret = -EINVAL; - goto out; + goto out_tunnel; } cfg.cookie_len = len; memcpy(&cfg.cookie[0], nla_data(info->attrs[L2TP_ATTR_COOKIE]), len); @@ -575,7 +576,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf u16 len = nla_len(info->attrs[L2TP_ATTR_PEER_COOKIE]); if (len > 8) { ret = -EINVAL; - goto out; + goto out_tunnel; } cfg.peer_cookie_len = len; memcpy(&cfg.peer_cookie[0], nla_data(info->attrs[L2TP_ATTR_PEER_COOKIE]), len); @@ -618,7 +619,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf if ((l2tp_nl_cmd_ops[cfg.pw_type] == NULL) || (l2tp_nl_cmd_ops[cfg.pw_type]->session_create == NULL)) { ret = -EPROTONOSUPPORT; - goto out; + goto out_tunnel; } /* Check that pseudowire-specific params are present */ @@ -628,7 +629,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf case L2TP_PWTYPE_ETH_VLAN: if (!info->attrs[L2TP_ATTR_VLAN_ID]) { ret = -EINVAL; - goto out; + goto out_tunnel; } break; case L2TP_PWTYPE_ETH: @@ -656,6 +657,8 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf } } +out_tunnel: + l2tp_tunnel_dec_refcount(tunnel); out: return ret; } From a7cd39e0c7805a93eaa4256370bcd48c506d46c1 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Fri, 25 Aug 2017 19:31:01 +0200 Subject: [PATCH 071/118] nfp: fix unchecked flow dissector use Previously flow dissectors were referenced without first checking that they are in use and correctly populated by TC. This patch fixes this by checking each flow dissector key before referencing them. Fixes: 5571e8c9f241 ("nfp: extend flower matching capabilities") Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../net/ethernet/netronome/nfp/flower/match.c | 133 +++++++++--------- .../ethernet/netronome/nfp/flower/offload.c | 41 +++--- 2 files changed, 93 insertions(+), 81 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c index 0e08404480ef..b36511063a25 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/match.c +++ b/drivers/net/ethernet/netronome/nfp/flower/match.c @@ -45,6 +45,7 @@ nfp_flower_compile_meta_tci(struct nfp_flower_meta_two *frame, struct flow_dissector_key_vlan *flow_vlan; u16 tmp_tci; + memset(frame, 0, sizeof(struct nfp_flower_meta_two)); /* Populate the metadata frame. */ frame->nfp_flow_key_layer = key_type; frame->mask_id = ~0; @@ -54,21 +55,20 @@ nfp_flower_compile_meta_tci(struct nfp_flower_meta_two *frame, return; } - flow_vlan = skb_flow_dissector_target(flow->dissector, - FLOW_DISSECTOR_KEY_VLAN, - flow->key); - - /* Populate the tci field. */ - if (!flow_vlan->vlan_id) { - tmp_tci = 0; - } else { - tmp_tci = FIELD_PREP(NFP_FLOWER_MASK_VLAN_PRIO, - flow_vlan->vlan_priority) | - FIELD_PREP(NFP_FLOWER_MASK_VLAN_VID, - flow_vlan->vlan_id) | - NFP_FLOWER_MASK_VLAN_CFI; + if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_VLAN)) { + flow_vlan = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_VLAN, + flow->key); + /* Populate the tci field. */ + if (flow_vlan->vlan_id) { + tmp_tci = FIELD_PREP(NFP_FLOWER_MASK_VLAN_PRIO, + flow_vlan->vlan_priority) | + FIELD_PREP(NFP_FLOWER_MASK_VLAN_VID, + flow_vlan->vlan_id) | + NFP_FLOWER_MASK_VLAN_CFI; + frame->tci = cpu_to_be16(tmp_tci); + } } - frame->tci = cpu_to_be16(tmp_tci); } static void @@ -99,17 +99,18 @@ nfp_flower_compile_mac(struct nfp_flower_mac_mpls *frame, bool mask_version) { struct fl_flow_key *target = mask_version ? flow->mask : flow->key; - struct flow_dissector_key_eth_addrs *flow_mac; - - flow_mac = skb_flow_dissector_target(flow->dissector, - FLOW_DISSECTOR_KEY_ETH_ADDRS, - target); + struct flow_dissector_key_eth_addrs *addr; memset(frame, 0, sizeof(struct nfp_flower_mac_mpls)); - /* Populate mac frame. */ - ether_addr_copy(frame->mac_dst, &flow_mac->dst[0]); - ether_addr_copy(frame->mac_src, &flow_mac->src[0]); + if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + addr = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS, + target); + /* Populate mac frame. */ + ether_addr_copy(frame->mac_dst, &addr->dst[0]); + ether_addr_copy(frame->mac_src, &addr->src[0]); + } if (mask_version) frame->mpls_lse = cpu_to_be32(~0); @@ -121,14 +122,17 @@ nfp_flower_compile_tport(struct nfp_flower_tp_ports *frame, bool mask_version) { struct fl_flow_key *target = mask_version ? flow->mask : flow->key; - struct flow_dissector_key_ports *flow_tp; + struct flow_dissector_key_ports *tp; - flow_tp = skb_flow_dissector_target(flow->dissector, - FLOW_DISSECTOR_KEY_PORTS, - target); + memset(frame, 0, sizeof(struct nfp_flower_tp_ports)); - frame->port_src = flow_tp->src; - frame->port_dst = flow_tp->dst; + if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_PORTS)) { + tp = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_PORTS, + target); + frame->port_src = tp->src; + frame->port_dst = tp->dst; + } } static void @@ -137,25 +141,27 @@ nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *frame, bool mask_version) { struct fl_flow_key *target = mask_version ? flow->mask : flow->key; - struct flow_dissector_key_ipv4_addrs *flow_ipv4; - struct flow_dissector_key_basic *flow_basic; + struct flow_dissector_key_ipv4_addrs *addr; + struct flow_dissector_key_basic *basic; - flow_ipv4 = skb_flow_dissector_target(flow->dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS, - target); - - flow_basic = skb_flow_dissector_target(flow->dissector, - FLOW_DISSECTOR_KEY_BASIC, - target); - - /* Populate IPv4 frame. */ - frame->reserved = 0; - frame->ipv4_src = flow_ipv4->src; - frame->ipv4_dst = flow_ipv4->dst; - frame->proto = flow_basic->ip_proto; /* Wildcard TOS/TTL for now. */ - frame->tos = 0; - frame->ttl = 0; + memset(frame, 0, sizeof(struct nfp_flower_ipv4)); + + if (dissector_uses_key(flow->dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + addr = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target); + frame->ipv4_src = addr->src; + frame->ipv4_dst = addr->dst; + } + + if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_BASIC)) { + basic = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_BASIC, + target); + frame->proto = basic->ip_proto; + } } static void @@ -164,26 +170,27 @@ nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *frame, bool mask_version) { struct fl_flow_key *target = mask_version ? flow->mask : flow->key; - struct flow_dissector_key_ipv6_addrs *flow_ipv6; - struct flow_dissector_key_basic *flow_basic; + struct flow_dissector_key_ipv6_addrs *addr; + struct flow_dissector_key_basic *basic; - flow_ipv6 = skb_flow_dissector_target(flow->dissector, - FLOW_DISSECTOR_KEY_IPV6_ADDRS, - target); - - flow_basic = skb_flow_dissector_target(flow->dissector, - FLOW_DISSECTOR_KEY_BASIC, - target); - - /* Populate IPv6 frame. */ - frame->reserved = 0; - frame->ipv6_src = flow_ipv6->src; - frame->ipv6_dst = flow_ipv6->dst; - frame->proto = flow_basic->ip_proto; /* Wildcard LABEL/TOS/TTL for now. */ - frame->ipv6_flow_label_exthdr = 0; - frame->tos = 0; - frame->ttl = 0; + memset(frame, 0, sizeof(struct nfp_flower_ipv6)); + + if (dissector_uses_key(flow->dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + addr = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target); + frame->ipv6_src = addr->src; + frame->ipv6_dst = addr->dst; + } + + if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_BASIC)) { + basic = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_BASIC, + target); + frame->proto = basic->ip_proto; + } } int nfp_flower_compile_flow_match(struct tc_cls_flower_offload *flow, diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 4ad10bd5e139..6c8ecc211568 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -105,35 +105,40 @@ static int nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls, struct tc_cls_flower_offload *flow) { - struct flow_dissector_key_control *mask_enc_ctl; - struct flow_dissector_key_basic *mask_basic; - struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_basic *mask_basic = NULL; + struct flow_dissector_key_basic *key_basic = NULL; u32 key_layer_two; u8 key_layer; int key_size; - mask_enc_ctl = skb_flow_dissector_target(flow->dissector, - FLOW_DISSECTOR_KEY_ENC_CONTROL, - flow->mask); + if (dissector_uses_key(flow->dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL)) { + struct flow_dissector_key_control *mask_enc_ctl = + skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_ENC_CONTROL, + flow->mask); + /* We are expecting a tunnel. For now we ignore offloading. */ + if (mask_enc_ctl->addr_type) + return -EOPNOTSUPP; + } - mask_basic = skb_flow_dissector_target(flow->dissector, - FLOW_DISSECTOR_KEY_BASIC, - flow->mask); + if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_BASIC)) { + mask_basic = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_BASIC, + flow->mask); + + key_basic = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_BASIC, + flow->key); + } - key_basic = skb_flow_dissector_target(flow->dissector, - FLOW_DISSECTOR_KEY_BASIC, - flow->key); key_layer_two = 0; key_layer = NFP_FLOWER_LAYER_PORT | NFP_FLOWER_LAYER_MAC; key_size = sizeof(struct nfp_flower_meta_one) + sizeof(struct nfp_flower_in_port) + sizeof(struct nfp_flower_mac_mpls); - /* We are expecting a tunnel. For now we ignore offloading. */ - if (mask_enc_ctl->addr_type) - return -EOPNOTSUPP; - - if (mask_basic->n_proto) { + if (mask_basic && mask_basic->n_proto) { /* Ethernet type is present in the key. */ switch (key_basic->n_proto) { case cpu_to_be16(ETH_P_IP): @@ -166,7 +171,7 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls, } } - if (mask_basic->ip_proto) { + if (mask_basic && mask_basic->ip_proto) { /* Ethernet type is present in the key. */ switch (key_basic->ip_proto) { case IPPROTO_TCP: From 74af5975108f54f9443952c0b4d52487031a7569 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Fri, 25 Aug 2017 19:31:02 +0200 Subject: [PATCH 072/118] nfp: fix supported key layers calculation Previously when calculating the supported key layers MPLS, IPv4/6 TTL and TOS were not considered. This patch checks that the TTL and TOS fields are masked out before offloading. Additionally this patch checks that MPLS packets are correctly handled, by not offloading them. Fixes: af9d842c1354 ("nfp: extend flower add flow offload") Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../ethernet/netronome/nfp/flower/offload.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 6c8ecc211568..74a96d6bb05c 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -107,6 +107,7 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls, { struct flow_dissector_key_basic *mask_basic = NULL; struct flow_dissector_key_basic *key_basic = NULL; + struct flow_dissector_key_ip *mask_ip = NULL; u32 key_layer_two; u8 key_layer; int key_size; @@ -132,6 +133,11 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls, flow->key); } + if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP)) + mask_ip = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_IP, + flow->mask); + key_layer_two = 0; key_layer = NFP_FLOWER_LAYER_PORT | NFP_FLOWER_LAYER_MAC; key_size = sizeof(struct nfp_flower_meta_one) + @@ -142,11 +148,19 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls, /* Ethernet type is present in the key. */ switch (key_basic->n_proto) { case cpu_to_be16(ETH_P_IP): + if (mask_ip && mask_ip->tos) + return -EOPNOTSUPP; + if (mask_ip && mask_ip->ttl) + return -EOPNOTSUPP; key_layer |= NFP_FLOWER_LAYER_IPV4; key_size += sizeof(struct nfp_flower_ipv4); break; case cpu_to_be16(ETH_P_IPV6): + if (mask_ip && mask_ip->tos) + return -EOPNOTSUPP; + if (mask_ip && mask_ip->ttl) + return -EOPNOTSUPP; key_layer |= NFP_FLOWER_LAYER_IPV6; key_size += sizeof(struct nfp_flower_ipv6); break; @@ -157,6 +171,11 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls, case cpu_to_be16(ETH_P_ARP): return -EOPNOTSUPP; + /* Currently we do not offload MPLS. */ + case cpu_to_be16(ETH_P_MPLS_UC): + case cpu_to_be16(ETH_P_MPLS_MC): + return -EOPNOTSUPP; + /* Will be included in layer 2. */ case cpu_to_be16(ETH_P_8021Q): break; From 6afd33e4384060e692705912337b184c1e159aff Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Fri, 25 Aug 2017 19:31:03 +0200 Subject: [PATCH 073/118] nfp: remove incorrect mask check for vlan matching Previously the vlan tci field was incorrectly exact matched. This patch fixes this by using the flow dissector to populate the vlan tci field. Fixes: 5571e8c9f241 ("nfp: extend flower matching capabilities") Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/match.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c index b36511063a25..d25b5038c3a2 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/match.c +++ b/drivers/net/ethernet/netronome/nfp/flower/match.c @@ -42,6 +42,7 @@ nfp_flower_compile_meta_tci(struct nfp_flower_meta_two *frame, struct tc_cls_flower_offload *flow, u8 key_type, bool mask_version) { + struct fl_flow_key *target = mask_version ? flow->mask : flow->key; struct flow_dissector_key_vlan *flow_vlan; u16 tmp_tci; @@ -50,15 +51,10 @@ nfp_flower_compile_meta_tci(struct nfp_flower_meta_two *frame, frame->nfp_flow_key_layer = key_type; frame->mask_id = ~0; - if (mask_version) { - frame->tci = cpu_to_be16(~0); - return; - } - if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_VLAN)) { flow_vlan = skb_flow_dissector_target(flow->dissector, FLOW_DISSECTOR_KEY_VLAN, - flow->key); + target); /* Populate the tci field. */ if (flow_vlan->vlan_id) { tmp_tci = FIELD_PREP(NFP_FLOWER_MASK_VLAN_PRIO, From ad4540cc5aa3dccb8e1e12458d57f8c40fae5a1c Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Fri, 25 Aug 2017 21:12:17 +0200 Subject: [PATCH 074/118] net: stmmac: sun8i: Remove the compatibles Since the bindings have been controversial, and we follow the DT stable ABI rule, we shouldn't let a driver with a DT binding that might change slip through in a stable release. Remove the compatibles to make sure the driver will not probe and no-one will start using the binding currently implemented. This commit will obviously need to be reverted in due time. Signed-off-by: Maxime Ripard Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index fffd6d5fc907..39c2122a4f26 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -979,14 +979,6 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) } static const struct of_device_id sun8i_dwmac_match[] = { - { .compatible = "allwinner,sun8i-h3-emac", - .data = &emac_variant_h3 }, - { .compatible = "allwinner,sun8i-v3s-emac", - .data = &emac_variant_v3s }, - { .compatible = "allwinner,sun8i-a83t-emac", - .data = &emac_variant_a83t }, - { .compatible = "allwinner,sun50i-a64-emac", - .data = &emac_variant_a64 }, { } }; MODULE_DEVICE_TABLE(of, sun8i_dwmac_match); From 0f3086868e8889a823a6e0f3d299102aa895d947 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 25 Aug 2017 22:48:48 +0200 Subject: [PATCH 075/118] cxgb4: Fix stack out-of-bounds read due to wrong size to t4_record_mbox() Passing commands for logging to t4_record_mbox() with size MBOX_LEN, when the actual command size is actually smaller, causes out-of-bounds stack accesses in t4_record_mbox() while copying command words here: for (i = 0; i < size / 8; i++) entry->cmd[i] = be64_to_cpu(cmd[i]); Up to 48 bytes from the stack are then leaked to debugfs. This happens whenever we send (and log) commands described by structs fw_sched_cmd (32 bytes leaked), fw_vi_rxmode_cmd (48), fw_hello_cmd (48), fw_bye_cmd (48), fw_initialize_cmd (48), fw_reset_cmd (48), fw_pfvf_cmd (32), fw_eq_eth_cmd (16), fw_eq_ctrl_cmd (32), fw_eq_ofld_cmd (32), fw_acl_mac_cmd(16), fw_rss_glb_config_cmd(32), fw_rss_vi_config_cmd(32), fw_devlog_cmd(32), fw_vi_enable_cmd(48), fw_port_cmd(32), fw_sched_cmd(32), fw_devlog_cmd(32). The cxgb4vf driver got this right instead. When we call t4_record_mbox() to log a command reply, a MBOX_LEN size can be used though, as get_mbox_rpl() will fill cmd_rpl up completely. Fixes: 7f080c3f2ff0 ("cxgb4: Add support to enable logging of firmware mailbox commands") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 82bf7aac6cdb..0293b41171a5 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -369,12 +369,12 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, list_del(&entry.list); spin_unlock(&adap->mbox_lock); ret = (v == MBOX_OWNER_FW) ? -EBUSY : -ETIMEDOUT; - t4_record_mbox(adap, cmd, MBOX_LEN, access, ret); + t4_record_mbox(adap, cmd, size, access, ret); return ret; } /* Copy in the new mailbox command and send it on its way ... */ - t4_record_mbox(adap, cmd, MBOX_LEN, access, 0); + t4_record_mbox(adap, cmd, size, access, 0); for (i = 0; i < size; i += 8) t4_write_reg64(adap, data_reg + i, be64_to_cpu(*p++)); @@ -426,7 +426,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, } ret = (pcie_fw & PCIE_FW_ERR_F) ? -ENXIO : -ETIMEDOUT; - t4_record_mbox(adap, cmd, MBOX_LEN, access, ret); + t4_record_mbox(adap, cmd, size, access, ret); dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n", *(const u8 *)cmd, mbox); t4_report_fw_error(adap); From 4e587ea71bf924f7dac621f1351653bd41e446cb Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 25 Aug 2017 15:03:10 -0700 Subject: [PATCH 076/118] ipv6: fix sparse warning on rt6i_node Commit c5cff8561d2d adds rcu grace period before freeing fib6_node. This generates a new sparse warning on rt->rt6i_node related code: net/ipv6/route.c:1394:30: error: incompatible types in comparison expression (different address spaces) ./include/net/ip6_fib.h:187:14: error: incompatible types in comparison expression (different address spaces) This commit adds "__rcu" tag for rt6i_node and makes sure corresponding rcu API is used for it. After this fix, sparse no longer generates the above warning. Fixes: c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node") Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 +- net/ipv6/addrconf.c | 2 +- net/ipv6/ip6_fib.c | 11 +++++++---- net/ipv6/route.c | 3 ++- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index e9c59db92942..af509f801084 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -105,7 +105,7 @@ struct rt6_info { * the same cache line. */ struct fib6_table *rt6i_table; - struct fib6_node *rt6i_node; + struct fib6_node __rcu *rt6i_node; struct in6_addr rt6i_gateway; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3c46e9513a31..936e9ab4dda5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5556,7 +5556,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) * our DAD process, so we don't need * to do it again */ - if (!(ifp->rt->rt6i_node)) + if (!rcu_access_pointer(ifp->rt->rt6i_node)) ip6_ins_rt(ifp->rt); if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index a5ebf86f6be8..10b4b1f8b838 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -889,7 +889,7 @@ add: rt->dst.rt6_next = iter; *ins = rt; - rt->rt6i_node = fn; + rcu_assign_pointer(rt->rt6i_node, fn); atomic_inc(&rt->rt6i_ref); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); @@ -915,7 +915,7 @@ add: return err; *ins = rt; - rt->rt6i_node = fn; + rcu_assign_pointer(rt->rt6i_node, fn); rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); if (!info->skip_notify) @@ -1480,8 +1480,9 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, int fib6_del(struct rt6_info *rt, struct nl_info *info) { + struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); struct net *net = info->nl_net; - struct fib6_node *fn = rt->rt6i_node; struct rt6_info **rtp; #if RT6_DEBUG >= 2 @@ -1670,7 +1671,9 @@ static int fib6_clean_node(struct fib6_walker *w) if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", - __func__, rt, rt->rt6i_node, res); + __func__, rt, + rcu_access_pointer(rt->rt6i_node), + res); #endif continue; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 48c8c92dcbd3..86fb2411e2bd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1383,7 +1383,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { return !(rt->rt6i_flags & RTF_CACHE) && - (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node); + (rt->rt6i_flags & RTF_PCPU || + rcu_access_pointer(rt->rt6i_node)); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, From 1e2ea8ad37be25a7cdcc974945935829d534d5d3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 26 Aug 2017 20:10:10 +0800 Subject: [PATCH 077/118] ipv6: set dst.obsolete when a cached route has expired Now it doesn't check for the cached route expiration in ipv6's dst_ops->check(), because it trusts dst_gc that would clean the cached route up when it's expired. The problem is in dst_gc, it would clean the cached route only when it's refcount is 1. If some other module (like xfrm) keeps holding it and the module only release it when dst_ops->check() fails. But without checking for the cached route expiration, .check() may always return true. Meanwhile, without releasing the cached route, dst_gc couldn't del it. It will cause this cached route never to expire. This patch is to set dst.obsolete with DST_OBSOLETE_KILL in .gc when it's expired, and check obsolete != DST_OBSOLETE_FORCE_CHK in .check. Note that this is even needed when ipv6 dst_gc timer is removed one day. It would set dst.obsolete in .redirect and .update_pmtu instead, and check for cached route expiration when getting it, just like what ipv4 route does. Reported-by: Jianlin Shi Signed-off-by: Xin Long Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 4 +++- net/ipv6/route.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 10b4b1f8b838..e1c85bb4eac0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1795,8 +1795,10 @@ static int fib6_age(struct rt6_info *rt, void *arg) } gc_args->more++; } else if (rt->rt6i_flags & RTF_CACHE) { + if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) + rt->dst.obsolete = DST_OBSOLETE_KILL; if (atomic_read(&rt->dst.__refcnt) == 1 && - time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { + rt->dst.obsolete == DST_OBSOLETE_KILL) { RT6_TRACE("aging clone %p\n", rt); return -1; } else if (rt->rt6i_flags & RTF_GATEWAY) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 86fb2411e2bd..2d0e7798c793 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -440,7 +440,8 @@ static bool rt6_check_expired(const struct rt6_info *rt) if (time_after(jiffies, rt->dst.expires)) return true; } else if (rt->dst.from) { - return rt6_check_expired((struct rt6_info *) rt->dst.from); + return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || + rt6_check_expired((struct rt6_info *)rt->dst.from); } return false; } From ef9a5a62c63456cbba1beef2fc1372fce105fbbc Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Sat, 26 Aug 2017 21:13:48 -0700 Subject: [PATCH 078/118] bridge: check for null fdb->dst before notifying switchdev drivers current switchdev drivers dont seem to support offloading fdb entries pointing to the bridge device which have fdb->dst not set to any port. This patch adds a NULL fdb->dst check in the switchdev notifier code. This patch fixes the below NULL ptr dereference: $bridge fdb add 00:02:00:00:00:33 dev br0 self [ 69.953374] BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 [ 69.954044] IP: br_switchdev_fdb_notify+0x29/0x80 [ 69.954044] PGD 66527067 [ 69.954044] P4D 66527067 [ 69.954044] PUD 7899c067 [ 69.954044] PMD 0 [ 69.954044] [ 69.954044] Oops: 0000 [#1] SMP [ 69.954044] Modules linked in: [ 69.954044] CPU: 1 PID: 3074 Comm: bridge Not tainted 4.13.0-rc6+ #1 [ 69.954044] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014 [ 69.954044] task: ffff88007b827140 task.stack: ffffc90001564000 [ 69.954044] RIP: 0010:br_switchdev_fdb_notify+0x29/0x80 [ 69.954044] RSP: 0018:ffffc90001567918 EFLAGS: 00010246 [ 69.954044] RAX: 0000000000000000 RBX: ffff8800795e0880 RCX: 00000000000000c0 [ 69.954044] RDX: ffffc90001567920 RSI: 000000000000001c RDI: ffff8800795d0600 [ 69.954044] RBP: ffffc90001567938 R08: ffff8800795d0600 R09: 0000000000000000 [ 69.954044] R10: ffffc90001567a88 R11: ffff88007b849400 R12: ffff8800795e0880 [ 69.954044] R13: ffff8800795d0600 R14: ffffffff81ef8880 R15: 000000000000001c [ 69.954044] FS: 00007f93d3085700(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000 [ 69.954044] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 69.954044] CR2: 0000000000000008 CR3: 0000000066551000 CR4: 00000000000006e0 [ 69.954044] Call Trace: [ 69.954044] fdb_notify+0x3f/0xf0 [ 69.954044] __br_fdb_add.isra.12+0x1a7/0x370 [ 69.954044] br_fdb_add+0x178/0x280 [ 69.954044] rtnl_fdb_add+0x10a/0x200 [ 69.954044] rtnetlink_rcv_msg+0x1b4/0x240 [ 69.954044] ? skb_free_head+0x21/0x40 [ 69.954044] ? rtnl_calcit.isra.18+0xf0/0xf0 [ 69.954044] netlink_rcv_skb+0xed/0x120 [ 69.954044] rtnetlink_rcv+0x15/0x20 [ 69.954044] netlink_unicast+0x180/0x200 [ 69.954044] netlink_sendmsg+0x291/0x370 [ 69.954044] ___sys_sendmsg+0x180/0x2e0 [ 69.954044] ? filemap_map_pages+0x2db/0x370 [ 69.954044] ? do_wp_page+0x11d/0x420 [ 69.954044] ? __handle_mm_fault+0x794/0xd80 [ 69.954044] ? vma_link+0xcb/0xd0 [ 69.954044] __sys_sendmsg+0x4c/0x90 [ 69.954044] SyS_sendmsg+0x12/0x20 [ 69.954044] do_syscall_64+0x63/0xe0 [ 69.954044] entry_SYSCALL64_slow_path+0x25/0x25 [ 69.954044] RIP: 0033:0x7f93d2bad690 [ 69.954044] RSP: 002b:00007ffc7217a638 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 69.954044] RAX: ffffffffffffffda RBX: 00007ffc72182eac RCX: 00007f93d2bad690 [ 69.954044] RDX: 0000000000000000 RSI: 00007ffc7217a670 RDI: 0000000000000003 [ 69.954044] RBP: 0000000059a1f7f8 R08: 0000000000000006 R09: 000000000000000a [ 69.954044] R10: 00007ffc7217a400 R11: 0000000000000246 R12: 00007ffc7217a670 [ 69.954044] R13: 00007ffc72182a98 R14: 00000000006114c0 R15: 00007ffc72182aa0 [ 69.954044] Code: 1f 00 66 66 66 66 90 55 48 89 e5 48 83 ec 20 f6 47 20 04 74 0a 83 fe 1c 74 09 83 fe 1d 74 2c c9 66 90 c3 48 8b 47 10 48 8d 55 e8 <48> 8b 70 08 0f b7 47 1e 48 83 c7 18 48 89 7d f0 bf 03 00 00 00 [ 69.954044] RIP: br_switchdev_fdb_notify+0x29/0x80 RSP: ffffc90001567918 [ 69.954044] CR2: 0000000000000008 [ 69.954044] ---[ end trace 03e9eec4a82c238b ]--- Fixes: 6b26b51b1d13 ("net: bridge: Add support for notifying devices about FDB add/del") Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/bridge/br_switchdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 181a44d0f1da..f6b1c7de059d 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -115,7 +115,7 @@ br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac, void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) { - if (!fdb->added_by_user) + if (!fdb->added_by_user || !fdb->dst) return; switch (type) { From c7848399ec7612c2fa4dc0c2eeb9e0b89d00afef Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 28 Aug 2017 17:10:51 -0700 Subject: [PATCH 079/118] net: dsa: Don't dereference dst->cpu_dp->netdev If we do not have a master network device attached dst->cpu_dp will be NULL and accessing cpu_dp->netdev will create a trace similar to the one below. The correct check is on dst->cpu_dp period. [ 1.004650] DSA: switch 0 0 parsed [ 1.008078] Unable to handle kernel NULL pointer dereference at virtual address 00000010 [ 1.016195] pgd = c0003000 [ 1.018918] [00000010] *pgd=80000000004003, *pmd=00000000 [ 1.024349] Internal error: Oops: 206 [#1] SMP ARM [ 1.029157] Modules linked in: [ 1.032228] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0-rc6-00071-g45b45afab9bd-dirty #7 [ 1.040772] Hardware name: Broadcom STB (Flattened Device Tree) [ 1.046704] task: ee08f840 task.stack: ee090000 [ 1.051258] PC is at dsa_register_switch+0x5e0/0x9dc [ 1.056234] LR is at dsa_register_switch+0x5d0/0x9dc [ 1.061211] pc : [] lr : [] psr: 60000213 [ 1.067491] sp : ee091d88 ip : 00000000 fp : 0000000c [ 1.072728] r10: 00000000 r9 : 00000001 r8 : ee208010 [ 1.077965] r7 : ee2b57b0 r6 : ee2b5780 r5 : 00000000 r4 : ee208e0c [ 1.084506] r3 : 00000000 r2 : 00040d00 r1 : 2d1b2000 r0 : 00000016 [ 1.091050] Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user [ 1.098199] Control: 32c5387d Table: 00003000 DAC: fffffffd [ 1.103957] Process swapper/0 (pid: 1, stack limit = 0xee090210) Reported-by: Dan Carpenter Fixes: 6d3c8c0dd88a ("net: dsa: Remove master_netdev and use dst->cpu_dp->netdev") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index c442051d5a55..20bc9c56fca0 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -577,7 +577,7 @@ static int dsa_dst_parse(struct dsa_switch_tree *dst) return err; } - if (!dst->cpu_dp->netdev) { + if (!dst->cpu_dp) { pr_warn("Tree has no master device\n"); return -EINVAL; } From e8d411d2980723b8f8ba8e4dd78b694c5fd9ea3e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 28 Aug 2017 10:45:01 +0800 Subject: [PATCH 080/118] ipv6: do not set sk_destruct in IPV6_ADDRFORM sockopt ChunYu found a kernel warn_on during syzkaller fuzzing: [40226.038539] WARNING: CPU: 5 PID: 23720 at net/ipv4/af_inet.c:152 inet_sock_destruct+0x78d/0x9a0 [40226.144849] Call Trace: [40226.147590] [40226.149859] dump_stack+0xe2/0x186 [40226.176546] __warn+0x1a4/0x1e0 [40226.180066] warn_slowpath_null+0x31/0x40 [40226.184555] inet_sock_destruct+0x78d/0x9a0 [40226.246355] __sk_destruct+0xfa/0x8c0 [40226.290612] rcu_process_callbacks+0xaa0/0x18a0 [40226.336816] __do_softirq+0x241/0x75e [40226.367758] irq_exit+0x1f6/0x220 [40226.371458] smp_apic_timer_interrupt+0x7b/0xa0 [40226.376507] apic_timer_interrupt+0x93/0xa0 The warn_on happned when sk->sk_rmem_alloc wasn't 0 in inet_sock_destruct. As after commit f970bd9e3a06 ("udp: implement memory accounting helpers"), udp has changed to use udp_destruct_sock as sk_destruct where it would udp_rmem_release all rmem. But IPV6_ADDRFORM sockopt sets sk_destruct with inet_sock_destruct after changing family to PF_INET. If rmem is not 0 at that time, and there is no place to release rmem before calling inet_sock_destruct, the warn_on will be triggered. This patch is to fix it by not setting sk_destruct in IPV6_ADDRFORM sockopt any more. As IPV6_ADDRFORM sockopt only works for tcp and udp. TCP sock has already set it's sk_destruct with inet_sock_destruct and UDP has set with udp_destruct_sock since they're created. Fixes: f970bd9e3a06 ("udp: implement memory accounting helpers") Reported-by: ChunYu Wang Signed-off-by: Xin Long Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 02d795fe3d7f..a5e466d4e093 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -242,7 +242,6 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, pktopt = xchg(&np->pktoptions, NULL); kfree_skb(pktopt); - sk->sk_destruct = inet_sock_destruct; /* * ... and add it to the refcnt debug socks count * in the new family. -acme From d55c60eba0ef44ec21831ce26db300763eafd865 Mon Sep 17 00:00:00 2001 From: Parthasarathy Bhuvaragan Date: Mon, 28 Aug 2017 17:57:02 +0200 Subject: [PATCH 081/118] tipc: permit bond slave as bearer For a bond slave device as a tipc bearer, the dev represents the bond interface and orig_dev represents the slave in tipc_l2_rcv_msg(). Since we decode the tipc_ptr from bonding device (dev), we fail to find the bearer and thus tipc links are not established. In this commit, we register the tipc protocol callback per device and look for tipc bearer from both the devices. Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/bearer.c | 26 +++++++++++--------------- net/tipc/bearer.h | 2 ++ 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 767e0537dde5..89cd061c4468 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -65,6 +65,8 @@ static struct tipc_bearer *bearer_get(struct net *net, int bearer_id) } static void bearer_disable(struct net *net, struct tipc_bearer *b); +static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); /** * tipc_media_find - locates specified media object by name @@ -428,6 +430,10 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, /* Associate TIPC bearer with L2 bearer */ rcu_assign_pointer(b->media_ptr, dev); + b->pt.dev = dev; + b->pt.type = htons(ETH_P_TIPC); + b->pt.func = tipc_l2_rcv_msg; + dev_add_pack(&b->pt); memset(&b->bcast_addr, 0, sizeof(b->bcast_addr)); memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len); b->bcast_addr.media_id = b->media->type_id; @@ -447,6 +453,7 @@ void tipc_disable_l2_media(struct tipc_bearer *b) struct net_device *dev; dev = (struct net_device *)rtnl_dereference(b->media_ptr); + dev_remove_pack(&b->pt); RCU_INIT_POINTER(dev->tipc_ptr, NULL); synchronize_net(); dev_put(dev); @@ -594,11 +601,12 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, struct tipc_bearer *b; rcu_read_lock(); - b = rcu_dereference_rtnl(dev->tipc_ptr); + b = rcu_dereference_rtnl(dev->tipc_ptr) ?: + rcu_dereference_rtnl(orig_dev->tipc_ptr); if (likely(b && test_bit(0, &b->up) && (skb->pkt_type <= PACKET_MULTICAST))) { skb->next = NULL; - tipc_rcv(dev_net(dev), skb, b); + tipc_rcv(dev_net(b->pt.dev), skb, b); rcu_read_unlock(); return NET_RX_SUCCESS; } @@ -659,11 +667,6 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, return NOTIFY_OK; } -static struct packet_type tipc_packet_type __read_mostly = { - .type = htons(ETH_P_TIPC), - .func = tipc_l2_rcv_msg, -}; - static struct notifier_block notifier = { .notifier_call = tipc_l2_device_event, .priority = 0, @@ -671,19 +674,12 @@ static struct notifier_block notifier = { int tipc_bearer_setup(void) { - int err; - - err = register_netdevice_notifier(¬ifier); - if (err) - return err; - dev_add_pack(&tipc_packet_type); - return 0; + return register_netdevice_notifier(¬ifier); } void tipc_bearer_cleanup(void) { unregister_netdevice_notifier(¬ifier); - dev_remove_pack(&tipc_packet_type); } void tipc_bearer_stop(struct net *net) diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 635c9086e19a..e07a55a80c18 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -131,6 +131,7 @@ struct tipc_media { * @name: bearer name (format = media:interface) * @media: ptr to media structure associated with bearer * @bcast_addr: media address used in broadcasting + * @pt: packet type for bearer * @rcu: rcu struct for tipc_bearer * @priority: default link priority for bearer * @window: default window size for bearer @@ -151,6 +152,7 @@ struct tipc_bearer { char name[TIPC_MAX_BEARER_NAME]; struct tipc_media *media; struct tipc_media_addr bcast_addr; + struct packet_type pt; struct rcu_head rcu; u32 priority; u32 window; From edbd58be15a957f6a760c4a514cd475217eb97fd Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Mon, 28 Aug 2017 14:29:41 -0400 Subject: [PATCH 082/118] packet: Don't write vnet header beyond end of buffer ... which may happen with certain values of tp_reserve and maclen. Fixes: 58d19b19cd99 ("packet: vnet_hdr support for tpacket_rcv") Signed-off-by: Benjamin Poirier Cc: Willem de Bruijn Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 008a45ca3112..1c61af9af67d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2191,6 +2191,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct timespec ts; __u32 ts_status; bool is_drop_n_account = false; + bool do_vnet = false; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing @@ -2241,8 +2242,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; - if (po->has_vnet_hdr) + if (po->has_vnet_hdr) { netoff += sizeof(struct virtio_net_hdr); + do_vnet = true; + } macoff = netoff - maclen; } if (po->tp_version <= TPACKET_V2) { @@ -2259,8 +2262,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, skb_set_owner_r(copy_skb, sk); } snaplen = po->rx_ring.frame_size - macoff; - if ((int)snaplen < 0) + if ((int)snaplen < 0) { snaplen = 0; + do_vnet = false; + } } } else if (unlikely(macoff + snaplen > GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { @@ -2273,6 +2278,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely((int)snaplen < 0)) { snaplen = 0; macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; + do_vnet = false; } } spin_lock(&sk->sk_receive_queue.lock); @@ -2298,7 +2304,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, } spin_unlock(&sk->sk_receive_queue.lock); - if (po->has_vnet_hdr) { + if (do_vnet) { if (virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), vio_le(), true)) { From 278175aba363dcc5b0978abe16fa39dcdca67ffb Mon Sep 17 00:00:00 2001 From: Pavel Belous Date: Mon, 28 Aug 2017 21:52:08 +0300 Subject: [PATCH 083/118] net:ethernet:aquantia: Extra spinlocks removed. This patch removes datapath spinlocks which does not perform any useful work. Fixes: 6e70637f9f1e ("net: ethernet: aquantia: Add ring support code") Signed-off-by: Pavel Belous Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/aq_nic.c | 40 +++++-------------- .../net/ethernet/aquantia/atlantic/aq_ring.c | 1 - .../net/ethernet/aquantia/atlantic/aq_utils.h | 1 - .../net/ethernet/aquantia/atlantic/aq_vec.c | 11 +---- 4 files changed, 13 insertions(+), 40 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 9ee1c5016784..08b727506575 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -597,14 +597,11 @@ exit: } int aq_nic_xmit(struct aq_nic_s *self, struct sk_buff *skb) -__releases(&ring->lock) -__acquires(&ring->lock) { struct aq_ring_s *ring = NULL; unsigned int frags = 0U; unsigned int vec = skb->queue_mapping % self->aq_nic_cfg.vecs; unsigned int tc = 0U; - unsigned int trys = AQ_CFG_LOCK_TRYS; int err = NETDEV_TX_OK; bool is_nic_in_bad_state; @@ -628,36 +625,21 @@ __acquires(&ring->lock) goto err_exit; } - do { - if (spin_trylock(&ring->header.lock)) { - frags = aq_nic_map_skb(self, skb, ring); + frags = aq_nic_map_skb(self, skb, ring); - if (likely(frags)) { - err = self->aq_hw_ops.hw_ring_tx_xmit( - self->aq_hw, - ring, frags); - if (err >= 0) { - if (aq_ring_avail_dx(ring) < - AQ_CFG_SKB_FRAGS_MAX + 1) - aq_nic_ndev_queue_stop( - self, - ring->idx); + if (likely(frags)) { + err = self->aq_hw_ops.hw_ring_tx_xmit(self->aq_hw, + ring, + frags); + if (err >= 0) { + if (aq_ring_avail_dx(ring) < AQ_CFG_SKB_FRAGS_MAX + 1) + aq_nic_ndev_queue_stop(self, ring->idx); - ++ring->stats.tx.packets; - ring->stats.tx.bytes += skb->len; - } - } else { - err = NETDEV_TX_BUSY; - } - - spin_unlock(&ring->header.lock); - break; + ++ring->stats.tx.packets; + ring->stats.tx.bytes += skb->len; } - } while (--trys); - - if (!trys) { + } else { err = NETDEV_TX_BUSY; - goto err_exit; } err_exit: diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 9a0817938eca..ec5579fb8268 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -101,7 +101,6 @@ int aq_ring_init(struct aq_ring_s *self) self->hw_head = 0; self->sw_head = 0; self->sw_tail = 0; - spin_lock_init(&self->header.lock); return 0; } diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_utils.h b/drivers/net/ethernet/aquantia/atlantic/aq_utils.h index f6012b34abe6..e12bcdfb874a 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_utils.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_utils.h @@ -17,7 +17,6 @@ #define AQ_DIMOF(_ARY_) ARRAY_SIZE(_ARY_) struct aq_obj_s { - spinlock_t lock; /* spinlock for nic/rings processing */ atomic_t flags; }; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c index ad5b4d4dac7f..fee446af748f 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c @@ -34,8 +34,6 @@ struct aq_vec_s { #define AQ_VEC_RX_ID 1 static int aq_vec_poll(struct napi_struct *napi, int budget) -__releases(&self->lock) -__acquires(&self->lock) { struct aq_vec_s *self = container_of(napi, struct aq_vec_s, napi); struct aq_ring_s *ring = NULL; @@ -47,7 +45,7 @@ __acquires(&self->lock) if (!self) { err = -EINVAL; - } else if (spin_trylock(&self->header.lock)) { + } else { for (i = 0U, ring = self->ring[0]; self->tx_rings > i; ++i, ring = self->ring[i]) { if (self->aq_hw_ops->hw_ring_tx_head_update) { @@ -105,11 +103,8 @@ __acquires(&self->lock) self->aq_hw_ops->hw_irq_enable(self->aq_hw, 1U << self->aq_ring_param.vec_idx); } - -err_exit: - spin_unlock(&self->header.lock); } - +err_exit: return work_done; } @@ -185,8 +180,6 @@ int aq_vec_init(struct aq_vec_s *self, struct aq_hw_ops *aq_hw_ops, self->aq_hw_ops = aq_hw_ops; self->aq_hw = aq_hw; - spin_lock_init(&self->header.lock); - for (i = 0U, ring = self->ring[0]; self->tx_rings > i; ++i, ring = self->ring[i]) { err = aq_ring_init(&ring[AQ_VEC_TX_ID]); From 64fc7953ffd9424726988dd04945c28141ee41af Mon Sep 17 00:00:00 2001 From: Pavel Belous Date: Mon, 28 Aug 2017 21:52:09 +0300 Subject: [PATCH 084/118] net:ethernet:aquantia: Fix for number of RSS queues. The number of RSS queues should be not more than numbers of CPU. Its does not make sense to increase perfomance, and also cause problems on some motherboards. Fixes: 94f6c9e4cdf6 ("net: ethernet: aquantia: Support for NIC-specific code") Signed-off-by: Pavel Belous Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 08b727506575..d6d8e7074c83 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -103,6 +103,8 @@ int aq_nic_cfg_start(struct aq_nic_s *self) else cfg->vecs = 1U; + cfg->num_rss_queues = min(cfg->vecs, AQ_CFG_NUM_RSS_QUEUES_DEF); + cfg->irq_type = aq_pci_func_get_irq_type(self->aq_pci_func); if ((cfg->irq_type == AQ_HW_IRQ_LEGACY) || From 0a402e7b9725611069dad4c873d1516f8c805f38 Mon Sep 17 00:00:00 2001 From: Pavel Belous Date: Mon, 28 Aug 2017 21:52:10 +0300 Subject: [PATCH 085/118] net:ethernet:aquantia: Workaround for HW checksum bug. The hardware has the HW Checksum Offload bug when small TCP patckets (with length <= 60 bytes) has wrong "checksum valid" bit. The solution is - ignore checksum valid bit for small packets (with length <= 60 bytes) and mark this as CHECKSUM_NONE to allow network stack recalculate checksum itself. Fixes: ccf9a5ed14be ("net: ethernet: aquantia: Atlantic A0 and B0 specific functions.") Signed-off-by: Pavel Belous Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c | 6 ++++++ drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c index faeb4935ef3e..c5a02df7a48b 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c @@ -629,6 +629,12 @@ static int hw_atl_a0_hw_ring_rx_receive(struct aq_hw_s *self, buff->is_udp_cso = (is_err & 0x10U) ? 0 : 1; else if (0x0U == (pkt_type & 0x1CU)) buff->is_tcp_cso = (is_err & 0x10U) ? 0 : 1; + + /* Checksum offload workaround for small packets */ + if (rxd_wb->pkt_len <= 60) { + buff->is_ip_cso = 0U; + buff->is_cso_err = 0U; + } } is_err &= ~0x18U; diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index 1bceb7358e5c..21784cc39dab 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -645,6 +645,12 @@ static int hw_atl_b0_hw_ring_rx_receive(struct aq_hw_s *self, buff->is_udp_cso = buff->is_cso_err ? 0U : 1U; else if (0x0U == (pkt_type & 0x1CU)) buff->is_tcp_cso = buff->is_cso_err ? 0U : 1U; + + /* Checksum offload workaround for small packets */ + if (rxd_wb->pkt_len <= 60) { + buff->is_ip_cso = 0U; + buff->is_cso_err = 0U; + } } is_err &= ~0x18U; From bd8ed4415ff8584ccdd1f61c8d7279dc1f9e623e Mon Sep 17 00:00:00 2001 From: Pavel Belous Date: Mon, 28 Aug 2017 21:52:11 +0300 Subject: [PATCH 086/118] net:ethernet:aquantia: Fix for incorrect speed index. The driver choose the optimal interrupt throttling settings depends of current link speed. Due this bug link_status field from aq_hw is never updated and as result always used same interrupt throttling values. Fixes: 3d2ff7eebe26 ("net: ethernet: aquantia: Atlantic hardware abstraction layer") Signed-off-by: Pavel Belous Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/aq_hw.h | 3 +- .../net/ethernet/aquantia/atlantic/aq_nic.c | 29 +++++++++---------- .../aquantia/atlantic/hw_atl/hw_atl_utils.c | 4 +-- .../aquantia/atlantic/hw_atl/hw_atl_utils.h | 3 +- 4 files changed, 17 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h index fce0fd3f23ff..bf9b3f020e10 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h @@ -105,8 +105,7 @@ struct aq_hw_ops { int (*hw_set_mac_address)(struct aq_hw_s *self, u8 *mac_addr); - int (*hw_get_link_status)(struct aq_hw_s *self, - struct aq_hw_link_status_s *link_status); + int (*hw_get_link_status)(struct aq_hw_s *self); int (*hw_set_link_speed)(struct aq_hw_s *self, u32 speed); diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index d6d8e7074c83..dce17a5b82b1 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -125,33 +125,30 @@ static void aq_nic_service_timer_cb(unsigned long param) struct net_device *ndev = aq_nic_get_ndev(self); int err = 0; unsigned int i = 0U; - struct aq_hw_link_status_s link_status; struct aq_ring_stats_rx_s stats_rx; struct aq_ring_stats_tx_s stats_tx; if (aq_utils_obj_test(&self->header.flags, AQ_NIC_FLAGS_IS_NOT_READY)) goto err_exit; - err = self->aq_hw_ops.hw_get_link_status(self->aq_hw, &link_status); + err = self->aq_hw_ops.hw_get_link_status(self->aq_hw); if (err < 0) goto err_exit; + self->link_status = self->aq_hw->aq_link_status; + self->aq_hw_ops.hw_interrupt_moderation_set(self->aq_hw, - self->aq_nic_cfg.is_interrupt_moderation); + self->aq_nic_cfg.is_interrupt_moderation); - if (memcmp(&link_status, &self->link_status, sizeof(link_status))) { - if (link_status.mbps) { - aq_utils_obj_set(&self->header.flags, - AQ_NIC_FLAG_STARTED); - aq_utils_obj_clear(&self->header.flags, - AQ_NIC_LINK_DOWN); - netif_carrier_on(self->ndev); - } else { - netif_carrier_off(self->ndev); - aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN); - } - - self->link_status = link_status; + if (self->link_status.mbps) { + aq_utils_obj_set(&self->header.flags, + AQ_NIC_FLAG_STARTED); + aq_utils_obj_clear(&self->header.flags, + AQ_NIC_LINK_DOWN); + netif_carrier_on(self->ndev); + } else { + netif_carrier_off(self->ndev); + aq_utils_obj_set(&self->header.flags, AQ_NIC_LINK_DOWN); } memset(&stats_rx, 0U, sizeof(struct aq_ring_stats_rx_s)); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index 8d6d8f5804da..7a1332e9b9bc 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -313,11 +313,11 @@ void hw_atl_utils_mpi_set(struct aq_hw_s *self, err_exit:; } -int hw_atl_utils_mpi_get_link_status(struct aq_hw_s *self, - struct aq_hw_link_status_s *link_status) +int hw_atl_utils_mpi_get_link_status(struct aq_hw_s *self) { u32 cp0x036C = aq_hw_read_reg(self, HW_ATL_MPI_STATE_ADR); u32 link_speed_mask = cp0x036C >> HW_ATL_MPI_SPEED_SHIFT; + struct aq_hw_link_status_s *link_status = &self->aq_link_status; if (!link_speed_mask) { link_status->mbps = 0U; diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h index a66aee51ab5b..e0360a6b2202 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h @@ -180,8 +180,7 @@ void hw_atl_utils_mpi_set(struct aq_hw_s *self, int hw_atl_utils_mpi_set_speed(struct aq_hw_s *self, u32 speed, enum hal_atl_utils_fw_state_e state); -int hw_atl_utils_mpi_get_link_status(struct aq_hw_s *self, - struct aq_hw_link_status_s *link_status); +int hw_atl_utils_mpi_get_link_status(struct aq_hw_s *self); int hw_atl_utils_get_mac_permanent(struct aq_hw_s *self, struct aq_hw_caps_s *aq_hw_caps, From b21f502f84be082fb63cca8e7ab6eb8f7ee88024 Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Mon, 28 Aug 2017 21:52:12 +0300 Subject: [PATCH 087/118] net:ethernet:aquantia: Fix for multicast filter handling. Since the HW supports up to 32 multicast filters we should track count of multicast filters to avoid overflow. If we attempt to add >32 multicast filter - just set NETIF_ALLMULTI flag instead. Fixes: 94f6c9e4cdf6 ("net: ethernet: aquantia: Support for NIC-specific code") Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_nic.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index dce17a5b82b1..6ac9e2602d6d 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -669,11 +669,26 @@ int aq_nic_set_multicast_list(struct aq_nic_s *self, struct net_device *ndev) netdev_for_each_mc_addr(ha, ndev) { ether_addr_copy(self->mc_list.ar[i++], ha->addr); ++self->mc_list.count; + + if (i >= AQ_CFG_MULTICAST_ADDRESS_MAX) + break; } - return self->aq_hw_ops.hw_multicast_list_set(self->aq_hw, + if (i >= AQ_CFG_MULTICAST_ADDRESS_MAX) { + /* Number of filters is too big: atlantic does not support this. + * Force all multi filter to support this. + * With this we disable all UC filters and setup "all pass" + * multicast mask + */ + self->packet_filter |= IFF_ALLMULTI; + self->aq_hw->aq_nic_cfg->mc_list_count = 0; + return self->aq_hw_ops.hw_packet_filter_set(self->aq_hw, + self->packet_filter); + } else { + return self->aq_hw_ops.hw_multicast_list_set(self->aq_hw, self->mc_list.ar, self->mc_list.count); + } } int aq_nic_set_mtu(struct aq_nic_s *self, int new_mtu) From 6d3f58e09f0c1457aa5a6c60e5da08786dd8a18e Mon Sep 17 00:00:00 2001 From: Pavel Belous Date: Mon, 28 Aug 2017 21:52:13 +0300 Subject: [PATCH 088/118] net:ethernet:aquantia: Show info message if bad firmware version detected. We should inform user about wrong firmware version by printing message in dmesg. Fixes: 3d2ff7eebe26 ("net: ethernet: aquantia: Atlantic hardware abstraction layer") Signed-off-by: Pavel Belous Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index 7a1332e9b9bc..4f5ec9a0fbfb 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -141,6 +141,12 @@ static int hw_atl_utils_init_ucp(struct aq_hw_s *self, err = hw_atl_utils_ver_match(aq_hw_caps->fw_ver_expected, aq_hw_read_reg(self, 0x18U)); + + if (err < 0) + pr_err("%s: Bad FW version detected: expected=%x, actual=%x\n", + AQ_CFG_DRV_NAME, + aq_hw_caps->fw_ver_expected, + aq_hw_read_reg(self, 0x18U)); return err; } From e7562597b46d099a78d45c781ee8d1a7a93c53f1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 29 Aug 2017 22:15:16 +0300 Subject: [PATCH 089/118] nfp: double free on error in probe Both the nfp_net_pf_app_start() and the nfp_net_pci_probe() functions call nfp_net_pf_app_stop_ctrl(pf) so there is a double free. The free should be done from the probe function because it's allocated there so I have removed the call from nfp_net_pf_app_start(). Fixes: 02082701b974 ("nfp: create control vNICs and wire up rx/tx") Signed-off-by: Dan Carpenter Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c index 1aca4e57bf41..34b985384d26 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c @@ -458,7 +458,7 @@ static int nfp_net_pf_app_start(struct nfp_pf *pf) err = nfp_app_start(pf->app, pf->ctrl_vnic); if (err) - goto err_ctrl_stop; + return err; if (pf->num_vfs) { err = nfp_app_sriov_enable(pf->app, pf->num_vfs); @@ -470,8 +470,6 @@ static int nfp_net_pf_app_start(struct nfp_pf *pf) err_app_stop: nfp_app_stop(pf->app); -err_ctrl_stop: - nfp_net_pf_app_stop_ctrl(pf); return err; } From 183db481279437590f75a8a0479d512e5dd597de Mon Sep 17 00:00:00 2001 From: Quan Nguyen Date: Tue, 29 Aug 2017 15:43:12 -0700 Subject: [PATCH 090/118] drivers: net: xgene: Correct probe sequence handling The phy is connected at early stage of probe but not properly disconnected if error occurs. This patch fixes the issue. Also changing the return type of xgene_enet_check_phy_handle(), since this function always returns success. Signed-off-by: Quan Nguyen Signed-off-by: Iyappan Subramanian Signed-off-by: David S. Miller --- .../net/ethernet/apm/xgene/xgene_enet_main.c | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index 1d307f2def2d..6e253d913fe2 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -1661,21 +1661,21 @@ static int xgene_enet_get_irqs(struct xgene_enet_pdata *pdata) return 0; } -static int xgene_enet_check_phy_handle(struct xgene_enet_pdata *pdata) +static void xgene_enet_check_phy_handle(struct xgene_enet_pdata *pdata) { int ret; if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) - return 0; + return; if (!IS_ENABLED(CONFIG_MDIO_XGENE)) - return 0; + return; ret = xgene_enet_phy_connect(pdata->ndev); if (!ret) pdata->mdio_driver = true; - return 0; + return; } static void xgene_enet_gpiod_get(struct xgene_enet_pdata *pdata) @@ -1779,10 +1779,6 @@ static int xgene_enet_get_resources(struct xgene_enet_pdata *pdata) if (ret) return ret; - ret = xgene_enet_check_phy_handle(pdata); - if (ret) - return ret; - xgene_enet_gpiod_get(pdata); pdata->clk = devm_clk_get(&pdev->dev, NULL); @@ -2097,9 +2093,11 @@ static int xgene_enet_probe(struct platform_device *pdev) goto err; } + xgene_enet_check_phy_handle(pdata); + ret = xgene_enet_init_hw(pdata); if (ret) - goto err; + goto err2; link_state = pdata->mac_ops->link_state; if (pdata->phy_mode == PHY_INTERFACE_MODE_XGMII) { @@ -2117,29 +2115,30 @@ static int xgene_enet_probe(struct platform_device *pdev) spin_lock_init(&pdata->stats_lock); ret = xgene_extd_stats_init(pdata); if (ret) - goto err2; + goto err1; xgene_enet_napi_add(pdata); ret = register_netdev(ndev); if (ret) { netdev_err(ndev, "Failed to register netdev\n"); - goto err2; + goto err1; } return 0; -err2: +err1: /* * If necessary, free_netdev() will call netif_napi_del() and undo * the effects of xgene_enet_napi_add()'s calls to netif_napi_add(). */ + xgene_enet_delete_desc_rings(pdata); + +err2: if (pdata->mdio_driver) xgene_enet_phy_disconnect(pdata); else if (phy_interface_mode_is_rgmii(pdata->phy_mode)) xgene_enet_mdio_remove(pdata); -err1: - xgene_enet_delete_desc_rings(pdata); err: free_netdev(ndev); return ret; From f0e82d737d1b727e93adec16cf7132391249356a Mon Sep 17 00:00:00 2001 From: Sekhar Nori Date: Wed, 30 Aug 2017 13:37:13 +0530 Subject: [PATCH 091/118] net: ti: cpsw-common: dont print error if ti_cm_get_macid() fails It is quite common for ti_cm_get_macid() to fail on some of the platforms it is invoked on. They include any platform where mac address is not part of SoC register space. On these platforms, mac address is read and populated in device-tree by bootloader. An example is TI DA850. Downgrade the severity of message to "information", so it does not spam logs when 'quiet' boot is desired. Signed-off-by: Sekhar Nori Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw-common.c b/drivers/net/ethernet/ti/cpsw-common.c index 56ba411421f0..38d1cc557c11 100644 --- a/drivers/net/ethernet/ti/cpsw-common.c +++ b/drivers/net/ethernet/ti/cpsw-common.c @@ -96,7 +96,7 @@ int ti_cm_get_macid(struct device *dev, int slave, u8 *mac_addr) if (of_machine_is_compatible("ti,dra7")) return davinci_emac_3517_get_macid(dev, 0x514, slave, mac_addr); - dev_err(dev, "incompatible machine/device type for reading mac address\n"); + dev_info(dev, "incompatible machine/device type for reading mac address\n"); return -ENOENT; } EXPORT_SYMBOL_GPL(ti_cm_get_macid); From 33c52b6718d2a6cb414440c98560818910d896dc Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Thu, 13 Jul 2017 13:45:11 -0500 Subject: [PATCH 092/118] net/mlx5e: Check for qos capability in dcbnl_initialize qos capability is the master capability bit that determines if the DCBX is supported for the PCI function. If this bit is off, driver cannot run any dcbx code. Fixes: e207b7e99176 ("net/mlx5e: ConnectX-4 firmware support for DCBX") Signed-off-by: Huy Nguyen Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index 2eb54d36e16e..810b51029c7f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -754,6 +754,9 @@ void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) { struct mlx5e_dcbx *dcbx = &priv->dcbx; + if (!MLX5_CAP_GEN(priv->mdev, qos)) + return; + if (MLX5_CAP_GEN(priv->mdev, dcbx)) mlx5e_dcbnl_query_dcbx_mode(priv, &dcbx->mode); From 9e10bf1d349787f373484d835efe2dbb5f9c5614 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 10 Jul 2017 14:00:23 -0500 Subject: [PATCH 093/118] net/mlx5e: Fix DCB_CAP_ATTR_DCBX capability for DCBNL getcap. Current code doesn't report DCB_CAP_DCBX_HOST capability when query through getcap. User space lldptool expects capability to have HOST mode set when it wants to configure DCBX CEE mode. In absence of HOST mode capability, lldptool fails to switch to CEE mode. This fix returns DCB_CAP_DCBX_HOST capability when port's DCBX controlled mode is under software control. Fixes: 3a6a931dfb8e ("net/mlx5e: Support DCBX CEE API") Signed-off-by: Huy Nguyen Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + .../ethernet/mellanox/mlx5/core/en_dcbnl.c | 21 +++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 0039b4725405..2f26fb34d741 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -263,6 +263,7 @@ struct mlx5e_dcbx { /* The only setting that cannot be read from FW */ u8 tc_tsa[IEEE_8021QAZ_MAX_TCS]; + u8 cap; }; #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index 810b51029c7f..c1d384fca4dc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -288,13 +288,8 @@ static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev, static u8 mlx5e_dcbnl_getdcbx(struct net_device *dev) { struct mlx5e_priv *priv = netdev_priv(dev); - struct mlx5e_dcbx *dcbx = &priv->dcbx; - u8 mode = DCB_CAP_DCBX_VER_IEEE | DCB_CAP_DCBX_VER_CEE; - if (dcbx->mode == MLX5E_DCBX_PARAM_VER_OPER_HOST) - mode |= DCB_CAP_DCBX_HOST; - - return mode; + return priv->dcbx.cap; } static u8 mlx5e_dcbnl_setdcbx(struct net_device *dev, u8 mode) @@ -312,6 +307,7 @@ static u8 mlx5e_dcbnl_setdcbx(struct net_device *dev, u8 mode) /* set dcbx to fw controlled */ if (!mlx5e_dcbnl_set_dcbx_mode(priv, MLX5E_DCBX_PARAM_VER_OPER_AUTO)) { dcbx->mode = MLX5E_DCBX_PARAM_VER_OPER_AUTO; + dcbx->cap &= ~DCB_CAP_DCBX_HOST; return 0; } @@ -324,6 +320,8 @@ static u8 mlx5e_dcbnl_setdcbx(struct net_device *dev, u8 mode) if (mlx5e_dcbnl_switch_to_host_mode(netdev_priv(dev))) return 1; + dcbx->cap = mode; + return 0; } @@ -628,9 +626,9 @@ static u8 mlx5e_dcbnl_getcap(struct net_device *netdev, *cap = false; break; case DCB_CAP_ATTR_DCBX: - *cap = (DCB_CAP_DCBX_LLD_MANAGED | - DCB_CAP_DCBX_VER_CEE | - DCB_CAP_DCBX_STATIC); + *cap = priv->dcbx.cap | + DCB_CAP_DCBX_VER_CEE | + DCB_CAP_DCBX_VER_IEEE; break; default: *cap = 0; @@ -760,5 +758,10 @@ void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) if (MLX5_CAP_GEN(priv->mdev, dcbx)) mlx5e_dcbnl_query_dcbx_mode(priv, &dcbx->mode); + priv->dcbx.cap = DCB_CAP_DCBX_VER_CEE | + DCB_CAP_DCBX_VER_IEEE; + if (priv->dcbx.mode == MLX5E_DCBX_PARAM_VER_OPER_HOST) + priv->dcbx.cap |= DCB_CAP_DCBX_HOST; + mlx5e_ets_init(priv); } From 672d0880b7798a917bcc622308f25a0fbb991dab Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Sun, 30 Jul 2017 13:55:48 +0300 Subject: [PATCH 094/118] net/mlx5: Fix arm SRQ command for ISSI version 0 Support for ISSI version 0 was recently broken as the arm_srq_cmd command, which is used only for ISSI version 0, was given the opcode for ISSI version 1 instead of ISSI version 0. Change arm_srq_cmd to use the correct command opcode for ISSI version 0. Fixes: af1ba291c5e4 ('{net, IB}/mlx5: Refactor internal SRQ API') Signed-off-by: Noa Osherovich Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/srq.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c index f774de6f5fcb..520f6382dfde 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c @@ -201,13 +201,13 @@ static int destroy_srq_cmd(struct mlx5_core_dev *dev, static int arm_srq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, u16 lwm, int is_srq) { - /* arm_srq structs missing using identical xrc ones */ - u32 srq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)] = {0}; - u32 srq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0}; + u32 srq_in[MLX5_ST_SZ_DW(arm_rq_in)] = {0}; + u32 srq_out[MLX5_ST_SZ_DW(arm_rq_out)] = {0}; - MLX5_SET(arm_xrc_srq_in, srq_in, opcode, MLX5_CMD_OP_ARM_XRC_SRQ); - MLX5_SET(arm_xrc_srq_in, srq_in, xrc_srqn, srq->srqn); - MLX5_SET(arm_xrc_srq_in, srq_in, lwm, lwm); + MLX5_SET(arm_rq_in, srq_in, opcode, MLX5_CMD_OP_ARM_RQ); + MLX5_SET(arm_rq_in, srq_in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_SRQ); + MLX5_SET(arm_rq_in, srq_in, srq_number, srq->srqn); + MLX5_SET(arm_rq_in, srq_in, lwm, lwm); return mlx5_cmd_exec(dev, srq_in, sizeof(srq_in), srq_out, sizeof(srq_out)); From b3cb5388499c5e219324bfe7da2e46cbad82bfcf Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 8 Aug 2017 13:17:00 -0500 Subject: [PATCH 095/118] net/mlx5: Skip mlx5_unload_one if mlx5_load_one fails There is an issue where the firmware fails during mlx5_load_one, the health_care timer detects the issue and schedules a health_care call. Then the mlx5_load_one detects the issue, cleans up and quits. Then the health_care starts and calls mlx5_unload_one to clean up the resources that no longer exist and causes kernel panic. The root cause is that the bit MLX5_INTERFACE_STATE_DOWN is not set after mlx5_load_one fails. The solution is removing the bit MLX5_INTERFACE_STATE_DOWN and quit mlx5_unload_one if the bit MLX5_INTERFACE_STATE_UP is not set. The bit MLX5_INTERFACE_STATE_DOWN is redundant and we can use MLX5_INTERFACE_STATE_UP instead. Fixes: 5fc7197d3a25 ("net/mlx5: Add pci shutdown callback") Signed-off-by: Huy Nguyen Reviewed-by: Daniel Jurgens Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 +--- include/linux/mlx5/driver.h | 5 ++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c065132b956d..4cdb414aa2d5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1186,7 +1186,6 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, } } - clear_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state); set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); out: mutex_unlock(&dev->intf_state_mutex); @@ -1261,7 +1260,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, mlx5_drain_health_recovery(dev); mutex_lock(&dev->intf_state_mutex); - if (test_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state)) { + if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) { dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n", __func__); if (cleanup) @@ -1270,7 +1269,6 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, } clear_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state); - set_bit(MLX5_INTERFACE_STATE_DOWN, &dev->intf_state); if (mlx5_device_registered(dev)) mlx5_detach_device(dev); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index df6ce59a1f95..918f5e644506 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -673,9 +673,8 @@ enum mlx5_device_state { }; enum mlx5_interface_state { - MLX5_INTERFACE_STATE_DOWN = BIT(0), - MLX5_INTERFACE_STATE_UP = BIT(1), - MLX5_INTERFACE_STATE_SHUTDOWN = BIT(2), + MLX5_INTERFACE_STATE_UP = BIT(0), + MLX5_INTERFACE_STATE_SHUTDOWN = BIT(1), }; enum mlx5_pci_status { From 10a8d00707082955b177164d4b4e758ffcbd4017 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Wed, 9 Aug 2017 10:03:40 -0500 Subject: [PATCH 096/118] net/mlx5: Remove the flag MLX5_INTERFACE_STATE_SHUTDOWN MLX5_INTERFACE_STATE_SHUTDOWN is not used in the code. Fixes: 5fc7197d3a25 ("net/mlx5: Add pci shutdown callback") Signed-off-by: Huy Nguyen Reviewed-by: Daniel Jurgens Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 -- include/linux/mlx5/driver.h | 1 - 2 files changed, 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 4cdb414aa2d5..16885827367b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1563,8 +1563,6 @@ static void shutdown(struct pci_dev *pdev) int err; dev_info(&pdev->dev, "Shutdown was called\n"); - /* Notify mlx5 clients that the kernel is being shut down */ - set_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &dev->intf_state); err = mlx5_try_fast_unload(dev); if (err) mlx5_unload_one(dev, priv, false); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 918f5e644506..205d82d4c468 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -674,7 +674,6 @@ enum mlx5_device_state { enum mlx5_interface_state { MLX5_INTERFACE_STATE_UP = BIT(0), - MLX5_INTERFACE_STATE_SHUTDOWN = BIT(1), }; enum mlx5_pci_status { From 0556ce72ab16156af6c94cdc7964e4310acc97c0 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Wed, 16 Aug 2017 14:37:11 +0300 Subject: [PATCH 097/118] net/mlx5e: Fix dangling page pointer on DMA mapping error Function mlx5e_dealloc_rx_wqe is using page pointer value as an indication to valid DMA mapping. In case that the mapping failed, we released the page but kept the dangling pointer. Store the page pointer only after the DMA mapping passed to avoid invalid page DMA unmap. Fixes: bc77b240b3c5 ("net/mlx5e: Add fragmented memory support for RX multi packet WQE") Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 325b2c8c1c6d..7344433259fc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -222,13 +222,13 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq, if (unlikely(!page)) return -ENOMEM; - dma_info->page = page; dma_info->addr = dma_map_page(rq->pdev, page, 0, RQ_PAGE_SIZE(rq), rq->buff.map_dir); if (unlikely(dma_mapping_error(rq->pdev, dma_info->addr))) { put_page(page); return -ENOMEM; } + dma_info->page = page; return 0; } From 5a8e12678c767ccf8bb16d6237569e4a707d655b Mon Sep 17 00:00:00 2001 From: Inbar Karmy Date: Mon, 14 Aug 2017 16:12:16 +0300 Subject: [PATCH 098/118] net/mlx5e: Don't override user RSS upon set channels Currently, increasing the number of combined channels is changing the RSS spread to use the new created channels. Prevent the RSS spread change in case the user explicitly declare it, to avoid overriding user configuration. Tested: when RSS default: # ethtool -L ens8 combined 4 RSS spread will change and point to 4 channels. # ethtool -X ens8 equal 4 # ethtool -L ens8 combined 6 RSS will not change after increasing the number of the channels. Fixes: 8bf368620486 ('ethtool: ensure channel counts are within bounds during SCHANNELS') Signed-off-by: Inbar Karmy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index 917fade5f5d5..f5594014715b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -641,8 +641,10 @@ int mlx5e_ethtool_set_channels(struct mlx5e_priv *priv, new_channels.params = priv->channels.params; new_channels.params.num_channels = count; - mlx5e_build_default_indir_rqt(priv->mdev, new_channels.params.indirection_rqt, - MLX5E_INDIR_RQT_SIZE, count); + if (!netif_is_rxfh_configured(priv->netdev)) + mlx5e_build_default_indir_rqt(priv->mdev, + new_channels.params.indirection_rqt, + MLX5E_INDIR_RQT_SIZE, count); if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { priv->channels.params = new_channels.params; From 08820528c9d3ff0d0eda047d7ef5ecac2da1ef6c Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Tue, 22 Aug 2017 13:51:56 +0300 Subject: [PATCH 099/118] net/mlx5e: Properly resolve TC offloaded ipv6 vxlan tunnel source address Currently if vxlan tunnel ipv6 src isn't supplied the driver fails to resolve it as part of the route lookup. The resulting encap header is left with a zeroed out ipv6 src address so the packets are sent with this src ip. Use an appropriate route lookup API that also resolves the source ipv6 address if it's not supplied. Fixes: ce99f6b97fcd ('net/mlx5e: Support SRIOV TC encapsulation offloads for IPv6 tunnels') Signed-off-by: Paul Blakey Reviewed-by: Or Gerlitz Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 3c536f560dd2..7f282e8f4e7f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1443,12 +1443,10 @@ static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv, struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; int ret; - dst = ip6_route_output(dev_net(mirred_dev), NULL, fl6); - ret = dst->error; - if (ret) { - dst_release(dst); + ret = ipv6_stub->ipv6_dst_lookup(dev_net(mirred_dev), NULL, &dst, + fl6); + if (ret < 0) return ret; - } *out_ttl = ip6_dst_hoplimit(dst); From 191220396db840822fc818edf03c49f0c02eb237 Mon Sep 17 00:00:00 2001 From: Shahar Klein Date: Tue, 1 Aug 2017 15:29:55 +0300 Subject: [PATCH 100/118] net/mlx5: E-Switch, Unload the representors in the correct order When changing from switchdev to legacy mode, all the representor port devices (uplink nic and reps) are cleaned up. Part of this cleaning process is removing the neigh entries and the hash table containing them. However, a representor neigh entry might be linked to the uplink port hash table and if the uplink nic is cleaned first the cleaning of the representor will end up in null deref. Fix that by unloading the representors in the opposite order of load. Fixes: cb67b832921c ("net/mlx5e: Introduce SRIOV VF representors") Signed-off-by: Shahar Klein Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 95b64025ce36..5bc0593bd76e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -815,7 +815,7 @@ void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports) struct mlx5_eswitch_rep *rep; int vport; - for (vport = 0; vport < nvports; vport++) { + for (vport = nvports - 1; vport >= 0; vport--) { rep = &esw->offloads.vport_reps[vport]; if (!rep->valid) continue; From 6aace17e64f4aa1c49802c46bd10688968b3787f Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 8 Aug 2017 15:56:37 +0300 Subject: [PATCH 101/118] net/mlx5e: Fix inline header size for small packets Fix inline header size, make sure it is not greater than skb len. This bug effects small packets, for example L2 packets with size < 18. Fixes: ae76715d153e ("net/mlx5e: Check the minimum inline header mode before xmit") Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index aaa0f4ebba9a..31353e5c3c78 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -128,10 +128,10 @@ static inline int mlx5e_skb_l3_header_offset(struct sk_buff *skb) return mlx5e_skb_l2_header_offset(skb); } -static inline unsigned int mlx5e_calc_min_inline(enum mlx5_inline_modes mode, - struct sk_buff *skb) +static inline u16 mlx5e_calc_min_inline(enum mlx5_inline_modes mode, + struct sk_buff *skb) { - int hlen; + u16 hlen; switch (mode) { case MLX5_INLINE_MODE_NONE: @@ -140,19 +140,22 @@ static inline unsigned int mlx5e_calc_min_inline(enum mlx5_inline_modes mode, hlen = eth_get_headlen(skb->data, skb_headlen(skb)); if (hlen == ETH_HLEN && !skb_vlan_tag_present(skb)) hlen += VLAN_HLEN; - return hlen; + break; case MLX5_INLINE_MODE_IP: /* When transport header is set to zero, it means no transport * header. When transport header is set to 0xff's, it means * transport header wasn't set. */ - if (skb_transport_offset(skb)) - return mlx5e_skb_l3_header_offset(skb); + if (skb_transport_offset(skb)) { + hlen = mlx5e_skb_l3_header_offset(skb); + break; + } /* fall through */ case MLX5_INLINE_MODE_L2: default: - return mlx5e_skb_l2_header_offset(skb); + hlen = mlx5e_skb_l2_header_offset(skb); } + return min_t(u16, hlen, skb->len); } static inline void mlx5e_tx_skb_pull_inline(unsigned char **skb_data, From 1213ad28f9595a08e3877248bbba1a25c40225d6 Mon Sep 17 00:00:00 2001 From: Tal Gilboa Date: Mon, 28 Aug 2017 18:45:08 +0300 Subject: [PATCH 102/118] net/mlx5e: Fix CQ moderation mode not set properly cq_period_mode assignment was mistakenly removed so it was always set to "0", which is EQE based moderation, regardless of the device CAPs and requested value in ethtool. Fixes: 6a9764efb255 ("net/mlx5e: Isolate open_channels from priv->params") Signed-off-by: Tal Gilboa Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 57f31fa478ce..6ad7f07e7861 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1969,6 +1969,7 @@ static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv, } mlx5e_build_common_cq_param(priv, param); + param->cq_period_mode = params->rx_cq_period_mode; } static void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv, From 88c2ace69dbef696edba77712882af03879abc9c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:48:57 +0300 Subject: [PATCH 103/118] sch_htb: fix crash on init failure The commit below added a call to the ->destroy() callback for all qdiscs which failed in their ->init(), but some were not prepared for such change and can't handle partially initialized qdisc. HTB is one of them and if any error occurs before the qdisc watchdog timer and qdisc work are initialized then we can hit either a null ptr deref (timer->base) when canceling in ->destroy or lockdep error info about trying to register a non-static key and a stack dump. So to fix these two move the watchdog timer and workqueue init before anything that can err out. To reproduce userspace needs to send broken htb qdisc create request, tested with a modified tc (q_htb.c). Trace log: [ 2710.897602] BUG: unable to handle kernel NULL pointer dereference at (null) [ 2710.897977] IP: hrtimer_active+0x17/0x8a [ 2710.898174] PGD 58fab067 [ 2710.898175] P4D 58fab067 [ 2710.898353] PUD 586c0067 [ 2710.898531] PMD 0 [ 2710.898710] [ 2710.899045] Oops: 0000 [#1] SMP [ 2710.899232] Modules linked in: [ 2710.899419] CPU: 1 PID: 950 Comm: tc Not tainted 4.13.0-rc6+ #54 [ 2710.899646] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 2710.900035] task: ffff880059ed2700 task.stack: ffff88005ad4c000 [ 2710.900262] RIP: 0010:hrtimer_active+0x17/0x8a [ 2710.900467] RSP: 0018:ffff88005ad4f960 EFLAGS: 00010246 [ 2710.900684] RAX: 0000000000000000 RBX: ffff88003701e298 RCX: 0000000000000000 [ 2710.900933] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88003701e298 [ 2710.901177] RBP: ffff88005ad4f980 R08: 0000000000000001 R09: 0000000000000001 [ 2710.901419] R10: ffff88005ad4f800 R11: 0000000000000400 R12: 0000000000000000 [ 2710.901663] R13: ffff88003701e298 R14: ffffffff822a4540 R15: ffff88005ad4fac0 [ 2710.901907] FS: 00007f2f5e90f740(0000) GS:ffff88005d880000(0000) knlGS:0000000000000000 [ 2710.902277] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2710.902500] CR2: 0000000000000000 CR3: 0000000058ca3000 CR4: 00000000000406e0 [ 2710.902744] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 2710.902977] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 2710.903180] Call Trace: [ 2710.903332] hrtimer_try_to_cancel+0x1a/0x93 [ 2710.903504] hrtimer_cancel+0x15/0x20 [ 2710.903667] qdisc_watchdog_cancel+0x12/0x14 [ 2710.903866] htb_destroy+0x2e/0xf7 [ 2710.904097] qdisc_create+0x377/0x3fd [ 2710.904330] tc_modify_qdisc+0x4d2/0x4fd [ 2710.904511] rtnetlink_rcv_msg+0x188/0x197 [ 2710.904682] ? rcu_read_unlock+0x3e/0x5f [ 2710.904849] ? rtnl_newlink+0x729/0x729 [ 2710.905017] netlink_rcv_skb+0x6c/0xce [ 2710.905183] rtnetlink_rcv+0x23/0x2a [ 2710.905345] netlink_unicast+0x103/0x181 [ 2710.905511] netlink_sendmsg+0x326/0x337 [ 2710.905679] sock_sendmsg_nosec+0x14/0x3f [ 2710.905847] sock_sendmsg+0x29/0x2e [ 2710.906010] ___sys_sendmsg+0x209/0x28b [ 2710.906176] ? do_raw_spin_unlock+0xcd/0xf8 [ 2710.906346] ? _raw_spin_unlock+0x27/0x31 [ 2710.906514] ? __handle_mm_fault+0x651/0xdb1 [ 2710.906685] ? check_chain_key+0xb0/0xfd [ 2710.906855] __sys_sendmsg+0x45/0x63 [ 2710.907018] ? __sys_sendmsg+0x45/0x63 [ 2710.907185] SyS_sendmsg+0x19/0x1b [ 2710.907344] entry_SYSCALL_64_fastpath+0x23/0xc2 Note that probably this bug goes further back because the default qdisc handling always calls ->destroy on init failure too. Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Fixes: 0fbbeb1ba43b ("[PKT_SCHED]: Fix missing qdisc_destroy() in qdisc_create_dflt()") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 5d65ec5207e9..5bf5177b2bd3 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1017,6 +1017,9 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) int err; int i; + qdisc_watchdog_init(&q->watchdog, sch); + INIT_WORK(&q->work, htb_work_func); + if (!opt) return -EINVAL; @@ -1041,8 +1044,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < TC_HTB_NUMPRIO; i++) INIT_LIST_HEAD(q->drops + i); - qdisc_watchdog_init(&q->watchdog, sch); - INIT_WORK(&q->work, htb_work_func); qdisc_skb_head_init(&q->direct_queue); if (tb[TCA_HTB_DIRECT_QLEN]) From e89d469e3be3ed3d7124a803211a463ff83d0964 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:48:58 +0300 Subject: [PATCH 104/118] sch_multiq: fix double free on init failure The below commit added a call to ->destroy() on init failure, but multiq still frees ->queues on error in init, but ->queues is also freed by ->destroy() thus we get double free and corrupted memory. Very easy to reproduce (eth0 not multiqueue): $ tc qdisc add dev eth0 root multiq RTNETLINK answers: Operation not supported $ ip l add dumdum type dummy (crash) Trace log: [ 3929.467747] general protection fault: 0000 [#1] SMP [ 3929.468083] Modules linked in: [ 3929.468302] CPU: 3 PID: 967 Comm: ip Not tainted 4.13.0-rc6+ #56 [ 3929.468625] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 3929.469124] task: ffff88003716a700 task.stack: ffff88005872c000 [ 3929.469449] RIP: 0010:__kmalloc_track_caller+0x117/0x1be [ 3929.469746] RSP: 0018:ffff88005872f6a0 EFLAGS: 00010246 [ 3929.470042] RAX: 00000000000002de RBX: 0000000058a59000 RCX: 00000000000002df [ 3929.470406] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff821f7020 [ 3929.470770] RBP: ffff88005872f6e8 R08: 000000000001f010 R09: 0000000000000000 [ 3929.471133] R10: ffff88005872f730 R11: 0000000000008cdd R12: ff006d75646d7564 [ 3929.471496] R13: 00000000014000c0 R14: ffff88005b403c00 R15: ffff88005b403c00 [ 3929.471869] FS: 00007f0b70480740(0000) GS:ffff88005d980000(0000) knlGS:0000000000000000 [ 3929.472286] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3929.472677] CR2: 00007ffcee4f3000 CR3: 0000000059d45000 CR4: 00000000000406e0 [ 3929.473209] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 3929.474109] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 3929.474873] Call Trace: [ 3929.475337] ? kstrdup_const+0x23/0x25 [ 3929.475863] kstrdup+0x2e/0x4b [ 3929.476338] kstrdup_const+0x23/0x25 [ 3929.478084] __kernfs_new_node+0x28/0xbc [ 3929.478478] kernfs_new_node+0x35/0x55 [ 3929.478929] kernfs_create_link+0x23/0x76 [ 3929.479478] sysfs_do_create_link_sd.isra.2+0x85/0xd7 [ 3929.480096] sysfs_create_link+0x33/0x35 [ 3929.480649] device_add+0x200/0x589 [ 3929.481184] netdev_register_kobject+0x7c/0x12f [ 3929.481711] register_netdevice+0x373/0x471 [ 3929.482174] rtnl_newlink+0x614/0x729 [ 3929.482610] ? rtnl_newlink+0x17f/0x729 [ 3929.483080] rtnetlink_rcv_msg+0x188/0x197 [ 3929.483533] ? rcu_read_unlock+0x3e/0x5f [ 3929.483984] ? rtnl_newlink+0x729/0x729 [ 3929.484420] netlink_rcv_skb+0x6c/0xce [ 3929.484858] rtnetlink_rcv+0x23/0x2a [ 3929.485291] netlink_unicast+0x103/0x181 [ 3929.485735] netlink_sendmsg+0x326/0x337 [ 3929.486181] sock_sendmsg_nosec+0x14/0x3f [ 3929.486614] sock_sendmsg+0x29/0x2e [ 3929.486973] ___sys_sendmsg+0x209/0x28b [ 3929.487340] ? do_raw_spin_unlock+0xcd/0xf8 [ 3929.487719] ? _raw_spin_unlock+0x27/0x31 [ 3929.488092] ? __handle_mm_fault+0x651/0xdb1 [ 3929.488471] ? check_chain_key+0xb0/0xfd [ 3929.488847] __sys_sendmsg+0x45/0x63 [ 3929.489206] ? __sys_sendmsg+0x45/0x63 [ 3929.489576] SyS_sendmsg+0x19/0x1b [ 3929.489901] entry_SYSCALL_64_fastpath+0x23/0xc2 [ 3929.490172] RIP: 0033:0x7f0b6fb93690 [ 3929.490423] RSP: 002b:00007ffcee4ed588 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 3929.490881] RAX: ffffffffffffffda RBX: ffffffff810d278c RCX: 00007f0b6fb93690 [ 3929.491198] RDX: 0000000000000000 RSI: 00007ffcee4ed5d0 RDI: 0000000000000003 [ 3929.491521] RBP: ffff88005872ff98 R08: 0000000000000001 R09: 0000000000000000 [ 3929.491801] R10: 00007ffcee4ed350 R11: 0000000000000246 R12: 0000000000000002 [ 3929.492075] R13: 000000000066f1a0 R14: 00007ffcee4f5680 R15: 0000000000000000 [ 3929.492352] ? trace_hardirqs_off_caller+0xa7/0xcf [ 3929.492590] Code: 8b 45 c0 48 8b 45 b8 74 17 48 8b 4d c8 83 ca ff 44 89 ee 4c 89 f7 e8 83 ca ff ff 49 89 c4 eb 49 49 63 56 20 48 8d 48 01 4d 8b 06 <49> 8b 1c 14 48 89 c2 4c 89 e0 65 49 0f c7 08 0f 94 c0 83 f0 01 [ 3929.493335] RIP: __kmalloc_track_caller+0x117/0x1be RSP: ffff88005872f6a0 Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Fixes: f07d1501292b ("multiq: Further multiqueue cleanup") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/sched/sch_multiq.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index f143b7bbaa0d..9c454f5d6c38 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -257,12 +257,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < q->max_bands; i++) q->queues[i] = &noop_qdisc; - err = multiq_tune(sch, opt); - - if (err) - kfree(q->queues); - - return err; + return multiq_tune(sch, opt); } static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb) From 32db864d33c21fd70a217ba53cb7224889354ffb Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:48:59 +0300 Subject: [PATCH 105/118] sch_hhf: fix null pointer dereference on init failure If sch_hhf fails in its ->init() function (either due to wrong user-space arguments as below or memory alloc failure of hh_flows) it will do a null pointer deref of q->hh_flows in its ->destroy() function. To reproduce the crash: $ tc qdisc add dev eth0 root hhf quantum 2000000 non_hh_weight 10000000 Crash log: [ 690.654882] BUG: unable to handle kernel NULL pointer dereference at (null) [ 690.655565] IP: hhf_destroy+0x48/0xbc [ 690.655944] PGD 37345067 [ 690.655948] P4D 37345067 [ 690.656252] PUD 58402067 [ 690.656554] PMD 0 [ 690.656857] [ 690.657362] Oops: 0000 [#1] SMP [ 690.657696] Modules linked in: [ 690.658032] CPU: 3 PID: 920 Comm: tc Not tainted 4.13.0-rc6+ #57 [ 690.658525] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 690.659255] task: ffff880058578000 task.stack: ffff88005acbc000 [ 690.659747] RIP: 0010:hhf_destroy+0x48/0xbc [ 690.660146] RSP: 0018:ffff88005acbf9e0 EFLAGS: 00010246 [ 690.660601] RAX: 0000000000000000 RBX: 0000000000000020 RCX: 0000000000000000 [ 690.661155] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffff821f63f0 [ 690.661710] RBP: ffff88005acbfa08 R08: ffffffff81b10a90 R09: 0000000000000000 [ 690.662267] R10: 00000000f42b7019 R11: ffff880058578000 R12: 00000000ffffffea [ 690.662820] R13: ffff8800372f6400 R14: 0000000000000000 R15: 0000000000000000 [ 690.663769] FS: 00007f8ae5e8b740(0000) GS:ffff88005d980000(0000) knlGS:0000000000000000 [ 690.667069] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 690.667965] CR2: 0000000000000000 CR3: 0000000058523000 CR4: 00000000000406e0 [ 690.668918] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 690.669945] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 690.671003] Call Trace: [ 690.671743] qdisc_create+0x377/0x3fd [ 690.672534] tc_modify_qdisc+0x4d2/0x4fd [ 690.673324] rtnetlink_rcv_msg+0x188/0x197 [ 690.674204] ? rcu_read_unlock+0x3e/0x5f [ 690.675091] ? rtnl_newlink+0x729/0x729 [ 690.675877] netlink_rcv_skb+0x6c/0xce [ 690.676648] rtnetlink_rcv+0x23/0x2a [ 690.677405] netlink_unicast+0x103/0x181 [ 690.678179] netlink_sendmsg+0x326/0x337 [ 690.678958] sock_sendmsg_nosec+0x14/0x3f [ 690.679743] sock_sendmsg+0x29/0x2e [ 690.680506] ___sys_sendmsg+0x209/0x28b [ 690.681283] ? __handle_mm_fault+0xc7d/0xdb1 [ 690.681915] ? check_chain_key+0xb0/0xfd [ 690.682449] __sys_sendmsg+0x45/0x63 [ 690.682954] ? __sys_sendmsg+0x45/0x63 [ 690.683471] SyS_sendmsg+0x19/0x1b [ 690.683974] entry_SYSCALL_64_fastpath+0x23/0xc2 [ 690.684516] RIP: 0033:0x7f8ae529d690 [ 690.685016] RSP: 002b:00007fff26d2d6b8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 690.685931] RAX: ffffffffffffffda RBX: ffffffff810d278c RCX: 00007f8ae529d690 [ 690.686573] RDX: 0000000000000000 RSI: 00007fff26d2d700 RDI: 0000000000000003 [ 690.687047] RBP: ffff88005acbff98 R08: 0000000000000001 R09: 0000000000000000 [ 690.687519] R10: 00007fff26d2d480 R11: 0000000000000246 R12: 0000000000000002 [ 690.687996] R13: 0000000001258070 R14: 0000000000000001 R15: 0000000000000000 [ 690.688475] ? trace_hardirqs_off_caller+0xa7/0xcf [ 690.688887] Code: 00 00 e8 2a 02 ae ff 49 8b bc 1d 60 02 00 00 48 83 c3 08 e8 19 02 ae ff 48 83 fb 20 75 dc 45 31 f6 4d 89 f7 4d 03 bd 20 02 00 00 <49> 8b 07 49 39 c7 75 24 49 83 c6 10 49 81 fe 00 40 00 00 75 e1 [ 690.690200] RIP: hhf_destroy+0x48/0xbc RSP: ffff88005acbf9e0 [ 690.690636] CR2: 0000000000000000 Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Fixes: 10239edf86f1 ("net-qdisc-hhf: Heavy-Hitter Filter (HHF) qdisc") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/sched/sch_hhf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 51d3ba682af9..73a53c08091b 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -477,6 +477,9 @@ static void hhf_destroy(struct Qdisc *sch) kvfree(q->hhf_valid_bits[i]); } + if (!q->hh_flows) + return; + for (i = 0; i < HH_FLOWS_CNT; i++) { struct hh_flow_state *flow, *next; struct list_head *head = &q->hh_flows[i]; From 3bdac362a2f89ed3e148fa6f38c5f5d858f50b1a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:49:00 +0300 Subject: [PATCH 106/118] sch_hfsc: fix null pointer deref and double free on init failure Depending on where ->init fails we can get a null pointer deref due to uninitialized hires timer (watchdog) or a double free of the qdisc hash because it is already freed by ->destroy(). Fixes: 8d5537387505 ("net/sched/hfsc: allocate tcf block for hfsc root class") Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/sched/sch_hfsc.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index fd15200f8627..11ab8dace901 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1418,6 +1418,8 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) struct tc_hfsc_qopt *qopt; int err; + qdisc_watchdog_init(&q->watchdog, sch); + if (opt == NULL || nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); @@ -1430,7 +1432,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) err = tcf_block_get(&q->root.block, &q->root.filter_list); if (err) - goto err_tcf; + return err; q->root.cl_common.classid = sch->handle; q->root.refcnt = 1; @@ -1448,13 +1450,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) qdisc_class_hash_insert(&q->clhash, &q->root.cl_common); qdisc_class_hash_grow(sch, &q->clhash); - qdisc_watchdog_init(&q->watchdog, sch); - return 0; - -err_tcf: - qdisc_class_hash_destroy(&q->clhash); - return err; } static int From 3501d059921246ff617b43e86250a719c140bd97 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:49:01 +0300 Subject: [PATCH 107/118] sch_cbq: fix null pointer dereferences on init failure CBQ can fail on ->init by wrong nl attributes or simply for missing any, f.e. if it's set as a default qdisc then TCA_OPTIONS (opt) will be NULL when it is activated. The first thing init does is parse opt but it will dereference a null pointer if used as a default qdisc, also since init failure at default qdisc invokes ->reset() which cancels all timers then we'll also dereference two more null pointers (timer->base) as they were never initialized. To reproduce: $ sysctl net.core.default_qdisc=cbq $ ip l set ethX up Crash log of the first null ptr deref: [44727.907454] BUG: unable to handle kernel NULL pointer dereference at (null) [44727.907600] IP: cbq_init+0x27/0x205 [44727.907676] PGD 59ff4067 [44727.907677] P4D 59ff4067 [44727.907742] PUD 59c70067 [44727.907807] PMD 0 [44727.907873] [44727.907982] Oops: 0000 [#1] SMP [44727.908054] Modules linked in: [44727.908126] CPU: 1 PID: 21312 Comm: ip Not tainted 4.13.0-rc6+ #60 [44727.908235] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [44727.908477] task: ffff88005ad42700 task.stack: ffff880037214000 [44727.908672] RIP: 0010:cbq_init+0x27/0x205 [44727.908838] RSP: 0018:ffff8800372175f0 EFLAGS: 00010286 [44727.909018] RAX: ffffffff816c3852 RBX: ffff880058c53800 RCX: 0000000000000000 [44727.909222] RDX: 0000000000000004 RSI: 0000000000000000 RDI: ffff8800372175f8 [44727.909427] RBP: ffff880037217650 R08: ffffffff81b0f380 R09: 0000000000000000 [44727.909631] R10: ffff880037217660 R11: 0000000000000020 R12: ffffffff822a44c0 [44727.909835] R13: ffff880058b92000 R14: 00000000ffffffff R15: 0000000000000001 [44727.910040] FS: 00007ff8bc583740(0000) GS:ffff88005d880000(0000) knlGS:0000000000000000 [44727.910339] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [44727.910525] CR2: 0000000000000000 CR3: 00000000371e5000 CR4: 00000000000406e0 [44727.910731] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [44727.910936] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [44727.911141] Call Trace: [44727.911291] ? lockdep_init_map+0xb6/0x1ba [44727.911461] ? qdisc_alloc+0x14e/0x187 [44727.911626] qdisc_create_dflt+0x7a/0x94 [44727.911794] ? dev_activate+0x129/0x129 [44727.911959] attach_one_default_qdisc+0x36/0x63 [44727.912132] netdev_for_each_tx_queue+0x3d/0x48 [44727.912305] dev_activate+0x4b/0x129 [44727.912468] __dev_open+0xe7/0x104 [44727.912631] __dev_change_flags+0xc6/0x15c [44727.912799] dev_change_flags+0x25/0x59 [44727.912966] do_setlink+0x30c/0xb3f [44727.913129] ? check_chain_key+0xb0/0xfd [44727.913294] ? check_chain_key+0xb0/0xfd [44727.913463] rtnl_newlink+0x3a4/0x729 [44727.913626] ? rtnl_newlink+0x117/0x729 [44727.913801] ? ns_capable_common+0xd/0xb1 [44727.913968] ? ns_capable+0x13/0x15 [44727.914131] rtnetlink_rcv_msg+0x188/0x197 [44727.914300] ? rcu_read_unlock+0x3e/0x5f [44727.914465] ? rtnl_newlink+0x729/0x729 [44727.914630] netlink_rcv_skb+0x6c/0xce [44727.914796] rtnetlink_rcv+0x23/0x2a [44727.914956] netlink_unicast+0x103/0x181 [44727.915122] netlink_sendmsg+0x326/0x337 [44727.915291] sock_sendmsg_nosec+0x14/0x3f [44727.915459] sock_sendmsg+0x29/0x2e [44727.915619] ___sys_sendmsg+0x209/0x28b [44727.915784] ? do_raw_spin_unlock+0xcd/0xf8 [44727.915954] ? _raw_spin_unlock+0x27/0x31 [44727.916121] ? __handle_mm_fault+0x651/0xdb1 [44727.916290] ? check_chain_key+0xb0/0xfd [44727.916461] __sys_sendmsg+0x45/0x63 [44727.916626] ? __sys_sendmsg+0x45/0x63 [44727.916792] SyS_sendmsg+0x19/0x1b [44727.916950] entry_SYSCALL_64_fastpath+0x23/0xc2 [44727.917125] RIP: 0033:0x7ff8bbc96690 [44727.917286] RSP: 002b:00007ffc360991e8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [44727.917579] RAX: ffffffffffffffda RBX: ffffffff810d278c RCX: 00007ff8bbc96690 [44727.917783] RDX: 0000000000000000 RSI: 00007ffc36099230 RDI: 0000000000000003 [44727.917987] RBP: ffff880037217f98 R08: 0000000000000001 R09: 0000000000000003 [44727.918190] R10: 00007ffc36098fb0 R11: 0000000000000246 R12: 0000000000000006 [44727.918393] R13: 000000000066f1a0 R14: 00007ffc360a12e0 R15: 0000000000000000 [44727.918597] ? trace_hardirqs_off_caller+0xa7/0xcf [44727.918774] Code: 41 5f 5d c3 66 66 66 66 90 55 48 8d 56 04 45 31 c9 49 c7 c0 80 f3 b0 81 48 89 e5 41 55 41 54 53 48 89 fb 48 8d 7d a8 48 83 ec 48 <0f> b7 0e be 07 00 00 00 83 e9 04 e8 e6 f7 d8 ff 85 c0 0f 88 bb [44727.919332] RIP: cbq_init+0x27/0x205 RSP: ffff8800372175f0 [44727.919516] CR2: 0000000000000000 Fixes: 0fbbeb1ba43b ("[PKT_SCHED]: Fix missing qdisc_destroy() in qdisc_create_dflt()") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 780db43300b1..156c8a33c677 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1139,6 +1139,13 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) struct tc_ratespec *r; int err; + qdisc_watchdog_init(&q->watchdog, sch); + hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + q->delay_timer.function = cbq_undelay; + + if (!opt) + return -EINVAL; + err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL); if (err < 0) return err; @@ -1177,9 +1184,6 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) q->link.avpkt = q->link.allot/2; q->link.minidle = -0x7FFFFFFF; - qdisc_watchdog_init(&q->watchdog, sch); - hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); - q->delay_timer.function = cbq_undelay; q->toplevel = TC_CBQ_MAXLEVEL; q->now = psched_get_time(); From 30c31d746d0eb458ae327f522bc8e4c44cbea0f0 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:49:02 +0300 Subject: [PATCH 108/118] sch_fq_codel: avoid double free on init failure It is very unlikely to happen but the backlogs memory allocation could fail and will free q->flows, but then ->destroy() will free q->flows too. For correctness remove the first free and let ->destroy clean up. Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/sched/sch_fq_codel.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 337f2d6d81e4..2c0c05f2cc34 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -491,10 +491,8 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) if (!q->flows) return -ENOMEM; q->backlogs = kvzalloc(q->flows_cnt * sizeof(u32), GFP_KERNEL); - if (!q->backlogs) { - kvfree(q->flows); + if (!q->backlogs) return -ENOMEM; - } for (i = 0; i < q->flows_cnt; i++) { struct fq_codel_flow *flow = q->flows + i; From 634576a1844dba15bc5e6fc61d72f37e13a21615 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:49:03 +0300 Subject: [PATCH 109/118] sch_netem: avoid null pointer deref on init failure netem can fail in ->init due to missing options (either not supplied by user-space or used as a default qdisc) causing a timer->base null pointer deref in its ->destroy() and ->reset() callbacks. Reproduce: $ sysctl net.core.default_qdisc=netem $ ip l set ethX up Crash log: [ 1814.846943] BUG: unable to handle kernel NULL pointer dereference at (null) [ 1814.847181] IP: hrtimer_active+0x17/0x8a [ 1814.847270] PGD 59c34067 [ 1814.847271] P4D 59c34067 [ 1814.847337] PUD 37374067 [ 1814.847403] PMD 0 [ 1814.847468] [ 1814.847582] Oops: 0000 [#1] SMP [ 1814.847655] Modules linked in: sch_netem(O) sch_fq_codel(O) [ 1814.847761] CPU: 3 PID: 1573 Comm: ip Tainted: G O 4.13.0-rc6+ #62 [ 1814.847884] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 1814.848043] task: ffff88003723a700 task.stack: ffff88005adc8000 [ 1814.848235] RIP: 0010:hrtimer_active+0x17/0x8a [ 1814.848407] RSP: 0018:ffff88005adcb590 EFLAGS: 00010246 [ 1814.848590] RAX: 0000000000000000 RBX: ffff880058e359d8 RCX: 0000000000000000 [ 1814.848793] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880058e359d8 [ 1814.848998] RBP: ffff88005adcb5b0 R08: 00000000014080c0 R09: 00000000ffffffff [ 1814.849204] R10: ffff88005adcb660 R11: 0000000000000020 R12: 0000000000000000 [ 1814.849410] R13: ffff880058e359d8 R14: 00000000ffffffff R15: 0000000000000001 [ 1814.849616] FS: 00007f733bbca740(0000) GS:ffff88005d980000(0000) knlGS:0000000000000000 [ 1814.849919] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1814.850107] CR2: 0000000000000000 CR3: 0000000059f0d000 CR4: 00000000000406e0 [ 1814.850313] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1814.850518] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1814.850723] Call Trace: [ 1814.850875] hrtimer_try_to_cancel+0x1a/0x93 [ 1814.851047] hrtimer_cancel+0x15/0x20 [ 1814.851211] qdisc_watchdog_cancel+0x12/0x14 [ 1814.851383] netem_reset+0xe6/0xed [sch_netem] [ 1814.851561] qdisc_destroy+0x8b/0xe5 [ 1814.851723] qdisc_create_dflt+0x86/0x94 [ 1814.851890] ? dev_activate+0x129/0x129 [ 1814.852057] attach_one_default_qdisc+0x36/0x63 [ 1814.852232] netdev_for_each_tx_queue+0x3d/0x48 [ 1814.852406] dev_activate+0x4b/0x129 [ 1814.852569] __dev_open+0xe7/0x104 [ 1814.852730] __dev_change_flags+0xc6/0x15c [ 1814.852899] dev_change_flags+0x25/0x59 [ 1814.853064] do_setlink+0x30c/0xb3f [ 1814.853228] ? check_chain_key+0xb0/0xfd [ 1814.853396] ? check_chain_key+0xb0/0xfd [ 1814.853565] rtnl_newlink+0x3a4/0x729 [ 1814.853728] ? rtnl_newlink+0x117/0x729 [ 1814.853905] ? ns_capable_common+0xd/0xb1 [ 1814.854072] ? ns_capable+0x13/0x15 [ 1814.854234] rtnetlink_rcv_msg+0x188/0x197 [ 1814.854404] ? rcu_read_unlock+0x3e/0x5f [ 1814.854572] ? rtnl_newlink+0x729/0x729 [ 1814.854737] netlink_rcv_skb+0x6c/0xce [ 1814.854902] rtnetlink_rcv+0x23/0x2a [ 1814.855064] netlink_unicast+0x103/0x181 [ 1814.855230] netlink_sendmsg+0x326/0x337 [ 1814.855398] sock_sendmsg_nosec+0x14/0x3f [ 1814.855584] sock_sendmsg+0x29/0x2e [ 1814.855747] ___sys_sendmsg+0x209/0x28b [ 1814.855912] ? do_raw_spin_unlock+0xcd/0xf8 [ 1814.856082] ? _raw_spin_unlock+0x27/0x31 [ 1814.856251] ? __handle_mm_fault+0x651/0xdb1 [ 1814.856421] ? check_chain_key+0xb0/0xfd [ 1814.856592] __sys_sendmsg+0x45/0x63 [ 1814.856755] ? __sys_sendmsg+0x45/0x63 [ 1814.856923] SyS_sendmsg+0x19/0x1b [ 1814.857083] entry_SYSCALL_64_fastpath+0x23/0xc2 [ 1814.857256] RIP: 0033:0x7f733b2dd690 [ 1814.857419] RSP: 002b:00007ffe1d3387d8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 1814.858238] RAX: ffffffffffffffda RBX: ffffffff810d278c RCX: 00007f733b2dd690 [ 1814.858445] RDX: 0000000000000000 RSI: 00007ffe1d338820 RDI: 0000000000000003 [ 1814.858651] RBP: ffff88005adcbf98 R08: 0000000000000001 R09: 0000000000000003 [ 1814.858856] R10: 00007ffe1d3385a0 R11: 0000000000000246 R12: 0000000000000002 [ 1814.859060] R13: 000000000066f1a0 R14: 00007ffe1d3408d0 R15: 0000000000000000 [ 1814.859267] ? trace_hardirqs_off_caller+0xa7/0xcf [ 1814.859446] Code: 10 55 48 89 c7 48 89 e5 e8 45 a1 fb ff 31 c0 5d c3 31 c0 c3 66 66 66 66 90 55 48 89 e5 41 56 41 55 41 54 53 49 89 fd 49 8b 45 30 <4c> 8b 20 41 8b 5c 24 38 31 c9 31 d2 48 c7 c7 50 8e 1d 82 41 89 [ 1814.860022] RIP: hrtimer_active+0x17/0x8a RSP: ffff88005adcb590 [ 1814.860214] CR2: 0000000000000000 Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Fixes: 0fbbeb1ba43b ("[PKT_SCHED]: Fix missing qdisc_destroy() in qdisc_create_dflt()") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 1b3dd6190e93..14d1724e0dc4 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -933,11 +933,11 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt) struct netem_sched_data *q = qdisc_priv(sch); int ret; + qdisc_watchdog_init(&q->watchdog, sch); + if (!opt) return -EINVAL; - qdisc_watchdog_init(&q->watchdog, sch); - q->loss_model = CLG_RANDOM; ret = netem_change(sch, opt); if (ret) From e232657661242cc8a169595160847b3e66aa7056 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:49:04 +0300 Subject: [PATCH 110/118] sch_sfq: fix null pointer dereference on init failure Currently only a memory allocation failure can lead to this, so let's initialize the timer first. Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/sched/sch_sfq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 82469ef9655e..fc69fc5956e9 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -716,13 +716,13 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) int i; int err; + setup_deferrable_timer(&q->perturb_timer, sfq_perturbation, + (unsigned long)sch); + err = tcf_block_get(&q->block, &q->filter_list); if (err) return err; - setup_deferrable_timer(&q->perturb_timer, sfq_perturbation, - (unsigned long)sch); - for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { q->dep[i].next = i + SFQ_MAX_FLOWS; q->dep[i].prev = i + SFQ_MAX_FLOWS; From c2d6511e6a4f1f3673d711569c00c3849549e9b0 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 30 Aug 2017 12:49:05 +0300 Subject: [PATCH 111/118] sch_tbf: fix two null pointer dereferences on init failure sch_tbf calls qdisc_watchdog_cancel() in both its ->reset and ->destroy callbacks but it may fail before the timer is initialized due to missing options (either not supplied by user-space or set as a default qdisc), also q->qdisc is used by ->reset and ->destroy so we need it initialized. Reproduce: $ sysctl net.core.default_qdisc=tbf $ ip l set ethX up Crash log: [ 959.160172] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 [ 959.160323] IP: qdisc_reset+0xa/0x5c [ 959.160400] PGD 59cdb067 [ 959.160401] P4D 59cdb067 [ 959.160466] PUD 59ccb067 [ 959.160532] PMD 0 [ 959.160597] [ 959.160706] Oops: 0000 [#1] SMP [ 959.160778] Modules linked in: sch_tbf sch_sfb sch_prio sch_netem [ 959.160891] CPU: 2 PID: 1562 Comm: ip Not tainted 4.13.0-rc6+ #62 [ 959.160998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 959.161157] task: ffff880059c9a700 task.stack: ffff8800376d0000 [ 959.161263] RIP: 0010:qdisc_reset+0xa/0x5c [ 959.161347] RSP: 0018:ffff8800376d3610 EFLAGS: 00010286 [ 959.161531] RAX: ffffffffa001b1dd RBX: ffff8800373a2800 RCX: 0000000000000000 [ 959.161733] RDX: ffffffff8215f160 RSI: ffffffff8215f160 RDI: 0000000000000000 [ 959.161939] RBP: ffff8800376d3618 R08: 00000000014080c0 R09: 00000000ffffffff [ 959.162141] R10: ffff8800376d3578 R11: 0000000000000020 R12: ffffffffa001d2c0 [ 959.162343] R13: ffff880037538000 R14: 00000000ffffffff R15: 0000000000000001 [ 959.162546] FS: 00007fcc5126b740(0000) GS:ffff88005d900000(0000) knlGS:0000000000000000 [ 959.162844] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 959.163030] CR2: 0000000000000018 CR3: 000000005abc4000 CR4: 00000000000406e0 [ 959.163233] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 959.163436] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 959.163638] Call Trace: [ 959.163788] tbf_reset+0x19/0x64 [sch_tbf] [ 959.163957] qdisc_destroy+0x8b/0xe5 [ 959.164119] qdisc_create_dflt+0x86/0x94 [ 959.164284] ? dev_activate+0x129/0x129 [ 959.164449] attach_one_default_qdisc+0x36/0x63 [ 959.164623] netdev_for_each_tx_queue+0x3d/0x48 [ 959.164795] dev_activate+0x4b/0x129 [ 959.164957] __dev_open+0xe7/0x104 [ 959.165118] __dev_change_flags+0xc6/0x15c [ 959.165287] dev_change_flags+0x25/0x59 [ 959.165451] do_setlink+0x30c/0xb3f [ 959.165613] ? check_chain_key+0xb0/0xfd [ 959.165782] rtnl_newlink+0x3a4/0x729 [ 959.165947] ? rtnl_newlink+0x117/0x729 [ 959.166121] ? ns_capable_common+0xd/0xb1 [ 959.166288] ? ns_capable+0x13/0x15 [ 959.166450] rtnetlink_rcv_msg+0x188/0x197 [ 959.166617] ? rcu_read_unlock+0x3e/0x5f [ 959.166783] ? rtnl_newlink+0x729/0x729 [ 959.166948] netlink_rcv_skb+0x6c/0xce [ 959.167113] rtnetlink_rcv+0x23/0x2a [ 959.167273] netlink_unicast+0x103/0x181 [ 959.167439] netlink_sendmsg+0x326/0x337 [ 959.167607] sock_sendmsg_nosec+0x14/0x3f [ 959.167772] sock_sendmsg+0x29/0x2e [ 959.167932] ___sys_sendmsg+0x209/0x28b [ 959.168098] ? do_raw_spin_unlock+0xcd/0xf8 [ 959.168267] ? _raw_spin_unlock+0x27/0x31 [ 959.168432] ? __handle_mm_fault+0x651/0xdb1 [ 959.168602] ? check_chain_key+0xb0/0xfd [ 959.168773] __sys_sendmsg+0x45/0x63 [ 959.168934] ? __sys_sendmsg+0x45/0x63 [ 959.169100] SyS_sendmsg+0x19/0x1b [ 959.169260] entry_SYSCALL_64_fastpath+0x23/0xc2 [ 959.169432] RIP: 0033:0x7fcc5097e690 [ 959.169592] RSP: 002b:00007ffd0d5c7b48 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 959.169887] RAX: ffffffffffffffda RBX: ffffffff810d278c RCX: 00007fcc5097e690 [ 959.170089] RDX: 0000000000000000 RSI: 00007ffd0d5c7b90 RDI: 0000000000000003 [ 959.170292] RBP: ffff8800376d3f98 R08: 0000000000000001 R09: 0000000000000003 [ 959.170494] R10: 00007ffd0d5c7910 R11: 0000000000000246 R12: 0000000000000006 [ 959.170697] R13: 000000000066f1a0 R14: 00007ffd0d5cfc40 R15: 0000000000000000 [ 959.170900] ? trace_hardirqs_off_caller+0xa7/0xcf [ 959.171076] Code: 00 41 c7 84 24 14 01 00 00 00 00 00 00 41 c7 84 24 98 00 00 00 00 00 00 00 41 5c 41 5d 41 5e 5d c3 66 66 66 66 90 55 48 89 e5 53 <48> 8b 47 18 48 89 fb 48 8b 40 48 48 85 c0 74 02 ff d0 48 8b bb [ 959.171637] RIP: qdisc_reset+0xa/0x5c RSP: ffff8800376d3610 [ 959.171821] CR2: 0000000000000018 Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Fixes: 0fbbeb1ba43b ("[PKT_SCHED]: Fix missing qdisc_destroy() in qdisc_create_dflt()") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/sched/sch_tbf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index b2e4b6ad241a..493270f0d5b0 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -425,12 +425,13 @@ static int tbf_init(struct Qdisc *sch, struct nlattr *opt) { struct tbf_sched_data *q = qdisc_priv(sch); + qdisc_watchdog_init(&q->watchdog, sch); + q->qdisc = &noop_qdisc; + if (opt == NULL) return -EINVAL; q->t_c = ktime_get_ns(); - qdisc_watchdog_init(&q->watchdog, sch); - q->qdisc = &noop_qdisc; return tbf_change(sch, opt); } From 351050ecd6523374b370341cc29fe61e2201556b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Aug 2017 09:29:31 -0700 Subject: [PATCH 112/118] kcm: do not attach PF_KCM sockets to avoid deadlock syzkaller had no problem to trigger a deadlock, attaching a KCM socket to another one (or itself). (original syzkaller report was a very confusing lockdep splat during a sendmsg()) It seems KCM claims to only support TCP, but no enforcement is done, so we might need to add additional checks. Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index da49191f7ad0..4abf6287d7e1 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1383,6 +1383,10 @@ static int kcm_attach(struct socket *sock, struct socket *csock, if (!csk) return -EINVAL; + /* We must prevent loops or risk deadlock ! */ + if (csk->sk_family == PF_KCM) + return -EOPNOTSUPP; + psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); if (!psock) return -ENOMEM; From df191632f814357ee4d646421662d866028b569d Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 30 Aug 2017 12:39:33 -0700 Subject: [PATCH 113/118] net: dsa: bcm_sf2: Fix number of CFP entries for BCM7278 BCM7278 has only 128 entries while BCM7445 has the full 256 entries set, fix that. Fixes: 7318166cacad ("net: dsa: bcm_sf2: Add support for ethtool::rxnfc") Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 4 ++++ drivers/net/dsa/bcm_sf2.h | 1 + drivers/net/dsa/bcm_sf2_cfp.c | 8 ++++---- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 648f91b58d1e..9b6ce7c3f6c3 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1048,6 +1048,7 @@ struct bcm_sf2_of_data { u32 type; const u16 *reg_offsets; unsigned int core_reg_align; + unsigned int num_cfp_rules; }; /* Register offsets for the SWITCH_REG_* block */ @@ -1071,6 +1072,7 @@ static const struct bcm_sf2_of_data bcm_sf2_7445_data = { .type = BCM7445_DEVICE_ID, .core_reg_align = 0, .reg_offsets = bcm_sf2_7445_reg_offsets, + .num_cfp_rules = 256, }; static const u16 bcm_sf2_7278_reg_offsets[] = { @@ -1093,6 +1095,7 @@ static const struct bcm_sf2_of_data bcm_sf2_7278_data = { .type = BCM7278_DEVICE_ID, .core_reg_align = 1, .reg_offsets = bcm_sf2_7278_reg_offsets, + .num_cfp_rules = 128, }; static const struct of_device_id bcm_sf2_of_match[] = { @@ -1149,6 +1152,7 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev) priv->type = data->type; priv->reg_offsets = data->reg_offsets; priv->core_reg_align = data->core_reg_align; + priv->num_cfp_rules = data->num_cfp_rules; /* Auto-detection using standard registers will not work, so * provide an indication of what kind of device we are for diff --git a/drivers/net/dsa/bcm_sf2.h b/drivers/net/dsa/bcm_sf2.h index 7d3030e04f11..7f9125eef3df 100644 --- a/drivers/net/dsa/bcm_sf2.h +++ b/drivers/net/dsa/bcm_sf2.h @@ -72,6 +72,7 @@ struct bcm_sf2_priv { u32 type; const u16 *reg_offsets; unsigned int core_reg_align; + unsigned int num_cfp_rules; /* spinlock protecting access to the indirect registers */ spinlock_t indir_lock; diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c index 2fb32d67065f..8a1da7e67707 100644 --- a/drivers/net/dsa/bcm_sf2_cfp.c +++ b/drivers/net/dsa/bcm_sf2_cfp.c @@ -98,7 +98,7 @@ static inline void bcm_sf2_cfp_rule_addr_set(struct bcm_sf2_priv *priv, { u32 reg; - WARN_ON(addr >= CFP_NUM_RULES); + WARN_ON(addr >= priv->num_cfp_rules); reg = core_readl(priv, CORE_CFP_ACC); reg &= ~(XCESS_ADDR_MASK << XCESS_ADDR_SHIFT); @@ -109,7 +109,7 @@ static inline void bcm_sf2_cfp_rule_addr_set(struct bcm_sf2_priv *priv, static inline unsigned int bcm_sf2_cfp_rule_size(struct bcm_sf2_priv *priv) { /* Entry #0 is reserved */ - return CFP_NUM_RULES - 1; + return priv->num_cfp_rules - 1; } static int bcm_sf2_cfp_rule_set(struct dsa_switch *ds, int port, @@ -523,7 +523,7 @@ static int bcm_sf2_cfp_rule_get_all(struct bcm_sf2_priv *priv, if (!(reg & OP_STR_DONE)) break; - } while (index < CFP_NUM_RULES); + } while (index < priv->num_cfp_rules); /* Put the TCAM size here */ nfc->data = bcm_sf2_cfp_rule_size(priv); @@ -544,7 +544,7 @@ int bcm_sf2_get_rxnfc(struct dsa_switch *ds, int port, case ETHTOOL_GRXCLSRLCNT: /* Subtract the default, unusable rule */ nfc->rule_cnt = bitmap_weight(priv->cfp.used, - CFP_NUM_RULES) - 1; + priv->num_cfp_rules) - 1; /* We support specifying rule locations */ nfc->data |= RX_CLS_LOC_SPECIAL; break; From ebc8254aeae34226d0bc8fda309fd9790d4dccfe Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 30 Aug 2017 17:49:29 -0700 Subject: [PATCH 114/118] Revert "net: phy: Correctly process PHY_HALTED in phy_stop_machine()" This reverts commit 7ad813f208533cebfcc32d3d7474dc1677d1b09a ("net: phy: Correctly process PHY_HALTED in phy_stop_machine()") because it is creating the possibility for a NULL pointer dereference. David Daney provide the following call trace and diagram of events: When ndo_stop() is called we call: phy_disconnect() +---> phy_stop_interrupts() implies: phydev->irq = PHY_POLL; +---> phy_stop_machine() | +---> phy_state_machine() | +----> queue_delayed_work(): Work queued. +--->phy_detach() implies: phydev->attached_dev = NULL; Now at a later time the queued work does: phy_state_machine() +---->netif_carrier_off(phydev->attached_dev): Oh no! It is NULL: CPU 12 Unable to handle kernel paging request at virtual address 0000000000000048, epc == ffffffff80de37ec, ra == ffffffff80c7c Oops[#1]: CPU: 12 PID: 1502 Comm: kworker/12:1 Not tainted 4.9.43-Cavium-Octeon+ #1 Workqueue: events_power_efficient phy_state_machine task: 80000004021ed100 task.stack: 8000000409d70000 $ 0 : 0000000000000000 ffffffff84720060 0000000000000048 0000000000000004 $ 4 : 0000000000000000 0000000000000001 0000000000000004 0000000000000000 $ 8 : 0000000000000000 0000000000000000 00000000ffff98f3 0000000000000000 $12 : 8000000409d73fe0 0000000000009c00 ffffffff846547c8 000000000000af3b $16 : 80000004096bab68 80000004096babd0 0000000000000000 80000004096ba800 $20 : 0000000000000000 0000000000000000 ffffffff81090000 0000000000000008 $24 : 0000000000000061 ffffffff808637b0 $28 : 8000000409d70000 8000000409d73cf0 80000000271bd300 ffffffff80c7804c Hi : 000000000000002a Lo : 000000000000003f epc : ffffffff80de37ec netif_carrier_off+0xc/0x58 ra : ffffffff80c7804c phy_state_machine+0x48c/0x4f8 Status: 14009ce3 KX SX UX KERNEL EXL IE Cause : 00800008 (ExcCode 02) BadVA : 0000000000000048 PrId : 000d9501 (Cavium Octeon III) Modules linked in: Process kworker/12:1 (pid: 1502, threadinfo=8000000409d70000, task=80000004021ed100, tls=0000000000000000) Stack : 8000000409a54000 80000004096bab68 80000000271bd300 80000000271c1e00 0000000000000000 ffffffff808a1708 8000000409a54000 80000000271bd300 80000000271bd320 8000000409a54030 ffffffff80ff0f00 0000000000000001 ffffffff81090000 ffffffff808a1ac0 8000000402182080 ffffffff84650000 8000000402182080 ffffffff84650000 ffffffff80ff0000 8000000409a54000 ffffffff808a1970 0000000000000000 80000004099e8000 8000000402099240 0000000000000000 ffffffff808a8598 0000000000000000 8000000408eeeb00 8000000409a54000 00000000810a1d00 0000000000000000 8000000409d73de8 8000000409d73de8 0000000000000088 000000000c009c00 8000000409d73e08 8000000409d73e08 8000000402182080 ffffffff808a84d0 8000000402182080 ... Call Trace: [] netif_carrier_off+0xc/0x58 [] phy_state_machine+0x48c/0x4f8 [] process_one_work+0x158/0x368 [] worker_thread+0x150/0x4c0 [] kthread+0xc8/0xe0 [] ret_from_kernel_thread+0x14/0x1c The original motivation for this change originated from Marc Gonzales indicating that his network driver did not have its adjust_link callback executing with phydev->link = 0 while he was expecting it. PHYLIB has never made any such guarantees ever because phy_stop() merely just tells the workqueue to move into PHY_HALTED state which will happen asynchronously. Reported-by: Geert Uytterhoeven Reported-by: David Daney Fixes: 7ad813f20853 ("net: phy: Correctly process PHY_HALTED in phy_stop_machine()") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 5068c582d502..d0626bf5c540 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -749,9 +749,6 @@ void phy_stop_machine(struct phy_device *phydev) if (phydev->state > PHY_UP && phydev->state != PHY_HALTED) phydev->state = PHY_UP; mutex_unlock(&phydev->lock); - - /* Now we can run the state machine synchronously */ - phy_state_machine(&phydev->state_queue.work); } /** From f581a0dd744fe32b0a8805e279c59ec1ac676d60 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 31 Aug 2017 16:47:43 +0200 Subject: [PATCH 115/118] wl1251: add a missing spin_lock_init() wl1251: add a missing spin_lock_init() This fixes the following kernel warning: [ 5668.771453] BUG: spinlock bad magic on CPU#0, kworker/u2:3/9745 [ 5668.771850] lock: 0xce63ef20, .magic: 00000000, .owner: /-1, .owner_cpu: 0 [ 5668.772277] CPU: 0 PID: 9745 Comm: kworker/u2:3 Tainted: G W 4.12.0-03002-gec979a4-dirty #40 [ 5668.772796] Hardware name: Nokia RX-51 board [ 5668.773071] Workqueue: phy1 wl1251_irq_work [ 5668.773345] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 5668.773803] [] (show_stack) from [] (do_raw_spin_lock+0x6c/0xa0) [ 5668.774230] [] (do_raw_spin_lock) from [] (_raw_spin_lock_irqsave+0x10/0x18) [ 5668.774658] [] (_raw_spin_lock_irqsave) from [] (wl1251_op_tx+0x38/0x5c) [ 5668.775115] [] (wl1251_op_tx) from [] (ieee80211_tx_frags+0x188/0x1c0) [ 5668.775543] [] (ieee80211_tx_frags) from [] (__ieee80211_tx+0x6c/0x130) [ 5668.775970] [] (__ieee80211_tx) from [] (ieee80211_tx+0xdc/0x104) [ 5668.776367] [] (ieee80211_tx) from [] (__ieee80211_subif_start_xmit+0x454/0x8c8) [ 5668.776824] [] (__ieee80211_subif_start_xmit) from [] (ieee80211_subif_start_xmit+0x30/0x2fc) [ 5668.777343] [] (ieee80211_subif_start_xmit) from [] (dev_hard_start_xmit+0x80/0x118) ... by adding the missing spin_lock_init(). Reported-by: Pavel Machek Cc: Kalle Valo Signed-off-by: Cong Wang Acked-by: Pavel Machek Signed-off-by: Kalle Valo Signed-off-by: Pavel Machek Cc: stable@kernel.org Signed-off-by: David S. Miller --- drivers/net/wireless/ti/wl1251/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c index 08f0477f78d9..9915d83a4a30 100644 --- a/drivers/net/wireless/ti/wl1251/main.c +++ b/drivers/net/wireless/ti/wl1251/main.c @@ -1571,6 +1571,7 @@ struct ieee80211_hw *wl1251_alloc_hw(void) wl->state = WL1251_STATE_OFF; mutex_init(&wl->mutex); + spin_lock_init(&wl->wl_lock); wl->tx_mgmt_frm_rate = DEFAULT_HW_GEN_TX_RATE; wl->tx_mgmt_frm_mod = DEFAULT_HW_GEN_MODULATION_TYPE; From 25cc72a33835ed8a6f53180a822cadab855852ac Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 1 Sep 2017 10:52:31 +0200 Subject: [PATCH 116/118] mlxsw: spectrum: Forbid linking to devices that have uppers The mlxsw driver relies on NETDEV_CHANGEUPPER events to configure the device in case a port is enslaved to a master netdev such as bridge or bond. Since the driver ignores events unrelated to its ports and their uppers, it's possible to engineer situations in which the device's data path differs from the kernel's. One example to such a situation is when a port is enslaved to a bond that is already enslaved to a bridge. When the bond was enslaved the driver ignored the event - as the bond wasn't one of its uppers - and therefore a bridge port instance isn't created in the device. Until such configurations are supported forbid them by checking that the upper device doesn't have uppers of its own. Fixes: 0d65fc13042f ("mlxsw: spectrum: Implement LAG port join/leave") Signed-off-by: Ido Schimmel Reported-by: Nogah Frankel Tested-by: Nogah Frankel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 6 ++++++ include/linux/netdevice.h | 2 ++ net/core/dev.c | 3 ++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 60bf8f27cc00..c6a3e61b53bd 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -4139,6 +4139,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev, return -EINVAL; if (!info->linking) break; + if (netdev_has_any_upper_dev(upper_dev)) + return -EINVAL; if (netif_is_lag_master(upper_dev) && !mlxsw_sp_master_lag_check(mlxsw_sp, upper_dev, info->upper_info)) @@ -4258,6 +4260,10 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev, upper_dev = info->upper_dev; if (!netif_is_bridge_master(upper_dev)) return -EINVAL; + if (!info->linking) + break; + if (netdev_has_any_upper_dev(upper_dev)) + return -EINVAL; break; case NETDEV_CHANGEUPPER: upper_dev = info->upper_dev; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 779b23595596..c99ba7914c0a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3866,6 +3866,8 @@ int netdev_walk_all_upper_dev_rcu(struct net_device *dev, bool netdev_has_upper_dev_all_rcu(struct net_device *dev, struct net_device *upper_dev); +bool netdev_has_any_upper_dev(struct net_device *dev); + void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter); void *netdev_lower_get_next_private_rcu(struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index 818dfa6e7ab5..86b4b0a79e7a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5668,12 +5668,13 @@ EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu); * Find out if a device is linked to an upper device and return true in case * it is. The caller must hold the RTNL lock. */ -static bool netdev_has_any_upper_dev(struct net_device *dev) +bool netdev_has_any_upper_dev(struct net_device *dev) { ASSERT_RTNL(); return !list_empty(&dev->adj_list.upper); } +EXPORT_SYMBOL(netdev_has_any_upper_dev); /** * netdev_master_upper_dev_get - Get master upper device From 79e99bdd60b484af9afe0147e85a13e66d5c1cdb Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 1 Sep 2017 12:22:25 +0300 Subject: [PATCH 117/118] bridge: switchdev: Clear forward mark when transmitting packet Commit 6bc506b4fb06 ("bridge: switchdev: Add forward mark support for stacked devices") added the 'offload_fwd_mark' bit to the skb in order to allow drivers to indicate to the bridge driver that they already forwarded the packet in L2. In case the bit is set, before transmitting the packet from each port, the port's mark is compared with the mark stored in the skb's control block. If both marks are equal, we know the packet arrived from a switch device that already forwarded the packet and it's not re-transmitted. However, if the packet is transmitted from the bridge device itself (e.g., br0), we should clear the 'offload_fwd_mark' bit as the mark stored in the skb's control block isn't valid. This scenario can happen in rare cases where a packet was trapped during L3 forwarding and forwarded by the kernel to a bridge device. Fixes: 6bc506b4fb06 ("bridge: switchdev: Add forward mark support for stacked devices") Signed-off-by: Ido Schimmel Reported-by: Yotam Gigi Tested-by: Yotam Gigi Reviewed-by: Jiri Pirko Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_device.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 861ae2a165f4..5a7be3bddfa9 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -53,6 +53,9 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) brstats->tx_bytes += skb->len; u64_stats_update_end(&brstats->syncp); +#ifdef CONFIG_NET_SWITCHDEV + skb->offload_fwd_mark = 0; +#endif BR_INPUT_SKB_CB(skb)->brdev = dev; skb_reset_mac_header(skb); From e8a732d1bc3ac313e22249c13a153c3fe54aa577 Mon Sep 17 00:00:00 2001 From: Yossi Kuperman Date: Fri, 1 Sep 2017 14:42:30 +0200 Subject: [PATCH 118/118] udp: fix secpath leak After commit dce4551cb2ad ("udp: preserve head state for IP_CMSG_PASSSEC") we preserve the secpath for the whole skb lifecycle, but we also end up leaking a reference to it. We must clear the head state on skb reception, if secpath is present. Fixes: dce4551cb2ad ("udp: preserve head state for IP_CMSG_PASSSEC") Signed-off-by: Yossi Kuperman Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a6dc48d76a29..62344804baae 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1176,7 +1176,7 @@ static void udp_set_dev_scratch(struct sk_buff *skb) scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); #endif - if (likely(!skb->_skb_refdst)) + if (likely(!skb->_skb_refdst && !skb_sec_path(skb))) scratch->_tsize_state |= UDP_SKB_IS_STATELESS; }