From 5ffd5cddd4d353fe18c01b89397dcad02035bb95 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 5 Sep 2013 17:48:47 +0200 Subject: [PATCH 1/7] net: netlink: filter particular protocols from analyzers Fix finer-grained control and let only a whitelist of allowed netlink protocols pass, in our case related to networking. If later on, other subsystems decide they want to add their protocol as well to the list of allowed protocols they shall simply add it. While at it, we also need to tell what protocol is in use otherwise BPF_S_ANC_PROTOCOL can not pick it up (as it's not filled out). Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index a17dda1bbee0..8df7f64c6db3 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -168,16 +168,43 @@ int netlink_remove_tap(struct netlink_tap *nt) } EXPORT_SYMBOL_GPL(netlink_remove_tap); +static bool netlink_filter_tap(const struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + bool pass = false; + + /* We take the more conservative approach and + * whitelist socket protocols that may pass. + */ + switch (sk->sk_protocol) { + case NETLINK_ROUTE: + case NETLINK_USERSOCK: + case NETLINK_SOCK_DIAG: + case NETLINK_NFLOG: + case NETLINK_XFRM: + case NETLINK_FIB_LOOKUP: + case NETLINK_NETFILTER: + case NETLINK_GENERIC: + pass = true; + break; + } + + return pass; +} + static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *nskb; + struct sock *sk = skb->sk; int ret = -ENOMEM; dev_hold(dev); nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; + nskb->protocol = htons((u16) sk->sk_protocol); + ret = dev_queue_xmit(nskb); if (unlikely(ret > 0)) ret = net_xmit_errno(ret); @@ -192,6 +219,9 @@ static void __netlink_deliver_tap(struct sk_buff *skb) int ret; struct netlink_tap *tmp; + if (!netlink_filter_tap(skb)) + return; + list_for_each_entry_rcu(tmp, &netlink_tap_all, list) { ret = __netlink_deliver_tap_skb(skb, tmp->dev); if (unlikely(ret)) From 16edfe7ee02dd7fcc13855ea19158333529533b2 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 5 Sep 2013 15:36:09 -0700 Subject: [PATCH 2/7] tcp: fix no cwnd growth after timeout In commit 0f7cc9a3 "tcp: increase throughput when reordering is high", it only allows cwnd to increase in Open state. This mistakenly disables slow start after timeout (CA_Loss). Moreover cwnd won't grow if the state moves from Disorder to Open later in tcp_fastretrans_alert(). Therefore the correct logic should be to allow cwnd to grow as long as the data is received in order in Open, Loss, or even Disorder state. Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1969e16d936d..894bc174f472 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3162,16 +3162,14 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) /* If reordering is high then always grow cwnd whenever data is * delivered regardless of its ordering. Otherwise stay conservative - * and only grow cwnd on in-order delivery in Open state, and retain - * cwnd in Disordered state (RFC5681). A stretched ACK with + * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ * new SACK or ECE mark may first advance cwnd here and later reduce * cwnd in tcp_fastretrans_alert() based on more states. */ if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) return flag & FLAG_FORWARD_PROGRESS; - return inet_csk(sk)->icsk_ca_state == TCP_CA_Open && - flag & FLAG_DATA_ACKED; + return flag & FLAG_DATA_ACKED; } /* Check that window update is acceptable. From 9b0be651ccb2fbe0e8fb08427d90237719b1022e Mon Sep 17 00:00:00 2001 From: Dmitry Kravkov Date: Fri, 6 Sep 2013 10:35:28 +0300 Subject: [PATCH 3/7] bnx2x: fix broken compilation with CONFIG_BNX2X_SRIOV is not set Since commit 60cad4e67bd6ff400e7ea61fe762b3042b12ae9d "bnx2x: VF RSS support - VF side" fails to compile w/o CONFIG_BNX2X_SRIOV option. Reported-by: Eric Dumazet CC: Ariel Elior Signed-off-by: Dmitry Kravkov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h index 2a8c1dc65d9c..059f0d460af2 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.h @@ -816,6 +816,8 @@ static inline int bnx2x_vfpf_setup_q(struct bnx2x *bp, struct bnx2x_fastpath *fp static inline int bnx2x_vfpf_teardown_queue(struct bnx2x *bp, int qidx) {return 0; } static inline int bnx2x_vfpf_config_mac(struct bnx2x *bp, u8 *addr, u8 vf_qid, bool set) {return 0; } +static inline int bnx2x_vfpf_config_rss(struct bnx2x *bp, + struct bnx2x_config_rss_params *params) {return 0; } static inline int bnx2x_vfpf_set_mcast(struct net_device *dev) {return 0; } static inline int bnx2x_vfpf_storm_rx_mode(struct bnx2x *bp) {return 0; } static inline int bnx2x_iov_nic_init(struct bnx2x *bp) {return 0; } From 937e5c3d63a17c0543912522ec4061a995558a70 Mon Sep 17 00:00:00 2001 From: Eilon Greenstein Date: Fri, 6 Sep 2013 12:55:02 +0300 Subject: [PATCH 4/7] bnx2x: Restore a call to config_init Commit c0a77ec74f295013d7ba3204dd3ed25fccf83cb4 'bnx2x: Add missing braces in bnx2x:bnx2x_link_initialize' identified indentation problem, but resolved it by adding braces instead of fixing the indentation. The braces now prevents a config_init call in some cases, though it should be called regardless of that condition. This patch removes the braces and fix the confusing indentation that caused this mess. Signed-off-by: Eilon Greenstein CC: Dave Jones Tested-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c index 664568420c9b..d60a2ea3da19 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c @@ -6501,13 +6501,10 @@ static int bnx2x_link_initialize(struct link_params *params, struct bnx2x_phy *phy = ¶ms->phy[INT_PHY]; if (vars->line_speed == SPEED_AUTO_NEG && (CHIP_IS_E1x(bp) || - CHIP_IS_E2(bp))) { + CHIP_IS_E2(bp))) bnx2x_set_parallel_detection(phy, params); - if (params->phy[INT_PHY].config_init) - params->phy[INT_PHY].config_init(phy, - params, - vars); - } + if (params->phy[INT_PHY].config_init) + params->phy[INT_PHY].config_init(phy, params, vars); } /* Init external phy*/ From 635ad31002fad736af85adfc96bb9408fb813a7b Mon Sep 17 00:00:00 2001 From: Michael Opdenacker Date: Fri, 6 Sep 2013 13:49:22 +0200 Subject: [PATCH 5/7] mlx5: remove unused MLX5_DEBUG param in Kconfig This patch proposes to remove the MLX5_DEBUG kernel configuration parameter defined in drivers/net/ethernet/mellanox/mlx5/core/Kconfig, but used nowhere in the makefiles and source code. This could also be fixed by using this parameter, but this may be a leftover from driver development... Signed-off-by: Michael Opdenacker Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index 21962828925a..157fe8df2c3e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -6,13 +6,3 @@ config MLX5_CORE tristate depends on PCI && X86 default n - -config MLX5_DEBUG - bool "Verbose debugging output" if (MLX5_CORE && EXPERT) - depends on MLX5_CORE - default y - ---help--- - This option causes debugging code to be compiled into the - mlx5_core driver. The output can be turned on via the - debug_mask module parameter (which can also be set after - the driver is loaded through sysfs). From 0042d0c840c616186a5b09207a0e77fab7581db3 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 6 Sep 2013 16:58:00 +0100 Subject: [PATCH 6/7] net: add documentation for BQL helpers Provide a kernel-doc comment documentation for the BQL helpers: - netdev_sent_queue - netdev_completed_queue - netdev_reset_queue Similarly to how it is done for the other functions, the documentation only covers the function operating on struct net_device and not struct netdev_queue. Signed-off-by: Florian Fainelli Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8ed4ae943053..041b42a305f6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2101,6 +2101,15 @@ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, #endif } +/** + * netdev_sent_queue - report the number of bytes queued to hardware + * @dev: network device + * @bytes: number of bytes queued to the hardware device queue + * + * Report the number of bytes queued for sending/completion to the network + * device hardware queue. @bytes should be a good approximation and should + * exactly match netdev_completed_queue() @bytes + */ static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes) { netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes); @@ -2130,6 +2139,16 @@ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue, #endif } +/** + * netdev_completed_queue - report bytes and packets completed by device + * @dev: network device + * @pkts: actual number of packets sent over the medium + * @bytes: actual number of bytes sent over the medium + * + * Report the number of bytes and packets transmitted by the network device + * hardware queue over the physical medium, @bytes must exactly match the + * @bytes amount passed to netdev_sent_queue() + */ static inline void netdev_completed_queue(struct net_device *dev, unsigned int pkts, unsigned int bytes) { @@ -2144,6 +2163,13 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q) #endif } +/** + * netdev_reset_queue - reset the packets and bytes count of a network device + * @dev_queue: network device + * + * Reset the bytes and packet count of a network device and clear the + * software flow control OFF bit for this network device + */ static inline void netdev_reset_queue(struct net_device *dev_queue) { netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0)); From 4e4f1fc226816905c937f9b29dabe351075dfe0f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 6 Sep 2013 10:35:58 -0700 Subject: [PATCH 7/7] tcp: properly increase rcv_ssthresh for ofo packets TCP receive window handling is multi staged. A socket has a memory budget, static or dynamic, in sk_rcvbuf. Because we do not really know how this memory budget translates to a TCP window (payload), TCP announces a small initial window (about 20 MSS). When a packet is received, we increase TCP rcv_win depending on the payload/truesize ratio of this packet. Good citizen packets give a hint that it's reasonable to have rcv_win = sk_rcvbuf/2 This heuristic takes place in tcp_grow_window() Problem is : We currently call tcp_grow_window() only for in-order packets. This means that reorders or packet losses stop proper grow of rcv_win, and senders are unable to benefit from fast recovery, or proper reordering level detection. Really, a packet being stored in OFO queue is not a bad citizen. It should be part of the game as in-order packets. In our traces, we very often see sender is limited by linux small receive windows, even if linux hosts use autotuning (DRS) and should allow rcv_win to grow to ~3MB. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 894bc174f472..25a89eaa669d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4139,6 +4139,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (!tcp_try_coalesce(sk, skb1, skb, &fragstolen)) { __skb_queue_after(&tp->out_of_order_queue, skb1, skb); } else { + tcp_grow_window(sk, skb); kfree_skb_partial(skb, fragstolen); skb = NULL; } @@ -4214,8 +4215,10 @@ add_sack: if (tcp_is_sack(tp)) tcp_sack_new_ofo_skb(sk, seq, end_seq); end: - if (skb) + if (skb) { + tcp_grow_window(sk, skb); skb_set_owner_r(skb, sk); + } } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,