From 584ec4355355ffac43571b02a314d43eb2f7fcbf Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Tue, 16 Jul 2013 10:49:41 -0400
Subject: [PATCH 01/19] atl1e: unmap partially mapped skb on dma error and free
 skb

Ben Hutchings pointed out that my recent update to atl1e
in commit 352900b583b2852152a1e05ea0e8b579292e731e
("atl1e: fix dma mapping warnings") was missing a bit of code.

Specifically it reset the hardware tx ring to its origional state when
we hit a dma error, but didn't unmap any exiting mappings from the
operation.  This patch fixes that up.  It also remembers to free the
skb in the event that an error occurs, so we don't leak.  Untested, as
I don't have hardware.  I think its pretty straightforward, but please
review closely.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
CC: Jay Cliburn <jcliburn@gmail.com>
CC: Chris Snook <chris.snook@gmail.com>
CC: "David S. Miller" <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/atheros/atl1e/atl1e_main.c   | 24 ++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index 6d1a62a84c9d..1966444590f6 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -1678,6 +1678,7 @@ static int atl1e_tx_map(struct atl1e_adapter *adapter,
 	u16 f;
 	int segment;
 	int ring_start = adapter->tx_ring.next_to_use;
+	int ring_end;
 
 	nr_frags = skb_shinfo(skb)->nr_frags;
 	segment = (tpd->word3 >> TPD_SEGMENT_EN_SHIFT) & TPD_SEGMENT_EN_MASK;
@@ -1721,6 +1722,15 @@ static int atl1e_tx_map(struct atl1e_adapter *adapter,
 					map_len, PCI_DMA_TODEVICE);
 
 		if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) {
+			/* We need to unwind the mappings we've done */
+			ring_end = adapter->tx_ring.next_to_use;
+			adapter->tx_ring.next_to_use = ring_start;
+			while (adapter->tx_ring.next_to_use != ring_end) {
+				tpd = atl1e_get_tpd(adapter);
+				tx_buffer = atl1e_get_tx_buffer(adapter, tpd);
+				pci_unmap_single(adapter->pdev, tx_buffer->dma,
+						 tx_buffer->length, PCI_DMA_TODEVICE);
+			}
 			/* Reset the tx rings next pointer */
 			adapter->tx_ring.next_to_use = ring_start;
 			return -ENOSPC;
@@ -1763,6 +1773,16 @@ static int atl1e_tx_map(struct atl1e_adapter *adapter,
 							  DMA_TO_DEVICE);
 
 			if (dma_mapping_error(&adapter->pdev->dev, tx_buffer->dma)) {
+				/* We need to unwind the mappings we've done */
+				ring_end = adapter->tx_ring.next_to_use;
+				adapter->tx_ring.next_to_use = ring_start;
+				while (adapter->tx_ring.next_to_use != ring_end) {
+					tpd = atl1e_get_tpd(adapter);
+					tx_buffer = atl1e_get_tx_buffer(adapter, tpd);
+					dma_unmap_page(&adapter->pdev->dev, tx_buffer->dma,
+						       tx_buffer->length, DMA_TO_DEVICE);
+				}
+
 				/* Reset the ring next to use pointer */
 				adapter->tx_ring.next_to_use = ring_start;
 				return -ENOSPC;
@@ -1853,8 +1873,10 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
 		return NETDEV_TX_OK;
 	}
 
-	if (atl1e_tx_map(adapter, skb, tpd))
+	if (atl1e_tx_map(adapter, skb, tpd)) {
+		dev_kfree_skb_any(skb);
 		goto out;
+	}
 
 	atl1e_tx_queue(adapter, tpd_req, tpd);
 

From f2f79cca13e36972978ffd1fb6483c8a1141b510 Mon Sep 17 00:00:00 2001
From: Daniel Baluta <dbaluta@ixiacom.com>
Date: Sat, 13 Jul 2013 11:26:51 +0300
Subject: [PATCH 02/19] ndisc: bool initializations should use true and false

Signed-off-by: Daniel Baluta <dbaluta@ixiacom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b3b5730b48c5..24c03396e008 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -479,7 +479,7 @@ static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh,
 	if (ifp) {
 		src_addr = solicited_addr;
 		if (ifp->flags & IFA_F_OPTIMISTIC)
-			override = 0;
+			override = false;
 		inc_opt |= ifp->idev->cnf.force_tllao;
 		in6_ifa_put(ifp);
 	} else {
@@ -557,7 +557,7 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh,
 	}
 
 	if (ipv6_addr_any(saddr))
-		inc_opt = 0;
+		inc_opt = false;
 	if (inc_opt)
 		optlen += ndisc_opt_addr_space(dev);
 
@@ -790,7 +790,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 		     (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) {
 			if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
 			    skb->pkt_type != PACKET_HOST &&
-			    inc != 0 &&
+			    inc &&
 			    idev->nd_parms->proxy_delay != 0) {
 				/*
 				 * for anycast or proxy,

From 8a73125c36809527236e1d8d367a09a3f0e9f9e1 Mon Sep 17 00:00:00 2001
From: Dragos Foianu <dragos.foianu@gmail.com>
Date: Sat, 13 Jul 2013 14:43:00 +0100
Subject: [PATCH 03/19] ethtool: fixed trailing statements in ethtool

Applied fixes suggested by checkpatch.pl.

Signed-off-by: Dragos Foianu <dragos.foianu@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index ab5fa6336c84..78e9d9223e40 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -279,11 +279,16 @@ static u32 __ethtool_get_flags(struct net_device *dev)
 {
 	u32 flags = 0;
 
-	if (dev->features & NETIF_F_LRO)	     flags |= ETH_FLAG_LRO;
-	if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) flags |= ETH_FLAG_RXVLAN;
-	if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) flags |= ETH_FLAG_TXVLAN;
-	if (dev->features & NETIF_F_NTUPLE)	     flags |= ETH_FLAG_NTUPLE;
-	if (dev->features & NETIF_F_RXHASH)	     flags |= ETH_FLAG_RXHASH;
+	if (dev->features & NETIF_F_LRO)
+		flags |= ETH_FLAG_LRO;
+	if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
+		flags |= ETH_FLAG_RXVLAN;
+	if (dev->features & NETIF_F_HW_VLAN_CTAG_TX)
+		flags |= ETH_FLAG_TXVLAN;
+	if (dev->features & NETIF_F_NTUPLE)
+		flags |= ETH_FLAG_NTUPLE;
+	if (dev->features & NETIF_F_RXHASH)
+		flags |= ETH_FLAG_RXHASH;
 
 	return flags;
 }
@@ -295,11 +300,16 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
 	if (data & ~ETH_ALL_FLAGS)
 		return -EINVAL;
 
-	if (data & ETH_FLAG_LRO)	features |= NETIF_F_LRO;
-	if (data & ETH_FLAG_RXVLAN)	features |= NETIF_F_HW_VLAN_CTAG_RX;
-	if (data & ETH_FLAG_TXVLAN)	features |= NETIF_F_HW_VLAN_CTAG_TX;
-	if (data & ETH_FLAG_NTUPLE)	features |= NETIF_F_NTUPLE;
-	if (data & ETH_FLAG_RXHASH)	features |= NETIF_F_RXHASH;
+	if (data & ETH_FLAG_LRO)
+		features |= NETIF_F_LRO;
+	if (data & ETH_FLAG_RXVLAN)
+		features |= NETIF_F_HW_VLAN_CTAG_RX;
+	if (data & ETH_FLAG_TXVLAN)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
+	if (data & ETH_FLAG_NTUPLE)
+		features |= NETIF_F_NTUPLE;
+	if (data & ETH_FLAG_RXHASH)
+		features |= NETIF_F_RXHASH;
 
 	/* allow changing only bits set in hw_features */
 	changed = (features ^ dev->features) & ETH_ALL_FEATURES;

From b6a82dd233cabcc1517c0744d7a8f0b61f559caf Mon Sep 17 00:00:00 2001
From: Dragos Foianu <dragos.foianu@gmail.com>
Date: Sat, 13 Jul 2013 15:03:55 +0100
Subject: [PATCH 04/19] net/irda: fixed style issues in irlan_eth

Applied fixes suggested by checkpatch.pl

Signed-off-by: Dragos Foianu <dragos.foianu@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/irda/irlan/irlan_eth.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index d14152e866d9..ffcec225b5d9 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -44,12 +44,12 @@ static int  irlan_eth_open(struct net_device *dev);
 static int  irlan_eth_close(struct net_device *dev);
 static netdev_tx_t  irlan_eth_xmit(struct sk_buff *skb,
 					 struct net_device *dev);
-static void irlan_eth_set_multicast_list( struct net_device *dev);
+static void irlan_eth_set_multicast_list(struct net_device *dev);
 
 static const struct net_device_ops irlan_eth_netdev_ops = {
-	.ndo_open               = irlan_eth_open,
-	.ndo_stop               = irlan_eth_close,
-	.ndo_start_xmit    	= irlan_eth_xmit,
+	.ndo_open		= irlan_eth_open,
+	.ndo_stop		= irlan_eth_close,
+	.ndo_start_xmit		= irlan_eth_xmit,
 	.ndo_set_rx_mode	= irlan_eth_set_multicast_list,
 	.ndo_change_mtu		= eth_change_mtu,
 	.ndo_validate_addr	= eth_validate_addr,
@@ -110,7 +110,7 @@ static int irlan_eth_open(struct net_device *dev)
 {
 	struct irlan_cb *self = netdev_priv(dev);
 
-	IRDA_DEBUG(2, "%s()\n", __func__ );
+	IRDA_DEBUG(2, "%s()\n", __func__);
 
 	/* Ready to play! */
 	netif_stop_queue(dev); /* Wait until data link is ready */
@@ -137,7 +137,7 @@ static int irlan_eth_close(struct net_device *dev)
 {
 	struct irlan_cb *self = netdev_priv(dev);
 
-	IRDA_DEBUG(2, "%s()\n", __func__ );
+	IRDA_DEBUG(2, "%s()\n", __func__);
 
 	/* Stop device */
 	netif_stop_queue(dev);
@@ -310,35 +310,32 @@ static void irlan_eth_set_multicast_list(struct net_device *dev)
 {
 	struct irlan_cb *self = netdev_priv(dev);
 
-	IRDA_DEBUG(2, "%s()\n", __func__ );
+	IRDA_DEBUG(2, "%s()\n", __func__);
 
 	/* Check if data channel has been connected yet */
 	if (self->client.state != IRLAN_DATA) {
-		IRDA_DEBUG(1, "%s(), delaying!\n", __func__ );
+		IRDA_DEBUG(1, "%s(), delaying!\n", __func__);
 		return;
 	}
 
 	if (dev->flags & IFF_PROMISC) {
 		/* Enable promiscuous mode */
 		IRDA_WARNING("Promiscuous mode not implemented by IrLAN!\n");
-	}
-	else if ((dev->flags & IFF_ALLMULTI) ||
+	} else if ((dev->flags & IFF_ALLMULTI) ||
 		 netdev_mc_count(dev) > HW_MAX_ADDRS) {
 		/* Disable promiscuous mode, use normal mode. */
-		IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __func__ );
+		IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __func__);
 		/* hardware_set_filter(NULL); */
 
 		irlan_set_multicast_filter(self, TRUE);
-	}
-	else if (!netdev_mc_empty(dev)) {
-		IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __func__ );
+	} else if (!netdev_mc_empty(dev)) {
+		IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __func__);
 		/* Walk the address list, and load the filter */
 		/* hardware_set_filter(dev->mc_list); */
 
 		irlan_set_multicast_filter(self, TRUE);
-	}
-	else {
-		IRDA_DEBUG(4, "%s(), Clearing multicast filter\n", __func__ );
+	} else {
+		IRDA_DEBUG(4, "%s(), Clearing multicast filter\n", __func__);
 		irlan_set_multicast_filter(self, FALSE);
 	}
 

From 31bd29776b8575b8cf7c25bbf7496a460278dc83 Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Mon, 15 Jul 2013 13:26:27 +0200
Subject: [PATCH 05/19] bgmac: add dependency to phylib

bgmac is using functions from phylib, add the dependency.

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
index 1d680baf43d6..52c96036dcc4 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -131,6 +131,7 @@ config BNX2X_SRIOV
 config BGMAC
 	tristate "BCMA bus GBit core support"
 	depends on BCMA_HOST_SOC && HAS_DMA
+	select PHYLIB
 	---help---
 	  This driver supports GBit MAC and BCM4706 GBit MAC cores on BCMA bus.
 	  They can be found on BCM47xx SoCs and provide gigabit ethernet.

From 9a0f06fee1f453370e87b6568dc1d39d676f53fc Mon Sep 17 00:00:00 2001
From: Tim Gardner <tim.gardner@canonical.com>
Date: Mon, 15 Jul 2013 08:56:45 -0600
Subject: [PATCH 06/19] mlx5 core: Fix __udivdi3 when compiling for 32 bit
 arches

Cc: Eli Cohen <eli@mellanox.com>
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
index 4273c06e2e96..9c7194b26ee2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
@@ -156,7 +156,7 @@ static ssize_t average_read(struct file *filp, char __user *buf, size_t count,
 	stats = filp->private_data;
 	spin_lock(&stats->lock);
 	if (stats->n)
-		field = stats->sum / stats->n;
+		field = div64_u64(stats->sum, stats->n);
 	spin_unlock(&stats->lock);
 	ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field);
 	if (ret > 0) {

From 21d1196a35f5686c4323e42a62fdb4b23b0ab4a3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Jul 2013 20:03:19 -0700
Subject: [PATCH 07/19] ipv4: set transport header earlier

commit 45f00f99d6e ("ipv4: tcp: clean up tcp_v4_early_demux()") added a
performance regression for non GRO traffic, basically disabling
IP early demux.

IPv6 stack resets transport header in ip6_rcv() before calling
IP early demux in ip6_rcv_finish(), while IPv4 does this only in
ip_local_deliver_finish(), _after_ IP early demux.

GRO traffic happened to enable IP early demux because transport header
is also set in inet_gro_receive()

Instead of reverting the faulty commit, we can make IPv4/IPv6 behave the
same : transport_header should be set in ip_rcv() instead of
ip_local_deliver_finish()

ip_local_deliver_finish() can also use skb_network_header_len() which is
faster than ip_hdrlen()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_input.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3da817b89e9b..15e3e683adec 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -190,10 +190,7 @@ static int ip_local_deliver_finish(struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 
-	__skb_pull(skb, ip_hdrlen(skb));
-
-	/* Point into the IP datagram, just past the header. */
-	skb_reset_transport_header(skb);
+	__skb_pull(skb, skb_network_header_len(skb));
 
 	rcu_read_lock();
 	{
@@ -437,6 +434,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 		goto drop;
 	}
 
+	skb->transport_header = skb->network_header + iph->ihl*4;
+
 	/* Remove any debris in the socket control block */
 	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
 

From 82a19eb8c02ab98bfe0bf6fa4915de370acb2858 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 16 Jul 2013 13:36:33 +0800
Subject: [PATCH 08/19] macvtap: fix the missing ret value of TUNSETQUEUE

Commit 441ac0fcaadc76ad09771812382345001dd2b813
(macvtap: Convert to using rtnl lock) forget to return what
macvtap_ioctl_set_queue() returns to its caller. This may break multiqueue API
by always falling through to TUNGETFEATURES.

Cc: Vlad Yasevich <vyasevic@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 876c72246ae9..0e5492ec753a 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -1107,6 +1107,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd,
 		rtnl_lock();
 		ret = macvtap_ioctl_set_queue(file, u);
 		rtnl_unlock();
+		return ret;
 
 	case TUNGETFEATURES:
 		if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR |

From 0fbe0d47b1ee2015c288abd4e7b32224f8bfa212 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 16 Jul 2013 13:36:34 +0800
Subject: [PATCH 09/19] macvtap: do not assume 802.1Q when send vlan packets

The hard-coded 8021.q proto will break 802.1ad traffic. So switch to use
vlan->proto.

Cc: Basil Gor <basil.gor@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 0e5492ec753a..a7c5654a2e5c 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -873,7 +873,7 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
 		} veth;
-		veth.h_vlan_proto = htons(ETH_P_8021Q);
+		veth.h_vlan_proto = skb->vlan_proto;
 		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);

From 52fe29e4bb614367c108b717c6d7fe5953eb7af3 Mon Sep 17 00:00:00 2001
From: Sarveshwar Bandi <sarveshwar.bandi@emulex.com>
Date: Tue, 16 Jul 2013 12:44:02 +0530
Subject: [PATCH 10/19] be2net: Fix to avoid hardware workaround when not
 needed

Hardware workaround requesting hardware to skip vlan insertion is necessary
only when umc or qnq is enabled. Enabling this workaround in other scenarios
could cause controller to stall.

Signed-off-by: Sarveshwar Bandi <sarveshwar.bandi@emulex.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be_main.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 2df48bb0f1ca..181edb522450 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -782,16 +782,22 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
 
 	if (vlan_tx_tag_present(skb))
 		vlan_tag = be_get_tx_vlan_tag(adapter, skb);
-	else if (qnq_async_evt_rcvd(adapter) && adapter->pvid)
-		vlan_tag = adapter->pvid;
+
+	if (qnq_async_evt_rcvd(adapter) && adapter->pvid) {
+		if (!vlan_tag)
+			vlan_tag = adapter->pvid;
+		/* f/w workaround to set skip_hw_vlan = 1, informs the F/W to
+		 * skip VLAN insertion
+		 */
+		if (skip_hw_vlan)
+			*skip_hw_vlan = true;
+	}
 
 	if (vlan_tag) {
 		skb = __vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
 		if (unlikely(!skb))
 			return skb;
 		skb->vlan_tci = 0;
-		if (skip_hw_vlan)
-			*skip_hw_vlan = true;
 	}
 
 	/* Insert the outer VLAN, if any */

From ae8e9c5a1a7889315229a741fd48a5dd0bc2964c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 16 Jul 2013 17:09:15 -0700
Subject: [PATCH 11/19] net: Fix sysfs_format_mac() code duplication.

It's just a duplicate implementation of "%*phC".  Thanks to Joe
Perches for showing that we had exactly this support in the
lib/vsprintf.c code already.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethernet/eth.c | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 5359560926bc..be1f64d35358 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -401,27 +401,8 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
 }
 EXPORT_SYMBOL(alloc_etherdev_mqs);
 
-static size_t _format_mac_addr(char *buf, int buflen,
-			       const unsigned char *addr, int len)
-{
-	int i;
-	char *cp = buf;
-
-	for (i = 0; i < len; i++) {
-		cp += scnprintf(cp, buflen - (cp - buf), "%02x", addr[i]);
-		if (i == len - 1)
-			break;
-		cp += scnprintf(cp, buflen - (cp - buf), ":");
-	}
-	return cp - buf;
-}
-
 ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
 {
-	size_t l;
-
-	l = _format_mac_addr(buf, PAGE_SIZE, addr, len);
-	l += scnprintf(buf + l, PAGE_SIZE - l, "\n");
-	return (ssize_t)l;
+	return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
 }
 EXPORT_SYMBOL(sysfs_format_mac);

From f45708209dc445bac0844f6ce86e315a2ffe8a29 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Tue, 16 Jul 2013 23:01:20 -0700
Subject: [PATCH 12/19] hyperv: Fix the NETIF_F_SG flag setting in netvsc

SG mode is not currently supported by netvsc, so remove this flag for now.
Otherwise, it will be unconditionally enabled by commit ec5f0615642
    "Kill link between CSUM and SG features"
Previously, the SG feature is disabled because CSUM is not set here.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc_drv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 4dccead586be..23a0fff0df52 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -431,8 +431,8 @@ static int netvsc_probe(struct hv_device *dev,
 	net->netdev_ops = &device_ops;
 
 	/* TODO: Add GSO and Checksum offload */
-	net->hw_features = NETIF_F_SG;
-	net->features = NETIF_F_SG | NETIF_F_HW_VLAN_CTAG_TX;
+	net->hw_features = 0;
+	net->features = NETIF_F_HW_VLAN_CTAG_TX;
 
 	SET_ETHTOOL_OPS(net, &ethtool_ops);
 	SET_NETDEV_DEV(net, &dev->device);

From fe5c3561e6f0ac7c9546209f01351113c1b77ec8 Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Sat, 13 Jul 2013 10:18:18 -0700
Subject: [PATCH 13/19] vxlan: add necessary locking on device removal

The socket management is now done in workqueue (outside of RTNL)
and protected by vn->sock_lock. There were two possible bugs, first
the vxlan device was removed from the VNI hash table per socket without
holding lock. And there was a race when device is created and the workqueue
could run after deletion.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 0ba1e7edbb1b..a5ba8dd7e6be 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1767,9 +1767,15 @@ static int vxlan_newlink(struct net *net, struct net_device *dev,
 
 static void vxlan_dellink(struct net_device *dev, struct list_head *head)
 {
+	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 
+	flush_workqueue(vxlan_wq);
+
+	spin_lock(&vn->sock_lock);
 	hlist_del_rcu(&vxlan->hlist);
+	spin_unlock(&vn->sock_lock);
+
 	list_del(&vxlan->next);
 	unregister_netdevice_queue(dev, head);
 }

From 093b9c71b6e450e375f4646ba86faed0195ec7df Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Wed, 17 Jul 2013 08:09:37 +0100
Subject: [PATCH 14/19] xen-netfront: pull on receive skb may need to happen
 earlier

Due to commit 3683243b ("xen-netfront: use __pskb_pull_tail to ensure
linear area is big enough on RX") xennet_fill_frags() may end up
filling MAX_SKB_FRAGS + 1 fragments in a receive skb, and only reduce
the fragment count subsequently via __pskb_pull_tail(). That's a
result of xennet_get_responses() allowing a maximum of one more slot to
be consumed (and intermediately transformed into a fragment) if the
head slot has a size less than or equal to RX_COPY_THRESHOLD.

Hence we need to adjust xennet_fill_frags() to pull earlier if we
reached the maximum fragment count - due to the described behavior of
xennet_get_responses() this guarantees that at least the first fragment
will get completely consumed, and hence the fragment count reduced.

In order to not needlessly call __pskb_pull_tail() twice, make the
original call conditional upon the pull target not having been reached
yet, and defer the newly added one as much as possible (an alternative
would have been to always call the function right before the call to
xennet_fill_frags(), but that would imply more frequent cases of
needing to call it twice).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Wei Liu <wei.liu2@citrix.com>
Cc: Ian Campbell <ian.campbell@citrix.com>
Cc: stable@vger.kernel.org (3.6 onwards)
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/xen-netfront.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index ff7f111fffee..36808bf25677 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -286,8 +286,7 @@ no_skb:
 			break;
 		}
 
-		__skb_fill_page_desc(skb, 0, page, 0, 0);
-		skb_shinfo(skb)->nr_frags = 1;
+		skb_add_rx_frag(skb, 0, page, 0, 0, PAGE_SIZE);
 		__skb_queue_tail(&np->rx_batch, skb);
 	}
 
@@ -831,7 +830,6 @@ static RING_IDX xennet_fill_frags(struct netfront_info *np,
 				  struct sk_buff_head *list)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
-	int nr_frags = shinfo->nr_frags;
 	RING_IDX cons = np->rx.rsp_cons;
 	struct sk_buff *nskb;
 
@@ -840,19 +838,21 @@ static RING_IDX xennet_fill_frags(struct netfront_info *np,
 			RING_GET_RESPONSE(&np->rx, ++cons);
 		skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0];
 
-		__skb_fill_page_desc(skb, nr_frags,
-				     skb_frag_page(nfrag),
-				     rx->offset, rx->status);
+		if (shinfo->nr_frags == MAX_SKB_FRAGS) {
+			unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to;
 
-		skb->data_len += rx->status;
+			BUG_ON(pull_to <= skb_headlen(skb));
+			__pskb_pull_tail(skb, pull_to - skb_headlen(skb));
+		}
+		BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS);
+
+		skb_add_rx_frag(skb, shinfo->nr_frags, skb_frag_page(nfrag),
+				rx->offset, rx->status, PAGE_SIZE);
 
 		skb_shinfo(nskb)->nr_frags = 0;
 		kfree_skb(nskb);
-
-		nr_frags++;
 	}
 
-	shinfo->nr_frags = nr_frags;
 	return cons;
 }
 
@@ -933,7 +933,8 @@ static int handle_incoming_queue(struct net_device *dev,
 	while ((skb = __skb_dequeue(rxq)) != NULL) {
 		int pull_to = NETFRONT_SKB_CB(skb)->pull_to;
 
-		__pskb_pull_tail(skb, pull_to - skb_headlen(skb));
+		if (pull_to > skb_headlen(skb))
+			__pskb_pull_tail(skb, pull_to - skb_headlen(skb));
 
 		/* Ethernet work: Delayed to here as it peeks the header. */
 		skb->protocol = eth_type_trans(skb, dev);
@@ -1019,16 +1020,10 @@ err:
 		skb_shinfo(skb)->frags[0].page_offset = rx->offset;
 		skb_frag_size_set(&skb_shinfo(skb)->frags[0], rx->status);
 		skb->data_len = rx->status;
+		skb->len += rx->status;
 
 		i = xennet_fill_frags(np, skb, &tmpq);
 
-		/*
-                 * Truesize is the actual allocation size, even if the
-                 * allocation is only partially used.
-                 */
-		skb->truesize += PAGE_SIZE * skb_shinfo(skb)->nr_frags;
-		skb->len += skb->data_len;
-
 		if (rx->flags & XEN_NETRXF_csum_blank)
 			skb->ip_summed = CHECKSUM_PARTIAL;
 		else if (rx->flags & XEN_NETRXF_data_validated)

From 87f40dd6ce7042caca0b3b557e8923127f51f902 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@unimore.it>
Date: Tue, 16 Jul 2013 08:52:30 +0200
Subject: [PATCH 15/19] pkt_sched: sch_qfq: remove a source of high packet
 delay/jitter

QFQ+ inherits from QFQ a design choice that may cause a high packet
delay/jitter and a severe short-term unfairness. As QFQ, QFQ+ uses a
special quantity, the system virtual time, to track the service
provided by the ideal system it approximates. When a packet is
dequeued, this quantity must be incremented by the size of the packet,
divided by the sum of the weights of the aggregates waiting to be
served. Tracking this sum correctly is a non-trivial task, because, to
preserve tight service guarantees, the decrement of this sum must be
delayed in a special way [1]: this sum can be decremented only after
that its value would decrease also in the ideal system approximated by
QFQ+. For efficiency, QFQ+ keeps track only of the 'instantaneous'
weight sum, increased and decreased immediately as the weight of an
aggregate changes, and as an aggregate is created or destroyed (which,
in its turn, happens as a consequence of some class being
created/destroyed/changed). However, to avoid the problems caused to
service guarantees by these immediate decreases, QFQ+ increments the
system virtual time using the maximum value allowed for the weight
sum, 2^10, in place of the dynamic, instantaneous value. The
instantaneous value of the weight sum is used only to check whether a
request of weight increase or a class creation can be satisfied.

Unfortunately, the problems caused by this choice are worse than the
temporary degradation of the service guarantees that may occur, when a
class is changed or destroyed, if the instantaneous value of the
weight sum was used to update the system virtual time. In fact, the
fraction of the link bandwidth guaranteed by QFQ+ to each aggregate is
equal to the ratio between the weight of the aggregate and the sum of
the weights of the competing aggregates. The packet delay guaranteed
to the aggregate is instead inversely proportional to the guaranteed
bandwidth. By using the maximum possible value, and not the actual
value of the weight sum, QFQ+ provides each aggregate with the worst
possible service guarantees, and not with service guarantees related
to the actual set of competing aggregates. To see the consequences of
this fact, consider the following simple example.

Suppose that only the following aggregates are backlogged, i.e., that
only the classes in the following aggregates have packets to transmit:
one aggregate with weight 10, say A, and ten aggregates with weight 1,
say B1, B2, ..., B10. In particular, suppose that these aggregates are
always backlogged. Given the weight distribution, the smoothest and
fairest service order would be:
A B1 A B2 A B3 A B4 A B5 A B6 A B7 A B8 A B9 A B10 A B1 A B2 ...

QFQ+ would provide exactly this optimal service if it used the actual
value for the weight sum instead of the maximum possible value, i.e.,
11 instead of 2^10. In contrast, since QFQ+ uses the latter value, it
serves aggregates as follows (easy to prove and to reproduce
experimentally):
A B1 B2 B3 B4 B5 B6 B7 B8 B9 B10 A A A A A A A A A A B1 B2 ... B10 A A ...

By replacing 10 with N in the above example, and by increasing N, one
can increase at will the maximum packet delay and the jitter
experienced by the classes in aggregate A.

This patch addresses this issue by just using the above
'instantaneous' value of the weight sum, instead of the maximum
possible value, when updating the system virtual time.  After the
instantaneous weight sum is decreased, QFQ+ may deviate from the ideal
service for a time interval in the order of the time to serve one
maximum-size packet for each backlogged class. The worst-case extent
of the deviation exhibited by QFQ+ during this time interval [1] is
basically the same as of the deviation described above (but, without
this patch, QFQ+ suffers from such a deviation all the time). Finally,
this patch modifies the comment to the function qfq_slot_insert, to
make it coherent with the fact that the weight sum used by QFQ+ can
now be lower than the maximum possible value.

[1] P. Valente, "Extending WF2Q+ to support a dynamic traffic mix",
Proceedings of AAA-IDEA'05, June 2005.

Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_qfq.c | 85 +++++++++++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 29 deletions(-)

diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index a7ab323849b6..8056fb4e618a 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -113,7 +113,6 @@
 
 #define FRAC_BITS		30	/* fixed point arithmetic */
 #define ONE_FP			(1UL << FRAC_BITS)
-#define IWSUM			(ONE_FP/QFQ_MAX_WSUM)
 
 #define QFQ_MTU_SHIFT		16	/* to support TSO/GSO */
 #define QFQ_MIN_LMAX		512	/* see qfq_slot_insert */
@@ -189,6 +188,7 @@ struct qfq_sched {
 	struct qfq_aggregate	*in_serv_agg;   /* Aggregate being served. */
 	u32			num_active_agg; /* Num. of active aggregates */
 	u32			wsum;		/* weight sum */
+	u32			iwsum;		/* inverse weight sum */
 
 	unsigned long bitmaps[QFQ_MAX_STATE];	    /* Group bitmaps. */
 	struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
@@ -314,6 +314,7 @@ static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg,
 
 	q->wsum +=
 		(int) agg->class_weight * (new_num_classes - agg->num_classes);
+	q->iwsum = ONE_FP / q->wsum;
 
 	agg->num_classes = new_num_classes;
 }
@@ -340,6 +341,10 @@ static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg)
 {
 	if (!hlist_unhashed(&agg->nonfull_next))
 		hlist_del_init(&agg->nonfull_next);
+	q->wsum -= agg->class_weight;
+	if (q->wsum != 0)
+		q->iwsum = ONE_FP / q->wsum;
+
 	if (q->in_serv_agg == agg)
 		q->in_serv_agg = qfq_choose_next_agg(q);
 	kfree(agg);
@@ -834,38 +839,60 @@ static void qfq_make_eligible(struct qfq_sched *q)
 	}
 }
 
-
 /*
- * The index of the slot in which the aggregate is to be inserted must
- * not be higher than QFQ_MAX_SLOTS-2. There is a '-2' and not a '-1'
- * because the start time of the group may be moved backward by one
- * slot after the aggregate has been inserted, and this would cause
- * non-empty slots to be right-shifted by one position.
+ * The index of the slot in which the input aggregate agg is to be
+ * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2'
+ * and not a '-1' because the start time of the group may be moved
+ * backward by one slot after the aggregate has been inserted, and
+ * this would cause non-empty slots to be right-shifted by one
+ * position.
  *
- * If the weight and lmax (max_pkt_size) of the classes do not change,
- * then QFQ+ does meet the above contraint according to the current
- * values of its parameters. In fact, if the weight and lmax of the
- * classes do not change, then, from the theory, QFQ+ guarantees that
- * the slot index is never higher than
- * 2 + QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
- * (QFQ_MAX_WEIGHT/QFQ_MAX_WSUM) = 2 + 8 * 128 * (1 / 64) = 18
+ * QFQ+ fully satisfies this bound to the slot index if the parameters
+ * of the classes are not changed dynamically, and if QFQ+ never
+ * happens to postpone the service of agg unjustly, i.e., it never
+ * happens that the aggregate becomes backlogged and eligible, or just
+ * eligible, while an aggregate with a higher approximated finish time
+ * is being served. In particular, in this case QFQ+ guarantees that
+ * the timestamps of agg are low enough that the slot index is never
+ * higher than 2. Unfortunately, QFQ+ cannot provide the same
+ * guarantee if it happens to unjustly postpone the service of agg, or
+ * if the parameters of some class are changed.
  *
- * When the weight of a class is increased or the lmax of the class is
- * decreased, a new aggregate with smaller slot size than the original
- * parent aggregate of the class may happen to be activated. The
- * activation of this aggregate should be properly delayed to when the
- * service of the class has finished in the ideal system tracked by
- * QFQ+. If the activation of the aggregate is not delayed to this
- * reference time instant, then this aggregate may be unjustly served
- * before other aggregates waiting for service. This may cause the
- * above bound to the slot index to be violated for some of these
- * unlucky aggregates.
+ * As for the first event, i.e., an out-of-order service, the
+ * upper bound to the slot index guaranteed by QFQ+ grows to
+ * 2 +
+ * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) *
+ * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1.
+ *
+ * The following function deals with this problem by backward-shifting
+ * the timestamps of agg, if needed, so as to guarantee that the slot
+ * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may
+ * cause the service of other aggregates to be postponed, yet the
+ * worst-case guarantees of these aggregates are not violated.  In
+ * fact, in case of no out-of-order service, the timestamps of agg
+ * would have been even lower than they are after the backward shift,
+ * because QFQ+ would have guaranteed a maximum value equal to 2 for
+ * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose
+ * service is postponed because of the backward-shift would have
+ * however waited for the service of agg before being served.
+ *
+ * The other event that may cause the slot index to be higher than 2
+ * for agg is a recent change of the parameters of some class. If the
+ * weight of a class is increased or the lmax (max_pkt_size) of the
+ * class is decreased, then a new aggregate with smaller slot size
+ * than the original parent aggregate of the class may happen to be
+ * activated. The activation of this aggregate should be properly
+ * delayed to when the service of the class has finished in the ideal
+ * system tracked by QFQ+. If the activation of the aggregate is not
+ * delayed to this reference time instant, then this aggregate may be
+ * unjustly served before other aggregates waiting for service. This
+ * may cause the above bound to the slot index to be violated for some
+ * of these unlucky aggregates.
  *
  * Instead of delaying the activation of the new aggregate, which is
- * quite complex, the following inaccurate but simple solution is used:
- * if the slot index is higher than QFQ_MAX_SLOTS-2, then the
- * timestamps of the aggregate are shifted backward so as to let the
- * slot index become equal to QFQ_MAX_SLOTS-2.
+ * quite complex, the above-discussed capping of the slot index is
+ * used to handle also the consequences of a change of the parameters
+ * of a class.
  */
 static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg,
 			    u64 roundedS)
@@ -1136,7 +1163,7 @@ static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
 	else
 		in_serv_agg->budget -= len;
 
-	q->V += (u64)len * IWSUM;
+	q->V += (u64)len * q->iwsum;
 	pr_debug("qfq dequeue: len %u F %lld now %lld\n",
 		 len, (unsigned long long) in_serv_agg->F,
 		 (unsigned long long) q->V);

From 885291761dba2bfe04df4c0f7bb75e4c920ab82e Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 18 Jul 2013 10:55:15 +0800
Subject: [PATCH 16/19] tuntap: do not zerocopy if iov needs more pages than
 MAX_SKB_FRAGS

We try to linearize part of the skb when the number of iov is greater than
MAX_SKB_FRAGS. This is not enough since each single vector may occupy more than
one pages, so zerocopy_sg_fromiovec() may still fail and may break the guest
network.

Solve this problem by calculate the pages needed for iov before trying to do
zerocopy and switch to use copy instead of zerocopy if it needs more than
MAX_SKB_FRAGS.

This is done through introducing a new helper to count the pages for iov, and
call uarg->callback() manually when switching from zerocopy to copy to notify
vhost.

We can do further optimization on top.

The bug were introduced from commit 0690899b4d4501b3505be069b9a687e68ccbe15b
(tun: experimental zero copy tx support)

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c | 62 +++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 24 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5cdcf92eb310..db690a372260 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1035,6 +1035,29 @@ static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
 	return 0;
 }
 
+static unsigned long iov_pages(const struct iovec *iv, int offset,
+			       unsigned long nr_segs)
+{
+	unsigned long seg, base;
+	int pages = 0, len, size;
+
+	while (nr_segs && (offset >= iv->iov_len)) {
+		offset -= iv->iov_len;
+		++iv;
+		--nr_segs;
+	}
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		base = (unsigned long)iv[seg].iov_base + offset;
+		len = iv[seg].iov_len - offset;
+		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		pages += size;
+		offset = 0;
+	}
+
+	return pages;
+}
+
 /* Get packet from user space buffer */
 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			    void *msg_control, const struct iovec *iv,
@@ -1082,32 +1105,18 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			return -EINVAL;
 	}
 
-	if (msg_control)
-		zerocopy = true;
-
-	if (zerocopy) {
-		/* Userspace may produce vectors with count greater than
-		 * MAX_SKB_FRAGS, so we need to linearize parts of the skb
-		 * to let the rest of data to be fit in the frags.
-		 */
-		if (count > MAX_SKB_FRAGS) {
-			copylen = iov_length(iv, count - MAX_SKB_FRAGS);
-			if (copylen < offset)
-				copylen = 0;
-			else
-				copylen -= offset;
-		} else
-				copylen = 0;
-		/* There are 256 bytes to be copied in skb, so there is enough
-		 * room for skb expand head in case it is used.
+	if (msg_control) {
+		/* There are 256 bytes to be copied in skb, so there is
+		 * enough room for skb expand head in case it is used.
 		 * The rest of the buffer is mapped from userspace.
 		 */
-		if (copylen < gso.hdr_len)
-			copylen = gso.hdr_len;
-		if (!copylen)
-			copylen = GOODCOPY_LEN;
+		copylen = gso.hdr_len ? gso.hdr_len : GOODCOPY_LEN;
 		linear = copylen;
-	} else {
+		if (iov_pages(iv, offset + copylen, count) <= MAX_SKB_FRAGS)
+			zerocopy = true;
+	}
+
+	if (!zerocopy) {
 		copylen = len;
 		linear = gso.hdr_len;
 	}
@@ -1121,8 +1130,13 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 
 	if (zerocopy)
 		err = zerocopy_sg_from_iovec(skb, iv, offset, count);
-	else
+	else {
 		err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len);
+		if (!err && msg_control) {
+			struct ubuf_info *uarg = msg_control;
+			uarg->callback(uarg, false);
+		}
+	}
 
 	if (err) {
 		tun->dev->stats.rx_dropped++;

From ece793fcfc417b3925844be88a6a6dc82ae8f7c6 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 18 Jul 2013 10:55:16 +0800
Subject: [PATCH 17/19] macvtap: do not zerocopy if iov needs more pages than
 MAX_SKB_FRAGS

We try to linearize part of the skb when the number of iov is greater than
MAX_SKB_FRAGS. This is not enough since each single vector may occupy more than
one pages, so zerocopy_sg_fromiovec() may still fail and may break the guest
network.

Solve this problem by calculate the pages needed for iov before trying to do
zerocopy and switch to use copy instead of zerocopy if it needs more than
MAX_SKB_FRAGS.

This is done through introducing a new helper to count the pages for iov, and
call uarg->callback() manually when switching from zerocopy to copy to notify
vhost.

We can do further optimization on top.

This bug were introduced from b92946e2919134ebe2a4083e4302236295ea2a73
(macvtap: zerocopy: validate vectors before building skb).

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap.c | 62 ++++++++++++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index a7c5654a2e5c..a98fb0ed6aef 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -698,6 +698,28 @@ static int macvtap_skb_to_vnet_hdr(const struct sk_buff *skb,
 	return 0;
 }
 
+static unsigned long iov_pages(const struct iovec *iv, int offset,
+			       unsigned long nr_segs)
+{
+	unsigned long seg, base;
+	int pages = 0, len, size;
+
+	while (nr_segs && (offset >= iv->iov_len)) {
+		offset -= iv->iov_len;
+		++iv;
+		--nr_segs;
+	}
+
+	for (seg = 0; seg < nr_segs; seg++) {
+		base = (unsigned long)iv[seg].iov_base + offset;
+		len = iv[seg].iov_len - offset;
+		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
+		pages += size;
+		offset = 0;
+	}
+
+	return pages;
+}
 
 /* Get packet from user space buffer */
 static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
@@ -744,31 +766,15 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 	if (unlikely(count > UIO_MAXIOV))
 		goto err;
 
-	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY))
-		zerocopy = true;
-
-	if (zerocopy) {
-		/* Userspace may produce vectors with count greater than
-		 * MAX_SKB_FRAGS, so we need to linearize parts of the skb
-		 * to let the rest of data to be fit in the frags.
-		 */
-		if (count > MAX_SKB_FRAGS) {
-			copylen = iov_length(iv, count - MAX_SKB_FRAGS);
-			if (copylen < vnet_hdr_len)
-				copylen = 0;
-			else
-				copylen -= vnet_hdr_len;
-		}
-		/* There are 256 bytes to be copied in skb, so there is enough
-		 * room for skb expand head in case it is used.
-		 * The rest buffer is mapped from userspace.
-		 */
-		if (copylen < vnet_hdr.hdr_len)
-			copylen = vnet_hdr.hdr_len;
-		if (!copylen)
-			copylen = GOODCOPY_LEN;
+	if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
+		copylen = vnet_hdr.hdr_len ? vnet_hdr.hdr_len : GOODCOPY_LEN;
 		linear = copylen;
-	} else {
+		if (iov_pages(iv, vnet_hdr_len + copylen, count)
+		    <= MAX_SKB_FRAGS)
+			zerocopy = true;
+	}
+
+	if (!zerocopy) {
 		copylen = len;
 		linear = vnet_hdr.hdr_len;
 	}
@@ -780,9 +786,15 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m,
 
 	if (zerocopy)
 		err = zerocopy_sg_from_iovec(skb, iv, vnet_hdr_len, count);
-	else
+	else {
 		err = skb_copy_datagram_from_iovec(skb, 0, iv, vnet_hdr_len,
 						   len);
+		if (!err && m && m->msg_control) {
+			struct ubuf_info *uarg = m->msg_control;
+			uarg->callback(uarg, false);
+		}
+	}
+
 	if (err)
 		goto err_kfree;
 

From d4b812dea4a236f729526facf97df1a9d18e191c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 18 Jul 2013 07:19:26 -0700
Subject: [PATCH 18/19] vlan: mask vlan prio bits

In commit 48cc32d38a52d0b68f91a171a8d00531edc6a46e
("vlan: don't deliver frames for unknown vlans to protocols")
Florian made sure we set pkt_type to PACKET_OTHERHOST
if the vlan id is set and we could find a vlan device for this
particular id.

But we also have a problem if prio bits are set.

Steinar reported an issue on a router receiving IPv6 frames with a
vlan tag of 4000 (id 0, prio 2), and tunneled into a sit device,
because skb->vlan_tci is set.

Forwarded frame is completely corrupted : We can see (8100:4000)
being inserted in the middle of IPv6 source address :

16:48:00.780413 IP6 2001:16d8:8100:4000:ee1c:0:9d9:bc87 >
9f94:4d95:2001:67c:29f4::: ICMP6, unknown icmp6 type (0), length 64
       0x0000:  0000 0029 8000 c7c3 7103 0001 a0ae e651
       0x0010:  0000 0000 ccce 0b00 0000 0000 1011 1213
       0x0020:  1415 1617 1819 1a1b 1c1d 1e1f 2021 2223
       0x0030:  2425 2627 2829 2a2b 2c2d 2e2f 3031 3233

It seems we are not really ready to properly cope with this right now.

We can probably do better in future kernels :
vlan_get_ingress_priority() should be a netdev property instead of
a per vlan_dev one.

For stable kernels, lets clear vlan_tci to fix the bugs.

Reported-by: Steinar H. Gunderson <sesse@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h |  3 +--
 net/8021q/vlan_core.c   |  2 +-
 net/core/dev.c          | 11 +++++++++--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index cdcbafa9b39a..715c343f7c00 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -79,9 +79,8 @@ static inline int is_vlan_dev(struct net_device *dev)
 }
 
 #define vlan_tx_tag_present(__skb)	((__skb)->vlan_tci & VLAN_TAG_PRESENT)
-#define vlan_tx_nonzero_tag_present(__skb) \
-	(vlan_tx_tag_present(__skb) && ((__skb)->vlan_tci & VLAN_VID_MASK))
 #define vlan_tx_tag_get(__skb)		((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
+#define vlan_tx_tag_get_id(__skb)	((__skb)->vlan_tci & VLAN_VID_MASK)
 
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
 
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 8a15eaadc4bd..4a78c4de9f20 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -9,7 +9,7 @@ bool vlan_do_receive(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	__be16 vlan_proto = skb->vlan_proto;
-	u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK;
+	u16 vlan_id = vlan_tx_tag_get_id(skb);
 	struct net_device *vlan_dev;
 	struct vlan_pcpu_stats *rx_stats;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index a3d8d44cb7f4..26755dd40daa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3580,8 +3580,15 @@ ncls:
 		}
 	}
 
-	if (vlan_tx_nonzero_tag_present(skb))
-		skb->pkt_type = PACKET_OTHERHOST;
+	if (unlikely(vlan_tx_tag_present(skb))) {
+		if (vlan_tx_tag_get_id(skb))
+			skb->pkt_type = PACKET_OTHERHOST;
+		/* Note: we might in the future use prio bits
+		 * and set skb->priority like in vlan_do_receive()
+		 * For the time being, just ignore Priority Code Point
+		 */
+		skb->vlan_tci = 0;
+	}
 
 	/* deliver only exact match when indicated */
 	null_or_dev = deliver_exact ? skb->dev : NULL;

From 3e3aac497513c669e1c62c71e1d552ea85c1d974 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 18 Jul 2013 09:35:10 -0700
Subject: [PATCH 19/19] vlan: fix a race in egress prio management

egress_priority_map[] hash table updates are protected by rtnl,
and we never remove elements until device is dismantled.

We have to make sure that before inserting an new element in hash table,
all its fields are committed to memory or else another cpu could
find corrupt values and crash.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan_dev.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 3a8c8fd63c88..1cd3d2a406f5 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -73,6 +73,8 @@ vlan_dev_get_egress_qos_mask(struct net_device *dev, struct sk_buff *skb)
 {
 	struct vlan_priority_tci_mapping *mp;
 
+	smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */
+
 	mp = vlan_dev_priv(dev)->egress_priority_map[(skb->priority & 0xF)];
 	while (mp) {
 		if (mp->priority == skb->priority) {
@@ -249,6 +251,11 @@ int vlan_dev_set_egress_priority(const struct net_device *dev,
 	np->next = mp;
 	np->priority = skb_prio;
 	np->vlan_qos = vlan_qos;
+	/* Before inserting this element in hash table, make sure all its fields
+	 * are committed to memory.
+	 * coupled with smp_rmb() in vlan_dev_get_egress_qos_mask()
+	 */
+	smp_wmb();
 	vlan->egress_priority_map[skb_prio & 0xF] = np;
 	if (vlan_qos)
 		vlan->nr_egress_mappings++;