From 6542df2f8412a8a065e987aac940130884028715 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Tue, 12 Jun 2018 01:54:47 +0900
Subject: [PATCH 01/38] netfilter: nft_reject_bridge: remove unnecessary ttl
 set

In the nft_reject_br_send_v4_tcp_reset(), a ttl is set by the
nf_reject_iphdr_put(). so, below code is unnecessary.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/nft_reject_bridge.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 6de981270566..08cbed7d940e 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -89,8 +89,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net,
 	niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
 				   net->ipv4.sysctl_ip_default_ttl);
 	nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
-	niph->ttl	= net->ipv4.sysctl_ip_default_ttl;
-	niph->tot_len	= htons(nskb->len);
+	niph->tot_len = htons(nskb->len);
 	ip_send_check(niph);
 
 	nft_reject_br_push_etherhdr(oldskb, nskb);

From e97d9404d5e8aea1f91f4c00dbe7854008f3a1e1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 15 Jun 2018 23:46:42 +0200
Subject: [PATCH 02/38] netfilter: flowtables: use fixed renew timeout on
 teardown

This is one of the very few external callers of ->get_timeouts(),

We can use a fixed timeout instead, conntrack core will refresh this in
case a new packet comes within this period.

Use of ESTABLISHED timeout seems way too huge anyway.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_core.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index eb0d1658ac05..d8125616edc7 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -107,11 +107,12 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
 	tcp->seen[1].td_maxwin = 0;
 }
 
+#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT	(120 * HZ)
+#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT	(30 * HZ)
+
 static void flow_offload_fixup_ct_state(struct nf_conn *ct)
 {
 	const struct nf_conntrack_l4proto *l4proto;
-	struct net *net = nf_ct_net(ct);
-	unsigned int *timeouts;
 	unsigned int timeout;
 	int l4num;
 
@@ -123,14 +124,10 @@ static void flow_offload_fixup_ct_state(struct nf_conn *ct)
 	if (!l4proto)
 		return;
 
-	timeouts = l4proto->get_timeouts(net);
-	if (!timeouts)
-		return;
-
 	if (l4num == IPPROTO_TCP)
-		timeout = timeouts[TCP_CONNTRACK_ESTABLISHED];
+		timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT;
 	else if (l4num == IPPROTO_UDP)
-		timeout = timeouts[UDP_CT_REPLIED];
+		timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT;
 	else
 		return;
 

From f286586df68e7733a8e651098401f139dc2e17f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Eckl?= <ecklm94@gmail.com>
Date: Mon, 18 Jun 2018 15:12:52 +0200
Subject: [PATCH 03/38] netfilter: nft_tproxy: Move nf_tproxy_assign_sock() to
 nf_tproxy.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This function is also necessary to implement nft tproxy support

Fixes: 45ca4e0cf273 ("netfilter: Libify xt_TPROXY")
Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tproxy.h | 8 ++++++++
 net/netfilter/xt_TPROXY.c         | 9 ---------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h
index 9754a50ecde9..d5a80888cbe4 100644
--- a/include/net/netfilter/nf_tproxy.h
+++ b/include/net/netfilter/nf_tproxy.h
@@ -17,6 +17,14 @@ static inline bool nf_tproxy_sk_is_transparent(struct sock *sk)
 	return false;
 }
 
+/* assign a socket to the skb -- consumes sk */
+static inline void nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
+{
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = sock_edemux;
+}
+
 __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr);
 
 /**
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 58fce4e749a9..35df0827e2ca 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -36,15 +36,6 @@
 #include <net/netfilter/nf_tproxy.h>
 #include <linux/netfilter/xt_TPROXY.h>
 
-/* assign a socket to the skb -- consumes sk */
-static void
-nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
-{
-	skb_orphan(skb);
-	skb->sk = sk;
-	skb->destructor = sock_edemux;
-}
-
 static unsigned int
 tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
 	   u_int32_t mark_mask, u_int32_t mark_value)

From d7e5a9a50245b91f016c814b0f076f7e55cbb980 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 25 Jun 2018 17:49:43 +0200
Subject: [PATCH 04/38] netfilter: utils: move nf_ip_checksum* from ipv4 to
 utils

allows to make nf_ip_checksum_partial static, it no longer
has an external caller.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv4.h | 11 -------
 net/ipv4/netfilter.c           | 53 --------------------------------
 net/netfilter/utils.c          | 55 ++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 64 deletions(-)

diff --git a/include/linux/netfilter_ipv4.h b/include/linux/netfilter_ipv4.h
index b31dabfdb453..95ab5cc64422 100644
--- a/include/linux/netfilter_ipv4.h
+++ b/include/linux/netfilter_ipv4.h
@@ -23,9 +23,6 @@ struct nf_queue_entry;
 #ifdef CONFIG_INET
 __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 		       unsigned int dataoff, u_int8_t protocol);
-__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
-			       unsigned int dataoff, unsigned int len,
-			       u_int8_t protocol);
 int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 		bool strict);
 int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry);
@@ -35,14 +32,6 @@ static inline __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 {
 	return 0;
 }
-static inline __sum16 nf_ip_checksum_partial(struct sk_buff *skb,
-					     unsigned int hook,
-					     unsigned int dataoff,
-					     unsigned int len,
-					     u_int8_t protocol)
-{
-	return 0;
-}
 static inline int nf_ip_route(struct net *net, struct dst_entry **dst,
 			      struct flowi *fl, bool strict)
 {
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index e6774ccb7731..8d2e5dc9a827 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -98,59 +98,6 @@ int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
 }
 EXPORT_SYMBOL_GPL(nf_ip_reroute);
 
-__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
-			    unsigned int dataoff, u_int8_t protocol)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-	__sum16 csum = 0;
-
-	switch (skb->ip_summed) {
-	case CHECKSUM_COMPLETE:
-		if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
-			break;
-		if ((protocol == 0 && !csum_fold(skb->csum)) ||
-		    !csum_tcpudp_magic(iph->saddr, iph->daddr,
-				       skb->len - dataoff, protocol,
-				       skb->csum)) {
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
-			break;
-		}
-		/* fall through */
-	case CHECKSUM_NONE:
-		if (protocol == 0)
-			skb->csum = 0;
-		else
-			skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
-						       skb->len - dataoff,
-						       protocol, 0);
-		csum = __skb_checksum_complete(skb);
-	}
-	return csum;
-}
-EXPORT_SYMBOL(nf_ip_checksum);
-
-__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
-			       unsigned int dataoff, unsigned int len,
-			       u_int8_t protocol)
-{
-	const struct iphdr *iph = ip_hdr(skb);
-	__sum16 csum = 0;
-
-	switch (skb->ip_summed) {
-	case CHECKSUM_COMPLETE:
-		if (len == skb->len - dataoff)
-			return nf_ip_checksum(skb, hook, dataoff, protocol);
-		/* fall through */
-	case CHECKSUM_NONE:
-		skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
-					       skb->len - dataoff, 0);
-		skb->ip_summed = CHECKSUM_NONE;
-		return __skb_checksum_complete_head(skb, dataoff + len);
-	}
-	return csum;
-}
-EXPORT_SYMBOL_GPL(nf_ip_checksum_partial);
-
 int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
 		bool strict __always_unused)
 {
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 0b660c568156..8980c8a0fe5c 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -1,9 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/kernel.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv6.h>
 #include <net/netfilter/nf_queue.h>
 
+#ifdef CONFIG_INET
+__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
+		       unsigned int dataoff, u8 protocol)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	__sum16 csum = 0;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+			break;
+		if ((protocol == 0 && !csum_fold(skb->csum)) ||
+		    !csum_tcpudp_magic(iph->saddr, iph->daddr,
+				       skb->len - dataoff, protocol,
+				       skb->csum)) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			break;
+		}
+		/* fall through */
+	case CHECKSUM_NONE:
+		if (protocol == 0)
+			skb->csum = 0;
+		else
+			skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+						       skb->len - dataoff,
+						       protocol, 0);
+		csum = __skb_checksum_complete(skb);
+	}
+	return csum;
+}
+EXPORT_SYMBOL(nf_ip_checksum);
+#endif
+
+static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
+				      unsigned int dataoff, unsigned int len,
+				      u8 protocol)
+{
+	const struct iphdr *iph = ip_hdr(skb);
+	__sum16 csum = 0;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (len == skb->len - dataoff)
+			return nf_ip_checksum(skb, hook, dataoff, protocol);
+		/* fall through */
+	case CHECKSUM_NONE:
+		skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
+					       skb->len - dataoff, 0);
+		skb->ip_summed = CHECKSUM_NONE;
+		return __skb_checksum_complete_head(skb, dataoff + len);
+	}
+	return csum;
+}
+
 __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
 		    unsigned int dataoff, u_int8_t protocol,
 		    unsigned short family)

From ebee5a50d0b7cdc576aa8081f05b86971880054d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 25 Jun 2018 17:49:59 +0200
Subject: [PATCH 05/38] netfilter: utils: move nf_ip6_checksum* from ipv6 to
 utils

similar to previous change, this also allows to remove it
from nf_ipv6_ops and avoid the indirection.

It also removes the bogus dependency of nf_conntrack_ipv6 on ipv6 module:
ipv6 checksum functions are built into kernel even if CONFIG_IPV6=m,
but ipv6/netfilter.o isn't.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h |  5 ---
 net/ipv6/netfilter.c           | 62 ---------------------------
 net/netfilter/utils.c          | 76 +++++++++++++++++++++++++++++-----
 3 files changed, 65 insertions(+), 78 deletions(-)

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 288c597e75b3..c0dc4dd78887 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -30,11 +30,6 @@ struct nf_ipv6_ops {
 	void (*route_input)(struct sk_buff *skb);
 	int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb,
 			int (*output)(struct net *, struct sock *, struct sk_buff *));
-	__sum16 (*checksum)(struct sk_buff *skb, unsigned int hook,
-			    unsigned int dataoff, u_int8_t protocol);
-	__sum16 (*checksum_partial)(struct sk_buff *skb, unsigned int hook,
-				    unsigned int dataoff, unsigned int len,
-				    u_int8_t protocol);
 	int (*route)(struct net *net, struct dst_entry **dst, struct flowi *fl,
 		     bool strict);
 	int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 531d6957af36..5ae8e1c51079 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -15,7 +15,6 @@
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
 #include <net/xfrm.h>
-#include <net/ip6_checksum.h>
 #include <net/netfilter/nf_queue.h>
 
 int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
@@ -106,71 +105,10 @@ static int nf_ip6_route(struct net *net, struct dst_entry **dst,
 	return err;
 }
 
-__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
-			     unsigned int dataoff, u_int8_t protocol)
-{
-	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
-	__sum16 csum = 0;
-
-	switch (skb->ip_summed) {
-	case CHECKSUM_COMPLETE:
-		if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
-			break;
-		if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
-				     skb->len - dataoff, protocol,
-				     csum_sub(skb->csum,
-					      skb_checksum(skb, 0,
-							   dataoff, 0)))) {
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
-			break;
-		}
-		/* fall through */
-	case CHECKSUM_NONE:
-		skb->csum = ~csum_unfold(
-				csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
-					     skb->len - dataoff,
-					     protocol,
-					     csum_sub(0,
-						      skb_checksum(skb, 0,
-								   dataoff, 0))));
-		csum = __skb_checksum_complete(skb);
-	}
-	return csum;
-}
-EXPORT_SYMBOL(nf_ip6_checksum);
-
-static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
-				       unsigned int dataoff, unsigned int len,
-				       u_int8_t protocol)
-{
-	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
-	__wsum hsum;
-	__sum16 csum = 0;
-
-	switch (skb->ip_summed) {
-	case CHECKSUM_COMPLETE:
-		if (len == skb->len - dataoff)
-			return nf_ip6_checksum(skb, hook, dataoff, protocol);
-		/* fall through */
-	case CHECKSUM_NONE:
-		hsum = skb_checksum(skb, 0, dataoff, 0);
-		skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
-							 &ip6h->daddr,
-							 skb->len - dataoff,
-							 protocol,
-							 csum_sub(0, hsum)));
-		skb->ip_summed = CHECKSUM_NONE;
-		return __skb_checksum_complete_head(skb, dataoff + len);
-	}
-	return csum;
-};
-
 static const struct nf_ipv6_ops ipv6ops = {
 	.chk_addr		= ipv6_chk_addr,
 	.route_input    	= ip6_route_input,
 	.fragment		= ip6_fragment,
-	.checksum		= nf_ip6_checksum,
-	.checksum_partial	= nf_ip6_checksum_partial,
 	.route			= nf_ip6_route,
 	.reroute		= nf_ip6_reroute,
 };
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 8980c8a0fe5c..e8da9a9bba73 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -4,6 +4,7 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv6.h>
 #include <net/netfilter/nf_queue.h>
+#include <net/ip6_checksum.h>
 
 #ifdef CONFIG_INET
 __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
@@ -59,11 +60,69 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
 	return csum;
 }
 
+__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
+			unsigned int dataoff, u8 protocol)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	__sum16 csum = 0;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+			break;
+		if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+				     skb->len - dataoff, protocol,
+				     csum_sub(skb->csum,
+					      skb_checksum(skb, 0,
+							   dataoff, 0)))) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+			break;
+		}
+		/* fall through */
+	case CHECKSUM_NONE:
+		skb->csum = ~csum_unfold(
+				csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+					     skb->len - dataoff,
+					     protocol,
+					     csum_sub(0,
+						      skb_checksum(skb, 0,
+								   dataoff, 0))));
+		csum = __skb_checksum_complete(skb);
+	}
+	return csum;
+}
+EXPORT_SYMBOL(nf_ip6_checksum);
+
+static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
+				       unsigned int dataoff, unsigned int len,
+				       u8 protocol)
+{
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	__wsum hsum;
+	__sum16 csum = 0;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_COMPLETE:
+		if (len == skb->len - dataoff)
+			return nf_ip6_checksum(skb, hook, dataoff, protocol);
+		/* fall through */
+	case CHECKSUM_NONE:
+		hsum = skb_checksum(skb, 0, dataoff, 0);
+		skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
+							 &ip6h->daddr,
+							 skb->len - dataoff,
+							 protocol,
+							 csum_sub(0, hsum)));
+		skb->ip_summed = CHECKSUM_NONE;
+		return __skb_checksum_complete_head(skb, dataoff + len);
+	}
+	return csum;
+};
+
 __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
-		    unsigned int dataoff, u_int8_t protocol,
+		    unsigned int dataoff, u8 protocol,
 		    unsigned short family)
 {
-	const struct nf_ipv6_ops *v6ops;
 	__sum16 csum = 0;
 
 	switch (family) {
@@ -71,9 +130,7 @@ __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
 		csum = nf_ip_checksum(skb, hook, dataoff, protocol);
 		break;
 	case AF_INET6:
-		v6ops = rcu_dereference(nf_ipv6_ops);
-		if (v6ops)
-			csum = v6ops->checksum(skb, hook, dataoff, protocol);
+		csum = nf_ip6_checksum(skb, hook, dataoff, protocol);
 		break;
 	}
 
@@ -83,9 +140,8 @@ EXPORT_SYMBOL_GPL(nf_checksum);
 
 __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
 			    unsigned int dataoff, unsigned int len,
-			    u_int8_t protocol, unsigned short family)
+			    u8 protocol, unsigned short family)
 {
-	const struct nf_ipv6_ops *v6ops;
 	__sum16 csum = 0;
 
 	switch (family) {
@@ -94,10 +150,8 @@ __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
 					      protocol);
 		break;
 	case AF_INET6:
-		v6ops = rcu_dereference(nf_ipv6_ops);
-		if (v6ops)
-			csum = v6ops->checksum_partial(skb, hook, dataoff, len,
-						       protocol);
+		csum = nf_ip6_checksum_partial(skb, hook, dataoff, len,
+					       protocol);
 		break;
 	}
 

From 60e3be94e6a1c5162a0763c9aafb5190b2b1fdce Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 25 Jun 2018 17:55:32 +0200
Subject: [PATCH 06/38] openvswitch: use nf_ct_get_tuplepr, invert_tuplepr

These versions deal with the l3proto/l4proto details internally.
It removes only caller of nf_ct_get_tuple, so make it static.

After this, l3proto->get_l4proto() can be removed in a followup patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_core.h |  7 -------
 net/netfilter/nf_conntrack_core.c         |  3 +--
 net/openvswitch/conntrack.c               | 17 +++--------------
 3 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 9b5e7634713e..90df45022c51 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -40,13 +40,6 @@ void nf_conntrack_cleanup_start(void);
 void nf_conntrack_init_end(void);
 void nf_conntrack_cleanup_end(void);
 
-bool nf_ct_get_tuple(const struct sk_buff *skb, unsigned int nhoff,
-		     unsigned int dataoff, u_int16_t l3num, u_int8_t protonum,
-		     struct net *net,
-		     struct nf_conntrack_tuple *tuple,
-		     const struct nf_conntrack_l3proto *l3proto,
-		     const struct nf_conntrack_l4proto *l4proto);
-
 bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 			const struct nf_conntrack_tuple *orig,
 			const struct nf_conntrack_l3proto *l3proto,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 85ab2fd6a665..be0ab81e6b2c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -222,7 +222,7 @@ static u32 hash_conntrack(const struct net *net,
 	return scale_hash(hash_conntrack_raw(tuple, net));
 }
 
-bool
+static bool
 nf_ct_get_tuple(const struct sk_buff *skb,
 		unsigned int nhoff,
 		unsigned int dataoff,
@@ -244,7 +244,6 @@ nf_ct_get_tuple(const struct sk_buff *skb,
 
 	return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
 }
-EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
 
 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 		       u_int16_t l3num,
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 284aca2a252d..e05bd3e53f0f 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -607,23 +607,12 @@ static struct nf_conn *
 ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
 		     u8 l3num, struct sk_buff *skb, bool natted)
 {
-	const struct nf_conntrack_l3proto *l3proto;
-	const struct nf_conntrack_l4proto *l4proto;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conn *ct;
-	unsigned int dataoff;
-	u8 protonum;
 
-	l3proto = __nf_ct_l3proto_find(l3num);
-	if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
-				 &protonum) <= 0) {
-		pr_debug("ovs_ct_find_existing: Can't get protonum\n");
-		return NULL;
-	}
-	l4proto = __nf_ct_l4proto_find(l3num, protonum);
-	if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
-			     protonum, net, &tuple, l3proto, l4proto)) {
+	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num,
+			       net, &tuple)) {
 		pr_debug("ovs_ct_find_existing: Can't get tuple\n");
 		return NULL;
 	}
@@ -632,7 +621,7 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
 	if (natted) {
 		struct nf_conntrack_tuple inverse;
 
-		if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) {
+		if (!nf_ct_invert_tuplepr(&inverse, &tuple)) {
 			pr_debug("ovs_ct_find_existing: Inversion failed!\n");
 			return NULL;
 		}

From 7414d929bc35b9a7c3eab98ef7bd32d5ae4c2981 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Eckl?= <ecklm94@gmail.com>
Date: Thu, 28 Jun 2018 20:01:02 +0200
Subject: [PATCH 07/38] netfilter: Kconfig: Make NETFILTER_XT_MATCH_SOCKET
 select NF_SOCKET_IPV4/6
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of depending on it.

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index dbd7d1fad277..3ce657fbca67 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1492,8 +1492,8 @@ config NETFILTER_XT_MATCH_SOCKET
 	depends on NETFILTER_ADVANCED
 	depends on IPV6 || IPV6=n
 	depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
-	depends on NF_SOCKET_IPV4
-	depends on NF_SOCKET_IPV6
+	select NF_SOCKET_IPV4
+	select NF_SOCKET_IPV6 if IP6_NF_IPTABLES
 	select NF_DEFRAG_IPV4
 	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
 	help

From f957be9d349a3800940f823b16e12b0405cc305b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Jun 2018 07:46:44 +0200
Subject: [PATCH 08/38] netfilter: conntrack: remove ctnetlink callbacks from
 l3 protocol trackers

handle everything from ctnetlink directly.

After all these years we still only support ipv4 and ipv6, so it
seems reasonable to remove l3 protocol tracker support and instead
handle ipv4/ipv6 from a common, always builtin inet tracker.

Step 1: Get rid of all the l3proto->func() calls.

Start with ctnetlink, then move on to packet-path ones.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_core.h     |  6 +-
 include/net/netfilter/nf_conntrack_l3proto.h  |  8 --
 .../netfilter/nf_conntrack_l3proto_ipv4.c     | 47 ---------
 .../netfilter/nf_conntrack_l3proto_ipv6.c     | 48 ----------
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c     |  1 -
 net/netfilter/nf_conntrack_expect.c           |  1 -
 net/netfilter/nf_conntrack_helper.c           |  1 -
 net/netfilter/nf_conntrack_netlink.c          | 96 ++++++++++++++-----
 net/netfilter/nf_conntrack_proto.c            |  5 +-
 net/netfilter/nf_conntrack_standalone.c       | 14 +--
 net/netfilter/nfnetlink_cttimeout.c           |  1 -
 11 files changed, 79 insertions(+), 149 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 90df45022c51..d454a53ba646 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -68,10 +68,8 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
 	return ret;
 }
 
-void
-print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
-            const struct nf_conntrack_l3proto *l3proto,
-            const struct nf_conntrack_l4proto *proto);
+void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
+		 const struct nf_conntrack_l4proto *proto);
 
 #define CONNTRACK_LOCKS 1024
 
diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index d5808f3e2715..d07b5216a925 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -46,14 +46,6 @@ struct nf_conntrack_l3proto {
 	int (*get_l4proto)(const struct sk_buff *skb, unsigned int nhoff,
 			   unsigned int *dataoff, u_int8_t *protonum);
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	int (*tuple_to_nlattr)(struct sk_buff *skb,
-			       const struct nf_conntrack_tuple *t);
-	int (*nlattr_to_tuple)(struct nlattr *tb[],
-			       struct nf_conntrack_tuple *t);
-	const struct nla_policy *nla_policy;
-#endif
-
 	/* Called when netns wants to use connection tracking */
 	int (*net_ns_get)(struct net *);
 	void (*net_ns_put)(struct net *);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 9db988f9a4d7..98ed12858c52 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -274,41 +274,6 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 	return -ENOENT;
 }
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
-				const struct nf_conntrack_tuple *tuple)
-{
-	if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
-	    nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -1;
-}
-
-static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = {
-	[CTA_IP_V4_SRC]	= { .type = NLA_U32 },
-	[CTA_IP_V4_DST]	= { .type = NLA_U32 },
-};
-
-static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
-				struct nf_conntrack_tuple *t)
-{
-	if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
-		return -EINVAL;
-
-	t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
-	t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
-
-	return 0;
-}
-#endif
-
 static struct nf_sockopt_ops so_getorigdst = {
 	.pf		= PF_INET,
 	.get_optmin	= SO_ORIGINAL_DST,
@@ -360,13 +325,6 @@ const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
 	.pkt_to_tuple	 = ipv4_pkt_to_tuple,
 	.invert_tuple	 = ipv4_invert_tuple,
 	.get_l4proto	 = ipv4_get_l4proto,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.tuple_to_nlattr = ipv4_tuple_to_nlattr,
-	.nlattr_to_tuple = ipv4_nlattr_to_tuple,
-	.nla_policy	 = ipv4_nla_policy,
-	.nla_size	 = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */
-			   NLA_ALIGN(NLA_HDRLEN + sizeof(u32)),  /* CTA_IP_V4_DST */
-#endif
 	.net_ns_get	 = ipv4_hooks_register,
 	.net_ns_put	 = ipv4_hooks_unregister,
 	.me		 = THIS_MODULE,
@@ -419,11 +377,6 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
 
 	need_conntrack();
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) !=
-	    nf_conntrack_l3proto_ipv4.nla_size))
-		return -EINVAL;
-#endif
 	ret = nf_register_sockopt(&so_getorigdst);
 	if (ret < 0) {
 		pr_err("Unable to register netfilter socket option\n");
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 663827ee3cf8..13a660ae5799 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -269,41 +269,6 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 	return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0;
 }
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static int ipv6_tuple_to_nlattr(struct sk_buff *skb,
-				const struct nf_conntrack_tuple *tuple)
-{
-	if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) ||
-	    nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6))
-		goto nla_put_failure;
-	return 0;
-
-nla_put_failure:
-	return -1;
-}
-
-static const struct nla_policy ipv6_nla_policy[CTA_IP_MAX+1] = {
-	[CTA_IP_V6_SRC]	= { .len = sizeof(u_int32_t)*4 },
-	[CTA_IP_V6_DST]	= { .len = sizeof(u_int32_t)*4 },
-};
-
-static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
-				struct nf_conntrack_tuple *t)
-{
-	if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
-		return -EINVAL;
-
-	t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
-	t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
-
-	return 0;
-}
-#endif
-
 static int ipv6_hooks_register(struct net *net)
 {
 	struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
@@ -345,13 +310,6 @@ const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
 	.pkt_to_tuple		= ipv6_pkt_to_tuple,
 	.invert_tuple		= ipv6_invert_tuple,
 	.get_l4proto		= ipv6_get_l4proto,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	.tuple_to_nlattr	= ipv6_tuple_to_nlattr,
-	.nlattr_to_tuple	= ipv6_nlattr_to_tuple,
-	.nla_policy		= ipv6_nla_policy,
-	.nla_size		= NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])) +
-				  NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])),
-#endif
 	.net_ns_get		= ipv6_hooks_register,
 	.net_ns_put		= ipv6_hooks_unregister,
 	.me			= THIS_MODULE,
@@ -409,12 +367,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void)
 
 	need_conntrack();
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	if (WARN_ON(nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1) !=
-	    nf_conntrack_l3proto_ipv6.nla_size))
-		return -EINVAL;
-#endif
-
 	ret = nf_register_sockopt(&so_getorigdst6);
 	if (ret < 0) {
 		pr_err("Unable to register netfilter socket option\n");
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index c87b48359e8f..e631be25337e 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -23,7 +23,6 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
 #endif
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 853b23206bb7..3f586ba23d92 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -610,7 +610,6 @@ static int exp_seq_show(struct seq_file *s, void *v)
 		   expect->tuple.src.l3num,
 		   expect->tuple.dst.protonum);
 	print_tuple(s, &expect->tuple,
-		    __nf_ct_l3proto_find(expect->tuple.src.l3num),
 		    __nf_ct_l4proto_find(expect->tuple.src.l3num,
 				       expect->tuple.dst.protonum));
 
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index a75b11c39312..a55a58c706a9 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -24,7 +24,6 @@
 #include <linux/rtnetlink.h>
 
 #include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_core.h>
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 20a2e37c76d1..40152b9ad772 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -38,7 +38,6 @@
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
 #include <net/netfilter/nf_conntrack_acct.h>
@@ -81,9 +80,26 @@ nla_put_failure:
 	return -1;
 }
 
+static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
+				const struct nf_conntrack_tuple *tuple)
+{
+	if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
+	    nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int ipv6_tuple_to_nlattr(struct sk_buff *skb,
+				const struct nf_conntrack_tuple *tuple)
+{
+	if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) ||
+	    nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6))
+		return -EMSGSIZE;
+	return 0;
+}
+
 static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
-				    const struct nf_conntrack_tuple *tuple,
-				    const struct nf_conntrack_l3proto *l3proto)
+				    const struct nf_conntrack_tuple *tuple)
 {
 	int ret = 0;
 	struct nlattr *nest_parms;
@@ -92,8 +108,14 @@ static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
 	if (!nest_parms)
 		goto nla_put_failure;
 
-	if (likely(l3proto->tuple_to_nlattr))
-		ret = l3proto->tuple_to_nlattr(skb, tuple);
+	switch (tuple->src.l3num) {
+	case NFPROTO_IPV4:
+		ret = ipv4_tuple_to_nlattr(skb, tuple);
+		break;
+	case NFPROTO_IPV6:
+		ret = ipv6_tuple_to_nlattr(skb, tuple);
+		break;
+	}
 
 	nla_nest_end(skb, nest_parms);
 
@@ -106,13 +128,11 @@ nla_put_failure:
 static int ctnetlink_dump_tuples(struct sk_buff *skb,
 				 const struct nf_conntrack_tuple *tuple)
 {
-	const struct nf_conntrack_l3proto *l3proto;
 	const struct nf_conntrack_l4proto *l4proto;
 	int ret;
 
 	rcu_read_lock();
-	l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
-	ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto);
+	ret = ctnetlink_dump_tuples_ip(skb, tuple);
 
 	if (ret >= 0) {
 		l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
@@ -556,15 +576,20 @@ nla_put_failure:
 	return -1;
 }
 
+static const struct nla_policy cta_ip_nla_policy[CTA_IP_MAX + 1] = {
+	[CTA_IP_V4_SRC]	= { .type = NLA_U32 },
+	[CTA_IP_V4_DST]	= { .type = NLA_U32 },
+	[CTA_IP_V6_SRC]	= { .len = sizeof(__be32) * 4 },
+	[CTA_IP_V6_DST]	= { .len = sizeof(__be32) * 4 },
+};
+
 #if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS)
 static size_t ctnetlink_proto_size(const struct nf_conn *ct)
 {
-	const struct nf_conntrack_l3proto *l3proto;
 	const struct nf_conntrack_l4proto *l4proto;
 	size_t len, len4 = 0;
 
-	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
-	len = l3proto->nla_size;
+	len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1);
 	len *= 3u; /* ORIG, REPLY, MASTER */
 
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
@@ -936,29 +961,54 @@ out:
 	return skb->len;
 }
 
+static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
+				struct nf_conntrack_tuple *t)
+{
+	if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
+		return -EINVAL;
+
+	t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
+	t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
+
+	return 0;
+}
+
+static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
+				struct nf_conntrack_tuple *t)
+{
+	if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
+		return -EINVAL;
+
+	t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
+	t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
+
+	return 0;
+}
+
 static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
 				    struct nf_conntrack_tuple *tuple)
 {
 	struct nlattr *tb[CTA_IP_MAX+1];
-	struct nf_conntrack_l3proto *l3proto;
 	int ret = 0;
 
 	ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL, NULL);
 	if (ret < 0)
 		return ret;
 
-	rcu_read_lock();
-	l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
+	ret = nla_validate_nested(attr, CTA_IP_MAX,
+				  cta_ip_nla_policy, NULL);
+	if (ret)
+		return ret;
 
-	if (likely(l3proto->nlattr_to_tuple)) {
-		ret = nla_validate_nested(attr, CTA_IP_MAX,
-					  l3proto->nla_policy, NULL);
-		if (ret == 0)
-			ret = l3proto->nlattr_to_tuple(tb, tuple);
+	switch (tuple->src.l3num) {
+	case NFPROTO_IPV4:
+		ret = ipv4_nlattr_to_tuple(tb, tuple);
+		break;
+	case NFPROTO_IPV6:
+		ret = ipv6_nlattr_to_tuple(tb, tuple);
+		break;
 	}
 
-	rcu_read_unlock();
-
 	return ret;
 }
 
@@ -2581,7 +2631,6 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
 				   const struct nf_conntrack_tuple *tuple,
 				   const struct nf_conntrack_tuple_mask *mask)
 {
-	const struct nf_conntrack_l3proto *l3proto;
 	const struct nf_conntrack_l4proto *l4proto;
 	struct nf_conntrack_tuple m;
 	struct nlattr *nest_parms;
@@ -2597,8 +2646,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
 		goto nla_put_failure;
 
 	rcu_read_lock();
-	l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
-	ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto);
+	ret = ctnetlink_dump_tuples_ip(skb, &m);
 	if (ret >= 0) {
 		l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
 					       tuple->dst.protonum);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index d88841fbc560..859cb303bb91 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -294,10 +294,7 @@ int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto)
 
 	if (proto->l3proto >= NFPROTO_NUMPROTO)
 		return -EBUSY;
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-	if (proto->tuple_to_nlattr && proto->nla_size == 0)
-		return -EINVAL;
-#endif
+
 	mutex_lock(&nf_ct_proto_mutex);
 	old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
 					lockdep_is_held(&nf_ct_proto_mutex));
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index b642c0b2495c..47b80fd0d2c3 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -24,7 +24,6 @@
 
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_helper.h>
@@ -38,10 +37,9 @@ MODULE_LICENSE("GPL");
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 void
 print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
-            const struct nf_conntrack_l3proto *l3proto,
             const struct nf_conntrack_l4proto *l4proto)
 {
-	switch (l3proto->l3proto) {
+	switch (tuple->src.l3num) {
 	case NFPROTO_IPV4:
 		seq_printf(s, "src=%pI4 dst=%pI4 ",
 			   &tuple->src.u3.ip, &tuple->dst.u3.ip);
@@ -282,7 +280,6 @@ static int ct_seq_show(struct seq_file *s, void *v)
 {
 	struct nf_conntrack_tuple_hash *hash = v;
 	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
-	const struct nf_conntrack_l3proto *l3proto;
 	const struct nf_conntrack_l4proto *l4proto;
 	struct net *net = seq_file_net(s);
 	int ret = 0;
@@ -303,14 +300,12 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	if (!net_eq(nf_ct_net(ct), net))
 		goto release;
 
-	l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
-	WARN_ON(!l3proto);
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 	WARN_ON(!l4proto);
 
 	ret = -ENOSPC;
 	seq_printf(s, "%-8s %u %-8s %u ",
-		   l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
+		   l3proto_name(nf_ct_l3num(ct)), nf_ct_l3num(ct),
 		   l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
 
 	if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
@@ -320,7 +315,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 		l4proto->print_conntrack(s, ct);
 
 	print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
-		    l3proto, l4proto);
+		    l4proto);
 
 	ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG);
 
@@ -333,8 +328,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
 	if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
 		seq_puts(s, "[UNREPLIED] ");
 
-	print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
-		    l3proto, l4proto);
+	print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l4proto);
 
 	ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL);
 
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 9ee5fa551fa6..9da4b8462004 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -26,7 +26,6 @@
 #include <net/sock.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
 #include <net/netfilter/nf_conntrack_timeout.h>

From 47a91b14de62e35d1466820cbb4c024b6c02dff1 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Jun 2018 07:46:45 +0200
Subject: [PATCH 09/38] netfilter: conntrack: remove pkt_to_tuple indirection
 from l3 protocol trackers

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h  |  7 ----
 .../netfilter/nf_conntrack_l3proto_ipv4.c     | 17 --------
 .../netfilter/nf_conntrack_l3proto_ipv6.c     | 18 ---------
 net/netfilter/nf_conntrack_core.c             | 39 ++++++++++++++++---
 net/netfilter/nf_conntrack_l3proto_generic.c  | 10 -----
 5 files changed, 33 insertions(+), 58 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index d07b5216a925..ece231450f30 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -24,13 +24,6 @@ struct nf_conntrack_l3proto {
 	/* size of tuple nlattr, fills a hole */
 	u16 nla_size;
 
-	/*
-	 * Try to fill in the third arg: nhoff is offset of l3 proto
-         * hdr.  Return true if possible.
-	 */
-	bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int nhoff,
-			     struct nf_conntrack_tuple *tuple);
-
 	/*
 	 * Invert the per-proto part of the tuple: ie. turn xmit into reply.
 	 * Some packets can't be inverted: return 0 in that case.
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 98ed12858c52..7ed56f61798b 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -38,22 +38,6 @@ struct conntrack4_net {
 	unsigned int users;
 };
 
-static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
-			      struct nf_conntrack_tuple *tuple)
-{
-	const __be32 *ap;
-	__be32 _addrs[2];
-	ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
-				sizeof(u_int32_t) * 2, _addrs);
-	if (ap == NULL)
-		return false;
-
-	tuple->src.u3.ip = ap[0];
-	tuple->dst.u3.ip = ap[1];
-
-	return true;
-}
-
 static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
 			      const struct nf_conntrack_tuple *orig)
 {
@@ -322,7 +306,6 @@ static void ipv4_hooks_unregister(struct net *net)
 
 const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
 	.l3proto	 = PF_INET,
-	.pkt_to_tuple	 = ipv4_pkt_to_tuple,
 	.invert_tuple	 = ipv4_invert_tuple,
 	.get_l4proto	 = ipv4_get_l4proto,
 	.net_ns_get	 = ipv4_hooks_register,
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 13a660ae5799..bdb1709bb951 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -41,23 +41,6 @@ struct conntrack6_net {
 	unsigned int users;
 };
 
-static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
-			      struct nf_conntrack_tuple *tuple)
-{
-	const u_int32_t *ap;
-	u_int32_t _addrs[8];
-
-	ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr),
-				sizeof(_addrs), _addrs);
-	if (ap == NULL)
-		return false;
-
-	memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
-	memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
-
-	return true;
-}
-
 static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
 			      const struct nf_conntrack_tuple *orig)
 {
@@ -307,7 +290,6 @@ static void ipv6_hooks_unregister(struct net *net)
 
 const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
 	.l3proto		= PF_INET6,
-	.pkt_to_tuple		= ipv6_pkt_to_tuple,
 	.invert_tuple		= ipv6_invert_tuple,
 	.get_l4proto		= ipv6_get_l4proto,
 	.net_ns_get		= ipv6_hooks_register,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index be0ab81e6b2c..66b2ebae2747 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -230,15 +230,43 @@ nf_ct_get_tuple(const struct sk_buff *skb,
 		u_int8_t protonum,
 		struct net *net,
 		struct nf_conntrack_tuple *tuple,
-		const struct nf_conntrack_l3proto *l3proto,
 		const struct nf_conntrack_l4proto *l4proto)
 {
+	unsigned int size;
+	const __be32 *ap;
+	__be32 _addrs[8];
+
 	memset(tuple, 0, sizeof(*tuple));
 
 	tuple->src.l3num = l3num;
-	if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
+	switch (l3num) {
+	case NFPROTO_IPV4:
+		nhoff += offsetof(struct iphdr, saddr);
+		size = 2 * sizeof(__be32);
+		break;
+	case NFPROTO_IPV6:
+		nhoff += offsetof(struct ipv6hdr, saddr);
+		size = sizeof(_addrs);
+		break;
+	default:
+		return true;
+	}
+
+	ap = skb_header_pointer(skb, nhoff, size, _addrs);
+	if (!ap)
 		return false;
 
+	switch (l3num) {
+	case NFPROTO_IPV4:
+		tuple->src.u3.ip = ap[0];
+		tuple->dst.u3.ip = ap[1];
+		break;
+	case NFPROTO_IPV6:
+		memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
+		memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
+		break;
+	}
+
 	tuple->dst.protonum = protonum;
 	tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 
@@ -267,7 +295,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 	l4proto = __nf_ct_l4proto_find(l3num, protonum);
 
 	ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
-			      l3proto, l4proto);
+			      l4proto);
 
 	rcu_read_unlock();
 	return ret;
@@ -1318,8 +1346,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	u32 hash;
 
 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
-			     dataoff, l3num, protonum, net, &tuple, l3proto,
-			     l4proto)) {
+			     dataoff, l3num, protonum, net, &tuple, l4proto)) {
 		pr_debug("Can't get tuple\n");
 		return 0;
 	}
@@ -1633,7 +1660,7 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
 	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
 
 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
-			     l4num, net, &tuple, l3proto, l4proto))
+			     l4num, net, &tuple, l4proto))
 		return -1;
 
 	if (ct->status & IPS_SRC_NAT) {
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
index 397e6911214f..0b01c9970e99 100644
--- a/net/netfilter/nf_conntrack_l3proto_generic.c
+++ b/net/netfilter/nf_conntrack_l3proto_generic.c
@@ -31,15 +31,6 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 
-static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
-				 struct nf_conntrack_tuple *tuple)
-{
-	memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
-	memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
-
-	return true;
-}
-
 static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
 				 const struct nf_conntrack_tuple *orig)
 {
@@ -59,7 +50,6 @@ static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 
 struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
 	.l3proto	 = PF_UNSPEC,
-	.pkt_to_tuple	 = generic_pkt_to_tuple,
 	.invert_tuple	 = generic_invert_tuple,
 	.get_l4proto	 = generic_get_l4proto,
 };

From d1b6fe94941f43e4743d5fea953d16b0a001c2c6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Jun 2018 07:46:46 +0200
Subject: [PATCH 10/38] netfilter: conntrack: remove invert_tuple indirection
 from l3 protocol trackers

Its simpler to just handle it directly in nf_ct_invert_tuple().
Also gets rid of need to pass l3proto pointer to resolve_conntrack().

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_core.h     |  1 -
 include/net/netfilter/nf_conntrack_l3proto.h  |  7 -----
 .../netfilter/nf_conntrack_l3proto_ipv4.c     | 10 -------
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c  |  3 +--
 .../netfilter/nf_conntrack_l3proto_ipv6.c     | 10 -------
 .../netfilter/nf_conntrack_proto_icmpv6.c     |  3 +--
 net/netfilter/nf_conntrack_core.c             | 26 ++++++++++++-------
 net/netfilter/nf_conntrack_l3proto_generic.c  | 10 -------
 8 files changed, 18 insertions(+), 52 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index d454a53ba646..35461b2d3462 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -42,7 +42,6 @@ void nf_conntrack_cleanup_end(void);
 
 bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 			const struct nf_conntrack_tuple *orig,
-			const struct nf_conntrack_l3proto *l3proto,
 			const struct nf_conntrack_l4proto *l4proto);
 
 /* Find a connection corresponding to a tuple. */
diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index ece231450f30..164641c743a5 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -24,13 +24,6 @@ struct nf_conntrack_l3proto {
 	/* size of tuple nlattr, fills a hole */
 	u16 nla_size;
 
-	/*
-	 * Invert the per-proto part of the tuple: ie. turn xmit into reply.
-	 * Some packets can't be inverted: return 0 in that case.
-	 */
-	bool (*invert_tuple)(struct nf_conntrack_tuple *inverse,
-			     const struct nf_conntrack_tuple *orig);
-
 	/*
 	 * Called before tracking. 
 	 *	*dataoff: offset of protocol header (TCP, UDP,...) in skb
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 7ed56f61798b..e10e38c443ab 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -38,15 +38,6 @@ struct conntrack4_net {
 	unsigned int users;
 };
 
-static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
-			      const struct nf_conntrack_tuple *orig)
-{
-	tuple->src.u3.ip = orig->dst.u3.ip;
-	tuple->dst.u3.ip = orig->src.u3.ip;
-
-	return true;
-}
-
 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 			    unsigned int *dataoff, u_int8_t *protonum)
 {
@@ -306,7 +297,6 @@ static void ipv4_hooks_unregister(struct net *net)
 
 const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
 	.l3proto	 = PF_INET,
-	.invert_tuple	 = ipv4_invert_tuple,
 	.get_l4proto	 = ipv4_get_l4proto,
 	.net_ns_get	 = ipv4_hooks_register,
 	.net_ns_put	 = ipv4_hooks_unregister,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 5c15beafa711..34095949a003 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -142,8 +142,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
 
 	/* Ordinarily, we'd expect the inverted tupleproto, but it's
 	   been preserved inside the ICMP. */
-	if (!nf_ct_invert_tuple(&innertuple, &origtuple,
-				&nf_conntrack_l3proto_ipv4, innerproto)) {
+	if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
 		pr_debug("icmp_error_message: no match\n");
 		return -NF_ACCEPT;
 	}
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index bdb1709bb951..f8051fe20489 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -41,15 +41,6 @@ struct conntrack6_net {
 	unsigned int users;
 };
 
-static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
-			      const struct nf_conntrack_tuple *orig)
-{
-	memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6));
-	memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6));
-
-	return true;
-}
-
 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 			    unsigned int *dataoff, u_int8_t *protonum)
 {
@@ -290,7 +281,6 @@ static void ipv6_hooks_unregister(struct net *net)
 
 const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
 	.l3proto		= PF_INET6,
-	.invert_tuple		= ipv6_invert_tuple,
 	.get_l4proto		= ipv6_get_l4proto,
 	.net_ns_get		= ipv6_hooks_register,
 	.net_ns_put		= ipv6_hooks_unregister,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 2548e2c8aedd..8bcbc2f15bd5 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -152,8 +152,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
 
 	/* Ordinarily, we'd expect the inverted tupleproto, but it's
 	   been preserved inside the ICMP. */
-	if (!nf_ct_invert_tuple(&intuple, &origtuple,
-				&nf_conntrack_l3proto_ipv6, inproto)) {
+	if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) {
 		pr_debug("icmpv6_error: Can't invert tuple\n");
 		return -NF_ACCEPT;
 	}
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 66b2ebae2747..14c040805b32 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -305,14 +305,24 @@ EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
 bool
 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 		   const struct nf_conntrack_tuple *orig,
-		   const struct nf_conntrack_l3proto *l3proto,
 		   const struct nf_conntrack_l4proto *l4proto)
 {
 	memset(inverse, 0, sizeof(*inverse));
 
 	inverse->src.l3num = orig->src.l3num;
-	if (l3proto->invert_tuple(inverse, orig) == 0)
-		return false;
+
+	switch (orig->src.l3num) {
+	case NFPROTO_IPV4:
+		inverse->src.u3.ip = orig->dst.u3.ip;
+		inverse->dst.u3.ip = orig->src.u3.ip;
+		break;
+	case NFPROTO_IPV6:
+		inverse->src.u3.in6 = orig->dst.u3.in6;
+		inverse->dst.u3.in6 = orig->src.u3.in6;
+		break;
+	default:
+		break;
+	}
 
 	inverse->dst.dir = !orig->dst.dir;
 
@@ -1222,7 +1232,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free);
 static noinline struct nf_conntrack_tuple_hash *
 init_conntrack(struct net *net, struct nf_conn *tmpl,
 	       const struct nf_conntrack_tuple *tuple,
-	       const struct nf_conntrack_l3proto *l3proto,
 	       const struct nf_conntrack_l4proto *l4proto,
 	       struct sk_buff *skb,
 	       unsigned int dataoff, u32 hash)
@@ -1237,7 +1246,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	struct nf_conntrack_zone tmp;
 	unsigned int *timeouts;
 
-	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
+	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) {
 		pr_debug("Can't invert tuple.\n");
 		return NULL;
 	}
@@ -1334,7 +1343,6 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 		  unsigned int dataoff,
 		  u_int16_t l3num,
 		  u_int8_t protonum,
-		  const struct nf_conntrack_l3proto *l3proto,
 		  const struct nf_conntrack_l4proto *l4proto)
 {
 	const struct nf_conntrack_zone *zone;
@@ -1356,7 +1364,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	hash = hash_conntrack_raw(&tuple, net);
 	h = __nf_conntrack_find_get(net, zone, &tuple, hash);
 	if (!h) {
-		h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
+		h = init_conntrack(net, tmpl, &tuple, l4proto,
 				   skb, dataoff, hash);
 		if (!h)
 			return 0;
@@ -1439,8 +1447,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 			goto out;
 	}
 repeat:
-	ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
-				l3proto, l4proto);
+	ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l4proto);
 	if (ret < 0) {
 		/* Too stressed to deal. */
 		NF_CT_STAT_INC_ATOMIC(net, drop);
@@ -1497,7 +1504,6 @@ bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
 
 	rcu_read_lock();
 	ret = nf_ct_invert_tuple(inverse, orig,
-				 __nf_ct_l3proto_find(orig->src.l3num),
 				 __nf_ct_l4proto_find(orig->src.l3num,
 						      orig->dst.protonum));
 	rcu_read_unlock();
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
index 0b01c9970e99..d6a8fe591ccc 100644
--- a/net/netfilter/nf_conntrack_l3proto_generic.c
+++ b/net/netfilter/nf_conntrack_l3proto_generic.c
@@ -31,15 +31,6 @@
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 
-static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
-				 const struct nf_conntrack_tuple *orig)
-{
-	memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
-	memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
-
-	return true;
-}
-
 static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 			       unsigned int *dataoff, u_int8_t *protonum)
 {
@@ -50,7 +41,6 @@ static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 
 struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
 	.l3proto	 = PF_UNSPEC,
-	.invert_tuple	 = generic_invert_tuple,
 	.get_l4proto	 = generic_get_l4proto,
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic);

From 6816d931cab009024b68c11c4cf752f8bf9a1e32 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Jun 2018 07:46:47 +0200
Subject: [PATCH 11/38] netfilter: conntrack: remove get_l4proto indirection
 from l3 protocol trackers

Handle it in the core instead.

ipv6_skip_exthdr() is built-in even if ipv6 is a module, i.e. this
doesn't create an ipv6 dependency.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l3proto.h  |   8 --
 .../netfilter/nf_conntrack_l3proto_ipv4.c     |  30 -----
 .../netfilter/nf_conntrack_l3proto_ipv6.c     |  29 -----
 net/netfilter/Makefile                        |   2 +-
 net/netfilter/nf_conntrack_core.c             | 108 ++++++++++++++----
 net/netfilter/nf_conntrack_l3proto_generic.c  |  46 --------
 net/netfilter/nf_conntrack_proto.c            |   5 +
 7 files changed, 94 insertions(+), 134 deletions(-)
 delete mode 100644 net/netfilter/nf_conntrack_l3proto_generic.c

diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index 164641c743a5..5f160375c93a 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -24,14 +24,6 @@ struct nf_conntrack_l3proto {
 	/* size of tuple nlattr, fills a hole */
 	u16 nla_size;
 
-	/*
-	 * Called before tracking. 
-	 *	*dataoff: offset of protocol header (TCP, UDP,...) in skb
-	 *	*protonum: protocol number
-	 */
-	int (*get_l4proto)(const struct sk_buff *skb, unsigned int nhoff,
-			   unsigned int *dataoff, u_int8_t *protonum);
-
 	/* Called when netns wants to use connection tracking */
 	int (*net_ns_get)(struct net *);
 	void (*net_ns_put)(struct net *);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index e10e38c443ab..9fbf6c7f8ece 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -38,35 +38,6 @@ struct conntrack4_net {
 	unsigned int users;
 };
 
-static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
-			    unsigned int *dataoff, u_int8_t *protonum)
-{
-	const struct iphdr *iph;
-	struct iphdr _iph;
-
-	iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
-	if (iph == NULL)
-		return -NF_ACCEPT;
-
-	/* Conntrack defragments packets, we might still see fragments
-	 * inside ICMP packets though. */
-	if (iph->frag_off & htons(IP_OFFSET))
-		return -NF_ACCEPT;
-
-	*dataoff = nhoff + (iph->ihl << 2);
-	*protonum = iph->protocol;
-
-	/* Check bogus IP headers */
-	if (*dataoff > skb->len) {
-		pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
-			 "nhoff %u, ihl %u, skblen %u\n",
-			 nhoff, iph->ihl << 2, skb->len);
-		return -NF_ACCEPT;
-	}
-
-	return NF_ACCEPT;
-}
-
 static unsigned int ipv4_helper(void *priv,
 				struct sk_buff *skb,
 				const struct nf_hook_state *state)
@@ -297,7 +268,6 @@ static void ipv4_hooks_unregister(struct net *net)
 
 const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
 	.l3proto	 = PF_INET,
-	.get_l4proto	 = ipv4_get_l4proto,
 	.net_ns_get	 = ipv4_hooks_register,
 	.net_ns_put	 = ipv4_hooks_unregister,
 	.me		 = THIS_MODULE,
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index f8051fe20489..37ab25645cf2 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -41,34 +41,6 @@ struct conntrack6_net {
 	unsigned int users;
 };
 
-static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
-			    unsigned int *dataoff, u_int8_t *protonum)
-{
-	unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
-	__be16 frag_off;
-	int protoff;
-	u8 nexthdr;
-
-	if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
-			  &nexthdr, sizeof(nexthdr)) != 0) {
-		pr_debug("ip6_conntrack_core: can't get nexthdr\n");
-		return -NF_ACCEPT;
-	}
-	protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
-	/*
-	 * (protoff == skb->len) means the packet has not data, just
-	 * IPv6 and possibly extensions headers, but it is tracked anyway
-	 */
-	if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
-		pr_debug("ip6_conntrack_core: can't find proto in pkt\n");
-		return -NF_ACCEPT;
-	}
-
-	*dataoff = protoff;
-	*protonum = nexthdr;
-	return NF_ACCEPT;
-}
-
 static unsigned int ipv6_helper(void *priv,
 				struct sk_buff *skb,
 				const struct nf_hook_state *state)
@@ -281,7 +253,6 @@ static void ipv6_hooks_unregister(struct net *net)
 
 const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
 	.l3proto		= PF_INET6,
-	.get_l4proto		= ipv6_get_l4proto,
 	.net_ns_get		= ipv6_hooks_register,
 	.net_ns_put		= ipv6_hooks_unregister,
 	.me			= THIS_MODULE,
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 44449389e527..f132ea850778 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o
 
-nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
+nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 14c040805b32..0674c6e5bfed 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -37,7 +37,6 @@
 #include <linux/rculist_nulls.h>
 
 #include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_helper.h>
@@ -55,6 +54,7 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_helper.h>
 #include <net/netns/hash.h>
+#include <net/ip.h>
 
 #include "nf_internals.h"
 
@@ -273,21 +273,94 @@ nf_ct_get_tuple(const struct sk_buff *skb,
 	return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
 }
 
+static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+			    u_int8_t *protonum)
+{
+	int dataoff = -1;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV4)
+	const struct iphdr *iph;
+	struct iphdr _iph;
+
+	iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+	if (!iph)
+		return -1;
+
+	/* Conntrack defragments packets, we might still see fragments
+	 * inside ICMP packets though.
+	 */
+	if (iph->frag_off & htons(IP_OFFSET))
+		return -1;
+
+	dataoff = nhoff + (iph->ihl << 2);
+	*protonum = iph->protocol;
+
+	/* Check bogus IP headers */
+	if (dataoff > skb->len) {
+		pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
+			 nhoff, iph->ihl << 2, skb->len);
+		return -1;
+	}
+#endif
+	return dataoff;
+}
+
+static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+			    u8 *protonum)
+{
+	int protoff = -1;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)
+	unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
+	__be16 frag_off;
+	u8 nexthdr;
+
+	if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
+			  &nexthdr, sizeof(nexthdr)) != 0) {
+		pr_debug("can't get nexthdr\n");
+		return -1;
+	}
+	protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
+	/*
+	 * (protoff == skb->len) means the packet has not data, just
+	 * IPv6 and possibly extensions headers, but it is tracked anyway
+	 */
+	if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+		pr_debug("can't find proto in pkt\n");
+		return -1;
+	}
+
+	*protonum = nexthdr;
+#endif
+	return protoff;
+}
+
+static int get_l4proto(const struct sk_buff *skb,
+		       unsigned int nhoff, u8 pf, u8 *l4num)
+{
+	switch (pf) {
+	case NFPROTO_IPV4:
+		return ipv4_get_l4proto(skb, nhoff, l4num);
+	case NFPROTO_IPV6:
+		return ipv6_get_l4proto(skb, nhoff, l4num);
+	default:
+		*l4num = 0;
+		break;
+	}
+	return -1;
+}
+
 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
 		       u_int16_t l3num,
 		       struct net *net, struct nf_conntrack_tuple *tuple)
 {
-	const struct nf_conntrack_l3proto *l3proto;
 	const struct nf_conntrack_l4proto *l4proto;
-	unsigned int protoff;
-	u_int8_t protonum;
+	u8 protonum;
+	int protoff;
 	int ret;
 
 	rcu_read_lock();
 
-	l3proto = __nf_ct_l3proto_find(l3num);
-	ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
-	if (ret != NF_ACCEPT) {
+	protoff = get_l4proto(skb, nhoff, l3num, &protonum);
+	if (protoff <= 0) {
 		rcu_read_unlock();
 		return false;
 	}
@@ -1397,14 +1470,12 @@ unsigned int
 nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 		struct sk_buff *skb)
 {
-	const struct nf_conntrack_l3proto *l3proto;
 	const struct nf_conntrack_l4proto *l4proto;
 	struct nf_conn *ct, *tmpl;
 	enum ip_conntrack_info ctinfo;
 	unsigned int *timeouts;
-	unsigned int dataoff;
 	u_int8_t protonum;
-	int ret;
+	int dataoff, ret;
 
 	tmpl = nf_ct_get(skb, &ctinfo);
 	if (tmpl || ctinfo == IP_CT_UNTRACKED) {
@@ -1418,14 +1489,12 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 	}
 
 	/* rcu_read_lock()ed by nf_hook_thresh */
-	l3proto = __nf_ct_l3proto_find(pf);
-	ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
-				   &dataoff, &protonum);
-	if (ret <= 0) {
+	dataoff = get_l4proto(skb, skb_network_offset(skb), pf, &protonum);
+	if (dataoff <= 0) {
 		pr_debug("not prepared to track yet or error occurred\n");
 		NF_CT_STAT_INC_ATOMIC(net, error);
 		NF_CT_STAT_INC_ATOMIC(net, invalid);
-		ret = -ret;
+		ret = NF_ACCEPT;
 		goto out;
 	}
 
@@ -1641,14 +1710,14 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
 
 static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
 {
-	const struct nf_conntrack_l3proto *l3proto;
 	const struct nf_conntrack_l4proto *l4proto;
 	struct nf_conntrack_tuple_hash *h;
 	struct nf_conntrack_tuple tuple;
 	enum ip_conntrack_info ctinfo;
 	struct nf_nat_hook *nat_hook;
-	unsigned int dataoff, status;
+	unsigned int status;
 	struct nf_conn *ct;
+	int dataoff;
 	u16 l3num;
 	u8 l4num;
 
@@ -1657,10 +1726,9 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
 		return 0;
 
 	l3num = nf_ct_l3num(ct);
-	l3proto = nf_ct_l3proto_find_get(l3num);
 
-	if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
-				 &l4num) <= 0)
+	dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
+	if (dataoff <= 0)
 		return -1;
 
 	l4proto = nf_ct_l4proto_find_get(l3num, l4num);
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
deleted file mode 100644
index d6a8fe591ccc..000000000000
--- a/net/netfilter/nf_conntrack_l3proto_generic.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
- *
- * Based largely upon the original ip_conntrack code which
- * had the following copyright information:
- *
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Author:
- *	Yasuyuki Kozakai @USAGI	<yasuyuki.kozakai@toshiba.co.jp>
- */
-
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/icmp.h>
-#include <linux/sysctl.h>
-#include <net/ip.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
-
-static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
-			       unsigned int *dataoff, u_int8_t *protonum)
-{
-	/* Never track !!! */
-	return -NF_ACCEPT;
-}
-
-
-struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
-	.l3proto	 = PF_UNSPEC,
-	.get_l4proto	 = generic_get_l4proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 859cb303bb91..39df72bb9d56 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -35,6 +35,11 @@ EXPORT_SYMBOL_GPL(nf_ct_l3protos);
 
 static DEFINE_MUTEX(nf_ct_proto_mutex);
 
+struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
+	.l3proto	 = PF_UNSPEC,
+};
+EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic);
+
 #ifdef CONFIG_SYSCTL
 static int
 nf_ct_register_sysctl(struct net *net,

From 8b3892ea8718920d29432328fe9544d89a429614 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Jun 2018 07:46:48 +0200
Subject: [PATCH 12/38] netfilter: conntrack: avoid calls to l4proto
 invert_tuple

Handle the common cases (tcp, udp, etc). in the core and only
do the indirect call for the protocols that need it (GRE for instance).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h |  2 +-
 net/netfilter/nf_conntrack_core.c            |  8 +++++++-
 net/netfilter/nf_conntrack_proto_dccp.c      | 10 ----------
 net/netfilter/nf_conntrack_proto_generic.c   | 10 ----------
 net/netfilter/nf_conntrack_proto_gre.c       | 10 ----------
 net/netfilter/nf_conntrack_proto_sctp.c      | 10 ----------
 net/netfilter/nf_conntrack_proto_tcp.c       | 10 ----------
 net/netfilter/nf_conntrack_proto_udp.c       | 12 ------------
 8 files changed, 8 insertions(+), 64 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index a7220eef9aee..6a55e337a161 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -36,7 +36,7 @@ struct nf_conntrack_l4proto {
 			     struct net *net, struct nf_conntrack_tuple *tuple);
 
 	/* Invert the per-proto part of the tuple: ie. turn xmit into reply.
-	 * Some packets can't be inverted: return 0 in that case.
+	 * Only used by icmp, most protocols use a generic version.
 	 */
 	bool (*invert_tuple)(struct nf_conntrack_tuple *inverse,
 			     const struct nf_conntrack_tuple *orig);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0674c6e5bfed..92efce69b690 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -400,7 +400,13 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
 	inverse->dst.dir = !orig->dst.dir;
 
 	inverse->dst.protonum = orig->dst.protonum;
-	return l4proto->invert_tuple(inverse, orig);
+
+	if (unlikely(l4proto->invert_tuple))
+		return l4proto->invert_tuple(inverse, orig);
+
+	inverse->src.u.all = orig->dst.u.all;
+	inverse->dst.u.all = orig->src.u.all;
+	return true;
 }
 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
 
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index abe647d5b8c6..05620c03f138 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -403,14 +403,6 @@ static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 	return true;
 }
 
-static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv,
-			      const struct nf_conntrack_tuple *tuple)
-{
-	inv->src.u.dccp.port = tuple->dst.u.dccp.port;
-	inv->dst.u.dccp.port = tuple->src.u.dccp.port;
-	return true;
-}
-
 static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
 		     unsigned int dataoff, unsigned int *timeouts)
 {
@@ -865,7 +857,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = {
 	.l3proto		= AF_INET,
 	.l4proto		= IPPROTO_DCCP,
 	.pkt_to_tuple		= dccp_pkt_to_tuple,
-	.invert_tuple		= dccp_invert_tuple,
 	.new			= dccp_new,
 	.packet			= dccp_packet,
 	.get_timeouts		= dccp_get_timeouts,
@@ -901,7 +892,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
 	.l3proto		= AF_INET6,
 	.l4proto		= IPPROTO_DCCP,
 	.pkt_to_tuple		= dccp_pkt_to_tuple,
-	.invert_tuple		= dccp_invert_tuple,
 	.new			= dccp_new,
 	.packet			= dccp_packet,
 	.get_timeouts		= dccp_get_timeouts,
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 6c6896d21cd7..4dfe40aa9446 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -41,15 +41,6 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb,
 	return true;
 }
 
-static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
-				 const struct nf_conntrack_tuple *orig)
-{
-	tuple->src.u.all = 0;
-	tuple->dst.u.all = 0;
-
-	return true;
-}
-
 static unsigned int *generic_get_timeouts(struct net *net)
 {
 	return &(generic_pernet(net)->timeout);
@@ -168,7 +159,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
 	.l3proto		= PF_UNSPEC,
 	.l4proto		= 255,
 	.pkt_to_tuple		= generic_pkt_to_tuple,
-	.invert_tuple		= generic_invert_tuple,
 	.packet			= generic_packet,
 	.get_timeouts		= generic_get_timeouts,
 	.new			= generic_new,
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index d049ea5a3770..0bd40eb06b55 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -179,15 +179,6 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy);
 
 /* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */
 
-/* invert gre part of tuple */
-static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
-			     const struct nf_conntrack_tuple *orig)
-{
-	tuple->dst.u.gre.key = orig->src.u.gre.key;
-	tuple->src.u.gre.key = orig->dst.u.gre.key;
-	return true;
-}
-
 /* gre hdr info to tuple */
 static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 			     struct net *net, struct nf_conntrack_tuple *tuple)
@@ -356,7 +347,6 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
 	.l3proto	 = AF_INET,
 	.l4proto	 = IPPROTO_GRE,
 	.pkt_to_tuple	 = gre_pkt_to_tuple,
-	.invert_tuple	 = gre_invert_tuple,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack = gre_print_conntrack,
 #endif
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index fb9a35d16069..148957a5cf3e 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -166,14 +166,6 @@ static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 	return true;
 }
 
-static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
-			      const struct nf_conntrack_tuple *orig)
-{
-	tuple->src.u.sctp.port = orig->dst.u.sctp.port;
-	tuple->dst.u.sctp.port = orig->src.u.sctp.port;
-	return true;
-}
-
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 /* Print out the private part of the conntrack. */
 static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
@@ -781,7 +773,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
 	.l3proto		= PF_INET,
 	.l4proto 		= IPPROTO_SCTP,
 	.pkt_to_tuple 		= sctp_pkt_to_tuple,
-	.invert_tuple 		= sctp_invert_tuple,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack	= sctp_print_conntrack,
 #endif
@@ -818,7 +809,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
 	.l3proto		= PF_INET6,
 	.l4proto 		= IPPROTO_SCTP,
 	.pkt_to_tuple 		= sctp_pkt_to_tuple,
-	.invert_tuple 		= sctp_invert_tuple,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack	= sctp_print_conntrack,
 #endif
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 8e67910185a0..03cff1e3066a 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -293,14 +293,6 @@ static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
 	return true;
 }
 
-static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
-			     const struct nf_conntrack_tuple *orig)
-{
-	tuple->src.u.tcp.port = orig->dst.u.tcp.port;
-	tuple->dst.u.tcp.port = orig->src.u.tcp.port;
-	return true;
-}
-
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 /* Print out the private part of the conntrack. */
 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
@@ -1560,7 +1552,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
 	.l3proto		= PF_INET,
 	.l4proto 		= IPPROTO_TCP,
 	.pkt_to_tuple 		= tcp_pkt_to_tuple,
-	.invert_tuple 		= tcp_invert_tuple,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack 	= tcp_print_conntrack,
 #endif
@@ -1598,7 +1589,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
 	.l3proto		= PF_INET6,
 	.l4proto 		= IPPROTO_TCP,
 	.pkt_to_tuple 		= tcp_pkt_to_tuple,
-	.invert_tuple 		= tcp_invert_tuple,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack 	= tcp_print_conntrack,
 #endif
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index fe7243970aa4..6fe2233c323a 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -55,14 +55,6 @@ static bool udp_pkt_to_tuple(const struct sk_buff *skb,
 	return true;
 }
 
-static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple,
-			     const struct nf_conntrack_tuple *orig)
-{
-	tuple->src.u.udp.port = orig->dst.u.udp.port;
-	tuple->dst.u.udp.port = orig->src.u.udp.port;
-	return true;
-}
-
 static unsigned int *udp_get_timeouts(struct net *net)
 {
 	return udp_pernet(net)->timeouts;
@@ -302,7 +294,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
 	.l4proto		= IPPROTO_UDP,
 	.allow_clash		= true,
 	.pkt_to_tuple		= udp_pkt_to_tuple,
-	.invert_tuple		= udp_invert_tuple,
 	.packet			= udp_packet,
 	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
@@ -334,7 +325,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
 	.l4proto		= IPPROTO_UDPLITE,
 	.allow_clash		= true,
 	.pkt_to_tuple		= udp_pkt_to_tuple,
-	.invert_tuple		= udp_invert_tuple,
 	.packet			= udp_packet,
 	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
@@ -366,7 +356,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
 	.l4proto		= IPPROTO_UDP,
 	.allow_clash		= true,
 	.pkt_to_tuple		= udp_pkt_to_tuple,
-	.invert_tuple		= udp_invert_tuple,
 	.packet			= udp_packet,
 	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
@@ -398,7 +387,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
 	.l4proto		= IPPROTO_UDPLITE,
 	.allow_clash		= true,
 	.pkt_to_tuple		= udp_pkt_to_tuple,
-	.invert_tuple		= udp_invert_tuple,
 	.packet			= udp_packet,
 	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,

From 97e08caec33a0923385b1215c3386c9ee1d07982 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Jun 2018 07:46:49 +0200
Subject: [PATCH 13/38] netfilter: conntrack: avoid l4proto pkt_to_tuple calls

Handle common protocols (udp, tcp, ..), in the core and only
do the call if needed by the l4proto tracker.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c       | 16 +++++++++++++++-
 net/netfilter/nf_conntrack_proto_dccp.c | 17 -----------------
 net/netfilter/nf_conntrack_proto_sctp.c | 18 ------------------
 net/netfilter/nf_conntrack_proto_tcp.c  | 19 -------------------
 net/netfilter/nf_conntrack_proto_udp.c  | 23 -----------------------
 5 files changed, 15 insertions(+), 78 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 92efce69b690..994591fd9b96 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -235,6 +235,10 @@ nf_ct_get_tuple(const struct sk_buff *skb,
 	unsigned int size;
 	const __be32 *ap;
 	__be32 _addrs[8];
+	struct {
+		__be16 sport;
+		__be16 dport;
+	} _inet_hdr, *inet_hdr;
 
 	memset(tuple, 0, sizeof(*tuple));
 
@@ -270,7 +274,17 @@ nf_ct_get_tuple(const struct sk_buff *skb,
 	tuple->dst.protonum = protonum;
 	tuple->dst.dir = IP_CT_DIR_ORIGINAL;
 
-	return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
+	if (unlikely(l4proto->pkt_to_tuple))
+		return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
+
+	/* Actually only need first 4 bytes to get ports. */
+	inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
+	if (!inet_hdr)
+		return false;
+
+	tuple->src.u.udp.port = inet_hdr->sport;
+	tuple->dst.u.udp.port = inet_hdr->dport;
+	return true;
 }
 
 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 05620c03f138..abfdce7baed5 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -388,21 +388,6 @@ static inline struct nf_dccp_net *dccp_pernet(struct net *net)
 	return &net->ct.nf_ct_proto.dccp;
 }
 
-static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
-			      struct net *net, struct nf_conntrack_tuple *tuple)
-{
-	struct dccp_hdr _hdr, *dh;
-
-	/* Actually only need first 4 bytes to get ports. */
-	dh = skb_header_pointer(skb, dataoff, 4, &_hdr);
-	if (dh == NULL)
-		return false;
-
-	tuple->src.u.dccp.port = dh->dccph_sport;
-	tuple->dst.u.dccp.port = dh->dccph_dport;
-	return true;
-}
-
 static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
 		     unsigned int dataoff, unsigned int *timeouts)
 {
@@ -856,7 +841,6 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net)
 const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = {
 	.l3proto		= AF_INET,
 	.l4proto		= IPPROTO_DCCP,
-	.pkt_to_tuple		= dccp_pkt_to_tuple,
 	.new			= dccp_new,
 	.packet			= dccp_packet,
 	.get_timeouts		= dccp_get_timeouts,
@@ -891,7 +875,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
 const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
 	.l3proto		= AF_INET6,
 	.l4proto		= IPPROTO_DCCP,
-	.pkt_to_tuple		= dccp_pkt_to_tuple,
 	.new			= dccp_new,
 	.packet			= dccp_packet,
 	.get_timeouts		= dccp_get_timeouts,
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 148957a5cf3e..b4126a842bfd 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -150,22 +150,6 @@ static inline struct nf_sctp_net *sctp_pernet(struct net *net)
 	return &net->ct.nf_ct_proto.sctp;
 }
 
-static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
-			      struct net *net, struct nf_conntrack_tuple *tuple)
-{
-	const struct sctphdr *hp;
-	struct sctphdr _hdr;
-
-	/* Actually only need first 4 bytes to get ports. */
-	hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
-	if (hp == NULL)
-		return false;
-
-	tuple->src.u.sctp.port = hp->source;
-	tuple->dst.u.sctp.port = hp->dest;
-	return true;
-}
-
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 /* Print out the private part of the conntrack. */
 static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
@@ -772,7 +756,6 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net)
 const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
 	.l3proto		= PF_INET,
 	.l4proto 		= IPPROTO_SCTP,
-	.pkt_to_tuple 		= sctp_pkt_to_tuple,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack	= sctp_print_conntrack,
 #endif
@@ -808,7 +791,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4);
 const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
 	.l3proto		= PF_INET6,
 	.l4proto 		= IPPROTO_SCTP,
-	.pkt_to_tuple 		= sctp_pkt_to_tuple,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack	= sctp_print_conntrack,
 #endif
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 03cff1e3066a..13c89fd107b2 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -276,23 +276,6 @@ static inline struct nf_tcp_net *tcp_pernet(struct net *net)
 	return &net->ct.nf_ct_proto.tcp;
 }
 
-static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
-			     struct net *net, struct nf_conntrack_tuple *tuple)
-{
-	const struct tcphdr *hp;
-	struct tcphdr _hdr;
-
-	/* Actually only need first 4 bytes to get ports. */
-	hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
-	if (hp == NULL)
-		return false;
-
-	tuple->src.u.tcp.port = hp->source;
-	tuple->dst.u.tcp.port = hp->dest;
-
-	return true;
-}
-
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 /* Print out the private part of the conntrack. */
 static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
@@ -1551,7 +1534,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
 {
 	.l3proto		= PF_INET,
 	.l4proto 		= IPPROTO_TCP,
-	.pkt_to_tuple 		= tcp_pkt_to_tuple,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack 	= tcp_print_conntrack,
 #endif
@@ -1588,7 +1570,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
 {
 	.l3proto		= PF_INET6,
 	.l4proto 		= IPPROTO_TCP,
-	.pkt_to_tuple 		= tcp_pkt_to_tuple,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack 	= tcp_print_conntrack,
 #endif
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 6fe2233c323a..8b435d70ffe3 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -36,25 +36,6 @@ static inline struct nf_udp_net *udp_pernet(struct net *net)
 	return &net->ct.nf_ct_proto.udp;
 }
 
-static bool udp_pkt_to_tuple(const struct sk_buff *skb,
-			     unsigned int dataoff,
-			     struct net *net,
-			     struct nf_conntrack_tuple *tuple)
-{
-	const struct udphdr *hp;
-	struct udphdr _hdr;
-
-	/* Actually only need first 4 bytes to get ports. */
-	hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
-	if (hp == NULL)
-		return false;
-
-	tuple->src.u.udp.port = hp->source;
-	tuple->dst.u.udp.port = hp->dest;
-
-	return true;
-}
-
 static unsigned int *udp_get_timeouts(struct net *net)
 {
 	return udp_pernet(net)->timeouts;
@@ -293,7 +274,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
 	.l3proto		= PF_INET,
 	.l4proto		= IPPROTO_UDP,
 	.allow_clash		= true,
-	.pkt_to_tuple		= udp_pkt_to_tuple,
 	.packet			= udp_packet,
 	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
@@ -324,7 +304,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
 	.l3proto		= PF_INET,
 	.l4proto		= IPPROTO_UDPLITE,
 	.allow_clash		= true,
-	.pkt_to_tuple		= udp_pkt_to_tuple,
 	.packet			= udp_packet,
 	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
@@ -355,7 +334,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
 	.l3proto		= PF_INET6,
 	.l4proto		= IPPROTO_UDP,
 	.allow_clash		= true,
-	.pkt_to_tuple		= udp_pkt_to_tuple,
 	.packet			= udp_packet,
 	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
@@ -386,7 +364,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
 	.l3proto		= PF_INET6,
 	.l4proto		= IPPROTO_UDPLITE,
 	.allow_clash		= true,
-	.pkt_to_tuple		= udp_pkt_to_tuple,
 	.packet			= udp_packet,
 	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,

From c779e849608a875448f6ffc2a5c2a15523bdcd00 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Jun 2018 07:46:50 +0200
Subject: [PATCH 14/38] netfilter: conntrack: remove get_timeout() indirection

Not needed, we can have the l4trackers fetch it themselvs.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_l4proto.h  |  8 ++-----
 include/net/netfilter/nf_conntrack_timeout.h  | 18 ++++-----------
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c  | 16 +++++++++----
 .../netfilter/nf_conntrack_proto_icmpv6.c     | 14 +++++++----
 net/netfilter/nf_conntrack_core.c             | 16 ++-----------
 net/netfilter/nf_conntrack_proto_dccp.c       | 17 ++++++--------
 net/netfilter/nf_conntrack_proto_generic.c    | 22 ++++++++++--------
 net/netfilter/nf_conntrack_proto_gre.c        | 14 +++++++----
 net/netfilter/nf_conntrack_proto_sctp.c       | 18 +++++++--------
 net/netfilter/nf_conntrack_proto_tcp.c        | 23 +++++++++----------
 net/netfilter/nf_conntrack_proto_udp.c        | 20 ++++++++++------
 net/netfilter/nfnetlink_cttimeout.c           | 12 ++++------
 12 files changed, 94 insertions(+), 104 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 6a55e337a161..c7a0075d96df 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -45,13 +45,12 @@ struct nf_conntrack_l4proto {
 	int (*packet)(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
-		      enum ip_conntrack_info ctinfo,
-		      unsigned int *timeouts);
+		      enum ip_conntrack_info ctinfo);
 
 	/* Called when a new connection for this protocol found;
 	 * returns TRUE if it's OK.  If so, packet() called next. */
 	bool (*new)(struct nf_conn *ct, const struct sk_buff *skb,
-		    unsigned int dataoff, unsigned int *timeouts);
+		    unsigned int dataoff);
 
 	/* Called when a conntrack entry is destroyed */
 	void (*destroy)(struct nf_conn *ct);
@@ -63,9 +62,6 @@ struct nf_conntrack_l4proto {
 	/* called by gc worker if table is full */
 	bool (*can_early_drop)(const struct nf_conn *ct);
 
-	/* Return the array of timeouts for this protocol. */
-	unsigned int *(*get_timeouts)(struct net *net);
-
 	/* convert protoinfo to nfnetink attributes */
 	int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla,
 			 struct nf_conn *ct);
diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index 9468ab4ad12d..80ceb3d0291d 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -67,27 +67,17 @@ struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct,
 #endif
 };
 
-static inline unsigned int *
-nf_ct_timeout_lookup(struct net *net, struct nf_conn *ct,
-		     const struct nf_conntrack_l4proto *l4proto)
+static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct)
 {
+	unsigned int *timeouts = NULL;
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
 	struct nf_conn_timeout *timeout_ext;
-	unsigned int *timeouts;
 
 	timeout_ext = nf_ct_timeout_find(ct);
-	if (timeout_ext) {
+	if (timeout_ext)
 		timeouts = nf_ct_timeout_data(timeout_ext);
-		if (unlikely(!timeouts))
-			timeouts = l4proto->get_timeouts(net);
-	} else {
-		timeouts = l4proto->get_timeouts(net);
-	}
-
-	return timeouts;
-#else
-	return l4proto->get_timeouts(net);
 #endif
+	return timeouts;
 }
 
 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index 34095949a003..036670b38282 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -19,6 +19,7 @@
 #include <net/netfilter/nf_conntrack_tuple.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_log.h>
 
@@ -80,12 +81,16 @@ static unsigned int *icmp_get_timeouts(struct net *net)
 static int icmp_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
 		       unsigned int dataoff,
-		       enum ip_conntrack_info ctinfo,
-		       unsigned int *timeout)
+		       enum ip_conntrack_info ctinfo)
 {
 	/* Do not immediately delete the connection after the first
 	   successful reply to avoid excessive conntrackd traffic
 	   and also to handle correctly ICMP echo reply duplicates. */
+	unsigned int *timeout = nf_ct_timeout_lookup(ct);
+
+	if (!timeout)
+		timeout = icmp_get_timeouts(nf_ct_net(ct));
+
 	nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
 
 	return NF_ACCEPT;
@@ -93,7 +98,7 @@ static int icmp_packet(struct nf_conn *ct,
 
 /* Called when a new connection for this protocol found. */
 static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
-		     unsigned int dataoff, unsigned int *timeouts)
+		     unsigned int dataoff)
 {
 	static const u_int8_t valid_new[] = {
 		[ICMP_ECHO] = 1,
@@ -280,9 +285,11 @@ static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
 	struct nf_icmp_net *in = icmp_pernet(net);
 
 	if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
+		if (!timeout)
+			timeout = &in->timeout;
 		*timeout =
 			ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
-	} else {
+	} else if (timeout) {
 		/* Set default ICMP timeout. */
 		*timeout = in->timeout;
 	}
@@ -357,7 +364,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
 	.pkt_to_tuple		= icmp_pkt_to_tuple,
 	.invert_tuple		= icmp_invert_tuple,
 	.packet			= icmp_packet,
-	.get_timeouts		= icmp_get_timeouts,
 	.new			= icmp_new,
 	.error			= icmp_error,
 	.destroy		= NULL,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 8bcbc2f15bd5..bed07b998a10 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -23,6 +23,7 @@
 #include <net/netfilter/nf_conntrack_tuple.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/ipv6/nf_conntrack_icmpv6.h>
 #include <net/netfilter/nf_log.h>
@@ -93,9 +94,13 @@ static unsigned int *icmpv6_get_timeouts(struct net *net)
 static int icmpv6_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
 		       unsigned int dataoff,
-		       enum ip_conntrack_info ctinfo,
-		       unsigned int *timeout)
+		       enum ip_conntrack_info ctinfo)
 {
+	unsigned int *timeout = nf_ct_timeout_lookup(ct);
+
+	if (!timeout)
+		timeout = icmpv6_get_timeouts(nf_ct_net(ct));
+
 	/* Do not immediately delete the connection after the first
 	   successful reply to avoid excessive conntrackd traffic
 	   and also to handle correctly ICMP echo reply duplicates. */
@@ -106,7 +111,7 @@ static int icmpv6_packet(struct nf_conn *ct,
 
 /* Called when a new connection for this protocol found. */
 static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
-		       unsigned int dataoff, unsigned int *timeouts)
+		       unsigned int dataoff)
 {
 	static const u_int8_t valid_new[] = {
 		[ICMPV6_ECHO_REQUEST - 128] = 1,
@@ -280,6 +285,8 @@ static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[],
 	unsigned int *timeout = data;
 	struct nf_icmp_net *in = icmpv6_pernet(net);
 
+	if (!timeout)
+		timeout = icmpv6_get_timeouts(net);
 	if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) {
 		*timeout =
 		    ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ;
@@ -358,7 +365,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
 	.pkt_to_tuple		= icmpv6_pkt_to_tuple,
 	.invert_tuple		= icmpv6_invert_tuple,
 	.packet			= icmpv6_packet,
-	.get_timeouts		= icmpv6_get_timeouts,
 	.new			= icmpv6_new,
 	.error			= icmpv6_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 994591fd9b96..c069f2faff4c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1337,7 +1337,6 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	const struct nf_conntrack_zone *zone;
 	struct nf_conn_timeout *timeout_ext;
 	struct nf_conntrack_zone tmp;
-	unsigned int *timeouts;
 
 	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) {
 		pr_debug("Can't invert tuple.\n");
@@ -1356,15 +1355,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 	}
 
 	timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
-	if (timeout_ext) {
-		timeouts = nf_ct_timeout_data(timeout_ext);
-		if (unlikely(!timeouts))
-			timeouts = l4proto->get_timeouts(net);
-	} else {
-		timeouts = l4proto->get_timeouts(net);
-	}
 
-	if (!l4proto->new(ct, skb, dataoff, timeouts)) {
+	if (!l4proto->new(ct, skb, dataoff)) {
 		nf_conntrack_free(ct);
 		pr_debug("can't track with proto module\n");
 		return NULL;
@@ -1493,7 +1485,6 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
 	const struct nf_conntrack_l4proto *l4proto;
 	struct nf_conn *ct, *tmpl;
 	enum ip_conntrack_info ctinfo;
-	unsigned int *timeouts;
 	u_int8_t protonum;
 	int dataoff, ret;
 
@@ -1552,10 +1543,7 @@ repeat:
 		goto out;
 	}
 
-	/* Decide what timeout policy we want to apply to this flow. */
-	timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
-
-	ret = l4proto->packet(ct, skb, dataoff, ctinfo, timeouts);
+	ret = l4proto->packet(ct, skb, dataoff, ctinfo);
 	if (ret <= 0) {
 		/* Invalid: inverse of the return code tells
 		 * the netfilter core what to do */
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index abfdce7baed5..f476d116c816 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -23,6 +23,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
 #include <net/netfilter/nf_log.h>
 
 /* Timeouts are based on values from RFC4340:
@@ -389,7 +390,7 @@ static inline struct nf_dccp_net *dccp_pernet(struct net *net)
 }
 
 static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
-		     unsigned int dataoff, unsigned int *timeouts)
+		     unsigned int dataoff)
 {
 	struct net *net = nf_ct_net(ct);
 	struct nf_dccp_net *dn;
@@ -437,19 +438,14 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh)
 		     ntohl(dhack->dccph_ack_nr_low);
 }
 
-static unsigned int *dccp_get_timeouts(struct net *net)
-{
-	return dccp_pernet(net)->dccp_timeout;
-}
-
 static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
-		       unsigned int dataoff, enum ip_conntrack_info ctinfo,
-		       unsigned int *timeouts)
+		       unsigned int dataoff, enum ip_conntrack_info ctinfo)
 {
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 	struct dccp_hdr _dh, *dh;
 	u_int8_t type, old_state, new_state;
 	enum ct_dccp_roles role;
+	unsigned int *timeouts;
 
 	dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
 	BUG_ON(dh == NULL);
@@ -523,6 +519,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
 	if (new_state != old_state)
 		nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
 
+	timeouts = nf_ct_timeout_lookup(ct);
+	if (!timeouts)
+		timeouts = dccp_pernet(nf_ct_net(ct))->dccp_timeout;
 	nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
 
 	return NF_ACCEPT;
@@ -843,7 +842,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = {
 	.l4proto		= IPPROTO_DCCP,
 	.new			= dccp_new,
 	.packet			= dccp_packet,
-	.get_timeouts		= dccp_get_timeouts,
 	.error			= dccp_error,
 	.can_early_drop		= dccp_can_early_drop,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
@@ -877,7 +875,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
 	.l4proto		= IPPROTO_DCCP,
 	.new			= dccp_new,
 	.packet			= dccp_packet,
-	.get_timeouts		= dccp_get_timeouts,
 	.error			= dccp_error,
 	.can_early_drop		= dccp_can_early_drop,
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 4dfe40aa9446..ac4a0b296dcd 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -11,6 +11,7 @@
 #include <linux/timer.h>
 #include <linux/netfilter.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
 
 static const unsigned int nf_ct_generic_timeout = 600*HZ;
 
@@ -41,25 +42,24 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb,
 	return true;
 }
 
-static unsigned int *generic_get_timeouts(struct net *net)
-{
-	return &(generic_pernet(net)->timeout);
-}
-
 /* Returns verdict for packet, or -1 for invalid. */
 static int generic_packet(struct nf_conn *ct,
 			  const struct sk_buff *skb,
 			  unsigned int dataoff,
-			  enum ip_conntrack_info ctinfo,
-			  unsigned int *timeout)
+			  enum ip_conntrack_info ctinfo)
 {
+	const unsigned int *timeout = nf_ct_timeout_lookup(ct);
+
+	if (!timeout)
+		timeout = &generic_pernet(nf_ct_net(ct))->timeout;
+
 	nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
 	return NF_ACCEPT;
 }
 
 /* Called when a new connection for this protocol found. */
 static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
-			unsigned int dataoff, unsigned int *timeouts)
+			unsigned int dataoff)
 {
 	bool ret;
 
@@ -78,8 +78,11 @@ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
 static int generic_timeout_nlattr_to_obj(struct nlattr *tb[],
 					 struct net *net, void *data)
 {
-	unsigned int *timeout = data;
 	struct nf_generic_net *gn = generic_pernet(net);
+	unsigned int *timeout = data;
+
+	if (!timeout)
+		timeout = &gn->timeout;
 
 	if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT])
 		*timeout =
@@ -160,7 +163,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
 	.l4proto		= 255,
 	.pkt_to_tuple		= generic_pkt_to_tuple,
 	.packet			= generic_packet,
-	.get_timeouts		= generic_get_timeouts,
 	.new			= generic_new,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
 	.ctnl_timeout		= {
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 0bd40eb06b55..d1632252bf5b 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -39,6 +39,7 @@
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
 #include <linux/netfilter/nf_conntrack_proto_gre.h>
 #include <linux/netfilter/nf_conntrack_pptp.h>
 
@@ -234,8 +235,7 @@ static unsigned int *gre_get_timeouts(struct net *net)
 static int gre_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
-		      enum ip_conntrack_info ctinfo,
-		      unsigned int *timeouts)
+		      enum ip_conntrack_info ctinfo)
 {
 	/* If we've seen traffic both ways, this is a GRE connection.
 	 * Extend timeout. */
@@ -254,8 +254,13 @@ static int gre_packet(struct nf_conn *ct,
 
 /* Called when a new connection for this protocol found. */
 static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb,
-		    unsigned int dataoff, unsigned int *timeouts)
+		    unsigned int dataoff)
 {
+	unsigned int *timeouts = nf_ct_timeout_lookup(ct);
+
+	if (!timeouts)
+		timeouts = gre_get_timeouts(nf_ct_net(ct));
+
 	pr_debug(": ");
 	nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 
@@ -291,6 +296,8 @@ static int gre_timeout_nlattr_to_obj(struct nlattr *tb[],
 	unsigned int *timeouts = data;
 	struct netns_proto_gre *net_gre = gre_pernet(net);
 
+	if (!timeouts)
+		timeouts = gre_get_timeouts(net);
 	/* set default timeouts for GRE. */
 	timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED];
 	timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED];
@@ -350,7 +357,6 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 	.print_conntrack = gre_print_conntrack,
 #endif
-	.get_timeouts    = gre_get_timeouts,
 	.packet		 = gre_packet,
 	.new		 = gre_new,
 	.destroy	 = gre_destroy,
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index b4126a842bfd..8d1e085fc14a 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -28,6 +28,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
 
 /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
    closely.  They're more complex. --RR
@@ -272,17 +273,11 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
 	return sctp_conntracks[dir][i][cur_state];
 }
 
-static unsigned int *sctp_get_timeouts(struct net *net)
-{
-	return sctp_pernet(net)->timeouts;
-}
-
 /* Returns verdict for packet, or -NF_ACCEPT for invalid. */
 static int sctp_packet(struct nf_conn *ct,
 		       const struct sk_buff *skb,
 		       unsigned int dataoff,
-		       enum ip_conntrack_info ctinfo,
-		       unsigned int *timeouts)
+		       enum ip_conntrack_info ctinfo)
 {
 	enum sctp_conntrack new_state, old_state;
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -291,6 +286,7 @@ static int sctp_packet(struct nf_conn *ct,
 	const struct sctp_chunkhdr *sch;
 	struct sctp_chunkhdr _sch;
 	u_int32_t offset, count;
+	unsigned int *timeouts;
 	unsigned long map[256 / sizeof(unsigned long)] = { 0 };
 
 	sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
@@ -379,6 +375,10 @@ static int sctp_packet(struct nf_conn *ct,
 	}
 	spin_unlock_bh(&ct->lock);
 
+	timeouts = nf_ct_timeout_lookup(ct);
+	if (!timeouts)
+		timeouts = sctp_pernet(nf_ct_net(ct))->timeouts;
+
 	nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
 
 	if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&
@@ -399,7 +399,7 @@ out:
 
 /* Called when a new connection for this protocol found. */
 static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
-		     unsigned int dataoff, unsigned int *timeouts)
+		     unsigned int dataoff)
 {
 	enum sctp_conntrack new_state;
 	const struct sctphdr *sh;
@@ -760,7 +760,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
 	.print_conntrack	= sctp_print_conntrack,
 #endif
 	.packet 		= sctp_packet,
-	.get_timeouts		= sctp_get_timeouts,
 	.new 			= sctp_new,
 	.error			= sctp_error,
 	.can_early_drop		= sctp_can_early_drop,
@@ -795,7 +794,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
 	.print_conntrack	= sctp_print_conntrack,
 #endif
 	.packet 		= sctp_packet,
-	.get_timeouts		= sctp_get_timeouts,
 	.new 			= sctp_new,
 	.error			= sctp_error,
 	.can_early_drop		= sctp_can_early_drop,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 13c89fd107b2..d80d322b9d8b 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -29,6 +29,7 @@
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
 #include <net/netfilter/nf_log.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -768,27 +769,21 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
 	return NF_ACCEPT;
 }
 
-static unsigned int *tcp_get_timeouts(struct net *net)
-{
-	return tcp_pernet(net)->timeouts;
-}
-
 /* Returns verdict for packet, or -1 for invalid. */
 static int tcp_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
-		      enum ip_conntrack_info ctinfo,
-		      unsigned int *timeouts)
+		      enum ip_conntrack_info ctinfo)
 {
 	struct net *net = nf_ct_net(ct);
 	struct nf_tcp_net *tn = tcp_pernet(net);
 	struct nf_conntrack_tuple *tuple;
 	enum tcp_conntrack new_state, old_state;
+	unsigned int index, *timeouts;
 	enum ip_conntrack_dir dir;
 	const struct tcphdr *th;
 	struct tcphdr _tcph;
 	unsigned long timeout;
-	unsigned int index;
 
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	BUG_ON(th == NULL);
@@ -1021,6 +1016,10 @@ static int tcp_packet(struct nf_conn *ct,
 	    && new_state == TCP_CONNTRACK_FIN_WAIT)
 		ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
 
+	timeouts = nf_ct_timeout_lookup(ct);
+	if (!timeouts)
+		timeouts = tn->timeouts;
+
 	if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
 	    timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
 		timeout = timeouts[TCP_CONNTRACK_RETRANS];
@@ -1070,7 +1069,7 @@ static int tcp_packet(struct nf_conn *ct,
 
 /* Called when a new connection for this protocol found. */
 static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
-		    unsigned int dataoff, unsigned int *timeouts)
+		    unsigned int dataoff)
 {
 	enum tcp_conntrack new_state;
 	const struct tcphdr *th;
@@ -1288,10 +1287,12 @@ static unsigned int tcp_nlattr_tuple_size(void)
 static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
 				     struct net *net, void *data)
 {
-	unsigned int *timeouts = data;
 	struct nf_tcp_net *tn = tcp_pernet(net);
+	unsigned int *timeouts = data;
 	int i;
 
+	if (!timeouts)
+		timeouts = tn->timeouts;
 	/* set default TCP timeouts. */
 	for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
 		timeouts[i] = tn->timeouts[i];
@@ -1538,7 +1539,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
 	.print_conntrack 	= tcp_print_conntrack,
 #endif
 	.packet 		= tcp_packet,
-	.get_timeouts		= tcp_get_timeouts,
 	.new 			= tcp_new,
 	.error			= tcp_error,
 	.can_early_drop		= tcp_can_early_drop,
@@ -1574,7 +1574,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
 	.print_conntrack 	= tcp_print_conntrack,
 #endif
 	.packet 		= tcp_packet,
-	.get_timeouts		= tcp_get_timeouts,
 	.new 			= tcp_new,
 	.error			= tcp_error,
 	.can_early_drop		= tcp_can_early_drop,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 8b435d70ffe3..7a1b8988a931 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -22,6 +22,7 @@
 #include <linux/netfilter_ipv6.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
 #include <net/netfilter/nf_log.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -45,9 +46,14 @@ static unsigned int *udp_get_timeouts(struct net *net)
 static int udp_packet(struct nf_conn *ct,
 		      const struct sk_buff *skb,
 		      unsigned int dataoff,
-		      enum ip_conntrack_info ctinfo,
-		      unsigned int *timeouts)
+		      enum ip_conntrack_info ctinfo)
 {
+	unsigned int *timeouts;
+
+	timeouts = nf_ct_timeout_lookup(ct);
+	if (!timeouts)
+		timeouts = udp_get_timeouts(nf_ct_net(ct));
+
 	/* If we've seen traffic both ways, this is some kind of UDP
 	   stream.  Extend timeout. */
 	if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
@@ -65,7 +71,7 @@ static int udp_packet(struct nf_conn *ct,
 
 /* Called when a new connection for this protocol found. */
 static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
-		    unsigned int dataoff, unsigned int *timeouts)
+		    unsigned int dataoff)
 {
 	return true;
 }
@@ -176,6 +182,9 @@ static int udp_timeout_nlattr_to_obj(struct nlattr *tb[],
 	unsigned int *timeouts = data;
 	struct nf_udp_net *un = udp_pernet(net);
 
+	if (!timeouts)
+		timeouts = un->timeouts;
+
 	/* set default timeouts for UDP. */
 	timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED];
 	timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED];
@@ -275,7 +284,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
 	.l4proto		= IPPROTO_UDP,
 	.allow_clash		= true,
 	.packet			= udp_packet,
-	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
 	.error			= udp_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -305,7 +313,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
 	.l4proto		= IPPROTO_UDPLITE,
 	.allow_clash		= true,
 	.packet			= udp_packet,
-	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
 	.error			= udplite_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -335,7 +342,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
 	.l4proto		= IPPROTO_UDP,
 	.allow_clash		= true,
 	.packet			= udp_packet,
-	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
 	.error			= udp_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -365,7 +371,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
 	.l4proto		= IPPROTO_UDPLITE,
 	.allow_clash		= true,
 	.packet			= udp_packet,
-	.get_timeouts		= udp_get_timeouts,
 	.new			= udp_new,
 	.error			= udplite_error,
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -388,3 +393,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
 };
 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
 #endif
+#include <net/netfilter/nf_conntrack_timeout.h>
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 9da4b8462004..d9d952fad3e0 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -46,7 +46,7 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
 };
 
 static int
-ctnl_timeout_parse_policy(void *timeouts,
+ctnl_timeout_parse_policy(void *timeout,
 			  const struct nf_conntrack_l4proto *l4proto,
 			  struct net *net, const struct nlattr *attr)
 {
@@ -67,7 +67,7 @@ ctnl_timeout_parse_policy(void *timeouts,
 	if (ret < 0)
 		goto err;
 
-	ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);
+	ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeout);
 
 err:
 	kfree(tb);
@@ -372,7 +372,6 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
 				 struct netlink_ext_ack *extack)
 {
 	const struct nf_conntrack_l4proto *l4proto;
-	unsigned int *timeouts;
 	__u16 l3num;
 	__u8 l4num;
 	int ret;
@@ -392,9 +391,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
 		goto err;
 	}
 
-	timeouts = l4proto->get_timeouts(net);
-
-	ret = ctnl_timeout_parse_policy(timeouts, l4proto, net,
+	ret = ctnl_timeout_parse_policy(NULL, l4proto, net,
 					cda[CTA_TIMEOUT_DATA]);
 	if (ret < 0)
 		goto err;
@@ -431,7 +428,6 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
 
 	if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
 		struct nlattr *nest_parms;
-		unsigned int *timeouts = l4proto->get_timeouts(net);
 		int ret;
 
 		nest_parms = nla_nest_start(skb,
@@ -439,7 +435,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
 		if (!nest_parms)
 			goto nla_put_failure;
 
-		ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts);
+		ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL);
 		if (ret < 0)
 			goto nla_put_failure;
 

From a0ae2562c6c4b2721d9fddba63b7286c13517d9f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 29 Jun 2018 07:46:51 +0200
Subject: [PATCH 15/38] netfilter: conntrack: remove l3proto abstraction

This unifies ipv4 and ipv6 protocol trackers and removes the l3proto
abstraction.

This gets rid of all l3proto indirect calls and the need to do
a lookup on the function to call for l3 demux.

It increases module size by only a small amount (12kbyte), so this reduces
size because nf_conntrack.ko is useless without either nf_conntrack_ipv4
or nf_conntrack_ipv6 module.

before:
   text    data     bss     dec     hex filename
   7357    1088       0    8445    20fd nf_conntrack_ipv4.ko
   7405    1084       4    8493    212d nf_conntrack_ipv6.ko
  72614   13689     236   86539   1520b nf_conntrack.ko
 19K nf_conntrack_ipv4.ko
 19K nf_conntrack_ipv6.ko
179K nf_conntrack.ko

after:
   text    data     bss     dec     hex filename
  79277   13937     236   93450   16d0a nf_conntrack.ko
  191K nf_conntrack.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 .../net/netfilter/ipv4/nf_conntrack_ipv4.h    |   3 -
 include/net/netfilter/nf_conntrack.h          |   5 +
 include/net/netfilter/nf_conntrack_core.h     |   1 -
 include/net/netfilter/nf_conntrack_l3proto.h  |  54 --
 include/net/netfilter/nf_conntrack_l4proto.h  |   4 -
 net/ipv4/netfilter/Kconfig                    |  22 +-
 net/ipv4/netfilter/Makefile                   |   6 -
 .../netfilter/nf_conntrack_l3proto_ipv4.c     | 368 --------
 net/ipv6/netfilter/Kconfig                    |  27 +-
 net/ipv6/netfilter/Makefile                   |   6 -
 .../netfilter/nf_conntrack_l3proto_ipv6.c     | 355 --------
 net/netfilter/Kconfig                         |   2 +
 net/netfilter/Makefile                        |   7 +-
 net/netfilter/nf_conntrack_core.c             |  11 +-
 net/netfilter/nf_conntrack_proto.c            | 847 +++++++++++++-----
 .../netfilter/nf_conntrack_proto_icmp.c       |   0
 .../netfilter/nf_conntrack_proto_icmpv6.c     |   0
 net/netfilter/nf_conntrack_standalone.c       |  14 +-
 net/netfilter/nf_nat_core.c                   |   8 -
 19 files changed, 645 insertions(+), 1095 deletions(-)
 delete mode 100644 include/net/netfilter/nf_conntrack_l3proto.h
 delete mode 100644 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
 delete mode 100644 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
 rename net/{ipv4 => }/netfilter/nf_conntrack_proto_icmp.c (100%)
 rename net/{ipv6 => }/netfilter/nf_conntrack_proto_icmpv6.c (100%)

diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 73f825732326..c84b51682f08 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -10,9 +10,6 @@
 #ifndef _NF_CONNTRACK_IPV4_H
 #define _NF_CONNTRACK_IPV4_H
 
-
-const extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4;
-
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4;
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4;
 extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 062dc19b5840..a2b0ed025908 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -41,6 +41,11 @@ union nf_conntrack_expect_proto {
 	/* insert expect proto private data here */
 };
 
+struct nf_conntrack_net {
+	unsigned int users4;
+	unsigned int users6;
+};
+
 #include <linux/types.h>
 #include <linux/skbuff.h>
 
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 35461b2d3462..2a3e0974a6af 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -14,7 +14,6 @@
 #define _NF_CONNTRACK_CORE_H
 
 #include <linux/netfilter.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
 
diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
deleted file mode 100644
index 5f160375c93a..000000000000
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C)2003,2004 USAGI/WIDE Project
- *
- * Header for use in defining a given L3 protocol for connection tracking.
- *
- * Author:
- *	Yasuyuki Kozakai @USAGI	<yasuyuki.kozakai@toshiba.co.jp>
- *
- * Derived from include/netfilter_ipv4/ip_conntrack_protocol.h
- */
-
-#ifndef _NF_CONNTRACK_L3PROTO_H
-#define _NF_CONNTRACK_L3PROTO_H
-#include <linux/netlink.h>
-#include <net/netlink.h>
-#include <linux/seq_file.h>
-#include <net/netfilter/nf_conntrack.h>
-
-struct nf_conntrack_l3proto {
-	/* L3 Protocol Family number. ex) PF_INET */
-	u_int16_t l3proto;
-
-	/* size of tuple nlattr, fills a hole */
-	u16 nla_size;
-
-	/* Called when netns wants to use connection tracking */
-	int (*net_ns_get)(struct net *);
-	void (*net_ns_put)(struct net *);
-
-	/* Module (if any) which this is connected to. */
-	struct module *me;
-};
-
-extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO];
-
-/* Protocol global registration. */
-int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto);
-void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto);
-
-const struct nf_conntrack_l3proto *nf_ct_l3proto_find_get(u_int16_t l3proto);
-
-/* Existing built-in protocols */
-extern struct nf_conntrack_l3proto nf_conntrack_l3proto_generic;
-
-static inline struct nf_conntrack_l3proto *
-__nf_ct_l3proto_find(u_int16_t l3proto)
-{
-	if (unlikely(l3proto >= NFPROTO_NUMPROTO))
-		return &nf_conntrack_l3proto_generic;
-	return rcu_dereference(nf_ct_l3protos[l3proto]);
-}
-
-#endif /*_NF_CONNTRACK_L3PROTO_H*/
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index c7a0075d96df..6068c6da3eac 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -130,10 +130,6 @@ void nf_ct_l4proto_pernet_unregister(struct net *net,
 /* Protocol global registration. */
 int nf_ct_l4proto_register_one(const struct nf_conntrack_l4proto *proto);
 void nf_ct_l4proto_unregister_one(const struct nf_conntrack_l4proto *proto);
-int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const proto[],
-			   unsigned int num_proto);
-void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const proto[],
-			      unsigned int num_proto);
 
 /* Generic netlink helpers */
 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index bbfc356cb1b5..d9504adc47b3 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -9,22 +9,6 @@ config NF_DEFRAG_IPV4
 	tristate
 	default n
 
-config NF_CONNTRACK_IPV4
-	tristate "IPv4 connection tracking support (required for NAT)"
-	depends on NF_CONNTRACK
-	default m if NETFILTER_ADVANCED=n
-	select NF_DEFRAG_IPV4
-	---help---
-	  Connection tracking keeps a record of what packets have passed
-	  through your machine, in order to figure out how they are related
-	  into connections.
-
-	  This is IPv4 support on Layer 3 independent connection tracking.
-	  Layer 3 independent connection tracking is experimental scheme
-	  which generalize ip_conntrack to support other layer 3 protocols.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config NF_SOCKET_IPV4
 	tristate "IPv4 socket lookup support"
 	help
@@ -112,7 +96,7 @@ config NF_REJECT_IPV4
 
 config NF_NAT_IPV4
 	tristate "IPv4 NAT"
-	depends on NF_CONNTRACK_IPV4
+	depends on NF_CONNTRACK
 	default m if NETFILTER_ADVANCED=n
 	select NF_NAT
 	help
@@ -279,7 +263,7 @@ config IP_NF_TARGET_SYNPROXY
 # NAT + specific targets: nf_conntrack
 config IP_NF_NAT
 	tristate "iptables NAT support"
-	depends on NF_CONNTRACK_IPV4
+	depends on NF_CONNTRACK
 	default m if NETFILTER_ADVANCED=n
 	select NF_NAT
 	select NF_NAT_IPV4
@@ -340,7 +324,7 @@ config IP_NF_MANGLE
 config IP_NF_TARGET_CLUSTERIP
 	tristate "CLUSTERIP target support"
 	depends on IP_NF_MANGLE
-	depends on NF_CONNTRACK_IPV4
+	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	select NF_CONNTRACK_MARK
 	select NETFILTER_FAMILY_ARP
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 8394c17c269f..367993adf4d3 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,12 +3,6 @@
 # Makefile for the netfilter modules on top of IPv4.
 #
 
-# objects for l3 independent conntrack
-nf_conntrack_ipv4-y	:=  nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
-
-# connection tracking
-obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
-
 nf_nat_ipv4-y		:= nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
 nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
 obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
deleted file mode 100644
index 9fbf6c7f8ece..000000000000
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ /dev/null
@@ -1,368 +0,0 @@
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/icmp.h>
-#include <linux/sysctl.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_zones.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
-#include <net/netfilter/nf_log.h>
-
-static int conntrack4_net_id __read_mostly;
-static DEFINE_MUTEX(register_ipv4_hooks);
-
-struct conntrack4_net {
-	unsigned int users;
-};
-
-static unsigned int ipv4_helper(void *priv,
-				struct sk_buff *skb,
-				const struct nf_hook_state *state)
-{
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	const struct nf_conn_help *help;
-	const struct nf_conntrack_helper *helper;
-
-	/* This is where we call the helper: as the packet goes out. */
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-		return NF_ACCEPT;
-
-	help = nfct_help(ct);
-	if (!help)
-		return NF_ACCEPT;
-
-	/* rcu_read_lock()ed by nf_hook_thresh */
-	helper = rcu_dereference(help->helper);
-	if (!helper)
-		return NF_ACCEPT;
-
-	return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
-			    ct, ctinfo);
-}
-
-static unsigned int ipv4_confirm(void *priv,
-				 struct sk_buff *skb,
-				 const struct nf_hook_state *state)
-{
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-		goto out;
-
-	/* adjust seqs for loopback traffic only in outgoing direction */
-	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
-	    !nf_is_loopback_packet(skb)) {
-		if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
-			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
-			return NF_DROP;
-		}
-	}
-out:
-	/* We've seen it coming out the other side: confirm it */
-	return nf_conntrack_confirm(skb);
-}
-
-static unsigned int ipv4_conntrack_in(void *priv,
-				      struct sk_buff *skb,
-				      const struct nf_hook_state *state)
-{
-	return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
-}
-
-static unsigned int ipv4_conntrack_local(void *priv,
-					 struct sk_buff *skb,
-					 const struct nf_hook_state *state)
-{
-	if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */
-		enum ip_conntrack_info ctinfo;
-		struct nf_conn *tmpl;
-
-		tmpl = nf_ct_get(skb, &ctinfo);
-		if (tmpl && nf_ct_is_template(tmpl)) {
-			/* when skipping ct, clear templates to avoid fooling
-			 * later targets/matches
-			 */
-			skb->_nfct = 0;
-			nf_ct_put(tmpl);
-		}
-		return NF_ACCEPT;
-	}
-
-	return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
-}
-
-/* Connection tracking may drop packets, but never alters them, so
-   make it the first hook. */
-static const struct nf_hook_ops ipv4_conntrack_ops[] = {
-	{
-		.hook		= ipv4_conntrack_in,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_PRE_ROUTING,
-		.priority	= NF_IP_PRI_CONNTRACK,
-	},
-	{
-		.hook		= ipv4_conntrack_local,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP_PRI_CONNTRACK,
-	},
-	{
-		.hook		= ipv4_helper,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_POST_ROUTING,
-		.priority	= NF_IP_PRI_CONNTRACK_HELPER,
-	},
-	{
-		.hook		= ipv4_confirm,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_POST_ROUTING,
-		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,
-	},
-	{
-		.hook		= ipv4_helper,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_IN,
-		.priority	= NF_IP_PRI_CONNTRACK_HELPER,
-	},
-	{
-		.hook		= ipv4_confirm,
-		.pf		= NFPROTO_IPV4,
-		.hooknum	= NF_INET_LOCAL_IN,
-		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,
-	},
-};
-
-/* Fast function for those who don't want to parse /proc (and I don't
-   blame them). */
-/* Reversing the socket's dst/src point of view gives us the reply
-   mapping. */
-static int
-getorigdst(struct sock *sk, int optval, void __user *user, int *len)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-	const struct nf_conntrack_tuple_hash *h;
-	struct nf_conntrack_tuple tuple;
-
-	memset(&tuple, 0, sizeof(tuple));
-
-	lock_sock(sk);
-	tuple.src.u3.ip = inet->inet_rcv_saddr;
-	tuple.src.u.tcp.port = inet->inet_sport;
-	tuple.dst.u3.ip = inet->inet_daddr;
-	tuple.dst.u.tcp.port = inet->inet_dport;
-	tuple.src.l3num = PF_INET;
-	tuple.dst.protonum = sk->sk_protocol;
-	release_sock(sk);
-
-	/* We only do TCP and SCTP at the moment: is there a better way? */
-	if (tuple.dst.protonum != IPPROTO_TCP &&
-	    tuple.dst.protonum != IPPROTO_SCTP) {
-		pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
-		return -ENOPROTOOPT;
-	}
-
-	if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
-		pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
-			 *len, sizeof(struct sockaddr_in));
-		return -EINVAL;
-	}
-
-	h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
-	if (h) {
-		struct sockaddr_in sin;
-		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
-
-		sin.sin_family = AF_INET;
-		sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
-			.tuple.dst.u.tcp.port;
-		sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
-			.tuple.dst.u3.ip;
-		memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
-
-		pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
-			 &sin.sin_addr.s_addr, ntohs(sin.sin_port));
-		nf_ct_put(ct);
-		if (copy_to_user(user, &sin, sizeof(sin)) != 0)
-			return -EFAULT;
-		else
-			return 0;
-	}
-	pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
-		 &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
-		 &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
-	return -ENOENT;
-}
-
-static struct nf_sockopt_ops so_getorigdst = {
-	.pf		= PF_INET,
-	.get_optmin	= SO_ORIGINAL_DST,
-	.get_optmax	= SO_ORIGINAL_DST+1,
-	.get		= getorigdst,
-	.owner		= THIS_MODULE,
-};
-
-static int ipv4_hooks_register(struct net *net)
-{
-	struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
-	int err = 0;
-
-	mutex_lock(&register_ipv4_hooks);
-
-	cnet->users++;
-	if (cnet->users > 1)
-		goto out_unlock;
-
-	err = nf_defrag_ipv4_enable(net);
-	if (err) {
-		cnet->users = 0;
-		goto out_unlock;
-	}
-
-	err = nf_register_net_hooks(net, ipv4_conntrack_ops,
-				    ARRAY_SIZE(ipv4_conntrack_ops));
-
-	if (err)
-		cnet->users = 0;
- out_unlock:
-	mutex_unlock(&register_ipv4_hooks);
-	return err;
-}
-
-static void ipv4_hooks_unregister(struct net *net)
-{
-	struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
-
-	mutex_lock(&register_ipv4_hooks);
-	if (cnet->users && (--cnet->users == 0))
-		nf_unregister_net_hooks(net, ipv4_conntrack_ops,
-					ARRAY_SIZE(ipv4_conntrack_ops));
-	mutex_unlock(&register_ipv4_hooks);
-}
-
-const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
-	.l3proto	 = PF_INET,
-	.net_ns_get	 = ipv4_hooks_register,
-	.net_ns_put	 = ipv4_hooks_unregister,
-	.me		 = THIS_MODULE,
-};
-
-module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
-		  &nf_conntrack_htable_size, 0600);
-
-MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
-MODULE_ALIAS("ip_conntrack");
-MODULE_LICENSE("GPL");
-
-static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = {
-	&nf_conntrack_l4proto_tcp4,
-	&nf_conntrack_l4proto_udp4,
-	&nf_conntrack_l4proto_icmp,
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-	&nf_conntrack_l4proto_dccp4,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_SCTP
-	&nf_conntrack_l4proto_sctp4,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
-	&nf_conntrack_l4proto_udplite4,
-#endif
-};
-
-static int ipv4_net_init(struct net *net)
-{
-	return nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
-					     ARRAY_SIZE(builtin_l4proto4));
-}
-
-static void ipv4_net_exit(struct net *net)
-{
-	nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
-					ARRAY_SIZE(builtin_l4proto4));
-}
-
-static struct pernet_operations ipv4_net_ops = {
-	.init = ipv4_net_init,
-	.exit = ipv4_net_exit,
-	.id = &conntrack4_net_id,
-	.size = sizeof(struct conntrack4_net),
-};
-
-static int __init nf_conntrack_l3proto_ipv4_init(void)
-{
-	int ret = 0;
-
-	need_conntrack();
-
-	ret = nf_register_sockopt(&so_getorigdst);
-	if (ret < 0) {
-		pr_err("Unable to register netfilter socket option\n");
-		return ret;
-	}
-
-	ret = register_pernet_subsys(&ipv4_net_ops);
-	if (ret < 0) {
-		pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
-		goto cleanup_sockopt;
-	}
-
-	ret = nf_ct_l4proto_register(builtin_l4proto4,
-				     ARRAY_SIZE(builtin_l4proto4));
-	if (ret < 0)
-		goto cleanup_pernet;
-
-	ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
-	if (ret < 0) {
-		pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
-		goto cleanup_l4proto;
-	}
-
-	return ret;
-cleanup_l4proto:
-	nf_ct_l4proto_unregister(builtin_l4proto4,
-				 ARRAY_SIZE(builtin_l4proto4));
- cleanup_pernet:
-	unregister_pernet_subsys(&ipv4_net_ops);
- cleanup_sockopt:
-	nf_unregister_sockopt(&so_getorigdst);
-	return ret;
-}
-
-static void __exit nf_conntrack_l3proto_ipv4_fini(void)
-{
-	synchronize_net();
-	nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
-	nf_ct_l4proto_unregister(builtin_l4proto4,
-				 ARRAY_SIZE(builtin_l4proto4));
-	unregister_pernet_subsys(&ipv4_net_ops);
-	nf_unregister_sockopt(&so_getorigdst);
-}
-
-module_init(nf_conntrack_l3proto_ipv4_init);
-module_exit(nf_conntrack_l3proto_ipv4_fini);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 37b14dc9d863..339d0762b027 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -5,26 +5,6 @@
 menu "IPv6: Netfilter Configuration"
 	depends on INET && IPV6 && NETFILTER
 
-config NF_DEFRAG_IPV6
-	tristate
-	default n
-
-config NF_CONNTRACK_IPV6
-	tristate "IPv6 connection tracking support"
-	depends on INET && IPV6 && NF_CONNTRACK
-	default m if NETFILTER_ADVANCED=n
-	select NF_DEFRAG_IPV6
-	---help---
-	  Connection tracking keeps a record of what packets have passed
-	  through your machine, in order to figure out how they are related
-	  into connections.
-
-	  This is IPv6 support on Layer 3 independent connection tracking.
-	  Layer 3 independent connection tracking is experimental scheme
-	  which generalize ip_conntrack to support other layer 3 protocols.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config NF_SOCKET_IPV6
 	tristate "IPv6 socket lookup support"
 	help
@@ -128,7 +108,7 @@ config NF_LOG_IPV6
 
 config NF_NAT_IPV6
 	tristate "IPv6 NAT"
-	depends on NF_CONNTRACK_IPV6
+	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	select NF_NAT
 	help
@@ -328,7 +308,7 @@ config IP6_NF_SECURITY
 
 config IP6_NF_NAT
 	tristate "ip6tables NAT support"
-	depends on NF_CONNTRACK_IPV6
+	depends on NF_CONNTRACK
 	depends on NETFILTER_ADVANCED
 	select NF_NAT
 	select NF_NAT_IPV6
@@ -365,6 +345,7 @@ config IP6_NF_TARGET_NPT
 endif # IP6_NF_NAT
 
 endif # IP6_NF_IPTABLES
-
 endmenu
 
+config NF_DEFRAG_IPV6
+	tristate
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 10a5a1c87320..200c0c235565 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -11,12 +11,6 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
 obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
 obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o
 
-# objects for l3 independent conntrack
-nf_conntrack_ipv6-y  :=  nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
-
-# l3 independent conntrack
-obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
-
 nf_nat_ipv6-y		:= nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
 nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
 obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
deleted file mode 100644
index 37ab25645cf2..000000000000
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * Copyright (C)2004 USAGI/WIDE Project
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Author:
- *	Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- */
-
-#include <linux/types.h>
-#include <linux/ipv6.h>
-#include <linux/in6.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/icmp.h>
-#include <net/ipv6.h>
-#include <net/inet_frag.h>
-
-#include <linux/netfilter_bridge.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_zones.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
-#include <net/netfilter/nf_log.h>
-
-static int conntrack6_net_id;
-static DEFINE_MUTEX(register_ipv6_hooks);
-
-struct conntrack6_net {
-	unsigned int users;
-};
-
-static unsigned int ipv6_helper(void *priv,
-				struct sk_buff *skb,
-				const struct nf_hook_state *state)
-{
-	struct nf_conn *ct;
-	const struct nf_conn_help *help;
-	const struct nf_conntrack_helper *helper;
-	enum ip_conntrack_info ctinfo;
-	__be16 frag_off;
-	int protoff;
-	u8 nexthdr;
-
-	/* This is where we call the helper: as the packet goes out. */
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-		return NF_ACCEPT;
-
-	help = nfct_help(ct);
-	if (!help)
-		return NF_ACCEPT;
-	/* rcu_read_lock()ed by nf_hook_thresh */
-	helper = rcu_dereference(help->helper);
-	if (!helper)
-		return NF_ACCEPT;
-
-	nexthdr = ipv6_hdr(skb)->nexthdr;
-	protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
-				   &frag_off);
-	if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
-		pr_debug("proto header not found\n");
-		return NF_ACCEPT;
-	}
-
-	return helper->help(skb, protoff, ct, ctinfo);
-}
-
-static unsigned int ipv6_confirm(void *priv,
-				 struct sk_buff *skb,
-				 const struct nf_hook_state *state)
-{
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	unsigned char pnum = ipv6_hdr(skb)->nexthdr;
-	int protoff;
-	__be16 frag_off;
-
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-		goto out;
-
-	protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
-				   &frag_off);
-	if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
-		pr_debug("proto header not found\n");
-		goto out;
-	}
-
-	/* adjust seqs for loopback traffic only in outgoing direction */
-	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
-	    !nf_is_loopback_packet(skb)) {
-		if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
-			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
-			return NF_DROP;
-		}
-	}
-out:
-	/* We've seen it coming out the other side: confirm it */
-	return nf_conntrack_confirm(skb);
-}
-
-static unsigned int ipv6_conntrack_in(void *priv,
-				      struct sk_buff *skb,
-				      const struct nf_hook_state *state)
-{
-	return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
-}
-
-static unsigned int ipv6_conntrack_local(void *priv,
-					 struct sk_buff *skb,
-					 const struct nf_hook_state *state)
-{
-	return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
-}
-
-static const struct nf_hook_ops ipv6_conntrack_ops[] = {
-	{
-		.hook		= ipv6_conntrack_in,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_PRE_ROUTING,
-		.priority	= NF_IP6_PRI_CONNTRACK,
-	},
-	{
-		.hook		= ipv6_conntrack_local,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_LOCAL_OUT,
-		.priority	= NF_IP6_PRI_CONNTRACK,
-	},
-	{
-		.hook		= ipv6_helper,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_POST_ROUTING,
-		.priority	= NF_IP6_PRI_CONNTRACK_HELPER,
-	},
-	{
-		.hook		= ipv6_confirm,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_POST_ROUTING,
-		.priority	= NF_IP6_PRI_LAST,
-	},
-	{
-		.hook		= ipv6_helper,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_LOCAL_IN,
-		.priority	= NF_IP6_PRI_CONNTRACK_HELPER,
-	},
-	{
-		.hook		= ipv6_confirm,
-		.pf		= NFPROTO_IPV6,
-		.hooknum	= NF_INET_LOCAL_IN,
-		.priority	= NF_IP6_PRI_LAST-1,
-	},
-};
-
-static int
-ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
-{
-	struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 };
-	const struct ipv6_pinfo *inet6 = inet6_sk(sk);
-	const struct inet_sock *inet = inet_sk(sk);
-	const struct nf_conntrack_tuple_hash *h;
-	struct sockaddr_in6 sin6;
-	struct nf_conn *ct;
-	__be32 flow_label;
-	int bound_dev_if;
-
-	lock_sock(sk);
-	tuple.src.u3.in6 = sk->sk_v6_rcv_saddr;
-	tuple.src.u.tcp.port = inet->inet_sport;
-	tuple.dst.u3.in6 = sk->sk_v6_daddr;
-	tuple.dst.u.tcp.port = inet->inet_dport;
-	tuple.dst.protonum = sk->sk_protocol;
-	bound_dev_if = sk->sk_bound_dev_if;
-	flow_label = inet6->flow_label;
-	release_sock(sk);
-
-	if (tuple.dst.protonum != IPPROTO_TCP &&
-	    tuple.dst.protonum != IPPROTO_SCTP)
-		return -ENOPROTOOPT;
-
-	if (*len < 0 || (unsigned int) *len < sizeof(sin6))
-		return -EINVAL;
-
-	h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
-	if (!h) {
-		pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
-			 &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
-			 &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port));
-		return -ENOENT;
-	}
-
-	ct = nf_ct_tuplehash_to_ctrack(h);
-
-	sin6.sin6_family = AF_INET6;
-	sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
-	sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK;
-	memcpy(&sin6.sin6_addr,
-		&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6,
-					sizeof(sin6.sin6_addr));
-
-	nf_ct_put(ct);
-	sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if);
-	return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0;
-}
-
-static int ipv6_hooks_register(struct net *net)
-{
-	struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
-	int err = 0;
-
-	mutex_lock(&register_ipv6_hooks);
-	cnet->users++;
-	if (cnet->users > 1)
-		goto out_unlock;
-
-	err = nf_defrag_ipv6_enable(net);
-	if (err < 0) {
-		cnet->users = 0;
-		goto out_unlock;
-	}
-
-	err = nf_register_net_hooks(net, ipv6_conntrack_ops,
-				    ARRAY_SIZE(ipv6_conntrack_ops));
-	if (err)
-		cnet->users = 0;
- out_unlock:
-	mutex_unlock(&register_ipv6_hooks);
-	return err;
-}
-
-static void ipv6_hooks_unregister(struct net *net)
-{
-	struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
-
-	mutex_lock(&register_ipv6_hooks);
-	if (cnet->users && (--cnet->users == 0))
-		nf_unregister_net_hooks(net, ipv6_conntrack_ops,
-					ARRAY_SIZE(ipv6_conntrack_ops));
-	mutex_unlock(&register_ipv6_hooks);
-}
-
-const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
-	.l3proto		= PF_INET6,
-	.net_ns_get		= ipv6_hooks_register,
-	.net_ns_put		= ipv6_hooks_unregister,
-	.me			= THIS_MODULE,
-};
-
-MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>");
-
-static struct nf_sockopt_ops so_getorigdst6 = {
-	.pf		= NFPROTO_IPV6,
-	.get_optmin	= IP6T_SO_ORIGINAL_DST,
-	.get_optmax	= IP6T_SO_ORIGINAL_DST + 1,
-	.get		= ipv6_getorigdst,
-	.owner		= THIS_MODULE,
-};
-
-static const struct nf_conntrack_l4proto * const builtin_l4proto6[] = {
-	&nf_conntrack_l4proto_tcp6,
-	&nf_conntrack_l4proto_udp6,
-	&nf_conntrack_l4proto_icmpv6,
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-	&nf_conntrack_l4proto_dccp6,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_SCTP
-	&nf_conntrack_l4proto_sctp6,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
-	&nf_conntrack_l4proto_udplite6,
-#endif
-};
-
-static int ipv6_net_init(struct net *net)
-{
-	return nf_ct_l4proto_pernet_register(net, builtin_l4proto6,
-					     ARRAY_SIZE(builtin_l4proto6));
-}
-
-static void ipv6_net_exit(struct net *net)
-{
-	nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
-					ARRAY_SIZE(builtin_l4proto6));
-}
-
-static struct pernet_operations ipv6_net_ops = {
-	.init = ipv6_net_init,
-	.exit = ipv6_net_exit,
-	.id = &conntrack6_net_id,
-	.size = sizeof(struct conntrack6_net),
-};
-
-static int __init nf_conntrack_l3proto_ipv6_init(void)
-{
-	int ret = 0;
-
-	need_conntrack();
-
-	ret = nf_register_sockopt(&so_getorigdst6);
-	if (ret < 0) {
-		pr_err("Unable to register netfilter socket option\n");
-		return ret;
-	}
-
-	ret = register_pernet_subsys(&ipv6_net_ops);
-	if (ret < 0)
-		goto cleanup_sockopt;
-
-	ret = nf_ct_l4proto_register(builtin_l4proto6,
-				     ARRAY_SIZE(builtin_l4proto6));
-	if (ret < 0)
-		goto cleanup_pernet;
-
-	ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6);
-	if (ret < 0) {
-		pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n");
-		goto cleanup_l4proto;
-	}
-	return ret;
-cleanup_l4proto:
-	nf_ct_l4proto_unregister(builtin_l4proto6,
-				 ARRAY_SIZE(builtin_l4proto6));
- cleanup_pernet:
-	unregister_pernet_subsys(&ipv6_net_ops);
- cleanup_sockopt:
-	nf_unregister_sockopt(&so_getorigdst6);
-	return ret;
-}
-
-static void __exit nf_conntrack_l3proto_ipv6_fini(void)
-{
-	synchronize_net();
-	nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
-	nf_ct_l4proto_unregister(builtin_l4proto6,
-				 ARRAY_SIZE(builtin_l4proto6));
-	unregister_pernet_subsys(&ipv6_net_ops);
-	nf_unregister_sockopt(&so_getorigdst6);
-}
-
-module_init(nf_conntrack_l3proto_ipv6_init);
-module_exit(nf_conntrack_l3proto_ipv6_fini);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 3ce657fbca67..9eab519b403a 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -49,6 +49,8 @@ config NETFILTER_NETLINK_LOG
 config NF_CONNTRACK
 	tristate "Netfilter connection tracking support"
 	default m if NETFILTER_ADVANCED=n
+	select NF_DEFRAG_IPV4
+	select NF_DEFRAG_IPV6 if IPV6 != n
 	help
 	  Connection tracking keeps a record of what packets have passed
 	  through your machine, in order to figure out how they are related
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index f132ea850778..53bd1ed1228a 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,7 +1,12 @@
 # SPDX-License-Identifier: GPL-2.0
 netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o
 
-nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
+nf_conntrack-y	:= nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o \
+		   nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o \
+		   nf_conntrack_proto_icmp.o \
+		   nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
+
+nf_conntrack-$(subst m,y,$(CONFIG_IPV6)) += nf_conntrack_proto_icmpv6.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
 nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index c069f2faff4c..5123e91b1982 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -291,7 +291,6 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 			    u_int8_t *protonum)
 {
 	int dataoff = -1;
-#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV4)
 	const struct iphdr *iph;
 	struct iphdr _iph;
 
@@ -314,15 +313,14 @@ static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 			 nhoff, iph->ihl << 2, skb->len);
 		return -1;
 	}
-#endif
 	return dataoff;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 			    u8 *protonum)
 {
 	int protoff = -1;
-#if IS_ENABLED(CONFIG_NF_CONNTRACK_IPV6)
 	unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
 	__be16 frag_off;
 	u8 nexthdr;
@@ -343,9 +341,9 @@ static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
 	}
 
 	*protonum = nexthdr;
-#endif
 	return protoff;
 }
+#endif
 
 static int get_l4proto(const struct sk_buff *skb,
 		       unsigned int nhoff, u8 pf, u8 *l4num)
@@ -353,8 +351,10 @@ static int get_l4proto(const struct sk_buff *skb,
 	switch (pf) {
 	case NFPROTO_IPV4:
 		return ipv4_get_l4proto(skb, nhoff, l4num);
+#if IS_ENABLED(CONFIG_IPV6)
 	case NFPROTO_IPV6:
 		return ipv6_get_l4proto(skb, nhoff, l4num);
+#endif
 	default:
 		*l4num = 0;
 		break;
@@ -2197,9 +2197,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
 
-module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
-		  &nf_conntrack_htable_size, 0600);
-
 static __always_inline unsigned int total_extension_size(void)
 {
 	/* remember to add new extensions below */
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 39df72bb9d56..803607a90102 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -1,14 +1,4 @@
-/* L3/L4 protocol support for nf_conntrack. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
- * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0
 
 #include <linux/types.h>
 #include <linux/netfilter.h>
@@ -24,22 +14,39 @@
 #include <linux/netdevice.h>
 
 #include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_log.h>
 
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
+#include <net/inet_frag.h>
+
+extern unsigned int nf_conntrack_net_id;
+
 static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly;
-struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO] __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_l3protos);
 
 static DEFINE_MUTEX(nf_ct_proto_mutex);
 
-struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
-	.l3proto	 = PF_UNSPEC,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic);
-
 #ifdef CONFIG_SYSCTL
 static int
 nf_ct_register_sysctl(struct net *net,
@@ -127,137 +134,6 @@ __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto)
 }
 EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find);
 
-/* this is guaranteed to always return a valid protocol helper, since
- * it falls back to generic_protocol */
-const struct nf_conntrack_l3proto *
-nf_ct_l3proto_find_get(u_int16_t l3proto)
-{
-	struct nf_conntrack_l3proto *p;
-
-	rcu_read_lock();
-	p = __nf_ct_l3proto_find(l3proto);
-	if (!try_module_get(p->me))
-		p = &nf_conntrack_l3proto_generic;
-	rcu_read_unlock();
-
-	return p;
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get);
-
-int
-nf_ct_l3proto_try_module_get(unsigned short l3proto)
-{
-	const struct nf_conntrack_l3proto *p;
-	int ret;
-
-retry:	p = nf_ct_l3proto_find_get(l3proto);
-	if (p == &nf_conntrack_l3proto_generic) {
-		ret = request_module("nf_conntrack-%d", l3proto);
-		if (!ret)
-			goto retry;
-
-		return -EPROTOTYPE;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_try_module_get);
-
-void nf_ct_l3proto_module_put(unsigned short l3proto)
-{
-	struct nf_conntrack_l3proto *p;
-
-	/* rcu_read_lock not necessary since the caller holds a reference, but
-	 * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find()
-	 */
-	rcu_read_lock();
-	p = __nf_ct_l3proto_find(l3proto);
-	module_put(p->me);
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
-
-static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
-{
-	const struct nf_conntrack_l3proto *l3proto;
-	int ret;
-
-	might_sleep();
-
-	ret = nf_ct_l3proto_try_module_get(nfproto);
-	if (ret < 0)
-		return ret;
-
-	/* we already have a reference, can't fail */
-	rcu_read_lock();
-	l3proto = __nf_ct_l3proto_find(nfproto);
-	rcu_read_unlock();
-
-	if (!l3proto->net_ns_get)
-		return 0;
-
-	ret = l3proto->net_ns_get(net);
-	if (ret < 0)
-		nf_ct_l3proto_module_put(nfproto);
-
-	return ret;
-}
-
-int nf_ct_netns_get(struct net *net, u8 nfproto)
-{
-	int err;
-
-	if (nfproto == NFPROTO_INET) {
-		err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
-		if (err < 0)
-			goto err1;
-		err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
-		if (err < 0)
-			goto err2;
-	} else {
-		err = nf_ct_netns_do_get(net, nfproto);
-		if (err < 0)
-			goto err1;
-	}
-	return 0;
-
-err2:
-	nf_ct_netns_put(net, NFPROTO_IPV4);
-err1:
-	return err;
-}
-EXPORT_SYMBOL_GPL(nf_ct_netns_get);
-
-static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
-{
-	const struct nf_conntrack_l3proto *l3proto;
-
-	might_sleep();
-
-	/* same as nf_conntrack_netns_get(), reference assumed */
-	rcu_read_lock();
-	l3proto = __nf_ct_l3proto_find(nfproto);
-	rcu_read_unlock();
-
-	if (WARN_ON(!l3proto))
-		return;
-
-	if (l3proto->net_ns_put)
-		l3proto->net_ns_put(net);
-
-	nf_ct_l3proto_module_put(nfproto);
-}
-
-void nf_ct_netns_put(struct net *net, uint8_t nfproto)
-{
-	if (nfproto == NFPROTO_INET) {
-		nf_ct_netns_do_put(net, NFPROTO_IPV4);
-		nf_ct_netns_do_put(net, NFPROTO_IPV6);
-	} else
-		nf_ct_netns_do_put(net, nfproto);
-}
-EXPORT_SYMBOL_GPL(nf_ct_netns_put);
-
 const struct nf_conntrack_l4proto *
 nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
 {
@@ -279,11 +155,6 @@ void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p)
 }
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_put);
 
-static int kill_l3proto(struct nf_conn *i, void *data)
-{
-	return nf_ct_l3num(i) == ((const struct nf_conntrack_l3proto *)data)->l3proto;
-}
-
 static int kill_l4proto(struct nf_conn *i, void *data)
 {
 	const struct nf_conntrack_l4proto *l4proto;
@@ -292,49 +163,6 @@ static int kill_l4proto(struct nf_conn *i, void *data)
 	       nf_ct_l3num(i) == l4proto->l3proto;
 }
 
-int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto)
-{
-	int ret = 0;
-	struct nf_conntrack_l3proto *old;
-
-	if (proto->l3proto >= NFPROTO_NUMPROTO)
-		return -EBUSY;
-
-	mutex_lock(&nf_ct_proto_mutex);
-	old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
-					lockdep_is_held(&nf_ct_proto_mutex));
-	if (old != &nf_conntrack_l3proto_generic) {
-		ret = -EBUSY;
-		goto out_unlock;
-	}
-
-	rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto);
-
-out_unlock:
-	mutex_unlock(&nf_ct_proto_mutex);
-	return ret;
-
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
-
-void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto)
-{
-	BUG_ON(proto->l3proto >= NFPROTO_NUMPROTO);
-
-	mutex_lock(&nf_ct_proto_mutex);
-	BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
-					 lockdep_is_held(&nf_ct_proto_mutex)
-					 ) != proto);
-	rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
-			   &nf_conntrack_l3proto_generic);
-	mutex_unlock(&nf_ct_proto_mutex);
-
-	synchronize_rcu();
-	/* Remove all contrack entries for this protocol */
-	nf_ct_iterate_destroy(kill_l3proto, (void*)proto);
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
-
 static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
 				const struct nf_conntrack_l4proto *l4proto)
 {
@@ -501,8 +329,23 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net,
 }
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one);
 
-int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
-			   unsigned int num_proto)
+static void
+nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[],
+			 unsigned int num_proto)
+{
+	mutex_lock(&nf_ct_proto_mutex);
+	while (num_proto-- != 0)
+		__nf_ct_l4proto_unregister_one(l4proto[num_proto]);
+	mutex_unlock(&nf_ct_proto_mutex);
+
+	synchronize_net();
+	/* Remove all contrack entries for this protocol */
+	nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto);
+}
+
+static int
+nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
+		       unsigned int num_proto)
 {
 	int ret = -EINVAL, ver;
 	unsigned int i;
@@ -520,7 +363,6 @@ int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
 	}
 	return ret;
 }
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
 
 int nf_ct_l4proto_pernet_register(struct net *net,
 				  const struct nf_conntrack_l4proto *const l4proto[],
@@ -544,20 +386,6 @@ int nf_ct_l4proto_pernet_register(struct net *net,
 }
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
 
-void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[],
-			      unsigned int num_proto)
-{
-	mutex_lock(&nf_ct_proto_mutex);
-	while (num_proto-- != 0)
-		__nf_ct_l4proto_unregister_one(l4proto[num_proto]);
-	mutex_unlock(&nf_ct_proto_mutex);
-
-	synchronize_net();
-	/* Remove all contrack entries for this protocol */
-	nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto);
-}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
-
 void nf_ct_l4proto_pernet_unregister(struct net *net,
 				const struct nf_conntrack_l4proto *const l4proto[],
 				unsigned int num_proto)
@@ -567,6 +395,563 @@ void nf_ct_l4proto_pernet_unregister(struct net *net,
 }
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister);
 
+static unsigned int ipv4_helper(void *priv,
+				struct sk_buff *skb,
+				const struct nf_hook_state *state)
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	const struct nf_conn_help *help;
+	const struct nf_conntrack_helper *helper;
+
+	/* This is where we call the helper: as the packet goes out. */
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+		return NF_ACCEPT;
+
+	help = nfct_help(ct);
+	if (!help)
+		return NF_ACCEPT;
+
+	/* rcu_read_lock()ed by nf_hook_thresh */
+	helper = rcu_dereference(help->helper);
+	if (!helper)
+		return NF_ACCEPT;
+
+	return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
+			    ct, ctinfo);
+}
+
+static unsigned int ipv4_confirm(void *priv,
+				 struct sk_buff *skb,
+				 const struct nf_hook_state *state)
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+		goto out;
+
+	/* adjust seqs for loopback traffic only in outgoing direction */
+	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+	    !nf_is_loopback_packet(skb)) {
+		if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
+			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+			return NF_DROP;
+		}
+	}
+out:
+	/* We've seen it coming out the other side: confirm it */
+	return nf_conntrack_confirm(skb);
+}
+
+static unsigned int ipv4_conntrack_in(void *priv,
+				      struct sk_buff *skb,
+				      const struct nf_hook_state *state)
+{
+	return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
+}
+
+static unsigned int ipv4_conntrack_local(void *priv,
+					 struct sk_buff *skb,
+					 const struct nf_hook_state *state)
+{
+	if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */
+		enum ip_conntrack_info ctinfo;
+		struct nf_conn *tmpl;
+
+		tmpl = nf_ct_get(skb, &ctinfo);
+		if (tmpl && nf_ct_is_template(tmpl)) {
+			/* when skipping ct, clear templates to avoid fooling
+			 * later targets/matches
+			 */
+			skb->_nfct = 0;
+			nf_ct_put(tmpl);
+		}
+		return NF_ACCEPT;
+	}
+
+	return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
+}
+
+/* Connection tracking may drop packets, but never alters them, so
+ * make it the first hook.
+ */
+static const struct nf_hook_ops ipv4_conntrack_ops[] = {
+	{
+		.hook		= ipv4_conntrack_in,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK,
+	},
+	{
+		.hook		= ipv4_conntrack_local,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP_PRI_CONNTRACK,
+	},
+	{
+		.hook		= ipv4_helper,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_HELPER,
+	},
+	{
+		.hook		= ipv4_confirm,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,
+	},
+	{
+		.hook		= ipv4_helper,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_CONNTRACK_HELPER,
+	},
+	{
+		.hook		= ipv4_confirm,
+		.pf		= NFPROTO_IPV4,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,
+	},
+};
+
+/* Fast function for those who don't want to parse /proc (and I don't
+ * blame them).
+ * Reversing the socket's dst/src point of view gives us the reply
+ * mapping.
+ */
+static int
+getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct nf_conntrack_tuple_hash *h;
+	struct nf_conntrack_tuple tuple;
+
+	memset(&tuple, 0, sizeof(tuple));
+
+	lock_sock(sk);
+	tuple.src.u3.ip = inet->inet_rcv_saddr;
+	tuple.src.u.tcp.port = inet->inet_sport;
+	tuple.dst.u3.ip = inet->inet_daddr;
+	tuple.dst.u.tcp.port = inet->inet_dport;
+	tuple.src.l3num = PF_INET;
+	tuple.dst.protonum = sk->sk_protocol;
+	release_sock(sk);
+
+	/* We only do TCP and SCTP at the moment: is there a better way? */
+	if (tuple.dst.protonum != IPPROTO_TCP &&
+	    tuple.dst.protonum != IPPROTO_SCTP) {
+		pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
+		return -ENOPROTOOPT;
+	}
+
+	if ((unsigned int)*len < sizeof(struct sockaddr_in)) {
+		pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
+			 *len, sizeof(struct sockaddr_in));
+		return -EINVAL;
+	}
+
+	h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
+	if (h) {
+		struct sockaddr_in sin;
+		struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+		sin.sin_family = AF_INET;
+		sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+			.tuple.dst.u.tcp.port;
+		sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+			.tuple.dst.u3.ip;
+		memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+		pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
+			 &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+		nf_ct_put(ct);
+		if (copy_to_user(user, &sin, sizeof(sin)) != 0)
+			return -EFAULT;
+		else
+			return 0;
+	}
+	pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
+		 &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
+		 &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
+	return -ENOENT;
+}
+
+static struct nf_sockopt_ops so_getorigdst = {
+	.pf		= PF_INET,
+	.get_optmin	= SO_ORIGINAL_DST,
+	.get_optmax	= SO_ORIGINAL_DST + 1,
+	.get		= getorigdst,
+	.owner		= THIS_MODULE,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int
+ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+	struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 };
+	const struct ipv6_pinfo *inet6 = inet6_sk(sk);
+	const struct inet_sock *inet = inet_sk(sk);
+	const struct nf_conntrack_tuple_hash *h;
+	struct sockaddr_in6 sin6;
+	struct nf_conn *ct;
+	__be32 flow_label;
+	int bound_dev_if;
+
+	lock_sock(sk);
+	tuple.src.u3.in6 = sk->sk_v6_rcv_saddr;
+	tuple.src.u.tcp.port = inet->inet_sport;
+	tuple.dst.u3.in6 = sk->sk_v6_daddr;
+	tuple.dst.u.tcp.port = inet->inet_dport;
+	tuple.dst.protonum = sk->sk_protocol;
+	bound_dev_if = sk->sk_bound_dev_if;
+	flow_label = inet6->flow_label;
+	release_sock(sk);
+
+	if (tuple.dst.protonum != IPPROTO_TCP &&
+	    tuple.dst.protonum != IPPROTO_SCTP)
+		return -ENOPROTOOPT;
+
+	if (*len < 0 || (unsigned int)*len < sizeof(sin6))
+		return -EINVAL;
+
+	h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
+	if (!h) {
+		pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
+			 &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
+			 &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port));
+		return -ENOENT;
+	}
+
+	ct = nf_ct_tuplehash_to_ctrack(h);
+
+	sin6.sin6_family = AF_INET6;
+	sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
+	sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK;
+	memcpy(&sin6.sin6_addr,
+	       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6,
+	       sizeof(sin6.sin6_addr));
+
+	nf_ct_put(ct);
+	sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if);
+	return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0;
+}
+
+static struct nf_sockopt_ops so_getorigdst6 = {
+	.pf		= NFPROTO_IPV6,
+	.get_optmin	= IP6T_SO_ORIGINAL_DST,
+	.get_optmax	= IP6T_SO_ORIGINAL_DST + 1,
+	.get		= ipv6_getorigdst,
+	.owner		= THIS_MODULE,
+};
+
+static unsigned int ipv6_confirm(void *priv,
+				 struct sk_buff *skb,
+				 const struct nf_hook_state *state)
+{
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned char pnum = ipv6_hdr(skb)->nexthdr;
+	int protoff;
+	__be16 frag_off;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+		goto out;
+
+	protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
+				   &frag_off);
+	if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+		pr_debug("proto header not found\n");
+		goto out;
+	}
+
+	/* adjust seqs for loopback traffic only in outgoing direction */
+	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+	    !nf_is_loopback_packet(skb)) {
+		if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
+			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+			return NF_DROP;
+		}
+	}
+out:
+	/* We've seen it coming out the other side: confirm it */
+	return nf_conntrack_confirm(skb);
+}
+
+static unsigned int ipv6_conntrack_in(void *priv,
+				      struct sk_buff *skb,
+				      const struct nf_hook_state *state)
+{
+	return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
+}
+
+static unsigned int ipv6_conntrack_local(void *priv,
+					 struct sk_buff *skb,
+					 const struct nf_hook_state *state)
+{
+	return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
+}
+
+static unsigned int ipv6_helper(void *priv,
+				struct sk_buff *skb,
+				const struct nf_hook_state *state)
+{
+	struct nf_conn *ct;
+	const struct nf_conn_help *help;
+	const struct nf_conntrack_helper *helper;
+	enum ip_conntrack_info ctinfo;
+	__be16 frag_off;
+	int protoff;
+	u8 nexthdr;
+
+	/* This is where we call the helper: as the packet goes out. */
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+		return NF_ACCEPT;
+
+	help = nfct_help(ct);
+	if (!help)
+		return NF_ACCEPT;
+	/* rcu_read_lock()ed by nf_hook_thresh */
+	helper = rcu_dereference(help->helper);
+	if (!helper)
+		return NF_ACCEPT;
+
+	nexthdr = ipv6_hdr(skb)->nexthdr;
+	protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+				   &frag_off);
+	if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+		pr_debug("proto header not found\n");
+		return NF_ACCEPT;
+	}
+
+	return helper->help(skb, protoff, ct, ctinfo);
+}
+
+static const struct nf_hook_ops ipv6_conntrack_ops[] = {
+	{
+		.hook		= ipv6_conntrack_in,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_PRE_ROUTING,
+		.priority	= NF_IP6_PRI_CONNTRACK,
+	},
+	{
+		.hook		= ipv6_conntrack_local,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_OUT,
+		.priority	= NF_IP6_PRI_CONNTRACK,
+	},
+	{
+		.hook		= ipv6_helper,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP6_PRI_CONNTRACK_HELPER,
+	},
+	{
+		.hook		= ipv6_confirm,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_POST_ROUTING,
+		.priority	= NF_IP6_PRI_LAST,
+	},
+	{
+		.hook		= ipv6_helper,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP6_PRI_CONNTRACK_HELPER,
+	},
+	{
+		.hook		= ipv6_confirm,
+		.pf		= NFPROTO_IPV6,
+		.hooknum	= NF_INET_LOCAL_IN,
+		.priority	= NF_IP6_PRI_LAST - 1,
+	},
+};
+#endif
+
+static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
+{
+	struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+	int err = 0;
+
+	mutex_lock(&nf_ct_proto_mutex);
+
+	switch (nfproto) {
+	case NFPROTO_IPV4:
+		cnet->users4++;
+		if (cnet->users4 > 1)
+			goto out_unlock;
+		err = nf_defrag_ipv4_enable(net);
+		if (err) {
+			cnet->users4 = 0;
+			goto out_unlock;
+		}
+
+		err = nf_register_net_hooks(net, ipv4_conntrack_ops,
+					    ARRAY_SIZE(ipv4_conntrack_ops));
+		if (err)
+			cnet->users4 = 0;
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case NFPROTO_IPV6:
+		cnet->users6++;
+		if (cnet->users6 > 1)
+			goto out_unlock;
+		err = nf_defrag_ipv6_enable(net);
+		if (err < 0) {
+			cnet->users6 = 0;
+			goto out_unlock;
+		}
+
+		err = nf_register_net_hooks(net, ipv6_conntrack_ops,
+					    ARRAY_SIZE(ipv6_conntrack_ops));
+		if (err)
+			cnet->users6 = 0;
+		break;
+#endif
+	default:
+		err = -EPROTO;
+		break;
+	}
+ out_unlock:
+	mutex_unlock(&nf_ct_proto_mutex);
+	return err;
+}
+
+static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
+{
+	struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+
+	mutex_lock(&nf_ct_proto_mutex);
+	switch (nfproto) {
+	case NFPROTO_IPV4:
+		if (cnet->users4 && (--cnet->users4 == 0))
+			nf_unregister_net_hooks(net, ipv4_conntrack_ops,
+						ARRAY_SIZE(ipv4_conntrack_ops));
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case NFPROTO_IPV6:
+		if (cnet->users6 && (--cnet->users6 == 0))
+			nf_unregister_net_hooks(net, ipv6_conntrack_ops,
+						ARRAY_SIZE(ipv6_conntrack_ops));
+		break;
+#endif
+	}
+
+	mutex_unlock(&nf_ct_proto_mutex);
+}
+
+int nf_ct_netns_get(struct net *net, u8 nfproto)
+{
+	int err;
+
+	if (nfproto == NFPROTO_INET) {
+		err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
+		if (err < 0)
+			goto err1;
+		err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
+		if (err < 0)
+			goto err2;
+	} else {
+		err = nf_ct_netns_do_get(net, nfproto);
+		if (err < 0)
+			goto err1;
+	}
+	return 0;
+
+err2:
+	nf_ct_netns_put(net, NFPROTO_IPV4);
+err1:
+	return err;
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_get);
+
+void nf_ct_netns_put(struct net *net, uint8_t nfproto)
+{
+	if (nfproto == NFPROTO_INET) {
+		nf_ct_netns_do_put(net, NFPROTO_IPV4);
+		nf_ct_netns_do_put(net, NFPROTO_IPV6);
+	} else {
+		nf_ct_netns_do_put(net, nfproto);
+	}
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_put);
+
+static const struct nf_conntrack_l4proto * const builtin_l4proto[] = {
+	&nf_conntrack_l4proto_tcp4,
+	&nf_conntrack_l4proto_udp4,
+	&nf_conntrack_l4proto_icmp,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+	&nf_conntrack_l4proto_dccp4,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+	&nf_conntrack_l4proto_sctp4,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+	&nf_conntrack_l4proto_udplite4,
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+	&nf_conntrack_l4proto_tcp6,
+	&nf_conntrack_l4proto_udp6,
+	&nf_conntrack_l4proto_icmpv6,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+	&nf_conntrack_l4proto_dccp6,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+	&nf_conntrack_l4proto_sctp6,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+	&nf_conntrack_l4proto_udplite6,
+#endif
+#endif /* CONFIG_IPV6 */
+};
+
+int nf_conntrack_proto_init(void)
+{
+	int ret = 0;
+
+	ret = nf_register_sockopt(&so_getorigdst);
+	if (ret < 0)
+		return ret;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	ret = nf_register_sockopt(&so_getorigdst6);
+	if (ret < 0)
+		goto cleanup_sockopt;
+#endif
+	ret = nf_ct_l4proto_register(builtin_l4proto,
+				     ARRAY_SIZE(builtin_l4proto));
+	if (ret < 0)
+		goto cleanup_sockopt2;
+
+	return ret;
+cleanup_sockopt2:
+	nf_unregister_sockopt(&so_getorigdst);
+#if IS_ENABLED(CONFIG_IPV6)
+cleanup_sockopt:
+	nf_unregister_sockopt(&so_getorigdst6);
+#endif
+	return ret;
+}
+
+void nf_conntrack_proto_fini(void)
+{
+	unsigned int i;
+
+	nf_ct_l4proto_unregister(builtin_l4proto,
+				 ARRAY_SIZE(builtin_l4proto));
+	nf_unregister_sockopt(&so_getorigdst);
+#if IS_ENABLED(CONFIG_IPV6)
+	nf_unregister_sockopt(&so_getorigdst6);
+#endif
+
+	/* free l3proto protocol tables */
+	for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++)
+		kfree(nf_ct_protos[i]);
+}
+
 int nf_conntrack_proto_pernet_init(struct net *net)
 {
 	int err;
@@ -583,6 +968,14 @@ int nf_conntrack_proto_pernet_init(struct net *net)
 	if (err < 0)
 		return err;
 
+	err = nf_ct_l4proto_pernet_register(net, builtin_l4proto,
+					    ARRAY_SIZE(builtin_l4proto));
+	if (err < 0) {
+		nf_ct_l4proto_unregister_sysctl(net, pn,
+						&nf_conntrack_l4proto_generic);
+		return err;
+	}
+
 	pn->users++;
 	return 0;
 }
@@ -592,25 +985,19 @@ void nf_conntrack_proto_pernet_fini(struct net *net)
 	struct nf_proto_net *pn = nf_ct_l4proto_net(net,
 					&nf_conntrack_l4proto_generic);
 
+	nf_ct_l4proto_pernet_unregister(net, builtin_l4proto,
+					ARRAY_SIZE(builtin_l4proto));
 	pn->users--;
 	nf_ct_l4proto_unregister_sysctl(net,
 					pn,
 					&nf_conntrack_l4proto_generic);
 }
 
-int nf_conntrack_proto_init(void)
-{
-	unsigned int i;
-	for (i = 0; i < NFPROTO_NUMPROTO; i++)
-		rcu_assign_pointer(nf_ct_l3protos[i],
-				   &nf_conntrack_l3proto_generic);
-	return 0;
-}
 
-void nf_conntrack_proto_fini(void)
-{
-	unsigned int i;
-	/* free l3proto protocol tables */
-	for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++)
-		kfree(nf_ct_protos[i]);
-}
+module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
+		  &nf_conntrack_htable_size, 0600);
+
+MODULE_ALIAS("ip_conntrack");
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
similarity index 100%
rename from net/ipv4/netfilter/nf_conntrack_proto_icmp.c
rename to net/netfilter/nf_conntrack_proto_icmp.c
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
similarity index 100%
rename from net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
rename to net/netfilter/nf_conntrack_proto_icmpv6.c
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 47b80fd0d2c3..13279f683da9 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -1,12 +1,4 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
+// SPDX-License-Identifier: GPL-2.0
 #include <linux/types.h>
 #include <linux/netfilter.h>
 #include <linux/slab.h>
@@ -32,7 +24,7 @@
 #include <net/netfilter/nf_conntrack_timestamp.h>
 #include <linux/rculist_nulls.h>
 
-MODULE_LICENSE("GPL");
+unsigned int nf_conntrack_net_id __read_mostly;
 
 #ifdef CONFIG_NF_CONNTRACK_PROCFS
 void
@@ -674,6 +666,8 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
 static struct pernet_operations nf_conntrack_net_ops = {
 	.init		= nf_conntrack_pernet_init,
 	.exit_batch	= nf_conntrack_pernet_exit,
+	.id		= &nf_conntrack_net_id,
+	.size = sizeof(struct nf_conntrack_net),
 };
 
 static int __init nf_conntrack_standalone_init(void)
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 86df2a1666fd..6366f0c0b8c1 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -28,7 +28,6 @@
 #include <net/netfilter/nf_nat_helper.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
 
@@ -743,12 +742,6 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister);
 
 int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
 {
-	int err;
-
-	err = nf_ct_l3proto_try_module_get(l3proto->l3proto);
-	if (err < 0)
-		return err;
-
 	mutex_lock(&nf_nat_proto_mutex);
 	RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP],
 			 &nf_nat_l4proto_tcp);
@@ -781,7 +774,6 @@ void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto)
 	synchronize_rcu();
 
 	nf_nat_l3proto_clean(l3proto->l3proto);
-	nf_ct_l3proto_module_put(l3proto->l3proto);
 }
 EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
 

From 5d400a4933e867dbc3706023c8ed55d364c233ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Eckl?= <ecklm94@gmail.com>
Date: Tue, 10 Jul 2018 16:01:28 +0200
Subject: [PATCH 16/38] netfilter: Kconfig: Change select IPv6 dependencies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

... from IPV6 to NF_TABLES_IPV6 and IP6_NF_IPTABLES.

In some cases module selects depend on IPV6, but this means that they
select another module even if eg. NF_TABLES_IPV6 is not set in which
case the selected module is useless due to the lack of IPv6 nf_tables
functionality.

The same applies for IP6_NF_IPTABLES and iptables.

Joint work with: Arnd Bermann <arnd@arndb.de>

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Kconfig      | 6 +++---
 net/netfilter/nft_socket.c | 4 ++--
 net/netfilter/xt_TEE.c     | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 9eab519b403a..e0ab50c58dc4 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -628,7 +628,7 @@ config NFT_SOCKET
 	tristate "Netfilter nf_tables socket match support"
 	depends on IPV6 || IPV6=n
 	select NF_SOCKET_IPV4
-	select NF_SOCKET_IPV6 if IPV6
+	select NF_SOCKET_IPV6 if NF_TABLES_IPV6
 	help
 	  This option allows matching for the presence or absence of a
 	  corresponding socket and its attributes.
@@ -894,7 +894,7 @@ config NETFILTER_XT_TARGET_LOG
 	tristate "LOG target support"
 	select NF_LOG_COMMON
 	select NF_LOG_IPV4
-	select NF_LOG_IPV6 if IPV6
+	select NF_LOG_IPV6 if IP6_NF_IPTABLES
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This option adds a `LOG' target, which allows you to create rules in
@@ -986,7 +986,7 @@ config NETFILTER_XT_TARGET_TEE
 	depends on IPV6 || IPV6=n
 	depends on !NF_CONNTRACK || NF_CONNTRACK
 	select NF_DUP_IPV4
-	select NF_DUP_IPV6 if IPV6
+	select NF_DUP_IPV6 if IP6_NF_IPTABLES
 	---help---
 	This option adds a "TEE" target with which a packet can be cloned and
 	this clone be rerouted to another nexthop.
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 998c2b546f6d..e43c1939d25f 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -31,7 +31,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
 		case NFPROTO_IPV4:
 			sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
 			break;
-#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
 		case NFPROTO_IPV6:
 			sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
 			break;
@@ -77,7 +77,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
 
 	switch(ctx->family) {
 	case NFPROTO_IPV4:
-#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
 	case NFPROTO_IPV6:
 #endif
 	case NFPROTO_INET:
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 475957cfcf50..0d0d68c989df 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -38,7 +38,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-#if IS_ENABLED(CONFIG_IPV6)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 static unsigned int
 tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -141,7 +141,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
 		.destroy    = tee_tg_destroy,
 		.me         = THIS_MODULE,
 	},
-#if IS_ENABLED(CONFIG_IPV6)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 	{
 		.name       = "TEE",
 		.revision   = 1,

From 2a406e8ac7c3e7e96b94d6c0765d5a4641970446 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Mon, 2 Jul 2018 17:33:39 -0700
Subject: [PATCH 17/38] netfilter: nf_conncount: Early exit for garbage
 collection

This patch is originally from Florian Westphal.

We use an extra function with early exit for garbage collection.
It is not necessary to traverse the full list for every node since
it is enough to zap a couple of entries for garbage collection.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 39 ++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 510039862aa9..81c02185b2e8 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -189,6 +189,42 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
 }
 EXPORT_SYMBOL_GPL(nf_conncount_lookup);
 
+static void nf_conncount_gc_list(struct net *net,
+				 struct nf_conncount_rb *rbconn)
+{
+	const struct nf_conntrack_tuple_hash *found;
+	struct nf_conncount_tuple *conn;
+	struct hlist_node *n;
+	struct nf_conn *found_ct;
+	unsigned int collected = 0;
+
+	hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) {
+		found = find_or_evict(net, conn);
+		if (IS_ERR(found)) {
+			if (PTR_ERR(found) == -ENOENT)
+				collected++;
+			continue;
+		}
+
+		found_ct = nf_ct_tuplehash_to_ctrack(found);
+		if (already_closed(found_ct)) {
+			/*
+			 * we do not care about connections which are
+			 * closed already -> ditch it
+			 */
+			nf_ct_put(found_ct);
+			hlist_del(&conn->node);
+			kmem_cache_free(conncount_conn_cachep, conn);
+			collected++;
+			continue;
+		}
+
+		nf_ct_put(found_ct);
+		if (collected > CONNCOUNT_GC_MAX_NODES)
+			return;
+	}
+}
+
 static void tree_nodes_free(struct rb_root *root,
 			    struct nf_conncount_rb *gc_nodes[],
 			    unsigned int gc_count)
@@ -251,8 +287,7 @@ count_tree(struct net *net, struct rb_root *root,
 		if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
 			continue;
 
-		/* only used for GC on hhead, retval and 'addit' ignored */
-		nf_conncount_lookup(net, &rbconn->hhead, tuple, zone, &addit);
+		nf_conncount_gc_list(net, rbconn);
 		if (hlist_empty(&rbconn->hhead))
 			gc_nodes[gc_count++] = rbconn;
 	}

From cb2b36f5a97df76f547fcc4ab444a02522fb6c96 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Mon, 2 Jul 2018 17:33:40 -0700
Subject: [PATCH 18/38] netfilter: nf_conncount: Switch to plain list

Original patch is from Florian Westphal.

This patch switches from hlist to plain list to store the list of
connections with the same filtering key in nf_conncount. With the
plain list, we can insert new connections at the tail, so over time
the beginning of list holds long-running connections and those are
expired, while the newly creates ones are at the end.

Later on, we could probably move checked ones to the end of the list,
so the next run has higher chance to reclaim stale entries in the front.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h | 15 +++-
 net/netfilter/nf_conncount.c               | 83 +++++++++++++---------
 net/netfilter/nft_connlimit.c              | 24 +++----
 3 files changed, 75 insertions(+), 47 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index 3a188a0923a3..e4884e0e4f69 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -1,8 +1,15 @@
 #ifndef _NF_CONNTRACK_COUNT_H
 #define _NF_CONNTRACK_COUNT_H
 
+#include <linux/list.h>
+
 struct nf_conncount_data;
 
+struct nf_conncount_list {
+	struct list_head head;	/* connections with the same filtering key */
+	unsigned int count;	/* length of list */
+};
+
 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
 					    unsigned int keylen);
 void nf_conncount_destroy(struct net *net, unsigned int family,
@@ -14,15 +21,17 @@ unsigned int nf_conncount_count(struct net *net,
 				const struct nf_conntrack_tuple *tuple,
 				const struct nf_conntrack_zone *zone);
 
-unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
+unsigned int nf_conncount_lookup(struct net *net, struct nf_conncount_list *list,
 				 const struct nf_conntrack_tuple *tuple,
 				 const struct nf_conntrack_zone *zone,
 				 bool *addit);
 
-bool nf_conncount_add(struct hlist_head *head,
+void nf_conncount_list_init(struct nf_conncount_list *list);
+
+bool nf_conncount_add(struct nf_conncount_list *list,
 		      const struct nf_conntrack_tuple *tuple,
 		      const struct nf_conntrack_zone *zone);
 
-void nf_conncount_cache_free(struct hlist_head *hhead);
+void nf_conncount_cache_free(struct nf_conncount_list *list);
 
 #endif
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 81c02185b2e8..81b060adefef 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -44,7 +44,7 @@
 
 /* we will save the tuples of all connections we care about */
 struct nf_conncount_tuple {
-	struct hlist_node		node;
+	struct list_head		node;
 	struct nf_conntrack_tuple	tuple;
 	struct nf_conntrack_zone	zone;
 	int				cpu;
@@ -53,7 +53,7 @@ struct nf_conncount_tuple {
 
 struct nf_conncount_rb {
 	struct rb_node node;
-	struct hlist_head hhead; /* connections/hosts in same subnet */
+	struct nf_conncount_list list;
 	u32 key[MAX_KEYLEN];
 };
 
@@ -82,12 +82,15 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
 	return memcmp(a, b, klen * sizeof(u32));
 }
 
-bool nf_conncount_add(struct hlist_head *head,
+bool nf_conncount_add(struct nf_conncount_list *list,
 		      const struct nf_conntrack_tuple *tuple,
 		      const struct nf_conntrack_zone *zone)
 {
 	struct nf_conncount_tuple *conn;
 
+	if (WARN_ON_ONCE(list->count > INT_MAX))
+		return false;
+
 	conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
 	if (conn == NULL)
 		return false;
@@ -95,13 +98,26 @@ bool nf_conncount_add(struct hlist_head *head,
 	conn->zone = *zone;
 	conn->cpu = raw_smp_processor_id();
 	conn->jiffies32 = (u32)jiffies;
-	hlist_add_head(&conn->node, head);
+	list_add_tail(&conn->node, &list->head);
+	list->count++;
 	return true;
 }
 EXPORT_SYMBOL_GPL(nf_conncount_add);
 
+static void conn_free(struct nf_conncount_list *list,
+		      struct nf_conncount_tuple *conn)
+{
+	if (WARN_ON_ONCE(list->count == 0))
+		return;
+
+	list->count--;
+	list_del(&conn->node);
+	kmem_cache_free(conncount_conn_cachep, conn);
+}
+
 static const struct nf_conntrack_tuple_hash *
-find_or_evict(struct net *net, struct nf_conncount_tuple *conn)
+find_or_evict(struct net *net, struct nf_conncount_list *list,
+	      struct nf_conncount_tuple *conn)
 {
 	const struct nf_conntrack_tuple_hash *found;
 	unsigned long a, b;
@@ -121,30 +137,29 @@ find_or_evict(struct net *net, struct nf_conncount_tuple *conn)
 	 */
 	age = a - b;
 	if (conn->cpu == cpu || age >= 2) {
-		hlist_del(&conn->node);
-		kmem_cache_free(conncount_conn_cachep, conn);
+		conn_free(list, conn);
 		return ERR_PTR(-ENOENT);
 	}
 
 	return ERR_PTR(-EAGAIN);
 }
 
-unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
+unsigned int nf_conncount_lookup(struct net *net,
+				 struct nf_conncount_list *list,
 				 const struct nf_conntrack_tuple *tuple,
 				 const struct nf_conntrack_zone *zone,
 				 bool *addit)
 {
 	const struct nf_conntrack_tuple_hash *found;
-	struct nf_conncount_tuple *conn;
+	struct nf_conncount_tuple *conn, *conn_n;
 	struct nf_conn *found_ct;
-	struct hlist_node *n;
 	unsigned int length = 0;
 
 	*addit = tuple ? true : false;
 
 	/* check the saved connections */
-	hlist_for_each_entry_safe(conn, n, head, node) {
-		found = find_or_evict(net, conn);
+	list_for_each_entry_safe(conn, conn_n, &list->head, node) {
+		found = find_or_evict(net, list, conn);
 		if (IS_ERR(found)) {
 			/* Not found, but might be about to be confirmed */
 			if (PTR_ERR(found) == -EAGAIN) {
@@ -157,6 +172,7 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
 				    nf_ct_zone_id(zone, zone->dir))
 					*addit = false;
 			}
+
 			continue;
 		}
 
@@ -176,8 +192,7 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
 			 * closed already -> ditch it
 			 */
 			nf_ct_put(found_ct);
-			hlist_del(&conn->node);
-			kmem_cache_free(conncount_conn_cachep, conn);
+			conn_free(list, conn);
 			continue;
 		}
 
@@ -189,17 +204,23 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
 }
 EXPORT_SYMBOL_GPL(nf_conncount_lookup);
 
+void nf_conncount_list_init(struct nf_conncount_list *list)
+{
+	INIT_LIST_HEAD(&list->head);
+	list->count = 1;
+}
+EXPORT_SYMBOL_GPL(nf_conncount_list_init);
+
 static void nf_conncount_gc_list(struct net *net,
-				 struct nf_conncount_rb *rbconn)
+				 struct nf_conncount_list *list)
 {
 	const struct nf_conntrack_tuple_hash *found;
-	struct nf_conncount_tuple *conn;
-	struct hlist_node *n;
+	struct nf_conncount_tuple *conn, *conn_n;
 	struct nf_conn *found_ct;
 	unsigned int collected = 0;
 
-	hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node) {
-		found = find_or_evict(net, conn);
+	list_for_each_entry_safe(conn, conn_n, &list->head, node) {
+		found = find_or_evict(net, list, conn);
 		if (IS_ERR(found)) {
 			if (PTR_ERR(found) == -ENOENT)
 				collected++;
@@ -213,8 +234,7 @@ static void nf_conncount_gc_list(struct net *net,
 			 * closed already -> ditch it
 			 */
 			nf_ct_put(found_ct);
-			hlist_del(&conn->node);
-			kmem_cache_free(conncount_conn_cachep, conn);
+			conn_free(list, conn);
 			collected++;
 			continue;
 		}
@@ -271,14 +291,14 @@ count_tree(struct net *net, struct rb_root *root,
 			/* same source network -> be counted! */
 			unsigned int count;
 
-			count = nf_conncount_lookup(net, &rbconn->hhead, tuple,
+			count = nf_conncount_lookup(net, &rbconn->list, tuple,
 						    zone, &addit);
 
 			tree_nodes_free(root, gc_nodes, gc_count);
 			if (!addit)
 				return count;
 
-			if (!nf_conncount_add(&rbconn->hhead, tuple, zone))
+			if (!nf_conncount_add(&rbconn->list, tuple, zone))
 				return 0; /* hotdrop */
 
 			return count + 1;
@@ -287,8 +307,8 @@ count_tree(struct net *net, struct rb_root *root,
 		if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
 			continue;
 
-		nf_conncount_gc_list(net, rbconn);
-		if (hlist_empty(&rbconn->hhead))
+		nf_conncount_gc_list(net, &rbconn->list);
+		if (list_empty(&rbconn->list.head))
 			gc_nodes[gc_count++] = rbconn;
 	}
 
@@ -322,8 +342,8 @@ count_tree(struct net *net, struct rb_root *root,
 	conn->zone = *zone;
 	memcpy(rbconn->key, key, sizeof(u32) * keylen);
 
-	INIT_HLIST_HEAD(&rbconn->hhead);
-	hlist_add_head(&conn->node, &rbconn->hhead);
+	nf_conncount_list_init(&rbconn->list);
+	list_add(&conn->node, &rbconn->list.head);
 
 	rb_link_node(&rbconn->node, parent, rbnode);
 	rb_insert_color(&rbconn->node, root);
@@ -388,12 +408,11 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
 }
 EXPORT_SYMBOL_GPL(nf_conncount_init);
 
-void nf_conncount_cache_free(struct hlist_head *hhead)
+void nf_conncount_cache_free(struct nf_conncount_list *list)
 {
-	struct nf_conncount_tuple *conn;
-	struct hlist_node *n;
+	struct nf_conncount_tuple *conn, *conn_n;
 
-	hlist_for_each_entry_safe(conn, n, hhead, node)
+	list_for_each_entry_safe(conn, conn_n, &list->head, node)
 		kmem_cache_free(conncount_conn_cachep, conn);
 }
 EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
@@ -408,7 +427,7 @@ static void destroy_tree(struct rb_root *r)
 
 		rb_erase(node, r);
 
-		nf_conncount_cache_free(&rbconn->hhead);
+		nf_conncount_cache_free(&rbconn->list);
 
 		kmem_cache_free(conncount_rb_cachep, rbconn);
 	}
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index a832c59f0a9c..4f0491a36a1d 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -14,10 +14,10 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 
 struct nft_connlimit {
-	spinlock_t		lock;
-	struct hlist_head	hhead;
-	u32			limit;
-	bool			invert;
+	spinlock_t			lock;
+	struct nf_conncount_list	list;
+	u32				limit;
+	bool				invert;
 };
 
 static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
@@ -46,13 +46,13 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
 	}
 
 	spin_lock_bh(&priv->lock);
-	count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone,
+	count = nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone,
 				    &addit);
 
 	if (!addit)
 		goto out;
 
-	if (!nf_conncount_add(&priv->hhead, tuple_ptr, zone)) {
+	if (!nf_conncount_add(&priv->list, tuple_ptr, zone)) {
 		regs->verdict.code = NF_DROP;
 		spin_unlock_bh(&priv->lock);
 		return;
@@ -88,7 +88,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
 	}
 
 	spin_lock_init(&priv->lock);
-	INIT_HLIST_HEAD(&priv->hhead);
+	nf_conncount_list_init(&priv->list);
 	priv->limit	= limit;
 	priv->invert	= invert;
 
@@ -99,7 +99,7 @@ static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
 				     struct nft_connlimit *priv)
 {
 	nf_ct_netns_put(ctx->net, ctx->family);
-	nf_conncount_cache_free(&priv->hhead);
+	nf_conncount_cache_free(&priv->list);
 }
 
 static int nft_connlimit_do_dump(struct sk_buff *skb,
@@ -213,7 +213,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
 	struct nft_connlimit *priv_src = nft_expr_priv(src);
 
 	spin_lock_init(&priv_dst->lock);
-	INIT_HLIST_HEAD(&priv_dst->hhead);
+	nf_conncount_list_init(&priv_dst->list);
 	priv_dst->limit	 = priv_src->limit;
 	priv_dst->invert = priv_src->invert;
 
@@ -225,7 +225,7 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
 {
 	struct nft_connlimit *priv = nft_expr_priv(expr);
 
-	nf_conncount_cache_free(&priv->hhead);
+	nf_conncount_cache_free(&priv->list);
 }
 
 static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
@@ -234,9 +234,9 @@ static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
 	bool addit, ret;
 
 	spin_lock_bh(&priv->lock);
-	nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit);
+	nf_conncount_lookup(net, &priv->list, NULL, &nf_ct_zone_dflt, &addit);
 
-	ret = hlist_empty(&priv->hhead);
+	ret = list_empty(&priv->list.head);
 	spin_unlock_bh(&priv->lock);
 
 	return ret;

From 976afca1ceba53df6f4a543014e15d1c7a962571 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Mon, 2 Jul 2018 17:33:41 -0700
Subject: [PATCH 19/38] netfilter: nf_conncount: Early exit in
 nf_conncount_lookup() and cleanup

This patch is originally from Florian Westphal.

This patch does the following three tasks.

It applies the same early exit technique for nf_conncount_lookup().

Since now we keep the number of connections in 'struct nf_conncount_list',
we no longer need to return the count in nf_conncount_lookup().

Moreover, we expose the garbage collection function nf_conncount_gc_list()
for nft_connlimit.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h | 11 ++++---
 net/netfilter/nf_conncount.c               | 38 ++++++++++++----------
 net/netfilter/nft_connlimit.c              |  9 ++---
 3 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index e4884e0e4f69..dbec17f674b7 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -21,10 +21,10 @@ unsigned int nf_conncount_count(struct net *net,
 				const struct nf_conntrack_tuple *tuple,
 				const struct nf_conntrack_zone *zone);
 
-unsigned int nf_conncount_lookup(struct net *net, struct nf_conncount_list *list,
-				 const struct nf_conntrack_tuple *tuple,
-				 const struct nf_conntrack_zone *zone,
-				 bool *addit);
+void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list,
+			 const struct nf_conntrack_tuple *tuple,
+			 const struct nf_conntrack_zone *zone,
+			 bool *addit);
 
 void nf_conncount_list_init(struct nf_conncount_list *list);
 
@@ -32,6 +32,9 @@ bool nf_conncount_add(struct nf_conncount_list *list,
 		      const struct nf_conntrack_tuple *tuple,
 		      const struct nf_conntrack_zone *zone);
 
+void nf_conncount_gc_list(struct net *net,
+			  struct nf_conncount_list *list);
+
 void nf_conncount_cache_free(struct nf_conncount_list *list);
 
 #endif
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 81b060adefef..7dfd9d5e6a3e 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -144,26 +144,29 @@ find_or_evict(struct net *net, struct nf_conncount_list *list,
 	return ERR_PTR(-EAGAIN);
 }
 
-unsigned int nf_conncount_lookup(struct net *net,
-				 struct nf_conncount_list *list,
-				 const struct nf_conntrack_tuple *tuple,
-				 const struct nf_conntrack_zone *zone,
-				 bool *addit)
+void nf_conncount_lookup(struct net *net,
+			 struct nf_conncount_list *list,
+			 const struct nf_conntrack_tuple *tuple,
+			 const struct nf_conntrack_zone *zone,
+			 bool *addit)
 {
 	const struct nf_conntrack_tuple_hash *found;
 	struct nf_conncount_tuple *conn, *conn_n;
 	struct nf_conn *found_ct;
-	unsigned int length = 0;
+	unsigned int collect = 0;
 
+	/* best effort only */
 	*addit = tuple ? true : false;
 
 	/* check the saved connections */
 	list_for_each_entry_safe(conn, conn_n, &list->head, node) {
+		if (collect > CONNCOUNT_GC_MAX_NODES)
+			break;
+
 		found = find_or_evict(net, list, conn);
 		if (IS_ERR(found)) {
 			/* Not found, but might be about to be confirmed */
 			if (PTR_ERR(found) == -EAGAIN) {
-				length++;
 				if (!tuple)
 					continue;
 
@@ -171,8 +174,8 @@ unsigned int nf_conncount_lookup(struct net *net,
 				    nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
 				    nf_ct_zone_id(zone, zone->dir))
 					*addit = false;
-			}
-
+			} else if (PTR_ERR(found) == -ENOENT)
+				collect++;
 			continue;
 		}
 
@@ -181,9 +184,10 @@ unsigned int nf_conncount_lookup(struct net *net,
 		if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) &&
 		    nf_ct_zone_equal(found_ct, zone, zone->dir)) {
 			/*
-			 * Just to be sure we have it only once in the list.
 			 * We should not see tuples twice unless someone hooks
 			 * this into a table without "-p tcp --syn".
+			 *
+			 * Attempt to avoid a re-add in this case.
 			 */
 			*addit = false;
 		} else if (already_closed(found_ct)) {
@@ -193,14 +197,12 @@ unsigned int nf_conncount_lookup(struct net *net,
 			 */
 			nf_ct_put(found_ct);
 			conn_free(list, conn);
+			collect++;
 			continue;
 		}
 
 		nf_ct_put(found_ct);
-		length++;
 	}
-
-	return length;
 }
 EXPORT_SYMBOL_GPL(nf_conncount_lookup);
 
@@ -211,8 +213,8 @@ void nf_conncount_list_init(struct nf_conncount_list *list)
 }
 EXPORT_SYMBOL_GPL(nf_conncount_list_init);
 
-static void nf_conncount_gc_list(struct net *net,
-				 struct nf_conncount_list *list)
+void nf_conncount_gc_list(struct net *net,
+			  struct nf_conncount_list *list)
 {
 	const struct nf_conntrack_tuple_hash *found;
 	struct nf_conncount_tuple *conn, *conn_n;
@@ -244,6 +246,7 @@ static void nf_conncount_gc_list(struct net *net,
 			return;
 	}
 }
+EXPORT_SYMBOL_GPL(nf_conncount_gc_list);
 
 static void tree_nodes_free(struct rb_root *root,
 			    struct nf_conncount_rb *gc_nodes[],
@@ -291,8 +294,9 @@ count_tree(struct net *net, struct rb_root *root,
 			/* same source network -> be counted! */
 			unsigned int count;
 
-			count = nf_conncount_lookup(net, &rbconn->list, tuple,
-						    zone, &addit);
+			nf_conncount_lookup(net, &rbconn->list, tuple, zone,
+					    &addit);
+			count = rbconn->list.count;
 
 			tree_nodes_free(root, gc_nodes, gc_count);
 			if (!addit)
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index 4f0491a36a1d..37c52ae06741 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -46,8 +46,9 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
 	}
 
 	spin_lock_bh(&priv->lock);
-	count = nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone,
-				    &addit);
+	nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone,
+			    &addit);
+	count = priv->list.count;
 
 	if (!addit)
 		goto out;
@@ -231,10 +232,10 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
 static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
 {
 	struct nft_connlimit *priv = nft_expr_priv(expr);
-	bool addit, ret;
+	bool ret;
 
 	spin_lock_bh(&priv->lock);
-	nf_conncount_lookup(net, &priv->list, NULL, &nf_ct_zone_dflt, &addit);
+	nf_conncount_gc_list(net, &priv->list);
 
 	ret = list_empty(&priv->list.head);
 	spin_unlock_bh(&priv->lock);

From 2ba39118c10ae3a7d3411c073485bba9576684cd Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Mon, 2 Jul 2018 17:33:42 -0700
Subject: [PATCH 20/38] netfilter: nf_conncount: Move locking into count_tree()

This patch is originally from Florian Westphal.

This is a preparation patch to allow lockless traversal
of the tree via RCU.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 52 +++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 7dfd9d5e6a3e..d1a4fd1c0f81 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -262,18 +262,26 @@ static void tree_nodes_free(struct rb_root *root,
 }
 
 static unsigned int
-count_tree(struct net *net, struct rb_root *root,
-	   const u32 *key, u8 keylen,
+count_tree(struct net *net,
+	   struct nf_conncount_data *data,
+	   const u32 *key,
 	   const struct nf_conntrack_tuple *tuple,
 	   const struct nf_conntrack_zone *zone)
 {
 	struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
+	struct rb_root *root;
 	struct rb_node **rbnode, *parent;
 	struct nf_conncount_rb *rbconn;
 	struct nf_conncount_tuple *conn;
-	unsigned int gc_count;
+	unsigned int gc_count, hash;
 	bool no_gc = false;
+	unsigned int count = 0;
+	u8 keylen = data->keylen;
 
+	hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
+	root = &data->root[hash];
+
+	spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
  restart:
 	gc_count = 0;
 	parent = NULL;
@@ -292,20 +300,20 @@ count_tree(struct net *net, struct rb_root *root,
 			rbnode = &((*rbnode)->rb_right);
 		} else {
 			/* same source network -> be counted! */
-			unsigned int count;
-
 			nf_conncount_lookup(net, &rbconn->list, tuple, zone,
 					    &addit);
 			count = rbconn->list.count;
 
 			tree_nodes_free(root, gc_nodes, gc_count);
 			if (!addit)
-				return count;
+				goto out_unlock;
 
 			if (!nf_conncount_add(&rbconn->list, tuple, zone))
-				return 0; /* hotdrop */
+				count = 0; /* hotdrop */
+				goto out_unlock;
 
-			return count + 1;
+			count++;
+			goto out_unlock;
 		}
 
 		if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
@@ -328,18 +336,18 @@ count_tree(struct net *net, struct rb_root *root,
 		goto restart;
 	}
 
+	count = 0;
 	if (!tuple)
-		return 0;
-
+		goto out_unlock;
 	/* no match, need to insert new node */
 	rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
 	if (rbconn == NULL)
-		return 0;
+		goto out_unlock;
 
 	conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
 	if (conn == NULL) {
 		kmem_cache_free(conncount_rb_cachep, rbconn);
-		return 0;
+		goto out_unlock;
 	}
 
 	conn->tuple = *tuple;
@@ -348,10 +356,13 @@ count_tree(struct net *net, struct rb_root *root,
 
 	nf_conncount_list_init(&rbconn->list);
 	list_add(&conn->node, &rbconn->list.head);
+	count = 1;
 
 	rb_link_node(&rbconn->node, parent, rbnode);
 	rb_insert_color(&rbconn->node, root);
-	return 1;
+out_unlock:
+	spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+	return count;
 }
 
 /* Count and return number of conntrack entries in 'net' with particular 'key'.
@@ -363,20 +374,7 @@ unsigned int nf_conncount_count(struct net *net,
 				const struct nf_conntrack_tuple *tuple,
 				const struct nf_conntrack_zone *zone)
 {
-	struct rb_root *root;
-	int count;
-	u32 hash;
-
-	hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
-	root = &data->root[hash];
-
-	spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
-
-	count = count_tree(net, root, key, data->keylen, tuple, zone);
-
-	spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
-
-	return count;
+	return count_tree(net, data, key, tuple, zone);
 }
 EXPORT_SYMBOL_GPL(nf_conncount_count);
 

From 34848d5c896ea1ab4e3c441b9c4fed39928ccbaf Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Mon, 2 Jul 2018 17:33:43 -0700
Subject: [PATCH 21/38] netfilter: nf_conncount: Split insert and traversal

This patch is originally from Florian Westphal.

When we have a very coarse grouping, e.g. by large subnets, zone id,
etc, it's likely that we do not need to do tree rotation because
we'll find a node where we can attach new entry.  Based on this
observation, we split tree traversal and insertion.

Later on, we can make traversal lockless (tree protected
by RCU), and add extra lock in the individual nodes to protect list
insertion/deletion, thereby allowing parallel insert/delete in different
tree nodes.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 87 +++++++++++++++++++++++++++---------
 1 file changed, 67 insertions(+), 20 deletions(-)

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index d1a4fd1c0f81..3f14806b7271 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -261,6 +261,71 @@ static void tree_nodes_free(struct rb_root *root,
 	}
 }
 
+static unsigned int
+insert_tree(struct rb_root *root,
+	    unsigned int hash,
+	    const u32 *key,
+	    u8 keylen,
+	    const struct nf_conntrack_tuple *tuple,
+	    const struct nf_conntrack_zone *zone)
+{
+	struct rb_node **rbnode, *parent;
+	struct nf_conncount_rb *rbconn;
+	struct nf_conncount_tuple *conn;
+	unsigned int count = 0;
+
+	spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+
+	parent = NULL;
+	rbnode = &(root->rb_node);
+	while (*rbnode) {
+		int diff;
+		rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
+
+		parent = *rbnode;
+		diff = key_diff(key, rbconn->key, keylen);
+		if (diff < 0) {
+			rbnode = &((*rbnode)->rb_left);
+		} else if (diff > 0) {
+			rbnode = &((*rbnode)->rb_right);
+		} else {
+			/* unlikely: other cpu added node already */
+			if (!nf_conncount_add(&rbconn->list, tuple, zone)) {
+				count = 0; /* hotdrop */
+				goto out_unlock;
+			}
+
+			count = rbconn->list.count;
+			goto out_unlock;
+		}
+	}
+
+	/* expected case: match, insert new node */
+	rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
+	if (rbconn == NULL)
+		goto out_unlock;
+
+	conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
+	if (conn == NULL) {
+		kmem_cache_free(conncount_rb_cachep, rbconn);
+		goto out_unlock;
+	}
+
+	conn->tuple = *tuple;
+	conn->zone = *zone;
+	memcpy(rbconn->key, key, sizeof(u32) * keylen);
+
+	nf_conncount_list_init(&rbconn->list);
+	list_add(&conn->node, &rbconn->list.head);
+	count = 1;
+
+	rb_link_node(&rbconn->node, parent, rbnode);
+	rb_insert_color(&rbconn->node, root);
+out_unlock:
+	spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+	return count;
+}
+
 static unsigned int
 count_tree(struct net *net,
 	   struct nf_conncount_data *data,
@@ -272,7 +337,6 @@ count_tree(struct net *net,
 	struct rb_root *root;
 	struct rb_node **rbnode, *parent;
 	struct nf_conncount_rb *rbconn;
-	struct nf_conncount_tuple *conn;
 	unsigned int gc_count, hash;
 	bool no_gc = false;
 	unsigned int count = 0;
@@ -339,27 +403,10 @@ count_tree(struct net *net,
 	count = 0;
 	if (!tuple)
 		goto out_unlock;
-	/* no match, need to insert new node */
-	rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
-	if (rbconn == NULL)
-		goto out_unlock;
 
-	conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
-	if (conn == NULL) {
-		kmem_cache_free(conncount_rb_cachep, rbconn);
-		goto out_unlock;
-	}
+	spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+	return insert_tree(root, hash, key, keylen, tuple, zone);
 
-	conn->tuple = *tuple;
-	conn->zone = *zone;
-	memcpy(rbconn->key, key, sizeof(u32) * keylen);
-
-	nf_conncount_list_init(&rbconn->list);
-	list_add(&conn->node, &rbconn->list.head);
-	count = 1;
-
-	rb_link_node(&rbconn->node, parent, rbnode);
-	rb_insert_color(&rbconn->node, root);
 out_unlock:
 	spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
 	return count;

From 5c789e131cbb997a528451564ea4613e812fc718 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Mon, 2 Jul 2018 17:33:44 -0700
Subject: [PATCH 22/38] netfilter: nf_conncount: Add list lock and gc worker,
 and RCU for init tree search

This patch is originally from Florian Westphal.

This patch does the following 3 main tasks.

1) Add list lock to 'struct nf_conncount_list' so that we can
alter the lists containing the individual connections without holding the
main tree lock.  It would be useful when we only need to add/remove to/from
a list without allocate/remove a node in the tree.  With this change, we
update nft_connlimit accordingly since we longer need to maintain
a list lock in nft_connlimit now.

2) Use RCU for the initial tree search to improve tree look up performance.

3) Add a garbage collection worker. This worker is schedule when there
are excessive tree node that needed to be recycled.

Moreover,the rbnode reclaim logic is moved from search tree to insert tree
to avoid race condition.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_count.h |  17 +-
 net/netfilter/nf_conncount.c               | 253 +++++++++++++++------
 net/netfilter/nft_connlimit.c              |  17 +-
 3 files changed, 196 insertions(+), 91 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index dbec17f674b7..4b2b2baf8ab4 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -5,9 +5,17 @@
 
 struct nf_conncount_data;
 
+enum nf_conncount_list_add {
+	NF_CONNCOUNT_ADDED, 	/* list add was ok */
+	NF_CONNCOUNT_ERR,	/* -ENOMEM, must drop skb */
+	NF_CONNCOUNT_SKIP,	/* list is already reclaimed by gc */
+};
+
 struct nf_conncount_list {
+	spinlock_t list_lock;
 	struct list_head head;	/* connections with the same filtering key */
 	unsigned int count;	/* length of list */
+	bool dead;
 };
 
 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family,
@@ -28,11 +36,12 @@ void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list,
 
 void nf_conncount_list_init(struct nf_conncount_list *list);
 
-bool nf_conncount_add(struct nf_conncount_list *list,
-		      const struct nf_conntrack_tuple *tuple,
-		      const struct nf_conntrack_zone *zone);
+enum nf_conncount_list_add
+nf_conncount_add(struct nf_conncount_list *list,
+		 const struct nf_conntrack_tuple *tuple,
+		 const struct nf_conntrack_zone *zone);
 
-void nf_conncount_gc_list(struct net *net,
+bool nf_conncount_gc_list(struct net *net,
 			  struct nf_conncount_list *list);
 
 void nf_conncount_cache_free(struct nf_conncount_list *list);
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 3f14806b7271..02ca7df793f5 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -49,12 +49,14 @@ struct nf_conncount_tuple {
 	struct nf_conntrack_zone	zone;
 	int				cpu;
 	u32				jiffies32;
+	struct rcu_head			rcu_head;
 };
 
 struct nf_conncount_rb {
 	struct rb_node node;
 	struct nf_conncount_list list;
 	u32 key[MAX_KEYLEN];
+	struct rcu_head rcu_head;
 };
 
 static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp;
@@ -62,6 +64,10 @@ static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_i
 struct nf_conncount_data {
 	unsigned int keylen;
 	struct rb_root root[CONNCOUNT_SLOTS];
+	struct net *net;
+	struct work_struct gc_work;
+	unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
+	unsigned int gc_tree;
 };
 
 static u_int32_t conncount_rnd __read_mostly;
@@ -82,42 +88,70 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
 	return memcmp(a, b, klen * sizeof(u32));
 }
 
-bool nf_conncount_add(struct nf_conncount_list *list,
-		      const struct nf_conntrack_tuple *tuple,
-		      const struct nf_conntrack_zone *zone)
+enum nf_conncount_list_add
+nf_conncount_add(struct nf_conncount_list *list,
+		 const struct nf_conntrack_tuple *tuple,
+		 const struct nf_conntrack_zone *zone)
 {
 	struct nf_conncount_tuple *conn;
 
 	if (WARN_ON_ONCE(list->count > INT_MAX))
-		return false;
+		return NF_CONNCOUNT_ERR;
 
 	conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
 	if (conn == NULL)
-		return false;
+		return NF_CONNCOUNT_ERR;
+
 	conn->tuple = *tuple;
 	conn->zone = *zone;
 	conn->cpu = raw_smp_processor_id();
 	conn->jiffies32 = (u32)jiffies;
+	spin_lock(&list->list_lock);
+	if (list->dead == true) {
+		kmem_cache_free(conncount_conn_cachep, conn);
+		spin_unlock(&list->list_lock);
+		return NF_CONNCOUNT_SKIP;
+	}
 	list_add_tail(&conn->node, &list->head);
 	list->count++;
-	return true;
+	spin_unlock(&list->list_lock);
+	return NF_CONNCOUNT_ADDED;
 }
 EXPORT_SYMBOL_GPL(nf_conncount_add);
 
-static void conn_free(struct nf_conncount_list *list,
+static void __conn_free(struct rcu_head *h)
+{
+	struct nf_conncount_tuple *conn;
+
+	conn = container_of(h, struct nf_conncount_tuple, rcu_head);
+	kmem_cache_free(conncount_conn_cachep, conn);
+}
+
+static bool conn_free(struct nf_conncount_list *list,
 		      struct nf_conncount_tuple *conn)
 {
-	if (WARN_ON_ONCE(list->count == 0))
-		return;
+	bool free_entry = false;
+
+	spin_lock(&list->list_lock);
+
+	if (list->count == 0) {
+		spin_unlock(&list->list_lock);
+                return free_entry;
+	}
 
 	list->count--;
-	list_del(&conn->node);
-	kmem_cache_free(conncount_conn_cachep, conn);
+	list_del_rcu(&conn->node);
+	if (list->count == 0)
+		free_entry = true;
+
+	spin_unlock(&list->list_lock);
+	call_rcu(&conn->rcu_head, __conn_free);
+	return free_entry;
 }
 
 static const struct nf_conntrack_tuple_hash *
 find_or_evict(struct net *net, struct nf_conncount_list *list,
-	      struct nf_conncount_tuple *conn)
+	      struct nf_conncount_tuple *conn, bool *free_entry)
 {
 	const struct nf_conntrack_tuple_hash *found;
 	unsigned long a, b;
@@ -137,7 +171,7 @@ find_or_evict(struct net *net, struct nf_conncount_list *list,
 	 */
 	age = a - b;
 	if (conn->cpu == cpu || age >= 2) {
-		conn_free(list, conn);
+		*free_entry = conn_free(list, conn);
 		return ERR_PTR(-ENOENT);
 	}
 
@@ -154,6 +188,7 @@ void nf_conncount_lookup(struct net *net,
 	struct nf_conncount_tuple *conn, *conn_n;
 	struct nf_conn *found_ct;
 	unsigned int collect = 0;
+	bool free_entry = false;
 
 	/* best effort only */
 	*addit = tuple ? true : false;
@@ -163,7 +198,7 @@ void nf_conncount_lookup(struct net *net,
 		if (collect > CONNCOUNT_GC_MAX_NODES)
 			break;
 
-		found = find_or_evict(net, list, conn);
+		found = find_or_evict(net, list, conn, &free_entry);
 		if (IS_ERR(found)) {
 			/* Not found, but might be about to be confirmed */
 			if (PTR_ERR(found) == -EAGAIN) {
@@ -208,24 +243,31 @@ EXPORT_SYMBOL_GPL(nf_conncount_lookup);
 
 void nf_conncount_list_init(struct nf_conncount_list *list)
 {
+	spin_lock_init(&list->list_lock);
 	INIT_LIST_HEAD(&list->head);
 	list->count = 1;
+	list->dead = false;
 }
 EXPORT_SYMBOL_GPL(nf_conncount_list_init);
 
-void nf_conncount_gc_list(struct net *net,
+/* Return true if the list is empty */
+bool nf_conncount_gc_list(struct net *net,
 			  struct nf_conncount_list *list)
 {
 	const struct nf_conntrack_tuple_hash *found;
 	struct nf_conncount_tuple *conn, *conn_n;
 	struct nf_conn *found_ct;
 	unsigned int collected = 0;
+	bool free_entry = false;
 
 	list_for_each_entry_safe(conn, conn_n, &list->head, node) {
-		found = find_or_evict(net, list, conn);
+		found = find_or_evict(net, list, conn, &free_entry);
 		if (IS_ERR(found)) {
-			if (PTR_ERR(found) == -ENOENT)
+			if (PTR_ERR(found) == -ENOENT)  {
+				if (free_entry)
+					return true;
 				collected++;
+			}
 			continue;
 		}
 
@@ -236,18 +278,28 @@ void nf_conncount_gc_list(struct net *net,
 			 * closed already -> ditch it
 			 */
 			nf_ct_put(found_ct);
-			conn_free(list, conn);
+			if (conn_free(list, conn))
+				return true;
 			collected++;
 			continue;
 		}
 
 		nf_ct_put(found_ct);
 		if (collected > CONNCOUNT_GC_MAX_NODES)
-			return;
+			return false;
 	}
+	return false;
 }
 EXPORT_SYMBOL_GPL(nf_conncount_gc_list);
 
+static void __tree_nodes_free(struct rcu_head *h)
+{
+	struct nf_conncount_rb *rbconn;
+
+	rbconn = container_of(h, struct nf_conncount_rb, rcu_head);
+	kmem_cache_free(conncount_rb_cachep, rbconn);
+}
+
 static void tree_nodes_free(struct rb_root *root,
 			    struct nf_conncount_rb *gc_nodes[],
 			    unsigned int gc_count)
@@ -256,23 +308,39 @@ static void tree_nodes_free(struct rb_root *root,
 
 	while (gc_count) {
 		rbconn = gc_nodes[--gc_count];
-		rb_erase(&rbconn->node, root);
-		kmem_cache_free(conncount_rb_cachep, rbconn);
+		spin_lock(&rbconn->list.list_lock);
+		if (rbconn->list.count == 0 && rbconn->list.dead == false) {
+			rbconn->list.dead = true;
+			rb_erase(&rbconn->node, root);
+			call_rcu(&rbconn->rcu_head, __tree_nodes_free);
+		}
+		spin_unlock(&rbconn->list.list_lock);
 	}
 }
 
+static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
+{
+	set_bit(tree, data->pending_trees);
+	schedule_work(&data->gc_work);
+}
+
 static unsigned int
-insert_tree(struct rb_root *root,
+insert_tree(struct net *net,
+	    struct nf_conncount_data *data,
+	    struct rb_root *root,
 	    unsigned int hash,
 	    const u32 *key,
 	    u8 keylen,
 	    const struct nf_conntrack_tuple *tuple,
 	    const struct nf_conntrack_zone *zone)
 {
+	enum nf_conncount_list_add ret;
+	struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
 	struct rb_node **rbnode, *parent;
 	struct nf_conncount_rb *rbconn;
 	struct nf_conncount_tuple *conn;
-	unsigned int count = 0;
+	unsigned int count = 0, gc_count = 0;
+	bool node_found = false;
 
 	spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
 
@@ -290,16 +358,44 @@ insert_tree(struct rb_root *root,
 			rbnode = &((*rbnode)->rb_right);
 		} else {
 			/* unlikely: other cpu added node already */
-			if (!nf_conncount_add(&rbconn->list, tuple, zone)) {
+			node_found = true;
+			ret = nf_conncount_add(&rbconn->list, tuple, zone);
+			if (ret == NF_CONNCOUNT_ERR) {
 				count = 0; /* hotdrop */
-				goto out_unlock;
+			} else if (ret == NF_CONNCOUNT_ADDED) {
+				count = rbconn->list.count;
+			} else {
+				/* NF_CONNCOUNT_SKIP, rbconn is already
+				 * reclaimed by gc, insert a new tree node
+				 */
+				node_found = false;
 			}
-
-			count = rbconn->list.count;
-			goto out_unlock;
+			break;
 		}
+
+		if (gc_count >= ARRAY_SIZE(gc_nodes))
+			continue;
+
+		if (nf_conncount_gc_list(net, &rbconn->list))
+			gc_nodes[gc_count++] = rbconn;
 	}
 
+	if (gc_count) {
+		tree_nodes_free(root, gc_nodes, gc_count);
+		/* tree_node_free before new allocation permits
+		 * allocator to re-use newly free'd object.
+		 *
+		 * This is a rare event; in most cases we will find
+		 * existing node to re-use. (or gc_count is 0).
+		 */
+
+		if (gc_count >= ARRAY_SIZE(gc_nodes))
+			schedule_gc_worker(data, hash);
+	}
+
+	if (node_found)
+		goto out_unlock;
+
 	/* expected case: match, insert new node */
 	rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
 	if (rbconn == NULL)
@@ -333,87 +429,97 @@ count_tree(struct net *net,
 	   const struct nf_conntrack_tuple *tuple,
 	   const struct nf_conntrack_zone *zone)
 {
-	struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
+	enum nf_conncount_list_add ret;
 	struct rb_root *root;
-	struct rb_node **rbnode, *parent;
+	struct rb_node *parent;
 	struct nf_conncount_rb *rbconn;
-	unsigned int gc_count, hash;
-	bool no_gc = false;
-	unsigned int count = 0;
+	unsigned int hash;
 	u8 keylen = data->keylen;
 
 	hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
 	root = &data->root[hash];
 
-	spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
- restart:
-	gc_count = 0;
-	parent = NULL;
-	rbnode = &(root->rb_node);
-	while (*rbnode) {
+	parent = rcu_dereference_raw(root->rb_node);
+	while (parent) {
 		int diff;
 		bool addit;
 
-		rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
+		rbconn = rb_entry(parent, struct nf_conncount_rb, node);
 
-		parent = *rbnode;
 		diff = key_diff(key, rbconn->key, keylen);
 		if (diff < 0) {
-			rbnode = &((*rbnode)->rb_left);
+			parent = rcu_dereference_raw(parent->rb_left);
 		} else if (diff > 0) {
-			rbnode = &((*rbnode)->rb_right);
+			parent = rcu_dereference_raw(parent->rb_right);
 		} else {
 			/* same source network -> be counted! */
 			nf_conncount_lookup(net, &rbconn->list, tuple, zone,
 					    &addit);
-			count = rbconn->list.count;
 
-			tree_nodes_free(root, gc_nodes, gc_count);
 			if (!addit)
-				goto out_unlock;
+				return rbconn->list.count;
 
-			if (!nf_conncount_add(&rbconn->list, tuple, zone))
-				count = 0; /* hotdrop */
-				goto out_unlock;
-
-			count++;
-			goto out_unlock;
+			ret = nf_conncount_add(&rbconn->list, tuple, zone);
+			if (ret == NF_CONNCOUNT_ERR) {
+				return 0; /* hotdrop */
+			} else if (ret == NF_CONNCOUNT_ADDED) {
+				return rbconn->list.count;
+			} else {
+				/* NF_CONNCOUNT_SKIP, rbconn is already
+				 * reclaimed by gc, insert a new tree node
+				 */
+				break;
+			}
 		}
+	}
 
-		if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
-			continue;
+	if (!tuple)
+		return 0;
 
-		nf_conncount_gc_list(net, &rbconn->list);
-		if (list_empty(&rbconn->list.head))
+	return insert_tree(net, data, root, hash, key, keylen, tuple, zone);
+}
+
+static void tree_gc_worker(struct work_struct *work)
+{
+	struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
+	struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
+	struct rb_root *root;
+	struct rb_node *node;
+	unsigned int tree, next_tree, gc_count = 0;
+
+	tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS;
+	root = &data->root[tree];
+
+	rcu_read_lock();
+	for (node = rb_first(root); node != NULL; node = rb_next(node)) {
+		rbconn = rb_entry(node, struct nf_conncount_rb, node);
+		if (nf_conncount_gc_list(data->net, &rbconn->list))
 			gc_nodes[gc_count++] = rbconn;
 	}
+	rcu_read_unlock();
+
+	spin_lock_bh(&nf_conncount_locks[tree]);
 
 	if (gc_count) {
-		no_gc = true;
 		tree_nodes_free(root, gc_nodes, gc_count);
-		/* tree_node_free before new allocation permits
-		 * allocator to re-use newly free'd object.
-		 *
-		 * This is a rare event; in most cases we will find
-		 * existing node to re-use. (or gc_count is 0).
-		 */
-		goto restart;
 	}
 
-	count = 0;
-	if (!tuple)
-		goto out_unlock;
+	clear_bit(tree, data->pending_trees);
 
-	spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
-	return insert_tree(root, hash, key, keylen, tuple, zone);
+	next_tree = (tree + 1) % CONNCOUNT_SLOTS;
+	next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS);
 
-out_unlock:
-	spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
-	return count;
+	if (next_tree < CONNCOUNT_SLOTS) {
+		data->gc_tree = next_tree;
+		schedule_work(work);
+	}
+
+	spin_unlock_bh(&nf_conncount_locks[tree]);
 }
 
 /* Count and return number of conntrack entries in 'net' with particular 'key'.
  * If 'tuple' is not null, insert it into the accounting data structure.
+ * Call with RCU read lock.
  */
 unsigned int nf_conncount_count(struct net *net,
 				struct nf_conncount_data *data,
@@ -452,6 +558,8 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
 		data->root[i] = RB_ROOT;
 
 	data->keylen = keylen / sizeof(u32);
+	data->net = net;
+	INIT_WORK(&data->gc_work, tree_gc_worker);
 
 	return data;
 }
@@ -487,6 +595,7 @@ void nf_conncount_destroy(struct net *net, unsigned int family,
 {
 	unsigned int i;
 
+	cancel_work_sync(&data->gc_work);
 	nf_ct_netns_put(net, family);
 
 	for (i = 0; i < ARRAY_SIZE(data->root); ++i)
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index 37c52ae06741..b90d96ba4a12 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -14,7 +14,6 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 
 struct nft_connlimit {
-	spinlock_t			lock;
 	struct nf_conncount_list	list;
 	u32				limit;
 	bool				invert;
@@ -45,7 +44,6 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
 		return;
 	}
 
-	spin_lock_bh(&priv->lock);
 	nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone,
 			    &addit);
 	count = priv->list.count;
@@ -53,14 +51,12 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
 	if (!addit)
 		goto out;
 
-	if (!nf_conncount_add(&priv->list, tuple_ptr, zone)) {
+	if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) {
 		regs->verdict.code = NF_DROP;
-		spin_unlock_bh(&priv->lock);
 		return;
 	}
 	count++;
 out:
-	spin_unlock_bh(&priv->lock);
 
 	if ((count > priv->limit) ^ priv->invert) {
 		regs->verdict.code = NFT_BREAK;
@@ -88,7 +84,6 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
 			invert = true;
 	}
 
-	spin_lock_init(&priv->lock);
 	nf_conncount_list_init(&priv->list);
 	priv->limit	= limit;
 	priv->invert	= invert;
@@ -213,7 +208,6 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
 	struct nft_connlimit *priv_dst = nft_expr_priv(dst);
 	struct nft_connlimit *priv_src = nft_expr_priv(src);
 
-	spin_lock_init(&priv_dst->lock);
 	nf_conncount_list_init(&priv_dst->list);
 	priv_dst->limit	 = priv_src->limit;
 	priv_dst->invert = priv_src->invert;
@@ -232,15 +226,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
 static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
 {
 	struct nft_connlimit *priv = nft_expr_priv(expr);
-	bool ret;
 
-	spin_lock_bh(&priv->lock);
-	nf_conncount_gc_list(net, &priv->list);
-
-	ret = list_empty(&priv->list.head);
-	spin_unlock_bh(&priv->lock);
-
-	return ret;
+	return nf_conncount_gc_list(net, &priv->list);
 }
 
 static struct nft_expr_type nft_connlimit_type;

From ed07d9a021df6da53456663a76999189badc432a Mon Sep 17 00:00:00 2001
From: Martynas Pumputis <martynas@weave.works>
Date: Mon, 2 Jul 2018 16:52:14 +0200
Subject: [PATCH 23/38] netfilter: nf_conntrack: resolve clash for matching
 conntracks

This patch enables the clash resolution for NAT (disabled in
"590b52e10d41") if clashing conntracks match (i.e. both tuples are equal)
and a protocol allows it.

The clash might happen for a connections-less protocol (e.g. UDP) when
two threads in parallel writes to the same socket and consequent calls
to "get_unique_tuple" return the same tuples (incl. reply tuples).

In this case it is safe to perform the resolution, as the losing CT
describes the same mangling as the winning CT, so no modifications to
the packet are needed, and the result of rules traversal for the loser's
packet stays valid.

Signed-off-by: Martynas Pumputis <martynas@weave.works>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_core.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 5123e91b1982..4ced7c7102b6 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -632,6 +632,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
 	       net_eq(net, nf_ct_net(ct));
 }
 
+static inline bool
+nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
+{
+	return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+				 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
+	       nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
+				 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
+	       nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
+	       nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
+	       net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
+}
+
 /* caller must hold rcu readlock and none of the nf_conntrack_locks */
 static void nf_ct_gc_expired(struct nf_conn *ct)
 {
@@ -825,19 +837,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
 	/* This is the conntrack entry already in hashes that won race. */
 	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
 	const struct nf_conntrack_l4proto *l4proto;
+	enum ip_conntrack_info oldinfo;
+	struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
 
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 	if (l4proto->allow_clash &&
-	    ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
 	    !nf_ct_is_dying(ct) &&
 	    atomic_inc_not_zero(&ct->ct_general.use)) {
-		enum ip_conntrack_info oldinfo;
-		struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
-
-		nf_ct_acct_merge(ct, ctinfo, loser_ct);
-		nf_conntrack_put(&loser_ct->ct_general);
-		nf_ct_set(skb, ct, oldinfo);
-		return NF_ACCEPT;
+		if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
+		    nf_ct_match(ct, loser_ct)) {
+			nf_ct_acct_merge(ct, ctinfo, loser_ct);
+			nf_conntrack_put(&loser_ct->ct_general);
+			nf_ct_set(skb, ct, oldinfo);
+			return NF_ACCEPT;
+		}
+		nf_ct_put(ct);
 	}
 	NF_CT_STAT_INC(net, drop);
 	return NF_DROP;

From ec1b28ca9674def4a158808a6493bdb87b993d81 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 6 Jul 2018 08:25:52 +0300
Subject: [PATCH 24/38] ipvs: provide just conn to ip_vs_state_name

In preparation for followup patches, provide just the cp
ptr to ip_vs_state_name.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h              | 2 +-
 net/netfilter/ipvs/ip_vs_conn.c  | 8 ++++----
 net/netfilter/ipvs/ip_vs_proto.c | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index a0bec23c6d5e..4d76abcf1c41 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1221,7 +1221,7 @@ struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 				  struct ip_vs_dest *dest, __u32 fwmark);
 void ip_vs_conn_expire_now(struct ip_vs_conn *cp);
 
-const char *ip_vs_state_name(__u16 proto, int state);
+const char *ip_vs_state_name(const struct ip_vs_conn *cp);
 
 void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp);
 int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest);
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 99e0aa350dc5..de5a64e42ebd 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1107,7 +1107,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 				&cp->caddr.in6, ntohs(cp->cport),
 				&cp->vaddr.in6, ntohs(cp->vport),
 				dbuf, ntohs(cp->dport),
-				ip_vs_state_name(cp->protocol, cp->state),
+				ip_vs_state_name(cp),
 				(cp->timer.expires-jiffies)/HZ, pe_data);
 		else
 #endif
@@ -1118,7 +1118,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
 				ntohl(cp->caddr.ip), ntohs(cp->cport),
 				ntohl(cp->vaddr.ip), ntohs(cp->vport),
 				dbuf, ntohs(cp->dport),
-				ip_vs_state_name(cp->protocol, cp->state),
+				ip_vs_state_name(cp),
 				(cp->timer.expires-jiffies)/HZ, pe_data);
 	}
 	return 0;
@@ -1169,7 +1169,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
 				&cp->caddr.in6, ntohs(cp->cport),
 				&cp->vaddr.in6, ntohs(cp->vport),
 				dbuf, ntohs(cp->dport),
-				ip_vs_state_name(cp->protocol, cp->state),
+				ip_vs_state_name(cp),
 				ip_vs_origin_name(cp->flags),
 				(cp->timer.expires-jiffies)/HZ);
 		else
@@ -1181,7 +1181,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
 				ntohl(cp->caddr.ip), ntohs(cp->cport),
 				ntohl(cp->vaddr.ip), ntohs(cp->vport),
 				dbuf, ntohs(cp->dport),
-				ip_vs_state_name(cp->protocol, cp->state),
+				ip_vs_state_name(cp),
 				ip_vs_origin_name(cp->flags),
 				(cp->timer.expires-jiffies)/HZ);
 	}
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index ca880a3ad033..85c446621758 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -193,13 +193,13 @@ ip_vs_create_timeout_table(int *table, int size)
 }
 
 
-const char * ip_vs_state_name(__u16 proto, int state)
+const char *ip_vs_state_name(const struct ip_vs_conn *cp)
 {
-	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+	struct ip_vs_protocol *pp = ip_vs_proto_get(cp->protocol);
 
 	if (pp == NULL || pp->state_name == NULL)
-		return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
-	return pp->state_name(state);
+		return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!";
+	return pp->state_name(cp->state);
 }
 
 

From 275411430f892407b885be1de2548b2e632892c3 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 6 Jul 2018 08:25:53 +0300
Subject: [PATCH 25/38] ipvs: add assured state for conn templates

cp->state was not used for templates. Add support for state bits
and for the first "assured" bit which indicates that some
connection controlled by this template was established or assured
by the real server. In a followup patch we will use it to drop
templates under SYN attack.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h                   | 16 ++++++++++++++++
 net/netfilter/ipvs/ip_vs_proto.c      | 17 +++++++++++++++--
 net/netfilter/ipvs/ip_vs_proto_sctp.c |  2 ++
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |  2 ++
 net/netfilter/ipvs/ip_vs_proto_udp.c  |  2 ++
 net/netfilter/ipvs/ip_vs_sync.c       | 18 ++++++------------
 6 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4d76abcf1c41..a0d2e0bb9a94 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -335,6 +335,11 @@ enum ip_vs_sctp_states {
 	IP_VS_SCTP_S_LAST
 };
 
+/* Connection templates use bits from state */
+#define IP_VS_CTPL_S_NONE		0x0000
+#define IP_VS_CTPL_S_ASSURED		0x0001
+#define IP_VS_CTPL_S_LAST		0x0002
+
 /* Delta sequence info structure
  * Each ip_vs_conn has 2 (output AND input seq. changes).
  * Only used in the VS/NAT.
@@ -1289,6 +1294,17 @@ ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp)
 	atomic_inc(&ctl_cp->n_control);
 }
 
+/* Mark our template as assured */
+static inline void
+ip_vs_control_assure_ct(struct ip_vs_conn *cp)
+{
+	struct ip_vs_conn *ct = cp->control;
+
+	if (ct && !(ct->state & IP_VS_CTPL_S_ASSURED) &&
+	    (ct->flags & IP_VS_CONN_F_TEMPLATE))
+		ct->state |= IP_VS_CTPL_S_ASSURED;
+}
+
 /* IPVS netns init & cleanup functions */
 int ip_vs_estimator_net_init(struct netns_ipvs *ipvs);
 int ip_vs_control_net_init(struct netns_ipvs *ipvs);
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 85c446621758..54ee84adf0bd 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -42,6 +42,11 @@
 
 static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
 
+/* States for conn templates: NONE or words separated with ",", max 15 chars */
+static const char *ip_vs_ctpl_state_name_table[IP_VS_CTPL_S_LAST] = {
+	[IP_VS_CTPL_S_NONE]			= "NONE",
+	[IP_VS_CTPL_S_ASSURED]			= "ASSURED",
+};
 
 /*
  *	register an ipvs protocol
@@ -195,11 +200,19 @@ ip_vs_create_timeout_table(int *table, int size)
 
 const char *ip_vs_state_name(const struct ip_vs_conn *cp)
 {
-	struct ip_vs_protocol *pp = ip_vs_proto_get(cp->protocol);
+	unsigned int state = cp->state;
+	struct ip_vs_protocol *pp;
 
+	if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+
+		if (state >= IP_VS_CTPL_S_LAST)
+			return "ERR!";
+		return ip_vs_ctpl_state_name_table[state] ? : "?";
+	}
+	pp = ip_vs_proto_get(cp->protocol);
 	if (pp == NULL || pp->state_name == NULL)
 		return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!";
-	return pp->state_name(cp->state);
+	return pp->state_name(state);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 3250c4a1111e..b0cd7d08f2a7 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -461,6 +461,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
 				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
 			}
 		}
+		if (next_state == IP_VS_SCTP_S_ESTABLISHED)
+			ip_vs_control_assure_ct(cp);
 	}
 	if (likely(pd))
 		cp->timeout = pd->timeout_table[cp->state = next_state];
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 80d10ad12a15..1770fc6ce960 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -569,6 +569,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
 				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
 			}
 		}
+		if (new_state == IP_VS_TCP_S_ESTABLISHED)
+			ip_vs_control_assure_ct(cp);
 	}
 
 	if (likely(pd))
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index e0ef11c3691e..0f53c49025f8 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -460,6 +460,8 @@ udp_state_transition(struct ip_vs_conn *cp, int direction,
 	}
 
 	cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
+	if (direction == IP_VS_DIR_OUTPUT)
+		ip_vs_control_assure_ct(cp);
 }
 
 static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 001501e25625..d4020c5e831d 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1003,12 +1003,9 @@ static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer
 				continue;
 			}
 		} else {
-			/* protocol in templates is not used for state/timeout */
-			if (state > 0) {
-				IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
-					state);
-				state = 0;
-			}
+			if (state >= IP_VS_CTPL_S_LAST)
+				IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n",
+					  state);
 		}
 
 		ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
@@ -1166,12 +1163,9 @@ static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *m
 			goto out;
 		}
 	} else {
-		/* protocol in templates is not used for state/timeout */
-		if (state > 0) {
-			IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
-				state);
-			state = 0;
-		}
+		if (state >= IP_VS_CTPL_S_LAST)
+			IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n",
+				  state);
 	}
 	if (ip_vs_conn_fill_param_sync(ipvs, af, s, &param, pe_data,
 				       pe_data_len, pe_name, pe_name_len)) {

From 762c40076684771c0efbce6490ded26086441ce6 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Fri, 6 Jul 2018 08:25:54 +0300
Subject: [PATCH 26/38] ipvs: drop conn templates under attack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before now, connection templates were ignored by the random
dropentry procedure. But Michal Koutný suggests that we
should add exception for connections under SYN attack.
He provided patch that implements it for TCP:

<quote>

IPVS includes protection against filling the ip_vs_conn_tab by
dropping 1/32 of feasible entries every second. The template
entries (for persistent services) are never directly deleted by
this mechanism but when a picked TCP connection entry is being
dropped (1), the respective template entry is dropped too (realized
by expiring 60 seconds after the connection entry being dropped).

There is another mechanism that removes connection entries when they
time out (2), in this case the associated template entry is not deleted.
Under SYN flood template entries would accumulate (due to their entry
longer timeout).

The accumulation takes place also with drop_entry being enabled. Roughly
15% ((31/32)^60) of SYN_RECV connections survive the dropping mechanism
(1) and are removed by the timeout mechanism (2)(defaults to 60 seconds
for SYN_RECV), thus template entries would still accumulate.

The patch ensures that when a connection entry times out, we also remove
the template entry from the table. To prevent breaking persistent
services (since the connection may time out in already established state)
we add a new entry flag to protect templates what spawned at least one
established TCP connection.

</quote>

We already added ASSURED flag for the templates in previous patch, so
that we can use it now to decide which connection templates should be
dropped under attack. But we also have some cases that need special
handling.

We modify the dropentry procedure as follows:

- Linux timers currently use LIFO ordering but we can not rely on
this to drop controlling connections. So, set cp->timeout to 0
to indicate that connection was dropped and that on expiration we
should try to drop our controlling connections. As result, we can
now avoid the ip_vs_conn_expire_now call.

- move the cp->n_control check above, so that it avoids restarting
the timer for controlling connections when not needed.

- drop unassured connection templates here if they are not referred
by any connections.

On connection expiration: if connection was dropped (cp->timeout=0)
try to drop our controlling connection except if it is a template
in assured state.

In ip_vs_conn_flush change order of ip_vs_conn_expire_now calls
according to the LIFO timer expiration order. It should work
faster for controlling connections with single controlled one.

Suggested-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_conn.c | 59 ++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 20 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index de5a64e42ebd..0edc62910ebf 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -825,12 +825,23 @@ static void ip_vs_conn_expire(struct timer_list *t)
 
 	/* Unlink conn if not referenced anymore */
 	if (likely(ip_vs_conn_unlink(cp))) {
+		struct ip_vs_conn *ct = cp->control;
+
 		/* delete the timer if it is activated by other users */
 		del_timer(&cp->timer);
 
 		/* does anybody control me? */
-		if (cp->control)
+		if (ct) {
 			ip_vs_control_del(cp);
+			/* Drop CTL or non-assured TPL if not used anymore */
+			if (!cp->timeout && !atomic_read(&ct->n_control) &&
+			    (!(ct->flags & IP_VS_CONN_F_TEMPLATE) ||
+			     !(ct->state & IP_VS_CTPL_S_ASSURED))) {
+				IP_VS_DBG(4, "drop controlling connection\n");
+				ct->timeout = 0;
+				ip_vs_conn_expire_now(ct);
+			}
+		}
 
 		if ((cp->flags & IP_VS_CONN_F_NFCT) &&
 		    !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
@@ -872,6 +883,10 @@ static void ip_vs_conn_expire(struct timer_list *t)
 
 /* Modify timer, so that it expires as soon as possible.
  * Can be called without reference only if under RCU lock.
+ * We can have such chain of conns linked with ->control: DATA->CTL->TPL
+ * - DATA (eg. FTP) and TPL (persistence) can be present depending on setup
+ * - cp->timeout=0 indicates all conns from chain should be dropped but
+ * TPL is not dropped if in assured state
  */
 void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
 {
@@ -1197,8 +1212,11 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
 #endif
 
 
-/*
- *      Randomly drop connection entries before running out of memory
+/* Randomly drop connection entries before running out of memory
+ * Can be used for DATA and CTL conns. For TPL conns there are exceptions:
+ * - traffic for services in OPS mode increases ct->in_pkts, so it is supported
+ * - traffic for services not in OPS mode does not increase ct->in_pkts in
+ * all cases, so it is not supported
  */
 static inline int todrop_entry(struct ip_vs_conn *cp)
 {
@@ -1242,7 +1260,7 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
 void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
 {
 	int idx;
-	struct ip_vs_conn *cp, *cp_c;
+	struct ip_vs_conn *cp;
 
 	rcu_read_lock();
 	/*
@@ -1254,13 +1272,15 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
 		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
 			if (cp->ipvs != ipvs)
 				continue;
+			if (atomic_read(&cp->n_control))
+				continue;
 			if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
-				if (atomic_read(&cp->n_control) ||
-				    !ip_vs_conn_ops_mode(cp))
-					continue;
-				else
-					/* connection template of OPS */
+				/* connection template of OPS */
+				if (ip_vs_conn_ops_mode(cp))
 					goto try_drop;
+				if (!(cp->state & IP_VS_CTPL_S_ASSURED))
+					goto drop;
+				continue;
 			}
 			if (cp->protocol == IPPROTO_TCP) {
 				switch(cp->state) {
@@ -1294,15 +1314,10 @@ try_drop:
 					continue;
 			}
 
-			IP_VS_DBG(4, "del connection\n");
+drop:
+			IP_VS_DBG(4, "drop connection\n");
+			cp->timeout = 0;
 			ip_vs_conn_expire_now(cp);
-			cp_c = cp->control;
-			/* cp->control is valid only with reference to cp */
-			if (cp_c && __ip_vs_conn_get(cp)) {
-				IP_VS_DBG(4, "del conn template\n");
-				ip_vs_conn_expire_now(cp_c);
-				__ip_vs_conn_put(cp);
-			}
 		}
 		cond_resched_rcu();
 	}
@@ -1325,15 +1340,19 @@ flush_again:
 		hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
 			if (cp->ipvs != ipvs)
 				continue;
-			IP_VS_DBG(4, "del connection\n");
-			ip_vs_conn_expire_now(cp);
+			/* As timers are expired in LIFO order, restart
+			 * the timer of controlling connection first, so
+			 * that it is expired after us.
+			 */
 			cp_c = cp->control;
 			/* cp->control is valid only with reference to cp */
 			if (cp_c && __ip_vs_conn_get(cp)) {
-				IP_VS_DBG(4, "del conn template\n");
+				IP_VS_DBG(4, "del controlling connection\n");
 				ip_vs_conn_expire_now(cp_c);
 				__ip_vs_conn_put(cp);
 			}
+			IP_VS_DBG(4, "del connection\n");
+			ip_vs_conn_expire_now(cp);
 		}
 		cond_resched_rcu();
 	}

From 440534d3c56be04abfb26850ee882d19d223557a Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Mon, 9 Jul 2018 18:06:33 +0800
Subject: [PATCH 27/38] netfilter: Remove useless param helper of
 nf_ct_helper_ext_add

The param helper of nf_ct_helper_ext_add is useless now, then remove
it now.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_helper.h | 4 +---
 net/netfilter/nf_conntrack_core.c           | 3 +--
 net/netfilter/nf_conntrack_helper.c         | 5 ++---
 net/netfilter/nf_conntrack_netlink.c        | 2 +-
 net/netfilter/nft_ct.c                      | 2 +-
 net/netfilter/xt_CT.c                       | 2 +-
 net/openvswitch/conntrack.c                 | 2 +-
 7 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index 32c2a94a219d..2492120b8097 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -103,9 +103,7 @@ int nf_conntrack_helpers_register(struct nf_conntrack_helper *, unsigned int);
 void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *,
 				     unsigned int);
 
-struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct,
-					  struct nf_conntrack_helper *helper,
-					  gfp_t gfp);
+struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp);
 
 int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 			      gfp_t flags);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 4ced7c7102b6..d97d7e9a9ee7 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1401,8 +1401,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
 			/* exp->master safe, refcnt bumped in nf_ct_find_expectation */
 			ct->master = exp->master;
 			if (exp->helper) {
-				help = nf_ct_helper_ext_add(ct, exp->helper,
-							    GFP_ATOMIC);
+				help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
 				if (help)
 					rcu_assign_pointer(help->helper, exp->helper);
 			}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index a55a58c706a9..d557a425289d 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -192,8 +192,7 @@ void nf_conntrack_helper_put(struct nf_conntrack_helper *helper)
 EXPORT_SYMBOL_GPL(nf_conntrack_helper_put);
 
 struct nf_conn_help *
-nf_ct_helper_ext_add(struct nf_conn *ct,
-		     struct nf_conntrack_helper *helper, gfp_t gfp)
+nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
 {
 	struct nf_conn_help *help;
 
@@ -262,7 +261,7 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 	}
 
 	if (help == NULL) {
-		help = nf_ct_helper_ext_add(ct, helper, flags);
+		help = nf_ct_helper_ext_add(ct, flags);
 		if (help == NULL)
 			return -ENOMEM;
 	} else {
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 40152b9ad772..f981bfa8db72 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1947,7 +1947,7 @@ ctnetlink_create_conntrack(struct net *net,
 		} else {
 			struct nf_conn_help *help;
 
-			help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC);
+			help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
 			if (help == NULL) {
 				err = -ENOMEM;
 				goto err2;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 1435ffc5f57e..3bc82ee5464d 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -870,7 +870,7 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj,
 	if (test_bit(IPS_HELPER_BIT, &ct->status))
 		return;
 
-	help = nf_ct_helper_ext_add(ct, to_assign, GFP_ATOMIC);
+	help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
 	if (help) {
 		rcu_assign_pointer(help->helper, to_assign);
 		set_bit(IPS_HELPER_BIT, &ct->status);
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 03b9a50ec93b..7ba454e9e3fa 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -93,7 +93,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
 		return -ENOENT;
 	}
 
-	help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL);
+	help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
 	if (help == NULL) {
 		nf_conntrack_helper_put(helper);
 		return -ENOMEM;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index e05bd3e53f0f..3e33c382367f 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1303,7 +1303,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
 		return -EINVAL;
 	}
 
-	help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL);
+	help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL);
 	if (!help) {
 		nf_conntrack_helper_put(helper);
 		return -ENOMEM;

From 452238e8d5ffd8b77f92387519513839d4ca7379 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 11 Jul 2018 13:45:10 +0200
Subject: [PATCH 28/38] netfilter: nf_tables: add and use helper for module
 autoload

module autoload is problematic, it requires dropping the mutex that
protects the transaction.  Once the mutex has been dropped, another
client can start a new transaction before we had a chance to abort
current transaction log.

This helper makes sure we first zap the transaction log, then
drop mutex for module autoload.

In case autload is successful, the caller has to reply entire
message anyway.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 81 ++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 29 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3f211e1025c1..5e95e92e547b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -455,8 +455,40 @@ __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
 	return NULL;
 }
 
+/*
+ * Loading a module requires dropping mutex that guards the
+ * transaction.
+ * We first need to abort any pending transactions as once
+ * mutex is unlocked a different client could start a new
+ * transaction.  It must not see any 'future generation'
+ * changes * as these changes will never happen.
+ */
+#ifdef CONFIG_MODULES
+static int __nf_tables_abort(struct net *net);
+
+static void nft_request_module(struct net *net, const char *fmt, ...)
+{
+	char module_name[MODULE_NAME_LEN];
+	va_list args;
+	int ret;
+
+	__nf_tables_abort(net);
+
+	va_start(args, fmt);
+	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+	va_end(args);
+	if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret))
+		return;
+
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	request_module("%s", module_name);
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+}
+#endif
+
 static const struct nft_chain_type *
-nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload)
+nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
+			    u8 family, bool autoload)
 {
 	const struct nft_chain_type *type;
 
@@ -465,10 +497,8 @@ nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload)
 		return type;
 #ifdef CONFIG_MODULES
 	if (autoload) {
-		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nft-chain-%u-%.*s", family,
-			       nla_len(nla), (const char *)nla_data(nla));
-		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		nft_request_module(net, "nft-chain-%u-%.*s", family,
+				   nla_len(nla), (const char *)nla_data(nla));
 		type = __nf_tables_chain_type_lookup(nla, family);
 		if (type != NULL)
 			return ERR_PTR(-EAGAIN);
@@ -1412,7 +1442,7 @@ static int nft_chain_parse_hook(struct net *net,
 
 	type = chain_type[family][NFT_CHAIN_T_DEFAULT];
 	if (nla[NFTA_CHAIN_TYPE]) {
-		type = nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE],
+		type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
 						   family, create);
 		if (IS_ERR(type))
 			return PTR_ERR(type);
@@ -1875,7 +1905,8 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family,
 	return NULL;
 }
 
-static const struct nft_expr_type *nft_expr_type_get(u8 family,
+static const struct nft_expr_type *nft_expr_type_get(struct net *net,
+						     u8 family,
 						     struct nlattr *nla)
 {
 	const struct nft_expr_type *type;
@@ -1889,17 +1920,13 @@ static const struct nft_expr_type *nft_expr_type_get(u8 family,
 
 #ifdef CONFIG_MODULES
 	if (type == NULL) {
-		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nft-expr-%u-%.*s", family,
-			       nla_len(nla), (char *)nla_data(nla));
-		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		nft_request_module(net, "nft-expr-%u-%.*s", family,
+				   nla_len(nla), (char *)nla_data(nla));
 		if (__nft_expr_type_get(family, nla))
 			return ERR_PTR(-EAGAIN);
 
-		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nft-expr-%.*s",
-			       nla_len(nla), (char *)nla_data(nla));
-		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		nft_request_module(net, "nft-expr-%.*s",
+				   nla_len(nla), (char *)nla_data(nla));
 		if (__nft_expr_type_get(family, nla))
 			return ERR_PTR(-EAGAIN);
 	}
@@ -1968,7 +1995,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
-	type = nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
+	type = nft_expr_type_get(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]);
 	if (IS_ERR(type))
 		return PTR_ERR(type);
 
@@ -2744,9 +2771,7 @@ nft_select_set_ops(const struct nft_ctx *ctx,
 
 #ifdef CONFIG_MODULES
 	if (list_empty(&nf_tables_set_types)) {
-		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nft-set");
-		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		nft_request_module(ctx->net, "nft-set");
 		if (!list_empty(&nf_tables_set_types))
 			return ERR_PTR(-EAGAIN);
 	}
@@ -4779,7 +4804,8 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
 	return NULL;
 }
 
-static const struct nft_object_type *nft_obj_type_get(u32 objtype)
+static const struct nft_object_type *
+nft_obj_type_get(struct net *net, u32 objtype)
 {
 	const struct nft_object_type *type;
 
@@ -4789,9 +4815,7 @@ static const struct nft_object_type *nft_obj_type_get(u32 objtype)
 
 #ifdef CONFIG_MODULES
 	if (type == NULL) {
-		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nft-obj-%u", objtype);
-		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		nft_request_module(net, "nft-obj-%u", objtype);
 		if (__nft_obj_type_get(objtype))
 			return ERR_PTR(-EAGAIN);
 	}
@@ -4843,7 +4867,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
 
 	nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
 
-	type = nft_obj_type_get(objtype);
+	type = nft_obj_type_get(net, objtype);
 	if (IS_ERR(type))
 		return PTR_ERR(type);
 
@@ -5339,7 +5363,8 @@ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family)
 	return NULL;
 }
 
-static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
+static const struct nf_flowtable_type *
+nft_flowtable_type_get(struct net *net, u8 family)
 {
 	const struct nf_flowtable_type *type;
 
@@ -5349,9 +5374,7 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
 
 #ifdef CONFIG_MODULES
 	if (type == NULL) {
-		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-		request_module("nf-flowtable-%u", family);
-		nfnl_lock(NFNL_SUBSYS_NFTABLES);
+		nft_request_module(net, "nf-flowtable-%u", family);
 		if (__nft_flowtable_type_get(family))
 			return ERR_PTR(-EAGAIN);
 	}
@@ -5431,7 +5454,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
 		goto err1;
 	}
 
-	type = nft_flowtable_type_get(family);
+	type = nft_flowtable_type_get(net, family);
 	if (IS_ERR(type)) {
 		err = PTR_ERR(type);
 		goto err2;

From ca2f18be792fddd0db2bbf6cbe1ec12d1bb32dd7 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 11 Jul 2018 13:45:11 +0200
Subject: [PATCH 29/38] netfilter: nf_tables: make valid_genid callback
 mandatory

always call this function, followup patch can use this to
aquire a per-netns transaction log to guard the entire batch
instead of using the nfnl susbsys mutex (which is shared among all
namespaces).

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 2 +-
 net/netfilter/nfnetlink.c     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 5e95e92e547b..594b395442d6 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6591,7 +6591,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 
 static bool nf_tables_valid_genid(struct net *net, u32 genid)
 {
-	return net->nft.base_seq == genid;
+	return genid == 0 || net->nft.base_seq == genid;
 }
 
 static const struct nfnetlink_subsystem nf_tables_subsys = {
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index e1b6be29848d..94f9bcaa0799 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -331,13 +331,13 @@ replay:
 		}
 	}
 
-	if (!ss->commit || !ss->abort) {
+	if (!ss->valid_genid || !ss->commit || !ss->abort) {
 		nfnl_unlock(subsys_id);
 		netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
 		return kfree_skb(skb);
 	}
 
-	if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) {
+	if (!ss->valid_genid(net, genid)) {
 		nfnl_unlock(subsys_id);
 		netlink_ack(oskb, nlh, -ERESTART, NULL);
 		return kfree_skb(skb);

From be2ab5b4d5c0bf041a34ec2e1397d50afbfb095e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 11 Jul 2018 13:45:12 +0200
Subject: [PATCH 30/38] netfilter: nf_tables: take module reference when
 starting a batch

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h | 1 +
 net/netfilter/nf_tables_api.c       | 1 +
 net/netfilter/nfnetlink.c           | 9 +++++++++
 3 files changed, 11 insertions(+)

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 3ecc3050be0e..4a520d3304a2 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -29,6 +29,7 @@ struct nfnetlink_subsystem {
 	__u8 subsys_id;			/* nfnetlink subsystem ID */
 	__u8 cb_count;			/* number of callbacks */
 	const struct nfnl_callback *cb;	/* callback for individual types */
+	struct module *owner;
 	int (*commit)(struct net *net, struct sk_buff *skb);
 	int (*abort)(struct net *net, struct sk_buff *skb);
 	void (*cleanup)(struct net *net);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 594b395442d6..c16c481fc52a 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6603,6 +6603,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 	.abort		= nf_tables_abort,
 	.cleanup	= nf_tables_cleanup,
 	.valid_genid	= nf_tables_valid_genid,
+	.owner		= THIS_MODULE,
 };
 
 int nft_chain_validate_dependency(const struct nft_chain *chain,
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 94f9bcaa0799..dd1d7bc23b03 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -337,7 +337,14 @@ replay:
 		return kfree_skb(skb);
 	}
 
+	if (!try_module_get(ss->owner)) {
+		nfnl_unlock(subsys_id);
+		netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
+		return kfree_skb(skb);
+	}
+
 	if (!ss->valid_genid(net, genid)) {
+		module_put(ss->owner);
 		nfnl_unlock(subsys_id);
 		netlink_ack(oskb, nlh, -ERESTART, NULL);
 		return kfree_skb(skb);
@@ -472,6 +479,7 @@ done:
 		nfnl_err_reset(&err_list);
 		nfnl_unlock(subsys_id);
 		kfree_skb(skb);
+		module_put(ss->owner);
 		goto replay;
 	} else if (status == NFNL_BATCH_DONE) {
 		err = ss->commit(net, oskb);
@@ -491,6 +499,7 @@ done:
 	nfnl_err_deliver(&err_list, oskb);
 	nfnl_unlock(subsys_id);
 	kfree_skb(skb);
+	module_put(ss->owner);
 }
 
 static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {

From 2a43ecf96ba6a6eed70dbcd99d0888fc0ad3b82b Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 11 Jul 2018 13:45:13 +0200
Subject: [PATCH 31/38] netfilter: nf_tables: avoid global info storage

This works because all accesses are currently serialized by nfnl
nf_tables subsys mutex.

If we want to have per-netns locking, we need to make this scratch
area pernetns or allocate it on demand.

This does the latter, its ~28kbyte but we can fallback to vmalloc
so it should be fine.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c16c481fc52a..68436edd9cdf 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2454,8 +2454,6 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
 
 #define NFT_RULE_MAXEXPRS	128
 
-static struct nft_expr_info *info;
-
 static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 			     struct sk_buff *skb, const struct nlmsghdr *nlh,
 			     const struct nlattr * const nla[],
@@ -2463,6 +2461,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
 	u8 genmask = nft_genmask_next(net);
+	struct nft_expr_info *info = NULL;
 	int family = nfmsg->nfgen_family;
 	struct nft_table *table;
 	struct nft_chain *chain;
@@ -2533,6 +2532,12 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 	n = 0;
 	size = 0;
 	if (nla[NFTA_RULE_EXPRESSIONS]) {
+		info = kvmalloc_array(NFT_RULE_MAXEXPRS,
+				      sizeof(struct nft_expr_info),
+				      GFP_KERNEL);
+		if (!info)
+			return -ENOMEM;
+
 		nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
 			err = -EINVAL;
 			if (nla_type(tmp) != NFTA_LIST_ELEM)
@@ -2625,6 +2630,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 				list_add_rcu(&rule->list, &chain->rules);
 		}
 	}
+	kvfree(info);
 	chain->use++;
 
 	if (net->nft.validate_state == NFT_VALIDATE_DO)
@@ -2638,6 +2644,7 @@ err1:
 		if (info[i].ops != NULL)
 			module_put(info[i].ops->type->owner);
 	}
+	kvfree(info);
 	return err;
 }
 
@@ -7203,29 +7210,19 @@ static int __init nf_tables_module_init(void)
 
 	nft_chain_filter_init();
 
-	info = kmalloc_array(NFT_RULE_MAXEXPRS, sizeof(struct nft_expr_info),
-			     GFP_KERNEL);
-	if (info == NULL) {
-		err = -ENOMEM;
-		goto err1;
-	}
-
 	err = nf_tables_core_module_init();
 	if (err < 0)
-		goto err2;
+		return err;
 
 	err = nfnetlink_subsys_register(&nf_tables_subsys);
 	if (err < 0)
-		goto err3;
+		goto err;
 
 	register_netdevice_notifier(&nf_tables_flowtable_notifier);
 
 	return register_pernet_subsys(&nf_tables_net_ops);
-err3:
+err:
 	nf_tables_core_module_exit();
-err2:
-	kfree(info);
-err1:
 	return err;
 }
 
@@ -7237,7 +7234,6 @@ static void __exit nf_tables_module_exit(void)
 	unregister_pernet_subsys(&nf_tables_net_ops);
 	rcu_barrier();
 	nf_tables_core_module_exit();
-	kfree(info);
 }
 
 module_init(nf_tables_module_init);

From f102d66b335a417d4848da9441f585695a838934 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 11 Jul 2018 13:45:14 +0200
Subject: [PATCH 32/38] netfilter: nf_tables: use dedicated mutex to guard
 transactions

Continue to use nftnl subsys mutex to protect (un)registration of hook types,
expressions and so on, but force batch operations to do their own
locking.

This allows distinct net namespaces to perform transactions in parallel.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/nftables.h     |  1 +
 net/netfilter/nf_tables_api.c    | 88 +++++++++++++++++++++++++-------
 net/netfilter/nfnetlink.c        | 10 ++--
 net/netfilter/nft_chain_filter.c |  4 +-
 net/netfilter/nft_dynset.c       |  2 +
 5 files changed, 77 insertions(+), 28 deletions(-)

diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index 94767ea3a490..286fd960896f 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -7,6 +7,7 @@
 struct netns_nftables {
 	struct list_head	tables;
 	struct list_head	commit_list;
+	struct mutex		commit_mutex;
 	unsigned int		base_seq;
 	u8			gencursor;
 	u8			validate_state;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 68436edd9cdf..c0fb2bcd30fe 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -480,12 +480,19 @@ static void nft_request_module(struct net *net, const char *fmt, ...)
 	if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret))
 		return;
 
-	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	mutex_unlock(&net->nft.commit_mutex);
 	request_module("%s", module_name);
-	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	mutex_lock(&net->nft.commit_mutex);
 }
 #endif
 
+static void lockdep_nfnl_nft_mutex_not_held(void)
+{
+#ifdef CONFIG_PROVE_LOCKING
+	WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+#endif
+}
+
 static const struct nft_chain_type *
 nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
 			    u8 family, bool autoload)
@@ -495,6 +502,8 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
 	type = __nf_tables_chain_type_lookup(nla, family);
 	if (type != NULL)
 		return type;
+
+	lockdep_nfnl_nft_mutex_not_held();
 #ifdef CONFIG_MODULES
 	if (autoload) {
 		nft_request_module(net, "nft-chain-%u-%.*s", family,
@@ -802,6 +811,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
 	struct nft_ctx ctx;
 	int err;
 
+	lockdep_assert_held(&net->nft.commit_mutex);
 	attr = nla[NFTA_TABLE_NAME];
 	table = nft_table_lookup(net, attr, family, genmask);
 	if (IS_ERR(table)) {
@@ -1042,7 +1052,17 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
 	return ERR_PTR(-ENOENT);
 }
 
-static struct nft_chain *nft_chain_lookup(struct nft_table *table,
+static bool lockdep_commit_lock_is_held(struct net *net)
+{
+#ifdef CONFIG_PROVE_LOCKING
+	return lockdep_is_held(&net->nft.commit_mutex);
+#else
+	return true;
+#endif
+}
+
+static struct nft_chain *nft_chain_lookup(struct net *net,
+					  struct nft_table *table,
 					  const struct nlattr *nla, u8 genmask)
 {
 	char search[NFT_CHAIN_MAXNAMELEN + 1];
@@ -1055,7 +1075,7 @@ static struct nft_chain *nft_chain_lookup(struct nft_table *table,
 	nla_strlcpy(search, nla, sizeof(search));
 
 	WARN_ON(!rcu_read_lock_held() &&
-		!lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+		!lockdep_commit_lock_is_held(net));
 
 	chain = ERR_PTR(-ENOENT);
 	rcu_read_lock();
@@ -1295,7 +1315,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
 		return PTR_ERR(table);
 	}
 
-	chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+	chain = nft_chain_lookup(net, table, nla[NFTA_CHAIN_NAME], genmask);
 	if (IS_ERR(chain)) {
 		NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
 		return PTR_ERR(chain);
@@ -1428,6 +1448,9 @@ static int nft_chain_parse_hook(struct net *net,
 	struct net_device *dev;
 	int err;
 
+	lockdep_assert_held(&net->nft.commit_mutex);
+	lockdep_nfnl_nft_mutex_not_held();
+
 	err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
 			       nft_hook_policy, NULL);
 	if (err < 0)
@@ -1662,7 +1685,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
 	    nla[NFTA_CHAIN_NAME]) {
 		struct nft_chain *chain2;
 
-		chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+		chain2 = nft_chain_lookup(ctx->net, table,
+					  nla[NFTA_CHAIN_NAME], genmask);
 		if (!IS_ERR(chain2))
 			return -EEXIST;
 	}
@@ -1724,6 +1748,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
+	lockdep_assert_held(&net->nft.commit_mutex);
+
 	table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
 	if (IS_ERR(table)) {
 		NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
@@ -1742,7 +1768,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
 		}
 		attr = nla[NFTA_CHAIN_HANDLE];
 	} else {
-		chain = nft_chain_lookup(table, attr, genmask);
+		chain = nft_chain_lookup(net, table, attr, genmask);
 		if (IS_ERR(chain)) {
 			if (PTR_ERR(chain) != -ENOENT) {
 				NL_SET_BAD_ATTR(extack, attr);
@@ -1820,7 +1846,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
 		chain = nft_chain_lookup_byhandle(table, handle, genmask);
 	} else {
 		attr = nla[NFTA_CHAIN_NAME];
-		chain = nft_chain_lookup(table, attr, genmask);
+		chain = nft_chain_lookup(net, table, attr, genmask);
 	}
 	if (IS_ERR(chain)) {
 		NL_SET_BAD_ATTR(extack, attr);
@@ -1918,6 +1944,7 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net,
 	if (type != NULL && try_module_get(type->owner))
 		return type;
 
+	lockdep_nfnl_nft_mutex_not_held();
 #ifdef CONFIG_MODULES
 	if (type == NULL) {
 		nft_request_module(net, "nft-expr-%u-%.*s", family,
@@ -2352,7 +2379,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
 		return PTR_ERR(table);
 	}
 
-	chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+	chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
 	if (IS_ERR(chain)) {
 		NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
 		return PTR_ERR(chain);
@@ -2386,6 +2413,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
 {
 	struct nft_expr *expr;
 
+	lockdep_assert_held(&ctx->net->nft.commit_mutex);
 	/*
 	 * Careful: some expressions might not be initialized in case this
 	 * is called on error from nf_tables_newrule().
@@ -2476,6 +2504,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 	bool create;
 	u64 handle, pos_handle;
 
+	lockdep_assert_held(&net->nft.commit_mutex);
+
 	create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
 
 	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
@@ -2484,7 +2514,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
 		return PTR_ERR(table);
 	}
 
-	chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+	chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
 	if (IS_ERR(chain)) {
 		NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
 		return PTR_ERR(chain);
@@ -2684,7 +2714,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
 	}
 
 	if (nla[NFTA_RULE_CHAIN]) {
-		chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+		chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
+					 genmask);
 		if (IS_ERR(chain)) {
 			NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
 			return PTR_ERR(chain);
@@ -2776,6 +2807,8 @@ nft_select_set_ops(const struct nft_ctx *ctx,
 	const struct nft_set_type *type;
 	u32 flags = 0;
 
+	lockdep_assert_held(&ctx->net->nft.commit_mutex);
+	lockdep_nfnl_nft_mutex_not_held();
 #ifdef CONFIG_MODULES
 	if (list_empty(&nf_tables_set_types)) {
 		nft_request_module(ctx->net, "nft-set");
@@ -4820,6 +4853,7 @@ nft_obj_type_get(struct net *net, u32 objtype)
 	if (type != NULL && try_module_get(type->owner))
 		return type;
 
+	lockdep_nfnl_nft_mutex_not_held();
 #ifdef CONFIG_MODULES
 	if (type == NULL) {
 		nft_request_module(net, "nft-obj-%u", objtype);
@@ -5379,6 +5413,7 @@ nft_flowtable_type_get(struct net *net, u8 family)
 	if (type != NULL && try_module_get(type->owner))
 		return type;
 
+	lockdep_nfnl_nft_mutex_not_held();
 #ifdef CONFIG_MODULES
 	if (type == NULL) {
 		nft_request_module(net, "nf-flowtable-%u", family);
@@ -6232,9 +6267,9 @@ static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *cha
 	next_genbit = nft_gencursor_next(net);
 
 	g0 = rcu_dereference_protected(chain->rules_gen_0,
-				       lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+				       lockdep_commit_lock_is_held(net));
 	g1 = rcu_dereference_protected(chain->rules_gen_1,
-				       lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+				       lockdep_commit_lock_is_held(net));
 
 	/* No changes to this chain? */
 	if (chain->rules_next == NULL) {
@@ -6442,6 +6477,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
 
 	nf_tables_commit_release(net);
 	nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
+	mutex_unlock(&net->nft.commit_mutex);
 
 	return 0;
 }
@@ -6593,12 +6629,25 @@ static void nf_tables_cleanup(struct net *net)
 
 static int nf_tables_abort(struct net *net, struct sk_buff *skb)
 {
-	return __nf_tables_abort(net);
+	int ret = __nf_tables_abort(net);
+
+	mutex_unlock(&net->nft.commit_mutex);
+
+	return ret;
 }
 
 static bool nf_tables_valid_genid(struct net *net, u32 genid)
 {
-	return genid == 0 || net->nft.base_seq == genid;
+	bool genid_ok;
+
+	mutex_lock(&net->nft.commit_mutex);
+
+	genid_ok = genid == 0 || net->nft.base_seq == genid;
+	if (!genid_ok)
+		mutex_unlock(&net->nft.commit_mutex);
+
+	/* else, commit mutex has to be released by commit or abort function */
+	return genid_ok;
 }
 
 static const struct nfnetlink_subsystem nf_tables_subsys = {
@@ -6937,8 +6986,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
 	case NFT_GOTO:
 		if (!tb[NFTA_VERDICT_CHAIN])
 			return -EINVAL;
-		chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN],
-					 genmask);
+		chain = nft_chain_lookup(ctx->net, ctx->table,
+					 tb[NFTA_VERDICT_CHAIN], genmask);
 		if (IS_ERR(chain))
 			return PTR_ERR(chain);
 		if (nft_is_base_chain(chain))
@@ -7183,6 +7232,7 @@ static int __net_init nf_tables_init_net(struct net *net)
 {
 	INIT_LIST_HEAD(&net->nft.tables);
 	INIT_LIST_HEAD(&net->nft.commit_list);
+	mutex_init(&net->nft.commit_mutex);
 	net->nft.base_seq = 1;
 	net->nft.validate_state = NFT_VALIDATE_SKIP;
 
@@ -7191,11 +7241,11 @@ static int __net_init nf_tables_init_net(struct net *net)
 
 static void __net_exit nf_tables_exit_net(struct net *net)
 {
-	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	mutex_lock(&net->nft.commit_mutex);
 	if (!list_empty(&net->nft.commit_list))
 		__nf_tables_abort(net);
 	__nft_release_tables(net);
-	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	mutex_unlock(&net->nft.commit_mutex);
 	WARN_ON_ONCE(!list_empty(&net->nft.tables));
 }
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index dd1d7bc23b03..916913454624 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -350,6 +350,8 @@ replay:
 		return kfree_skb(skb);
 	}
 
+	nfnl_unlock(subsys_id);
+
 	while (skb->len >= nlmsg_total_size(0)) {
 		int msglen, type;
 
@@ -471,13 +473,8 @@ ack:
 	}
 done:
 	if (status & NFNL_BATCH_REPLAY) {
-		const struct nfnetlink_subsystem *ss2;
-
-		ss2 = nfnl_dereference_protected(subsys_id);
-		if (ss2 == ss)
-			ss->abort(net, oskb);
+		ss->abort(net, oskb);
 		nfnl_err_reset(&err_list);
-		nfnl_unlock(subsys_id);
 		kfree_skb(skb);
 		module_put(ss->owner);
 		goto replay;
@@ -497,7 +494,6 @@ done:
 		ss->cleanup(net);
 
 	nfnl_err_deliver(&err_list, oskb);
-	nfnl_unlock(subsys_id);
 	kfree_skb(skb);
 	module_put(ss->owner);
 }
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index d21834bed805..ea5b7c4944f6 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -322,7 +322,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
 	if (!ctx.net)
 		return NOTIFY_DONE;
 
-	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	mutex_lock(&ctx.net->nft.commit_mutex);
 	list_for_each_entry(table, &ctx.net->nft.tables, list) {
 		if (table->family != NFPROTO_NETDEV)
 			continue;
@@ -337,7 +337,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
 			nft_netdev_event(event, dev, &ctx);
 		}
 	}
-	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+	mutex_unlock(&ctx.net->nft.commit_mutex);
 	put_net(ctx.net);
 
 	return NOTIFY_DONE;
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 27d7e4598ab6..81184c244d1a 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -118,6 +118,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 	u64 timeout;
 	int err;
 
+	lockdep_assert_held(&ctx->net->nft.commit_mutex);
+
 	if (tb[NFTA_DYNSET_SET_NAME] == NULL ||
 	    tb[NFTA_DYNSET_OP] == NULL ||
 	    tb[NFTA_DYNSET_SREG_KEY] == NULL)

From 06ff4aa252303bd2a5d706008210bb49d9889b9d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 13 Jul 2018 14:54:43 +0200
Subject: [PATCH 33/38] netfilter: nf_osf: add nf_osf_match_one()

This new function allows us to check if there is TCP syn packet matching
with a given fingerprint that can be reused from the upcoming new
nf_osf_find() function.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_osf.c | 207 ++++++++++++++++++++++-------------------
 1 file changed, 111 insertions(+), 96 deletions(-)

diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c
index 5ba5c7bef2f9..bd7b34dd7d87 100644
--- a/net/netfilter/nf_osf.c
+++ b/net/netfilter/nf_osf.c
@@ -21,15 +21,14 @@
 #include <linux/netfilter/nf_osf.h>
 
 static inline int nf_osf_ttl(const struct sk_buff *skb,
-			     const struct nf_osf_info *info,
-			     unsigned char f_ttl)
+			     int ttl_check, unsigned char f_ttl)
 {
 	const struct iphdr *ip = ip_hdr(skb);
 
-	if (info->flags & NF_OSF_TTL) {
-		if (info->ttl == NF_OSF_TTL_TRUE)
+	if (ttl_check != -1) {
+		if (ttl_check == NF_OSF_TTL_TRUE)
 			return ip->ttl == f_ttl;
-		if (info->ttl == NF_OSF_TTL_NOCHECK)
+		if (ttl_check == NF_OSF_TTL_NOCHECK)
 			return 1;
 		else if (ip->ttl <= f_ttl)
 			return 1;
@@ -52,6 +51,104 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
 	return ip->ttl == f_ttl;
 }
 
+static bool nf_osf_match_one(const struct sk_buff *skb,
+			     const struct nf_osf_user_finger *f,
+			     int ttl_check, u16 totlen, u16 window,
+			     const unsigned char *optp,
+			     unsigned int optsize)
+{
+	unsigned int check_WSS = 0;
+	int fmatch = FMATCH_WRONG;
+	int foptsize, optnum;
+	u16 mss = 0;
+
+	if (totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl))
+		return false;
+
+	/*
+	 * Should not happen if userspace parser was written correctly.
+	 */
+	if (f->wss.wc >= OSF_WSS_MAX)
+		return false;
+
+	/* Check options */
+
+	foptsize = 0;
+	for (optnum = 0; optnum < f->opt_num; ++optnum)
+		foptsize += f->opt[optnum].length;
+
+	if (foptsize > MAX_IPOPTLEN ||
+	    optsize > MAX_IPOPTLEN ||
+	    optsize != foptsize)
+		return false;
+
+	check_WSS = f->wss.wc;
+
+	for (optnum = 0; optnum < f->opt_num; ++optnum) {
+		if (f->opt[optnum].kind == (*optp)) {
+			__u32 len = f->opt[optnum].length;
+			const __u8 *optend = optp + len;
+
+			fmatch = FMATCH_OK;
+
+			switch (*optp) {
+			case OSFOPT_MSS:
+				mss = optp[3];
+				mss <<= 8;
+				mss |= optp[2];
+
+				mss = ntohs((__force __be16)mss);
+				break;
+			case OSFOPT_TS:
+				break;
+			}
+
+			optp = optend;
+		} else
+			fmatch = FMATCH_OPT_WRONG;
+
+		if (fmatch != FMATCH_OK)
+			break;
+	}
+
+	if (fmatch != FMATCH_OPT_WRONG) {
+		fmatch = FMATCH_WRONG;
+
+		switch (check_WSS) {
+		case OSF_WSS_PLAIN:
+			if (f->wss.val == 0 || window == f->wss.val)
+				fmatch = FMATCH_OK;
+			break;
+		case OSF_WSS_MSS:
+			/*
+			 * Some smart modems decrease mangle MSS to
+			 * SMART_MSS_2, so we check standard, decreased
+			 * and the one provided in the fingerprint MSS
+			 * values.
+			 */
+#define SMART_MSS_1	1460
+#define SMART_MSS_2	1448
+			if (window == f->wss.val * mss ||
+			    window == f->wss.val * SMART_MSS_1 ||
+			    window == f->wss.val * SMART_MSS_2)
+				fmatch = FMATCH_OK;
+			break;
+		case OSF_WSS_MTU:
+			if (window == f->wss.val * (mss + 40) ||
+			    window == f->wss.val * (SMART_MSS_1 + 40) ||
+			    window == f->wss.val * (SMART_MSS_2 + 40))
+				fmatch = FMATCH_OK;
+			break;
+		case OSF_WSS_MODULO:
+			if ((window % f->wss.val) == 0)
+				fmatch = FMATCH_OK;
+			break;
+		}
+	}
+
+	return fmatch == FMATCH_OK;
+}
+
 bool
 nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 	     int hooknum, struct net_device *in, struct net_device *out,
@@ -59,15 +156,16 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 	     const struct list_head *nf_osf_fingers)
 {
 	const unsigned char *optp = NULL, *_optp = NULL;
-	unsigned int optsize = 0, check_WSS = 0;
-	int fmatch = FMATCH_WRONG, fcount = 0;
 	const struct iphdr *ip = ip_hdr(skb);
 	const struct nf_osf_user_finger *f;
 	unsigned char opts[MAX_IPOPTLEN];
 	const struct nf_osf_finger *kf;
-	u16 window, totlen, mss = 0;
+	int fcount = 0, ttl_check;
+	int fmatch = FMATCH_WRONG;
+	unsigned int optsize = 0;
 	const struct tcphdr *tcp;
 	struct tcphdr _tcph;
+	u16 window, totlen;
 	bool df;
 
 	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
@@ -88,103 +186,20 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 				sizeof(struct tcphdr), optsize, opts);
 	}
 
+	ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1;
+
 	list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) {
-		int foptsize, optnum;
 
 		f = &kf->finger;
 
 		if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
 			continue;
 
-		optp = _optp;
-		fmatch = FMATCH_WRONG;
-
-		if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl))
+		if (!nf_osf_match_one(skb, f,
+				      ttl_check, totlen, window, optp, optsize))
 			continue;
 
-		/*
-		 * Should not happen if userspace parser was written correctly.
-		 */
-		if (f->wss.wc >= OSF_WSS_MAX)
-			continue;
-
-		/* Check options */
-
-		foptsize = 0;
-		for (optnum = 0; optnum < f->opt_num; ++optnum)
-			foptsize += f->opt[optnum].length;
-
-		if (foptsize > MAX_IPOPTLEN ||
-		    optsize > MAX_IPOPTLEN ||
-		    optsize != foptsize)
-			continue;
-
-		check_WSS = f->wss.wc;
-
-		for (optnum = 0; optnum < f->opt_num; ++optnum) {
-			if (f->opt[optnum].kind == (*optp)) {
-				__u32 len = f->opt[optnum].length;
-				const __u8 *optend = optp + len;
-
-				fmatch = FMATCH_OK;
-
-				switch (*optp) {
-				case OSFOPT_MSS:
-					mss = optp[3];
-					mss <<= 8;
-					mss |= optp[2];
-
-					mss = ntohs((__force __be16)mss);
-					break;
-				case OSFOPT_TS:
-					break;
-				}
-
-				optp = optend;
-			} else
-				fmatch = FMATCH_OPT_WRONG;
-
-			if (fmatch != FMATCH_OK)
-				break;
-		}
-
-		if (fmatch != FMATCH_OPT_WRONG) {
-			fmatch = FMATCH_WRONG;
-
-			switch (check_WSS) {
-			case OSF_WSS_PLAIN:
-				if (f->wss.val == 0 || window == f->wss.val)
-					fmatch = FMATCH_OK;
-				break;
-			case OSF_WSS_MSS:
-				/*
-				 * Some smart modems decrease mangle MSS to
-				 * SMART_MSS_2, so we check standard, decreased
-				 * and the one provided in the fingerprint MSS
-				 * values.
-				 */
-#define SMART_MSS_1	1460
-#define SMART_MSS_2	1448
-				if (window == f->wss.val * mss ||
-				    window == f->wss.val * SMART_MSS_1 ||
-				    window == f->wss.val * SMART_MSS_2)
-					fmatch = FMATCH_OK;
-				break;
-			case OSF_WSS_MTU:
-				if (window == f->wss.val * (mss + 40) ||
-				    window == f->wss.val * (SMART_MSS_1 + 40) ||
-				    window == f->wss.val * (SMART_MSS_2 + 40))
-					fmatch = FMATCH_OK;
-				break;
-			case OSF_WSS_MODULO:
-				if ((window % f->wss.val) == 0)
-					fmatch = FMATCH_OK;
-				break;
-			}
-		}
-
-		if (fmatch != FMATCH_OK)
-			continue;
+		fmatch = FMATCH_OK;
 
 		fcount++;
 

From 31a9c29210e2d8129d2e81acb89babb56916c6c9 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 13 Jul 2018 14:54:44 +0200
Subject: [PATCH 34/38] netfilter: nf_osf: add struct nf_osf_hdr_ctx

Wrap context that allow us to guess the OS into a structure.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_osf.c | 105 ++++++++++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 43 deletions(-)

diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c
index bd7b34dd7d87..b44d62d5d9a9 100644
--- a/net/netfilter/nf_osf.c
+++ b/net/netfilter/nf_osf.c
@@ -51,18 +51,25 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
 	return ip->ttl == f_ttl;
 }
 
+struct nf_osf_hdr_ctx {
+	bool			df;
+	u16			window;
+	u16			totlen;
+	const unsigned char	*optp;
+	unsigned int		optsize;
+};
+
 static bool nf_osf_match_one(const struct sk_buff *skb,
 			     const struct nf_osf_user_finger *f,
-			     int ttl_check, u16 totlen, u16 window,
-			     const unsigned char *optp,
-			     unsigned int optsize)
+			     int ttl_check,
+			     struct nf_osf_hdr_ctx *ctx)
 {
 	unsigned int check_WSS = 0;
 	int fmatch = FMATCH_WRONG;
 	int foptsize, optnum;
 	u16 mss = 0;
 
-	if (totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl))
+	if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl))
 		return false;
 
 	/*
@@ -78,24 +85,24 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
 		foptsize += f->opt[optnum].length;
 
 	if (foptsize > MAX_IPOPTLEN ||
-	    optsize > MAX_IPOPTLEN ||
-	    optsize != foptsize)
+	    ctx->optsize > MAX_IPOPTLEN ||
+	    ctx->optsize != foptsize)
 		return false;
 
 	check_WSS = f->wss.wc;
 
 	for (optnum = 0; optnum < f->opt_num; ++optnum) {
-		if (f->opt[optnum].kind == (*optp)) {
+		if (f->opt[optnum].kind == *ctx->optp) {
 			__u32 len = f->opt[optnum].length;
-			const __u8 *optend = optp + len;
+			const __u8 *optend = ctx->optp + len;
 
 			fmatch = FMATCH_OK;
 
-			switch (*optp) {
+			switch (*ctx->optp) {
 			case OSFOPT_MSS:
-				mss = optp[3];
+				mss = ctx->optp[3];
 				mss <<= 8;
-				mss |= optp[2];
+				mss |= ctx->optp[2];
 
 				mss = ntohs((__force __be16)mss);
 				break;
@@ -103,7 +110,7 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
 				break;
 			}
 
-			optp = optend;
+			ctx->optp = optend;
 		} else
 			fmatch = FMATCH_OPT_WRONG;
 
@@ -116,7 +123,7 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
 
 		switch (check_WSS) {
 		case OSF_WSS_PLAIN:
-			if (f->wss.val == 0 || window == f->wss.val)
+			if (f->wss.val == 0 || ctx->window == f->wss.val)
 				fmatch = FMATCH_OK;
 			break;
 		case OSF_WSS_MSS:
@@ -128,19 +135,19 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
 			 */
 #define SMART_MSS_1	1460
 #define SMART_MSS_2	1448
-			if (window == f->wss.val * mss ||
-			    window == f->wss.val * SMART_MSS_1 ||
-			    window == f->wss.val * SMART_MSS_2)
+			if (ctx->window == f->wss.val * mss ||
+			    ctx->window == f->wss.val * SMART_MSS_1 ||
+			    ctx->window == f->wss.val * SMART_MSS_2)
 				fmatch = FMATCH_OK;
 			break;
 		case OSF_WSS_MTU:
-			if (window == f->wss.val * (mss + 40) ||
-			    window == f->wss.val * (SMART_MSS_1 + 40) ||
-			    window == f->wss.val * (SMART_MSS_2 + 40))
+			if (ctx->window == f->wss.val * (mss + 40) ||
+			    ctx->window == f->wss.val * (SMART_MSS_1 + 40) ||
+			    ctx->window == f->wss.val * (SMART_MSS_2 + 40))
 				fmatch = FMATCH_OK;
 			break;
 		case OSF_WSS_MODULO:
-			if ((window % f->wss.val) == 0)
+			if ((ctx->window % f->wss.val) == 0)
 				fmatch = FMATCH_OK;
 			break;
 		}
@@ -149,54 +156,66 @@ static bool nf_osf_match_one(const struct sk_buff *skb,
 	return fmatch == FMATCH_OK;
 }
 
+static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx,
+						const struct sk_buff *skb,
+						const struct iphdr *ip,
+						unsigned char *opts)
+{
+	const struct tcphdr *tcp;
+	struct tcphdr _tcph;
+
+	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+	if (!tcp)
+		return NULL;
+
+	if (!tcp->syn)
+		return NULL;
+
+	ctx->totlen = ntohs(ip->tot_len);
+	ctx->df = ntohs(ip->frag_off) & IP_DF;
+	ctx->window = ntohs(tcp->window);
+
+	if (tcp->doff * 4 > sizeof(struct tcphdr)) {
+		ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr);
+
+		ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) +
+				sizeof(struct tcphdr), ctx->optsize, opts);
+	}
+
+	return tcp;
+}
+
 bool
 nf_osf_match(const struct sk_buff *skb, u_int8_t family,
 	     int hooknum, struct net_device *in, struct net_device *out,
 	     const struct nf_osf_info *info, struct net *net,
 	     const struct list_head *nf_osf_fingers)
 {
-	const unsigned char *optp = NULL, *_optp = NULL;
 	const struct iphdr *ip = ip_hdr(skb);
 	const struct nf_osf_user_finger *f;
 	unsigned char opts[MAX_IPOPTLEN];
 	const struct nf_osf_finger *kf;
 	int fcount = 0, ttl_check;
 	int fmatch = FMATCH_WRONG;
-	unsigned int optsize = 0;
+	struct nf_osf_hdr_ctx ctx;
 	const struct tcphdr *tcp;
-	struct tcphdr _tcph;
-	u16 window, totlen;
-	bool df;
 
-	tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+	memset(&ctx, 0, sizeof(ctx));
+
+	tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
 	if (!tcp)
 		return false;
 
-	if (!tcp->syn)
-		return false;
-
-	totlen = ntohs(ip->tot_len);
-	df = ntohs(ip->frag_off) & IP_DF;
-	window = ntohs(tcp->window);
-
-	if (tcp->doff * 4 > sizeof(struct tcphdr)) {
-		optsize = tcp->doff * 4 - sizeof(struct tcphdr);
-
-		_optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
-				sizeof(struct tcphdr), optsize, opts);
-	}
-
 	ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1;
 
-	list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) {
+	list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
 
 		f = &kf->finger;
 
 		if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
 			continue;
 
-		if (!nf_osf_match_one(skb, f,
-				      ttl_check, totlen, window, optp, optsize))
+		if (!nf_osf_match_one(skb, f, ttl_check, &ctx))
 			continue;
 
 		fmatch = FMATCH_OK;

From 365b5a36f352e9884e85c47aa33026fd4df18633 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Eckl?= <ecklm94@gmail.com>
Date: Thu, 12 Jul 2018 17:18:46 +0200
Subject: [PATCH 35/38] netfilter: nft_socket: Break evaluation if no socket
 found
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Actual implementation stores 0 in the destination register if no socket
is found by the lookup, but that is not intentional as it is not really
a value of any socket metadata.

This patch fixes this and breaks rule evaluation in this case.

Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching")
Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index e43c1939d25f..622ac2012a40 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -43,7 +43,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
 		}
 
 	if (!sk) {
-		nft_reg_store8(dest, 0);
+		regs->verdict.code = NFT_BREAK;
 		return;
 	}
 

From 7d25f8851a2c03319bfa8e56bb40bde2c4621392 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A1t=C3=A9=20Eckl?= <ecklm94@gmail.com>
Date: Thu, 12 Jul 2018 17:48:06 +0200
Subject: [PATCH 36/38] netfilter: nft_socket: Expose socket mark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  4 +++-
 net/netfilter/nft_socket.c               | 11 +++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 89438e68dc03..f466860bcf75 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -921,10 +921,12 @@ enum nft_socket_attributes {
 /*
  * enum nft_socket_keys - nf_tables socket expression keys
  *
- * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option_
+ * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option
+ * @NFT_SOCKET_MARK: Value of the socket mark
  */
 enum nft_socket_keys {
 	NFT_SOCKET_TRANSPARENT,
+	NFT_SOCKET_MARK,
 	__NFT_SOCKET_MAX
 };
 #define NFT_SOCKET_MAX	(__NFT_SOCKET_MAX - 1)
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 622ac2012a40..d7f3776dfd71 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -54,6 +54,14 @@ static void nft_socket_eval(const struct nft_expr *expr,
 	case NFT_SOCKET_TRANSPARENT:
 		nft_reg_store8(dest, inet_sk_transparent(sk));
 		break;
+	case NFT_SOCKET_MARK:
+		if (sk_fullsock(sk)) {
+			*dest = sk->sk_mark;
+		} else {
+			regs->verdict.code = NFT_BREAK;
+			return;
+		}
+		break;
 	default:
 		WARN_ON(1);
 		regs->verdict.code = NFT_BREAK;
@@ -91,6 +99,9 @@ static int nft_socket_init(const struct nft_ctx *ctx,
 	case NFT_SOCKET_TRANSPARENT:
 		len = sizeof(u8);
 		break;
+	case NFT_SOCKET_MARK:
+		len = sizeof(u32);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}

From 70b095c84326640eeacfd69a411db8fc36e8ab1a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 14 Jul 2018 01:14:01 +0200
Subject: [PATCH 37/38] ipv6: remove dependency of nf_defrag_ipv6 on ipv6
 module

IPV6=m
DEFRAG_IPV6=m
CONNTRACK=y yields:

net/netfilter/nf_conntrack_proto.o: In function `nf_ct_netns_do_get':
net/netfilter/nf_conntrack_proto.c:802: undefined reference to `nf_defrag_ipv6_enable'
net/netfilter/nf_conntrack_proto.o:(.rodata+0x640): undefined reference to `nf_conntrack_l4proto_icmpv6'

Setting DEFRAG_IPV6=y causes undefined references to ip6_rhash_params
ip6_frag_init and ip6_expire_frag_queue so it would be needed to force
IPV6=y too.

This patch gets rid of the 'followup linker error' by removing
the dependency of ipv6.ko symbols from netfilter ipv6 defrag.

Shared code is placed into a header, then used from both.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ipv6.h                        |  28 ------
 include/net/ipv6_frag.h                   | 104 ++++++++++++++++++++++
 net/ieee802154/6lowpan/reassembly.c       |   2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c   |  17 ++--
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c |   3 +-
 net/ipv6/reassembly.c                     |  92 ++-----------------
 net/openvswitch/conntrack.c               |   1 +
 7 files changed, 126 insertions(+), 121 deletions(-)
 create mode 100644 include/net/ipv6_frag.h

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index aa6fd11a887c..3720958cd4e1 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -581,34 +581,6 @@ static inline bool ipv6_prefix_equal(const struct in6_addr *addr1,
 }
 #endif
 
-struct inet_frag_queue;
-
-enum ip6_defrag_users {
-	IP6_DEFRAG_LOCAL_DELIVER,
-	IP6_DEFRAG_CONNTRACK_IN,
-	__IP6_DEFRAG_CONNTRACK_IN	= IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX,
-	IP6_DEFRAG_CONNTRACK_OUT,
-	__IP6_DEFRAG_CONNTRACK_OUT	= IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
-	IP6_DEFRAG_CONNTRACK_BRIDGE_IN,
-	__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
-};
-
-void ip6_frag_init(struct inet_frag_queue *q, const void *a);
-extern const struct rhashtable_params ip6_rhash_params;
-
-/*
- *	Equivalent of ipv4 struct ip
- */
-struct frag_queue {
-	struct inet_frag_queue	q;
-
-	int			iif;
-	__u16			nhoffset;
-	u8			ecn;
-};
-
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
-
 static inline bool ipv6_addr_any(const struct in6_addr *a)
 {
 #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h
new file mode 100644
index 000000000000..6ced1e6899b6
--- /dev/null
+++ b/include/net/ipv6_frag.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _IPV6_FRAG_H
+#define _IPV6_FRAG_H
+#include <linux/kernel.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
+#include <net/inet_frag.h>
+
+enum ip6_defrag_users {
+	IP6_DEFRAG_LOCAL_DELIVER,
+	IP6_DEFRAG_CONNTRACK_IN,
+	__IP6_DEFRAG_CONNTRACK_IN	= IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX,
+	IP6_DEFRAG_CONNTRACK_OUT,
+	__IP6_DEFRAG_CONNTRACK_OUT	= IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX,
+	IP6_DEFRAG_CONNTRACK_BRIDGE_IN,
+	__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
+};
+
+/*
+ *	Equivalent of ipv4 struct ip
+ */
+struct frag_queue {
+	struct inet_frag_queue	q;
+
+	int			iif;
+	__u16			nhoffset;
+	u8			ecn;
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static inline void ip6frag_init(struct inet_frag_queue *q, const void *a)
+{
+	struct frag_queue *fq = container_of(q, struct frag_queue, q);
+	const struct frag_v6_compare_key *key = a;
+
+	q->key.v6 = *key;
+	fq->ecn = 0;
+}
+
+static inline u32 ip6frag_key_hashfn(const void *data, u32 len, u32 seed)
+{
+	return jhash2(data,
+		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
+}
+
+static inline u32 ip6frag_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct inet_frag_queue *fq = data;
+
+	return jhash2((const u32 *)&fq->key.v6,
+		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
+}
+
+static inline int
+ip6frag_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct frag_v6_compare_key *key = arg->key;
+	const struct inet_frag_queue *fq = ptr;
+
+	return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+static inline void
+ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
+{
+	struct net_device *dev = NULL;
+	struct sk_buff *head;
+
+	rcu_read_lock();
+	spin_lock(&fq->q.lock);
+
+	if (fq->q.flags & INET_FRAG_COMPLETE)
+		goto out;
+
+	inet_frag_kill(&fq->q);
+
+	dev = dev_get_by_index_rcu(net, fq->iif);
+	if (!dev)
+		goto out;
+
+	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
+	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
+
+	/* Don't send error if the first segment did not arrive. */
+	head = fq->q.fragments;
+	if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head)
+		goto out;
+
+	head->dev = dev;
+	skb_get(head);
+	spin_unlock(&fq->q.lock);
+
+	icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
+	kfree_skb(head);
+	goto out_rcu_unlock;
+
+out:
+	spin_unlock(&fq->q.lock);
+out_rcu_unlock:
+	rcu_read_unlock();
+	inet_frag_put(&fq->q);
+}
+#endif
+#endif
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 2cc224106b69..ec7a5da56129 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -25,7 +25,7 @@
 
 #include <net/ieee802154_netdev.h>
 #include <net/6lowpan.h>
-#include <net/ipv6.h>
+#include <net/ipv6_frag.h>
 #include <net/inet_frag.h>
 
 #include "6lowpan_i.h"
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index a452d99c9f52..333ee3256964 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -33,9 +33,8 @@
 
 #include <net/sock.h>
 #include <net/snmp.h>
-#include <net/inet_frag.h>
+#include <net/ipv6_frag.h>
 
-#include <net/ipv6.h>
 #include <net/protocol.h>
 #include <net/transp_v6.h>
 #include <net/rawv6.h>
@@ -151,7 +150,7 @@ static void nf_ct_frag6_expire(struct timer_list *t)
 	fq = container_of(frag, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, nf_frag.frags);
 
-	ip6_expire_frag_queue(net, fq);
+	ip6frag_expire_frag_queue(net, fq);
 }
 
 /* Creation primitives. */
@@ -622,16 +621,24 @@ static struct pernet_operations nf_ct_net_ops = {
 	.exit = nf_ct_net_exit,
 };
 
+static const struct rhashtable_params nfct_rhash_params = {
+	.head_offset		= offsetof(struct inet_frag_queue, node),
+	.hashfn			= ip6frag_key_hashfn,
+	.obj_hashfn		= ip6frag_obj_hashfn,
+	.obj_cmpfn		= ip6frag_obj_cmpfn,
+	.automatic_shrinking	= true,
+};
+
 int nf_ct_frag6_init(void)
 {
 	int ret = 0;
 
-	nf_frags.constructor = ip6_frag_init;
+	nf_frags.constructor = ip6frag_init;
 	nf_frags.destructor = NULL;
 	nf_frags.qsize = sizeof(struct frag_queue);
 	nf_frags.frag_expire = nf_ct_frag6_expire;
 	nf_frags.frags_cache_name = nf_frags_cache_name;
-	nf_frags.rhash_params = ip6_rhash_params;
+	nf_frags.rhash_params = nfct_rhash_params;
 	ret = inet_frags_init(&nf_frags);
 	if (ret)
 		goto out;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index e631be25337e..72dd3e202375 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -14,8 +14,7 @@
 #include <linux/skbuff.h>
 #include <linux/icmp.h>
 #include <linux/sysctl.h>
-#include <net/ipv6.h>
-#include <net/inet_frag.h>
+#include <net/ipv6_frag.h>
 
 #include <linux/netfilter_ipv6.h>
 #include <linux/netfilter_bridge.h>
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index b939b94e7e91..6edd2ac8ae4b 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -57,7 +57,7 @@
 #include <net/rawv6.h>
 #include <net/ndisc.h>
 #include <net/addrconf.h>
-#include <net/inet_frag.h>
+#include <net/ipv6_frag.h>
 #include <net/inet_ecn.h>
 
 static const char ip6_frag_cache_name[] = "ip6-frags";
@@ -72,61 +72,6 @@ static struct inet_frags ip6_frags;
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 			  struct net_device *dev);
 
-void ip6_frag_init(struct inet_frag_queue *q, const void *a)
-{
-	struct frag_queue *fq = container_of(q, struct frag_queue, q);
-	const struct frag_v6_compare_key *key = a;
-
-	q->key.v6 = *key;
-	fq->ecn = 0;
-}
-EXPORT_SYMBOL(ip6_frag_init);
-
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
-{
-	struct net_device *dev = NULL;
-	struct sk_buff *head;
-
-	rcu_read_lock();
-	spin_lock(&fq->q.lock);
-
-	if (fq->q.flags & INET_FRAG_COMPLETE)
-		goto out;
-
-	inet_frag_kill(&fq->q);
-
-	dev = dev_get_by_index_rcu(net, fq->iif);
-	if (!dev)
-		goto out;
-
-	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
-	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
-
-	/* Don't send error if the first segment did not arrive. */
-	head = fq->q.fragments;
-	if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head)
-		goto out;
-
-	/* But use as source device on which LAST ARRIVED
-	 * segment was received. And do not use fq->dev
-	 * pointer directly, device might already disappeared.
-	 */
-	head->dev = dev;
-	skb_get(head);
-	spin_unlock(&fq->q.lock);
-
-	icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
-	kfree_skb(head);
-	goto out_rcu_unlock;
-
-out:
-	spin_unlock(&fq->q.lock);
-out_rcu_unlock:
-	rcu_read_unlock();
-	inet_frag_put(&fq->q);
-}
-EXPORT_SYMBOL(ip6_expire_frag_queue);
-
 static void ip6_frag_expire(struct timer_list *t)
 {
 	struct inet_frag_queue *frag = from_timer(frag, t, timer);
@@ -136,7 +81,7 @@ static void ip6_frag_expire(struct timer_list *t)
 	fq = container_of(frag, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, ipv6.frags);
 
-	ip6_expire_frag_queue(net, fq);
+	ip6frag_expire_frag_queue(net, fq);
 }
 
 static struct frag_queue *
@@ -696,42 +641,19 @@ static struct pernet_operations ip6_frags_ops = {
 	.exit = ipv6_frags_exit_net,
 };
 
-static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
-{
-	return jhash2(data,
-		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
-}
-
-static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
-{
-	const struct inet_frag_queue *fq = data;
-
-	return jhash2((const u32 *)&fq->key.v6,
-		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
-}
-
-static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
-{
-	const struct frag_v6_compare_key *key = arg->key;
-	const struct inet_frag_queue *fq = ptr;
-
-	return !!memcmp(&fq->key, key, sizeof(*key));
-}
-
-const struct rhashtable_params ip6_rhash_params = {
+static const struct rhashtable_params ip6_rhash_params = {
 	.head_offset		= offsetof(struct inet_frag_queue, node),
-	.hashfn			= ip6_key_hashfn,
-	.obj_hashfn		= ip6_obj_hashfn,
-	.obj_cmpfn		= ip6_obj_cmpfn,
+	.hashfn			= ip6frag_key_hashfn,
+	.obj_hashfn		= ip6frag_obj_hashfn,
+	.obj_cmpfn		= ip6frag_obj_cmpfn,
 	.automatic_shrinking	= true,
 };
-EXPORT_SYMBOL(ip6_rhash_params);
 
 int __init ipv6_frag_init(void)
 {
 	int ret;
 
-	ip6_frags.constructor = ip6_frag_init;
+	ip6_frags.constructor = ip6frag_init;
 	ip6_frags.destructor = NULL;
 	ip6_frags.qsize = sizeof(struct frag_queue);
 	ip6_frags.frag_expire = ip6_frag_expire;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 3e33c382367f..86a75105af1a 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -26,6 +26,7 @@
 #include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/ipv6_frag.h>
 
 #ifdef CONFIG_NF_NAT_NEEDED
 #include <linux/netfilter/nf_nat.h>

From 24c458c485c87eef97e91d2e180f222555528b11 Mon Sep 17 00:00:00 2001
From: Fernando Fernandez Mancera <ffmancera@riseup.net>
Date: Sat, 14 Jul 2018 16:50:59 +0200
Subject: [PATCH 38/38] netfilter: nf_osf: add missing definitions to header
 file

Add missing definitions from nf_osf.h in order to extract Passive OS
fingerprint infrastructure from xt_osf.

Signed-off-by: Fernando Fernandez Mancera <ffmancera@riseup.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_osf.h | 11 +++++++++++
 include/uapi/linux/netfilter/xt_osf.h | 10 ++--------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_osf.h b/include/uapi/linux/netfilter/nf_osf.h
index 8f2f2f403183..3738116b2bbe 100644
--- a/include/uapi/linux/netfilter/nf_osf.h
+++ b/include/uapi/linux/netfilter/nf_osf.h
@@ -16,9 +16,14 @@
 
 #define NF_OSF_TTL_TRUE			0	/* True ip and fingerprint TTL comparison */
 
+/* Check if ip TTL is less than fingerprint one */
+#define NF_OSF_TTL_LESS			1
+
 /* Do not compare ip and fingerprint TTL at all */
 #define NF_OSF_TTL_NOCHECK		2
 
+#define NF_OSF_FLAGMASK		(NF_OSF_GENRE | NF_OSF_TTL | \
+				 NF_OSF_LOG | NF_OSF_INVERT)
 /* Wildcard MSS (kind of).
  * It is used to implement a state machine for the different wildcard values
  * of the MSS and window sizes.
@@ -83,4 +88,10 @@ enum iana_options {
 	OSFOPT_EMPTY = 255,
 };
 
+enum nf_osf_attr_type {
+	OSF_ATTR_UNSPEC,
+	OSF_ATTR_FINGER,
+	OSF_ATTR_MAX,
+};
+
 #endif /* _NF_OSF_H */
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index 72956eceeb09..b189007f4f28 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -37,8 +37,7 @@
 
 #define XT_OSF_TTL_TRUE		NF_OSF_TTL_TRUE
 #define XT_OSF_TTL_NOCHECK	NF_OSF_TTL_NOCHECK
-
-#define XT_OSF_TTL_LESS	1	/* Check if ip TTL is less than fingerprint one */
+#define XT_OSF_TTL_LESS		NF_OSF_TTL_LESS
 
 #define xt_osf_wc		nf_osf_wc
 #define xt_osf_opt		nf_osf_opt
@@ -47,6 +46,7 @@
 #define xt_osf_finger		nf_osf_finger
 #define xt_osf_nlmsg		nf_osf_nlmsg
 
+#define xt_osf_attr_type	nf_osf_attr_type
 /*
  * Add/remove fingerprint from the kernel.
  */
@@ -56,10 +56,4 @@ enum xt_osf_msg_types {
 	OSF_MSG_MAX,
 };
 
-enum xt_osf_attr_type {
-	OSF_ATTR_UNSPEC,
-	OSF_ATTR_FINGER,
-	OSF_ATTR_MAX,
-};
-
 #endif				/* _XT_OSF_H */