diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 22b4c6df1d2b..94568e022001 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -31,18 +31,16 @@ struct ping_group_range { struct inet_hashinfo; struct inet_timewait_death_row { - atomic_t tw_count; - char tw_pad[L1_CACHE_BYTES - sizeof(atomic_t)]; + refcount_t tw_refcount; - struct inet_hashinfo *hashinfo; + struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; int sysctl_max_tw_buckets; }; struct tcp_fastopen_context; struct netns_ipv4 { - /* Please keep tcp_death_row at first field in netns_ipv4 */ - struct inet_timewait_death_row tcp_death_row ____cacheline_aligned_in_smp; + struct inet_timewait_death_row *tcp_death_row; #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 91e7a2202697..64d805b27add 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -22,6 +22,7 @@ #include "feat.h" struct inet_timewait_death_row dccp_death_row = { + .tw_refcount = REFCOUNT_INIT(1), .sysctl_max_tw_buckets = NR_FILE * 2, .hashinfo = &dccp_hashinfo, }; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 71808c7a7025..9e0bbd026560 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -58,7 +58,9 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead->lock); - atomic_dec(&tw->tw_dr->tw_count); + if (refcount_dec_and_test(&tw->tw_dr->tw_refcount)) + kfree(tw->tw_dr); + inet_twsk_put(tw); } @@ -157,7 +159,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, { struct inet_timewait_sock *tw; - if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets) + if (refcount_read(&dr->tw_refcount) - 1 >= dr->sysctl_max_tw_buckets) return NULL; tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, @@ -249,7 +251,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED : LINUX_MIB_TIMEWAITED); BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo)); - atomic_inc(&tw->tw_dr->tw_count); + refcount_inc(&tw->tw_dr->tw_refcount); } else { mod_timer_pending(&tw->tw_timer, jiffies + timeo); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f30273afb539..28836071f0a6 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,8 +59,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, - atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets, - proto_memory_allocated(&tcp_prot)); + refcount_read(&net->ipv4.tcp_death_row->tw_refcount) - 1, + sockets, proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), proto_memory_allocated(&udp_prot)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 97eb54774924..1cae27b5dcd8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -589,6 +589,14 @@ static struct ctl_table ipv4_table[] = { }; static struct ctl_table ipv4_net_table[] = { + /* tcp_max_tw_buckets must be first in this table. */ + { + .procname = "tcp_max_tw_buckets", +/* .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, */ + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "icmp_echo_ignore_all", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, @@ -1000,13 +1008,6 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, - { - .procname = "tcp_max_tw_buckets", - .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_max_syn_backlog", .data = &init_net.ipv4.sysctl_max_syn_backlog, @@ -1400,7 +1401,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!table) goto err_alloc; - for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) { + /* skip first entry (sysctl_max_tw_buckets) */ + for (i = 1; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) { if (table[i].data) { /* Update the variables to point into * the current struct net @@ -1415,6 +1417,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) } } + table[0].data = &net->ipv4.tcp_death_row->sysctl_max_tw_buckets; + net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); if (!net->ipv4.ipv4_hdr) goto err_reg; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a7d83ceea420..00cd6ccf3ab4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -208,7 +208,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct rtable *rt; int err; struct ip_options_rcu *inet_opt; - struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; + struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; @@ -3117,9 +3117,13 @@ EXPORT_SYMBOL(tcp_prot); static void __net_exit tcp_sk_exit(struct net *net) { + struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row; + if (net->ipv4.tcp_congestion_control) bpf_module_put(net->ipv4.tcp_congestion_control, net->ipv4.tcp_congestion_control->owner); + if (refcount_dec_and_test(&tcp_death_row->tw_refcount)) + kfree(tcp_death_row); } static int __net_init tcp_sk_init(struct net *net) @@ -3151,9 +3155,13 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_tw_reuse = 2; net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; + net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL); + if (!net->ipv4.tcp_death_row) + return -ENOMEM; + refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1); cnt = tcp_hashinfo.ehash_mask + 1; - net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; - net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; + net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2; + net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo; net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); net->ipv4.sysctl_tcp_sack = 1; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7c2d3ac2363a..3977257f80d9 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -248,7 +248,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct inet_timewait_sock *tw; - struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; + struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row; tw = inet_twsk_alloc(sk, tcp_death_row, state); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1e55ee98dfed..0c648bf07f39 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -148,6 +148,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_timewait_death_row *tcp_death_row; struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; @@ -156,7 +157,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct dst_entry *dst; int addr_type; int err; - struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -308,6 +308,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); + tcp_death_row = sock_net(sk)->ipv4.tcp_death_row; err = inet6_hash_connect(tcp_death_row, sk); if (err) goto late_failure;