tcp: no longer hold ehash lock while calling tcp_get_info()

We had various problems in the past in tcp_get_info() and used
specific synchronization to avoid deadlocks.

We would like to add more instrumentation points for TCP, and
avoiding grabing socket lock in tcp_getinfo() was too costly.

Being able to lock the socket allows to provide consistent set
of fields.

inet_diag_dump_icsk() can make sure ehash locks are not
held any more when tcp_get_info() is called.

We can remove syncp added in commit d654976cbf
("tcp: fix a potential deadlock in tcp_get_info()"), but we need
to use lock_sock_fast() instead of spin_lock_bh() since TCP input
path can now be run from process context.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet 2016-11-04 11:54:32 -07:00 коммит произвёл David S. Miller
Родитель ccbf3bfaee
Коммит 67db3e4bfb
4 изменённых файлов: 43 добавлений и 33 удалений

Просмотреть файл

@ -176,8 +176,6 @@ struct tcp_sock {
* sum(delta(snd_una)), or how many bytes * sum(delta(snd_una)), or how many bytes
* were acked. * were acked.
*/ */
struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */
u32 snd_una; /* First byte we want an ack for */ u32 snd_una; /* First byte we want an ack for */
u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */
u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */

Просмотреть файл

@ -861,10 +861,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc) const struct inet_diag_req_v2 *r, struct nlattr *bc)
{ {
struct net *net = sock_net(skb->sk);
int i, num, s_i, s_num;
u32 idiag_states = r->idiag_states;
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
u32 idiag_states = r->idiag_states;
int i, num, s_i, s_num;
struct sock *sk;
if (idiag_states & TCPF_SYN_RECV) if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV; idiag_states |= TCPF_NEW_SYN_RECV;
@ -877,7 +878,6 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
for (i = s_i; i < INET_LHTABLE_SIZE; i++) { for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct inet_listen_hashbucket *ilb; struct inet_listen_hashbucket *ilb;
struct sock *sk;
num = 0; num = 0;
ilb = &hashinfo->listening_hash[i]; ilb = &hashinfo->listening_hash[i];
@ -922,13 +922,14 @@ skip_listen_ht:
if (!(idiag_states & ~TCPF_LISTEN)) if (!(idiag_states & ~TCPF_LISTEN))
goto out; goto out;
#define SKARR_SZ 16
for (i = s_i; i <= hashinfo->ehash_mask; i++) { for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i]; struct inet_ehash_bucket *head = &hashinfo->ehash[i];
spinlock_t *lock = inet_ehash_lockp(hashinfo, i); spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
struct hlist_nulls_node *node; struct hlist_nulls_node *node;
struct sock *sk; struct sock *sk_arr[SKARR_SZ];
int num_arr[SKARR_SZ];
num = 0; int idx, accum, res;
if (hlist_nulls_empty(&head->chain)) if (hlist_nulls_empty(&head->chain))
continue; continue;
@ -936,9 +937,12 @@ skip_listen_ht:
if (i > s_i) if (i > s_i)
s_num = 0; s_num = 0;
next_chunk:
num = 0;
accum = 0;
spin_lock_bh(lock); spin_lock_bh(lock);
sk_nulls_for_each(sk, node, &head->chain) { sk_nulls_for_each(sk, node, &head->chain) {
int state, res; int state;
if (!net_eq(sock_net(sk), net)) if (!net_eq(sock_net(sk), net))
continue; continue;
@ -962,21 +966,35 @@ skip_listen_ht:
if (!inet_diag_bc_sk(bc, sk)) if (!inet_diag_bc_sk(bc, sk))
goto next_normal; goto next_normal;
res = sk_diag_fill(sk, skb, r, sock_hold(sk);
num_arr[accum] = num;
sk_arr[accum] = sk;
if (++accum == SKARR_SZ)
break;
next_normal:
++num;
}
spin_unlock_bh(lock);
res = 0;
for (idx = 0; idx < accum; idx++) {
if (res >= 0) {
res = sk_diag_fill(sk_arr[idx], skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk), sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh->nlmsg_seq, NLM_F_MULTI,
cb->nlh, net_admin); cb->nlh, net_admin);
if (res < 0) { if (res < 0)
spin_unlock_bh(lock); num = num_arr[idx];
goto done;
} }
next_normal: sock_gen_put(sk_arr[idx]);
++num;
} }
if (res < 0)
spin_unlock_bh(lock); break;
cond_resched(); cond_resched();
if (accum == SKARR_SZ) {
s_num = num + 1;
goto next_chunk;
}
} }
done: done:

Просмотреть файл

@ -405,7 +405,6 @@ void tcp_init_sock(struct sock *sk)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0; tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT; tp->mss_cache = TCP_MSS_DEFAULT;
u64_stats_init(&tp->syncp);
tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
tcp_enable_early_retrans(tp); tcp_enable_early_retrans(tp);
@ -2710,9 +2709,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk); const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp, intv; u32 now = tcp_time_stamp, intv;
unsigned int start;
int notsent_bytes;
u64 rate64; u64 rate64;
bool slow;
u32 rate; u32 rate;
memset(info, 0, sizeof(*info)); memset(info, 0, sizeof(*info));
@ -2792,17 +2790,17 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_total_retrans = tp->total_retrans; info->tcpi_total_retrans = tp->total_retrans;
do { slow = lock_sock_fast(sk);
start = u64_stats_fetch_begin_irq(&tp->syncp);
put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked); put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
put_unaligned(tp->bytes_received, &info->tcpi_bytes_received); put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
} while (u64_stats_fetch_retry_irq(&tp->syncp, start)); info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
unlock_sock_fast(sk, slow);
info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_out = tp->segs_out;
info->tcpi_segs_in = tp->segs_in; info->tcpi_segs_in = tp->segs_in;
notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
info->tcpi_notsent_bytes = max(0, notsent_bytes);
info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_min_rtt = tcp_min_rtt(tp);
info->tcpi_data_segs_in = tp->data_segs_in; info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_data_segs_out = tp->data_segs_out;

Просмотреть файл

@ -3351,9 +3351,7 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
u32 delta = ack - tp->snd_una; u32 delta = ack - tp->snd_una;
sock_owned_by_me((struct sock *)tp); sock_owned_by_me((struct sock *)tp);
u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_acked += delta; tp->bytes_acked += delta;
u64_stats_update_end_raw(&tp->syncp);
tp->snd_una = ack; tp->snd_una = ack;
} }
@ -3363,9 +3361,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
u32 delta = seq - tp->rcv_nxt; u32 delta = seq - tp->rcv_nxt;
sock_owned_by_me((struct sock *)tp); sock_owned_by_me((struct sock *)tp);
u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_received += delta; tp->bytes_received += delta;
u64_stats_update_end_raw(&tp->syncp);
tp->rcv_nxt = seq; tp->rcv_nxt = seq;
} }