From f4dae54e486d528d4dd98df116e7a522bbf12667 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Mar 2021 12:35:04 -0800 Subject: [PATCH 1/3] tcp: plug skb_still_in_host_queue() to TSQ Jakub and Neil reported an increase of RTO timers whenever TX completions are delayed a bit more (by increasing NIC TX coalescing parameters) Main issue is that TCP stack has a logic preventing a packet being retransmit if the prior clone has not yet been orphaned or freed. This logic came with commit 1f3279ae0c13 ("tcp: avoid retransmits of TCP packets hanging in host queues") Thankfully, in the case skb_still_in_host_queue() detects the initial clone is still in flight, it can use TSQ logic that will eventually retry later, at the moment the clone is freed or orphaned. Signed-off-by: Eric Dumazet Reported-by: Neil Spring Reported-by: Jakub Kicinski Cc: Neal Cardwell Cc: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/ipv4/tcp_output.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0503c917d773..483e89348f78 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1140,7 +1140,7 @@ static inline bool skb_fclone_busy(const struct sock *sk, return skb->fclone == SKB_FCLONE_ORIG && refcount_read(&fclones->fclone_ref) > 1 && - fclones->skb2.sk == sk; + READ_ONCE(fclones->skb2.sk) == sk; } /** diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fbf140a770d8..0dbf208a4f2f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2775,13 +2775,17 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! */ -static bool skb_still_in_host_queue(const struct sock *sk, +static bool skb_still_in_host_queue(struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); - return true; + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); + smp_mb__after_atomic(); + if (skb_fclone_busy(sk, skb)) { + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + return true; + } } return false; } From a7abf3cd76e1e190a2c22afe5ffd0f695c7641b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Mar 2021 12:35:05 -0800 Subject: [PATCH 2/3] tcp: consider using standard rtx logic in tcp_rcv_fastopen_synack() Jakub reported Data included in a Fastopen SYN that had to be retransmit would have to wait for an RTO if TX completions are slow, even with prior fix. This is because tcp_rcv_fastopen_synack() does not use standard rtx logic, meaning TSQ handler exits early in tcp_tsq_write() because tp->lost_out == tp->retrans_out Lets make tcp_rcv_fastopen_synack() use standard rtx logic, by using tcp_mark_skb_lost() on the skb thats needs to be sent again. Not this raised a warning in tcp_fastretrans_alert() during my tests since we consider the data not being aknowledged by the receiver does not mean packet was lost on the network. Signed-off-by: Eric Dumazet Reported-by: Jakub Kicinski Cc: Neal Cardwell Cc: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 69a545db80d2..4cf4dd532d1c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2914,7 +2914,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, /* D. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { - WARN_ON(tp->retrans_out != 0); + WARN_ON(tp->retrans_out != 0 && !tp->syn_data); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { @@ -5994,11 +5994,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED; else tp->fastopen_client_fail = TFO_DATA_NOT_ACKED; - skb_rbtree_walk_from(data) { - if (__tcp_retransmit_skb(sk, data, 1)) - break; - } - tcp_rearm_rto(sk); + skb_rbtree_walk_from(data) + tcp_mark_skb_lost(sk, data); + tcp_xmit_retransmit_queue(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; From ac3959fd0dcc0e49298ea5cadfe257db0e58ef8b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Mar 2021 12:35:06 -0800 Subject: [PATCH 3/3] tcp: remove obsolete check in __tcp_retransmit_skb() TSQ provides a nice way to avoid bufferbloat on individual socket, including retransmit packets. We can get rid of the old heuristic: /* Do not sent more than we queued. 1/4 is reserved for possible * copying overhead: fragmentation, tunneling, mangling etc. */ if (refcount_read(&sk->sk_wmem_alloc) > min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) return -EAGAIN; This heuristic was giving false positives according to Jakub, whenever TX completions are delayed above RTT. (Ack packets are processed by TCP stack before clones are orphaned/freed) Signed-off-by: Eric Dumazet Reported-by: Jakub Kicinski Cc: Neal Cardwell Cc: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0dbf208a4f2f..bde781f46b41 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3151,14 +3151,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; - /* Do not sent more than we queued. 1/4 is reserved for possible - * copying overhead: fragmentation, tunneling, mangling etc. - */ - if (refcount_read(&sk->sk_wmem_alloc) > - min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), - sk->sk_sndbuf)) - return -EAGAIN; - if (skb_still_in_host_queue(sk, skb)) return -EBUSY;