diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r-- | net/ipv4/tcp_input.c | 234 |
1 files changed, 186 insertions, 48 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8afb0950a697..90de838a2745 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -693,6 +693,23 @@ new_measure: tp->rcv_rtt_est.time = tp->tcp_mstamp; } +static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) +{ + u32 delta, delta_us; + + delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; + if (tp->tcp_usec_ts) + return delta; + + if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { + if (!delta) + delta = 1; + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); + return delta_us; + } + return -1; +} + static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { @@ -704,15 +721,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { - u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - u32 delta_us; - - if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { - if (!delta) - delta = 1; - delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - tcp_rcv_rtt_update(tp, delta_us, 0); - } + s32 delta = tcp_rtt_tsopt_us(tp); + + if (delta >= 0) + tcp_rcv_rtt_update(tp, delta, 0); } } @@ -778,6 +790,16 @@ new_measure: tp->rcvq_space.time = tp->tcp_mstamp; } +static void tcp_save_lrcv_flowlabel(struct sock *sk, const struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet_connection_sock *icsk = inet_csk(sk); + + if (skb->protocol == htons(ETH_P_IPV6)) + icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb))); +#endif +} + /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The @@ -826,6 +848,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) } } icsk->icsk_ack.lrcvtime = now; + tcp_save_lrcv_flowlabel(sk, skb); tcp_ecn_check_ce(sk, skb); @@ -940,8 +963,8 @@ static void tcp_update_pacing_rate(struct sock *sk) * without any lock. We want to make sure compiler wont store * intermediate values in this location. */ - WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate, - sk->sk_max_pacing_rate)); + WRITE_ONCE(sk->sk_pacing_rate, + min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate))); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -2101,6 +2124,10 @@ void tcp_clear_retrans(struct tcp_sock *tp) tp->undo_marker = 0; tp->undo_retrans = -1; tp->sacked_out = 0; + tp->rto_stamp = 0; + tp->total_rto = 0; + tp->total_rto_recoveries = 0; + tp->total_rto_time = 0; } static inline void tcp_init_undo(struct tcp_sock *tp) @@ -2207,16 +2234,17 @@ void tcp_enter_loss(struct sock *sk) * restore sanity to the SACK scoreboard. If the apparent reneging * persists until this RTO then we'll clear the SACK scoreboard. */ -static bool tcp_check_sack_reneging(struct sock *sk, int flag) +static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) { - if (flag & FLAG_SACK_RENEGING && - flag & FLAG_SND_UNA_ADVANCED) { + if (*ack_flag & FLAG_SACK_RENEGING && + *ack_flag & FLAG_SND_UNA_ADVANCED) { struct tcp_sock *tp = tcp_sk(sk); unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX); + *ack_flag &= ~FLAG_SET_XMIT_TIMER; return true; } return false; @@ -2427,7 +2455,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, const struct sk_buff *skb) { return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && - tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb)); + tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); } /* Nothing was retransmitted or returned timestamp is less @@ -2838,6 +2866,14 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack) tcp_set_ca_state(sk, TCP_CA_Recovery); } +static void tcp_update_rto_time(struct tcp_sock *tp) +{ + if (tp->rto_stamp) { + tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp; + tp->rto_stamp = 0; + } +} + /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ @@ -2986,7 +3022,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ - if (tcp_check_sack_reneging(sk, flag)) + if (tcp_check_sack_reneging(sk, ack_flag)) return; /* C. Check consistency of the current state. */ @@ -3042,6 +3078,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, break; case TCP_CA_Loss: tcp_process_loss(sk, flag, num_dupack, rexmit); + if (icsk->icsk_ca_state != TCP_CA_Loss) + tcp_update_rto_time(tp); tcp_identify_packet_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) @@ -3121,17 +3159,10 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ - if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && - flag & FLAG_ACKED) { - u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; - - if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { - if (!delta) - delta = 1; - seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); - ca_rtt_us = seq_rtt_us; - } - } + if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && + tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) + seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp); + rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) return false; @@ -3541,6 +3572,21 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp, (ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin)); } +static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack) +{ +#ifdef CONFIG_TCP_AO + struct tcp_ao_info *ao; + + if (!static_branch_unlikely(&tcp_ao_needed.key)) + return; + + ao = rcu_dereference_protected(tp->ao_info, + lockdep_sock_is_held((struct sock *)tp)); + if (ao && ack < tp->snd_una) + ao->snd_sne++; +#endif +} + /* If we update tp->snd_una, also update tp->bytes_acked */ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) { @@ -3548,9 +3594,25 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) sock_owned_by_me((struct sock *)tp); tp->bytes_acked += delta; + tcp_snd_sne_update(tp, ack); tp->snd_una = ack; } +static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq) +{ +#ifdef CONFIG_TCP_AO + struct tcp_ao_info *ao; + + if (!static_branch_unlikely(&tcp_ao_needed.key)) + return; + + ao = rcu_dereference_protected(tp->ao_info, + lockdep_sock_is_held((struct sock *)tp)); + if (ao && seq < tp->rcv_nxt) + ao->rcv_sne++; +#endif +} + /* If we update tp->rcv_nxt, also update tp->bytes_received */ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) { @@ -3558,6 +3620,7 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) sock_owned_by_me((struct sock *)tp); tp->bytes_received += delta; + tcp_rcv_sne_update(tp, seq); WRITE_ONCE(tp->rcv_nxt, seq); } @@ -3808,8 +3871,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * then we can probably ignore it. */ if (before(ack, prior_snd_una)) { + u32 max_window; + + /* do not accept ACK for bytes we never sent. */ + max_window = min_t(u64, tp->max_window, tp->bytes_acked); /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ - if (before(ack, prior_snd_una - tp->max_window)) { + if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) tcp_send_challenge_ack(sk); return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; @@ -4224,39 +4291,58 @@ static bool tcp_fast_parse_options(const struct net *net, return true; } -#ifdef CONFIG_TCP_MD5SIG +#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) /* - * Parse MD5 Signature option + * Parse Signature options */ -const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) +int tcp_do_parse_auth_options(const struct tcphdr *th, + const u8 **md5_hash, const u8 **ao_hash) { int length = (th->doff << 2) - sizeof(*th); const u8 *ptr = (const u8 *)(th + 1); + unsigned int minlen = TCPOLEN_MD5SIG; + + if (IS_ENABLED(CONFIG_TCP_AO)) + minlen = sizeof(struct tcp_ao_hdr) + 1; + + *md5_hash = NULL; + *ao_hash = NULL; /* If not enough data remaining, we can short cut */ - while (length >= TCPOLEN_MD5SIG) { + while (length >= minlen) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: - return NULL; + return 0; case TCPOPT_NOP: length--; continue; default: opsize = *ptr++; if (opsize < 2 || opsize > length) - return NULL; - if (opcode == TCPOPT_MD5SIG) - return opsize == TCPOLEN_MD5SIG ? ptr : NULL; + return -EINVAL; + if (opcode == TCPOPT_MD5SIG) { + if (opsize != TCPOLEN_MD5SIG) + return -EINVAL; + if (unlikely(*md5_hash || *ao_hash)) + return -EEXIST; + *md5_hash = ptr; + } else if (opcode == TCPOPT_AO) { + if (opsize <= sizeof(struct tcp_ao_hdr)) + return -EINVAL; + if (unlikely(*md5_hash || *ao_hash)) + return -EEXIST; + *ao_hash = ptr; + } } ptr += opsize - 2; length -= opsize; } - return NULL; + return 0; } -EXPORT_SYMBOL(tcp_parse_md5sig_option); +EXPORT_SYMBOL(tcp_do_parse_auth_options); #endif /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM @@ -4499,12 +4585,23 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) { /* When the ACK path fails or drops most ACKs, the sender would * timeout and spuriously retransmit the same segment repeatedly. - * The receiver remembers and reflects via DSACKs. Leverage the - * DSACK state and change the txhash to re-route speculatively. + * If it seems our ACKs are not reaching the other side, + * based on receiving a duplicate data segment with new flowlabel + * (suggesting the sender suffered an RTO), and we are not already + * repathing due to our own RTO, then rehash the socket to repath our + * packets. */ - if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && +#if IS_ENABLED(CONFIG_IPV6) + if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss && + skb->protocol == htons(ETH_P_IPV6) && + (tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel != + ntohl(ip6_flowlabel(ipv6_hdr(skb)))) && sk_rethink_txhash(sk)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); + + /* Save last flowlabel after a spurious retrans. */ + tcp_save_lrcv_flowlabel(sk, skb); +#endif } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -4821,6 +4918,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) u32 seq, end_seq; bool fragstolen; + tcp_save_lrcv_flowlabel(sk, skb); tcp_ecn_check_ce(sk, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { @@ -5566,6 +5664,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) { + /* If we are running from __release_sock() in user context, + * Defer the ack until tcp_release_cb(). + */ + if (sock_owned_by_user_nocheck(sk) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) { + set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags); + return; + } send_now: tcp_send_ack(sk); return; @@ -6100,6 +6206,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + tcp_ao_finish_connect(sk, skb); tcp_set_state(sk, TCP_ESTABLISHED); icsk->icsk_ack.lrcvtime = tcp_jiffies32; @@ -6248,7 +6355,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, - tcp_time_stamp(tp))) { + tcp_time_stamp_ts(tp))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; @@ -6385,6 +6492,16 @@ consume: * simultaneous connect with crossed SYNs. * Particularly, it can be connect to self. */ +#ifdef CONFIG_TCP_AO + struct tcp_ao_info *ao; + + ao = rcu_dereference_protected(tp->ao_info, + lockdep_sock_is_held(sk)); + if (ao) { + WRITE_ONCE(ao->risn, th->seq); + ao->rcv_sne = 0; + } +#endif tcp_set_state(sk, TCP_SYN_RECV); if (tp->rx_opt.saw_tstamp) { @@ -6449,22 +6566,24 @@ reset_and_undo: static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); struct request_sock *req; /* If we are still handling the SYNACK RTO, see if timestamp ECR allows * undo. If peer SACKs triggered fast recovery, we can't undo here. */ - if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) - tcp_try_undo_loss(sk, false); + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out) + tcp_try_undo_recovery(sk); /* Reset rtx states to prevent spurious retransmits_timed_out() */ - tcp_sk(sk)->retrans_stamp = 0; + tcp_update_rto_time(tp); + tp->retrans_stamp = 0; inet_csk(sk)->icsk_retransmits = 0; /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, * we no longer need req so release it. */ - req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, + req = rcu_dereference_protected(tp->fastopen_rsk, lockdep_sock_is_held(sk)); reqsk_fastopen_remove(sk, req, false); @@ -6595,6 +6714,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) skb); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); } + tcp_ao_established(sk); smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); @@ -6971,6 +7091,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, struct flowi fl; u8 syncookies; +#ifdef CONFIG_TCP_AO + const struct tcp_ao_hdr *aoh; +#endif + syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies); /* TW buckets are converted to open requests without @@ -6995,6 +7119,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, req->syncookie = want_cookie; tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; + tcp_rsk(req)->req_usec_ts = false; #if IS_ENABLED(CONFIG_MPTCP) tcp_rsk(req)->is_mptcp = 0; #endif @@ -7022,9 +7147,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (!dst) goto drop_and_free; - if (tmp_opt.tstamp_ok) + if (tmp_opt.tstamp_ok) { + tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst); tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); - + } if (!want_cookie && !isn) { int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog); @@ -7056,6 +7182,18 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, inet_rsk(req)->ecn_ok = 0; } +#ifdef CONFIG_TCP_AO + if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) + goto drop_and_release; /* Invalid TCP options */ + if (aoh) { + tcp_rsk(req)->used_tcp_ao = true; + tcp_rsk(req)->ao_rcv_next = aoh->keyid; + tcp_rsk(req)->ao_keyid = aoh->rnext_keyid; + + } else { + tcp_rsk(req)->used_tcp_ao = false; + } +#endif tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; |