diff options
Diffstat (limited to 'net/ipv4/tcp_ipv4.c')
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 311 |
1 files changed, 215 insertions, 96 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f70586b50838..efc6fef692ff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -110,8 +110,38 @@ static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) { + const struct inet_timewait_sock *tw = inet_twsk(sktw); const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); + int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse; + + if (reuse == 2) { + /* Still does not detect *everything* that goes through + * lo, since we require a loopback src or dst address + * or direct binding to 'lo' interface. + */ + bool loopback = false; + if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX) + loopback = true; +#if IS_ENABLED(CONFIG_IPV6) + if (tw->tw_family == AF_INET6) { + if (ipv6_addr_loopback(&tw->tw_v6_daddr) || + (ipv6_addr_v4mapped(&tw->tw_v6_daddr) && + (tw->tw_v6_daddr.s6_addr[12] == 127)) || + ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) || + (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) && + (tw->tw_v6_rcv_saddr.s6_addr[12] == 127))) + loopback = true; + } else +#endif + { + if (ipv4_is_loopback(tw->tw_daddr) || + ipv4_is_loopback(tw->tw_rcv_saddr)) + loopback = true; + } + if (!loopback) + reuse = 0; + } /* With PAWS, it is safe from the viewpoint of data integrity. Even without PAWS it is safe provided sequence @@ -125,13 +155,26 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse && - get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + (!twp || (reuse && time_after32(ktime_get_seconds(), + tcptw->tw_ts_recent_stamp)))) { + /* In case of repair and re-using TIME-WAIT sockets we still + * want to be sure that it is safe as above but honor the + * sequence numbers and time stamps set as part of the repair + * process. + * + * Without this check re-using a TIME-WAIT socket with TCP + * repair would accumulate a -1 on the repair assigned + * sequence number. The first time it is reused the sequence + * is -1, the second time -2, etc. This fixes that issue + * without appearing to create any others. + */ + if (likely(!tp->repair)) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + } sock_hold(sktw); return 1; } @@ -380,7 +423,7 @@ EXPORT_SYMBOL(tcp_req_err); * */ -void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) +int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); @@ -403,20 +446,21 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) inet_iif(icmp_skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; + return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); - return; + return 0; } seq = ntohl(th->seq); - if (sk->sk_state == TCP_NEW_SYN_RECV) - return tcp_req_err(sk, seq, - type == ICMP_PARAMETERPROB || - type == ICMP_TIME_EXCEEDED || - (type == ICMP_DEST_UNREACH && - (code == ICMP_NET_UNREACH || - code == ICMP_HOST_UNREACH))); + if (sk->sk_state == TCP_NEW_SYN_RECV) { + tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB || + type == ICMP_TIME_EXCEEDED || + (type == ICMP_DEST_UNREACH && + (code == ICMP_NET_UNREACH || + code == ICMP_HOST_UNREACH))); + return 0; + } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -498,10 +542,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); skb = tcp_rtx_queue_head(sk); - BUG_ON(!skb); tcp_mstamp_refresh(tp); - delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); + delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); @@ -570,6 +613,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) out: bh_unlock_sock(sk); sock_put(sk); + return 0; } void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) @@ -621,6 +665,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) struct sock *sk1 = NULL; #endif struct net *net; + struct sock *ctl_sk; /* Never send a reset in response to a reset. */ if (th->rst) @@ -723,11 +768,16 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) arg.tos = ip_hdr(skb)->tos; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); + if (sk) + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); local_bh_enable(); @@ -759,6 +809,7 @@ static void tcp_v4_send_ack(const struct sock *sk, } rep; struct net *net = sock_net(sk); struct ip_reply_arg arg; + struct sock *ctl_sk; memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -809,11 +860,16 @@ static void tcp_v4_send_ack(const struct sock *sk, arg.tos = tos; arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); local_bh_disable(); - ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), + ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk); + if (sk) + ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + ip_send_unicast_reply(ctl_sk, skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); + ctl_sk->sk_mark = 0; __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); local_bh_enable(); } @@ -888,9 +944,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); + rcu_read_lock(); err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, ireq->ir_rmt_addr, - ireq_opt_deref(ireq)); + rcu_dereference(ireq->ireq_opt)); + rcu_read_unlock(); err = net_xmit_eval(err); } @@ -912,10 +970,13 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) * We need to maintain these in the sk structure. */ +struct static_key tcp_md5_needed __read_mostly; +EXPORT_SYMBOL(tcp_md5_needed); + /* Find the Key structure for an address. */ -struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, - const union tcp_md5_addr *addr, - int family) +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, + const union tcp_md5_addr *addr, + int family) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -953,7 +1014,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, } return best_match; } -EXPORT_SYMBOL(tcp_md5_do_lookup); +EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, @@ -1474,7 +1535,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - tcp_rcv_established(sk, skb, tcp_hdr(skb)); + tcp_rcv_established(sk, skb); return 0; } @@ -1561,12 +1622,14 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; - - /* Only socket owner can try to collapse/prune rx queues - * to reduce memory overhead, so add a little headroom here. - * Few sockets backlog are possibly concurrently non empty. - */ - limit += 64*1024; + struct skb_shared_info *shinfo; + const struct tcphdr *th; + struct tcphdr *thtail; + struct sk_buff *tail; + unsigned int hdrlen; + bool fragstolen; + u32 gso_segs; + int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), * we can fix skb->truesize to its real value to avoid future drops. @@ -1576,6 +1639,86 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) */ skb_condense(skb); + skb_dst_drop(skb); + + if (unlikely(tcp_checksum_complete(skb))) { + bh_unlock_sock(sk); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); + return true; + } + + /* Attempt coalescing to last skb in backlog, even if we are + * above the limits. + * This is okay because skb capacity is limited to MAX_SKB_FRAGS. + */ + th = (const struct tcphdr *)skb->data; + hdrlen = th->doff * 4; + shinfo = skb_shinfo(skb); + + if (!shinfo->gso_size) + shinfo->gso_size = skb->len - hdrlen; + + if (!shinfo->gso_segs) + shinfo->gso_segs = 1; + + tail = sk->sk_backlog.tail; + if (!tail) + goto no_coalesce; + thtail = (struct tcphdr *)tail->data; + + if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq || + TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield || + ((TCP_SKB_CB(tail)->tcp_flags | + TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) || + ((TCP_SKB_CB(tail)->tcp_flags ^ + TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || +#ifdef CONFIG_TLS_DEVICE + tail->decrypted != skb->decrypted || +#endif + thtail->doff != th->doff || + memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) + goto no_coalesce; + + __skb_pull(skb, hdrlen); + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { + thtail->window = th->window; + + TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; + + if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq)) + TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq; + + TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + + if (TCP_SKB_CB(skb)->has_rxtstamp) { + TCP_SKB_CB(tail)->has_rxtstamp = true; + tail->tstamp = skb->tstamp; + skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp; + } + + /* Not as strict as GRO. We only need to carry mss max value */ + skb_shinfo(tail)->gso_size = max(shinfo->gso_size, + skb_shinfo(tail)->gso_size); + + gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; + skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + + sk->sk_backlog.len += delta; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPBACKLOGCOALESCE); + kfree_skb_partial(skb, fragstolen); + return false; + } + __skb_push(skb, hdrlen); + +no_coalesce: + /* Only socket owner can try to collapse/prune rx queues + * to reduce memory overhead, so add a little headroom here. + * Few sockets backlog are possibly concurrently non empty. + */ + limit += 64*1024; + if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); @@ -1689,6 +1832,10 @@ process: reqsk_put(req); goto discard_it; } + if (tcp_checksum_complete(skb)) { + reqsk_put(req); + goto csum_error; + } if (unlikely(sk->sk_state != TCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; @@ -1961,6 +2108,7 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); */ static void *listening_get_next(struct seq_file *seq, void *cur) { + struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); struct inet_listen_hashbucket *ilb; @@ -1983,7 +2131,7 @@ get_sk: sk_for_each_from(sk) { if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_family == st->family) + if (sk->sk_family == afinfo->family) return sk; } spin_unlock(&ilb->lock); @@ -2020,6 +2168,7 @@ static inline bool empty_bucket(const struct tcp_iter_state *st) */ static void *established_get_first(struct seq_file *seq) { + struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); void *rc = NULL; @@ -2036,7 +2185,7 @@ static void *established_get_first(struct seq_file *seq) spin_lock_bh(lock); sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { - if (sk->sk_family != st->family || + if (sk->sk_family != afinfo->family || !net_eq(sock_net(sk), net)) { continue; } @@ -2051,6 +2200,7 @@ out: static void *established_get_next(struct seq_file *seq, void *cur) { + struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file)); struct sock *sk = cur; struct hlist_nulls_node *node; struct tcp_iter_state *st = seq->private; @@ -2062,7 +2212,8 @@ static void *established_get_next(struct seq_file *seq, void *cur) sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { - if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) + if (sk->sk_family == afinfo->family && + net_eq(sock_net(sk), net)) return sk; } @@ -2135,7 +2286,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) return rc; } -static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) +void *tcp_seq_start(struct seq_file *seq, loff_t *pos) { struct tcp_iter_state *st = seq->private; void *rc; @@ -2156,8 +2307,9 @@ out: st->last_pos = *pos; return rc; } +EXPORT_SYMBOL(tcp_seq_start); -static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct tcp_iter_state *st = seq->private; void *rc = NULL; @@ -2186,8 +2338,9 @@ out: st->last_pos = *pos; return rc; } +EXPORT_SYMBOL(tcp_seq_next); -static void tcp_seq_stop(struct seq_file *seq, void *v) +void tcp_seq_stop(struct seq_file *seq, void *v) { struct tcp_iter_state *st = seq->private; @@ -2202,47 +2355,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) break; } } - -int tcp_seq_open(struct inode *inode, struct file *file) -{ - struct tcp_seq_afinfo *afinfo = PDE_DATA(inode); - struct tcp_iter_state *s; - int err; - - err = seq_open_net(inode, file, &afinfo->seq_ops, - sizeof(struct tcp_iter_state)); - if (err < 0) - return err; - - s = ((struct seq_file *)file->private_data)->private; - s->family = afinfo->family; - s->last_pos = 0; - return 0; -} -EXPORT_SYMBOL(tcp_seq_open); - -int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) -{ - int rc = 0; - struct proc_dir_entry *p; - - afinfo->seq_ops.start = tcp_seq_start; - afinfo->seq_ops.next = tcp_seq_next; - afinfo->seq_ops.stop = tcp_seq_stop; - - p = proc_create_data(afinfo->name, 0444, net->proc_net, - afinfo->seq_fops, afinfo); - if (!p) - rc = -ENOMEM; - return rc; -} -EXPORT_SYMBOL(tcp_proc_register); - -void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) -{ - remove_proc_entry(afinfo->name, net->proc_net); -} -EXPORT_SYMBOL(tcp_proc_unregister); +EXPORT_SYMBOL(tcp_seq_stop); static void get_openreq4(const struct request_sock *req, struct seq_file *f, int i) @@ -2377,30 +2490,28 @@ out: return 0; } -static const struct file_operations tcp_afinfo_seq_fops = { - .open = tcp_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net +static const struct seq_operations tcp4_seq_ops = { + .show = tcp4_seq_show, + .start = tcp_seq_start, + .next = tcp_seq_next, + .stop = tcp_seq_stop, }; static struct tcp_seq_afinfo tcp4_seq_afinfo = { - .name = "tcp", .family = AF_INET, - .seq_fops = &tcp_afinfo_seq_fops, - .seq_ops = { - .show = tcp4_seq_show, - }, }; static int __net_init tcp4_proc_init_net(struct net *net) { - return tcp_proc_register(net, &tcp4_seq_afinfo); + if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops, + sizeof(struct tcp_iter_state), &tcp4_seq_afinfo)) + return -ENOMEM; + return 0; } static void __net_exit tcp4_proc_exit_net(struct net *net) { - tcp_proc_unregister(net, &tcp4_seq_afinfo); + remove_proc_entry("tcp", net->proc_net); } static struct pernet_operations tcp4_net_ops = { @@ -2494,6 +2605,12 @@ static int __net_init tcp_sk_init(struct net *net) if (res) goto fail; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + + /* Please enforce IP_DF and IPID==0 for RST and + * ACK sent in SYN-RECV and TIME-WAIT state. + */ + inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; + *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; } @@ -2517,10 +2634,10 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_orphan_retries = 0; net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; - net->ipv4.sysctl_tcp_tw_reuse = 0; + net->ipv4.sysctl_tcp_tw_reuse = 2; cnt = tcp_hashinfo.ehash_mask + 1; - net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; + net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); @@ -2542,8 +2659,8 @@ static int __net_init tcp_sk_init(struct net *net) * which are too large can cause TCP streams to be bursty. */ net->ipv4.sysctl_tcp_tso_win_divisor = 3; - /* Default TSQ limit of four TSO segments */ - net->ipv4.sysctl_tcp_limit_output_bytes = 262144; + /* Default TSQ limit of 16 TSO segments */ + net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; /* rfc5961 challenge ack rate limiting */ net->ipv4.sysctl_tcp_challenge_ack_limit = 1000; net->ipv4.sysctl_tcp_min_tso_segs = 2; @@ -2560,6 +2677,8 @@ static int __net_init tcp_sk_init(struct net *net) init_net.ipv4.sysctl_tcp_wmem, sizeof(init_net.ipv4.sysctl_tcp_wmem)); } + net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; + net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; |