diff options
Diffstat (limited to 'net/ipv4/tcp_ipv4.c')
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 205 |
1 files changed, 107 insertions, 98 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5ab2aac5ca19..85164d4d3e53 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -80,12 +80,11 @@ #include <linux/stddef.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/inetdevice.h> #include <crypto/hash.h> #include <linux/scatterlist.h> -int sysctl_tcp_low_latency __read_mostly; - #ifdef CONFIG_TCP_MD5SIG static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th); @@ -102,10 +101,9 @@ static u32 tcp_v4_init_seq(const struct sk_buff *skb) tcp_hdr(skb)->source); } -static u32 tcp_v4_init_ts_off(const struct sk_buff *skb) +static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb) { - return secure_tcp_ts_off(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr); + return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -242,7 +240,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr, inet->inet_sport, usin->sin_port); - tp->tsoffset = secure_tcp_ts_off(inet->inet_saddr, + tp->tsoffset = secure_tcp_ts_off(sock_net(sk), + inet->inet_saddr, inet->inet_daddr); } @@ -376,14 +375,15 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) struct sock *sk; struct sk_buff *skb; struct request_sock *fastopen; - __u32 seq, snd_una; - __u32 remaining; + u32 seq, snd_una; + s32 remaining; + u32 delta_us; int err; struct net *net = dev_net(icmp_skb->dev); sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, ntohs(th->source), - inet_iif(icmp_skb)); + inet_iif(icmp_skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; @@ -483,11 +483,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) skb = tcp_write_queue_head(sk); BUG_ON(!skb); + tcp_mstamp_refresh(tp); + delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp); remaining = icsk->icsk_rto - - min(icsk->icsk_rto, - tcp_time_stamp - tcp_skb_timestamp(skb)); + usecs_to_jiffies(delta_us); - if (remaining) { + if (remaining > 0) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, TCP_RTO_MAX); } else { @@ -658,7 +659,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, - ntohs(th->source), inet_iif(skb)); + ntohs(th->source), inet_iif(skb), + tcp_v4_sdif(skb)); /* don't send rst if it can't find key */ if (!sk1) goto out; @@ -811,7 +813,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) tcp_v4_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcp_time_stamp + tcptw->tw_ts_offset, + tcp_time_stamp_raw() + tcptw->tw_ts_offset, tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw), @@ -839,7 +841,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, - tcp_time_stamp + tcp_rsk(req)->ts_off, + tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr, @@ -904,6 +906,48 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; + const struct tcp_md5sig_info *md5sig; + __be32 mask; + struct tcp_md5sig_key *best_match = NULL; + bool match; + + /* caller either holds rcu_read_lock() or socket lock */ + md5sig = rcu_dereference_check(tp->md5sig_info, + lockdep_sock_is_held(sk)); + if (!md5sig) + return NULL; + + hlist_for_each_entry_rcu(key, &md5sig->head, node) { + if (key->family != family) + continue; + + if (family == AF_INET) { + mask = inet_make_mask(key->prefixlen); + match = (key->addr.a4.s_addr & mask) == + (addr->a4.s_addr & mask); +#if IS_ENABLED(CONFIG_IPV6) + } else if (family == AF_INET6) { + match = ipv6_prefix_equal(&key->addr.a6, &addr->a6, + key->prefixlen); +#endif + } else { + match = false; + } + + if (match && (!best_match || + key->prefixlen > best_match->prefixlen)) + best_match = key; + } + return best_match; +} +EXPORT_SYMBOL(tcp_md5_do_lookup); + +static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, + const union tcp_md5_addr *addr, + int family, u8 prefixlen) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_md5sig_key *key; unsigned int size = sizeof(struct in_addr); const struct tcp_md5sig_info *md5sig; @@ -919,12 +963,12 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; - if (!memcmp(&key->addr, addr, size)) + if (!memcmp(&key->addr, addr, size) && + key->prefixlen == prefixlen) return key; } return NULL; } -EXPORT_SYMBOL(tcp_md5_do_lookup); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) @@ -938,14 +982,15 @@ EXPORT_SYMBOL(tcp_v4_md5_lookup); /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, const u8 *newkey, u8 newkeylen, gfp_t gfp) + int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, + gfp_t gfp) { /* Add Key to the list */ struct tcp_md5sig_key *key; struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - key = tcp_md5_do_lookup(sk, addr, family); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); if (key) { /* Pre-existing entry - just update that one. */ memcpy(key->key, newkey, newkeylen); @@ -976,6 +1021,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, memcpy(key->key, newkey, newkeylen); key->keylen = newkeylen; key->family = family; + key->prefixlen = prefixlen; memcpy(&key->addr, addr, (family == AF_INET6) ? sizeof(struct in6_addr) : sizeof(struct in_addr)); @@ -984,11 +1030,12 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, } EXPORT_SYMBOL(tcp_md5_do_add); -int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family) +int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, + u8 prefixlen) { struct tcp_md5sig_key *key; - key = tcp_md5_do_lookup(sk, addr, family); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); if (!key) return -ENOENT; hlist_del_rcu(&key->node); @@ -1014,11 +1061,12 @@ static void tcp_clear_md5_list(struct sock *sk) } } -static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, - int optlen) +static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, + char __user *optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; + u8 prefixlen = 32; if (optlen < sizeof(cmd)) return -EINVAL; @@ -1029,15 +1077,22 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, if (sin->sin_family != AF_INET) return -EINVAL; + if (optname == TCP_MD5SIG_EXT && + cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { + prefixlen = cmd.tcpm_prefixlen; + if (prefixlen > 32) + return -EINVAL; + } + if (!cmd.tcpm_keylen) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET); + AF_INET, prefixlen); if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET, cmd.tcpm_key, cmd.tcpm_keylen, + AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } @@ -1213,7 +1268,7 @@ static void tcp_v4_init_req(struct request_sock *req, sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->opt = tcp_v4_save_options(skb); + ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb); } static struct dst_entry *tcp_v4_route_req(const struct sock *sk, @@ -1340,7 +1395,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, * across. Shucks. */ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, - AF_INET, key->key, key->keylen, GFP_ATOMIC); + AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } #endif @@ -1402,7 +1457,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len); + tcp_rcv_established(sk, skb, tcp_hdr(skb)); return 0; } @@ -1448,28 +1503,28 @@ csum_err: } EXPORT_SYMBOL(tcp_v4_do_rcv); -void tcp_v4_early_demux(struct sk_buff *skb) +int tcp_v4_early_demux(struct sk_buff *skb) { const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) - return; + return 0; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) - return; + return 0; iph = ip_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) - return; + return 0; sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), - skb->skb_iif); + skb->skb_iif, inet_sdif(skb)); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; @@ -1483,63 +1538,9 @@ void tcp_v4_early_demux(struct sk_buff *skb) skb_dst_set_noref(skb, dst); } } + return 0; } -/* Packet is added to VJ-style prequeue for processing in process - * context, if a reader task is waiting. Apparently, this exciting - * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) - * failed somewhere. Latency? Burstiness? Well, at least now we will - * see, why it failed. 8)8) --ANK - * - */ -bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (sysctl_tcp_low_latency || !tp->ucopy.task) - return false; - - if (skb->len <= tcp_hdrlen(skb) && - skb_queue_len(&tp->ucopy.prequeue) == 0) - return false; - - /* Before escaping RCU protected region, we need to take care of skb - * dst. Prequeue is only enabled for established sockets. - * For such sockets, we might need the skb dst only to set sk->sk_rx_dst - * Instead of doing full sk_rx_dst validity here, let's perform - * an optimistic check. - */ - if (likely(sk->sk_rx_dst)) - skb_dst_drop(skb); - else - skb_dst_force_safe(skb); - - __skb_queue_tail(&tp->ucopy.prequeue, skb); - tp->ucopy.memory += skb->truesize; - if (skb_queue_len(&tp->ucopy.prequeue) >= 32 || - tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { - struct sk_buff *skb1; - - BUG_ON(sock_owned_by_user(sk)); - __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED, - skb_queue_len(&tp->ucopy.prequeue)); - - while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk_backlog_rcv(sk, skb1); - - tp->ucopy.memory = 0; - } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { - wake_up_interruptible_sync_poll(sk_sleep(sk), - POLLIN | POLLRDNORM | POLLRDBAND); - if (!inet_csk_ack_scheduled(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - (3 * tcp_rto_min(sk)) / 4, - TCP_RTO_MAX); - } - return true; -} -EXPORT_SYMBOL(tcp_prequeue); - bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf; @@ -1589,6 +1590,7 @@ EXPORT_SYMBOL(tcp_filter); int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + int sdif = inet_sdif(skb); const struct iphdr *iph; const struct tcphdr *th; bool refcounted; @@ -1636,10 +1638,12 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; + TCP_SKB_CB(skb)->has_rxtstamp = + skb->tstamp || skb_hwtstamps(skb)->hwtstamp; lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, - th->dest, &refcounted); + th->dest, sdif, &refcounted); if (!sk) goto no_tcp_socket; @@ -1666,7 +1670,9 @@ process: */ sock_hold(sk); refcounted = true; - nsk = tcp_check_req(sk, skb, req, false); + nsk = NULL; + if (!tcp_filter(sk, skb)) + nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); goto discard_and_relse; @@ -1712,8 +1718,7 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v4_do_rcv(sk, skb); + ret = tcp_v4_do_rcv(sk, skb); } else if (tcp_add_backlog(sk, skb)) { goto discard_and_relse; } @@ -1766,7 +1771,8 @@ do_time_wait: __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, - inet_iif(skb)); + inet_iif(skb), + sdif); if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; @@ -1858,6 +1864,8 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_cleanup_congestion_control(sk); + tcp_cleanup_ulp(sk); + /* Cleanup up the write buffer. */ tcp_write_queue_purge(sk); @@ -1876,9 +1884,6 @@ void tcp_v4_destroy_sock(struct sock *sk) } #endif - /* Clean prequeue, it must be empty really */ - __skb_queue_purge(&tp->ucopy.prequeue); - /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); @@ -2263,7 +2268,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), icsk->icsk_probes_out, sock_i_ino(sk), - atomic_read(&sk->sk_refcnt), sk, + refcount_read(&sk->sk_refcnt), sk, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, @@ -2289,7 +2294,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw, " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK", i, src, srcp, dest, destp, tw->tw_substate, 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, - atomic_read(&tw->tw_refcnt), tw); + refcount_read(&tw->tw_refcnt), tw); } #define TMPSZ 150 @@ -2385,6 +2390,7 @@ struct proto tcp_prot = { .unhash = inet_unhash, .get_port = inet_csk_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, + .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, @@ -2463,6 +2469,9 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); + net->ipv4.sysctl_tcp_sack = 1; + net->ipv4.sysctl_tcp_window_scaling = 1; + net->ipv4.sysctl_tcp_timestamps = 1; return 0; fail: |