aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_ipv4.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_ipv4.c')
-rw-r--r--net/ipv4/tcp_ipv4.c205
1 files changed, 107 insertions, 98 deletions
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5ab2aac5ca19..85164d4d3e53 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -80,12 +80,11 @@
#include <linux/stddef.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/inetdevice.h>
#include <crypto/hash.h>
#include <linux/scatterlist.h>
-int sysctl_tcp_low_latency __read_mostly;
-
#ifdef CONFIG_TCP_MD5SIG
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
__be32 daddr, __be32 saddr, const struct tcphdr *th);
@@ -102,10 +101,9 @@ static u32 tcp_v4_init_seq(const struct sk_buff *skb)
tcp_hdr(skb)->source);
}
-static u32 tcp_v4_init_ts_off(const struct sk_buff *skb)
+static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
{
- return secure_tcp_ts_off(ip_hdr(skb)->daddr,
- ip_hdr(skb)->saddr);
+ return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
}
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
@@ -242,7 +240,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_daddr,
inet->inet_sport,
usin->sin_port);
- tp->tsoffset = secure_tcp_ts_off(inet->inet_saddr,
+ tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
+ inet->inet_saddr,
inet->inet_daddr);
}
@@ -376,14 +375,15 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
struct sock *sk;
struct sk_buff *skb;
struct request_sock *fastopen;
- __u32 seq, snd_una;
- __u32 remaining;
+ u32 seq, snd_una;
+ s32 remaining;
+ u32 delta_us;
int err;
struct net *net = dev_net(icmp_skb->dev);
sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
th->dest, iph->saddr, ntohs(th->source),
- inet_iif(icmp_skb));
+ inet_iif(icmp_skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return;
@@ -483,11 +483,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
skb = tcp_write_queue_head(sk);
BUG_ON(!skb);
+ tcp_mstamp_refresh(tp);
+ delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
remaining = icsk->icsk_rto -
- min(icsk->icsk_rto,
- tcp_time_stamp - tcp_skb_timestamp(skb));
+ usecs_to_jiffies(delta_us);
- if (remaining) {
+ if (remaining > 0) {
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
remaining, TCP_RTO_MAX);
} else {
@@ -658,7 +659,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
- ntohs(th->source), inet_iif(skb));
+ ntohs(th->source), inet_iif(skb),
+ tcp_v4_sdif(skb));
/* don't send rst if it can't find key */
if (!sk1)
goto out;
@@ -811,7 +813,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcp_v4_send_ack(sk, skb,
tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
- tcp_time_stamp + tcptw->tw_ts_offset,
+ tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
tw->tw_bound_dev_if,
tcp_twsk_md5_key(tcptw),
@@ -839,7 +841,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
- tcp_time_stamp + tcp_rsk(req)->ts_off,
+ tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
req->ts_recent,
0,
tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
@@ -904,6 +906,48 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
+ const struct tcp_md5sig_info *md5sig;
+ __be32 mask;
+ struct tcp_md5sig_key *best_match = NULL;
+ bool match;
+
+ /* caller either holds rcu_read_lock() or socket lock */
+ md5sig = rcu_dereference_check(tp->md5sig_info,
+ lockdep_sock_is_held(sk));
+ if (!md5sig)
+ return NULL;
+
+ hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ if (key->family != family)
+ continue;
+
+ if (family == AF_INET) {
+ mask = inet_make_mask(key->prefixlen);
+ match = (key->addr.a4.s_addr & mask) ==
+ (addr->a4.s_addr & mask);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
+ key->prefixlen);
+#endif
+ } else {
+ match = false;
+ }
+
+ if (match && (!best_match ||
+ key->prefixlen > best_match->prefixlen))
+ best_match = key;
+ }
+ return best_match;
+}
+EXPORT_SYMBOL(tcp_md5_do_lookup);
+
+static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
+ const union tcp_md5_addr *addr,
+ int family, u8 prefixlen)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
unsigned int size = sizeof(struct in_addr);
const struct tcp_md5sig_info *md5sig;
@@ -919,12 +963,12 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
hlist_for_each_entry_rcu(key, &md5sig->head, node) {
if (key->family != family)
continue;
- if (!memcmp(&key->addr, addr, size))
+ if (!memcmp(&key->addr, addr, size) &&
+ key->prefixlen == prefixlen)
return key;
}
return NULL;
}
-EXPORT_SYMBOL(tcp_md5_do_lookup);
struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
@@ -938,14 +982,15 @@ EXPORT_SYMBOL(tcp_v4_md5_lookup);
/* This can be called on a newly created socket, from other files */
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
- int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
+ int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
+ gfp_t gfp)
{
/* Add Key to the list */
struct tcp_md5sig_key *key;
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_info *md5sig;
- key = tcp_md5_do_lookup(sk, addr, family);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
if (key) {
/* Pre-existing entry - just update that one. */
memcpy(key->key, newkey, newkeylen);
@@ -976,6 +1021,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
memcpy(key->key, newkey, newkeylen);
key->keylen = newkeylen;
key->family = family;
+ key->prefixlen = prefixlen;
memcpy(&key->addr, addr,
(family == AF_INET6) ? sizeof(struct in6_addr) :
sizeof(struct in_addr));
@@ -984,11 +1030,12 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
}
EXPORT_SYMBOL(tcp_md5_do_add);
-int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
+int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
+ u8 prefixlen)
{
struct tcp_md5sig_key *key;
- key = tcp_md5_do_lookup(sk, addr, family);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
if (!key)
return -ENOENT;
hlist_del_rcu(&key->node);
@@ -1014,11 +1061,12 @@ static void tcp_clear_md5_list(struct sock *sk)
}
}
-static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
- int optlen)
+static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
+ char __user *optval, int optlen)
{
struct tcp_md5sig cmd;
struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
+ u8 prefixlen = 32;
if (optlen < sizeof(cmd))
return -EINVAL;
@@ -1029,15 +1077,22 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
if (sin->sin_family != AF_INET)
return -EINVAL;
+ if (optname == TCP_MD5SIG_EXT &&
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
+ prefixlen = cmd.tcpm_prefixlen;
+ if (prefixlen > 32)
+ return -EINVAL;
+ }
+
if (!cmd.tcpm_keylen)
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
- AF_INET);
+ AF_INET, prefixlen);
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
- AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
+ AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
GFP_KERNEL);
}
@@ -1213,7 +1268,7 @@ static void tcp_v4_init_req(struct request_sock *req,
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
- ireq->opt = tcp_v4_save_options(skb);
+ ireq->opt = tcp_v4_save_options(sock_net(sk_listener), skb);
}
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
@@ -1340,7 +1395,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
* across. Shucks.
*/
tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
- AF_INET, key->key, key->keylen, GFP_ATOMIC);
+ AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
}
#endif
@@ -1402,7 +1457,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
sk->sk_rx_dst = NULL;
}
}
- tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
+ tcp_rcv_established(sk, skb, tcp_hdr(skb));
return 0;
}
@@ -1448,28 +1503,28 @@ csum_err:
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
-void tcp_v4_early_demux(struct sk_buff *skb)
+int tcp_v4_early_demux(struct sk_buff *skb)
{
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
if (skb->pkt_type != PACKET_HOST)
- return;
+ return 0;
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
- return;
+ return 0;
iph = ip_hdr(skb);
th = tcp_hdr(skb);
if (th->doff < sizeof(struct tcphdr) / 4)
- return;
+ return 0;
sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
- skb->skb_iif);
+ skb->skb_iif, inet_sdif(skb));
if (sk) {
skb->sk = sk;
skb->destructor = sock_edemux;
@@ -1483,63 +1538,9 @@ void tcp_v4_early_demux(struct sk_buff *skb)
skb_dst_set_noref(skb, dst);
}
}
+ return 0;
}
-/* Packet is added to VJ-style prequeue for processing in process
- * context, if a reader task is waiting. Apparently, this exciting
- * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
- * failed somewhere. Latency? Burstiness? Well, at least now we will
- * see, why it failed. 8)8) --ANK
- *
- */
-bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (sysctl_tcp_low_latency || !tp->ucopy.task)
- return false;
-
- if (skb->len <= tcp_hdrlen(skb) &&
- skb_queue_len(&tp->ucopy.prequeue) == 0)
- return false;
-
- /* Before escaping RCU protected region, we need to take care of skb
- * dst. Prequeue is only enabled for established sockets.
- * For such sockets, we might need the skb dst only to set sk->sk_rx_dst
- * Instead of doing full sk_rx_dst validity here, let's perform
- * an optimistic check.
- */
- if (likely(sk->sk_rx_dst))
- skb_dst_drop(skb);
- else
- skb_dst_force_safe(skb);
-
- __skb_queue_tail(&tp->ucopy.prequeue, skb);
- tp->ucopy.memory += skb->truesize;
- if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
- tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
- struct sk_buff *skb1;
-
- BUG_ON(sock_owned_by_user(sk));
- __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
- skb_queue_len(&tp->ucopy.prequeue));
-
- while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
- sk_backlog_rcv(sk, skb1);
-
- tp->ucopy.memory = 0;
- } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
- wake_up_interruptible_sync_poll(sk_sleep(sk),
- POLLIN | POLLRDNORM | POLLRDBAND);
- if (!inet_csk_ack_scheduled(sk))
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- (3 * tcp_rto_min(sk)) / 4,
- TCP_RTO_MAX);
- }
- return true;
-}
-EXPORT_SYMBOL(tcp_prequeue);
-
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
@@ -1589,6 +1590,7 @@ EXPORT_SYMBOL(tcp_filter);
int tcp_v4_rcv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
+ int sdif = inet_sdif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
bool refcounted;
@@ -1636,10 +1638,12 @@ int tcp_v4_rcv(struct sk_buff *skb)
TCP_SKB_CB(skb)->tcp_tw_isn = 0;
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
+ TCP_SKB_CB(skb)->has_rxtstamp =
+ skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
- th->dest, &refcounted);
+ th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
@@ -1666,7 +1670,9 @@ process:
*/
sock_hold(sk);
refcounted = true;
- nsk = tcp_check_req(sk, skb, req, false);
+ nsk = NULL;
+ if (!tcp_filter(sk, skb))
+ nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
goto discard_and_relse;
@@ -1712,8 +1718,7 @@ process:
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
- if (!tcp_prequeue(sk, skb))
- ret = tcp_v4_do_rcv(sk, skb);
+ ret = tcp_v4_do_rcv(sk, skb);
} else if (tcp_add_backlog(sk, skb)) {
goto discard_and_relse;
}
@@ -1766,7 +1771,8 @@ do_time_wait:
__tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
- inet_iif(skb));
+ inet_iif(skb),
+ sdif);
if (sk2) {
inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
@@ -1858,6 +1864,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_cleanup_congestion_control(sk);
+ tcp_cleanup_ulp(sk);
+
/* Cleanup up the write buffer. */
tcp_write_queue_purge(sk);
@@ -1876,9 +1884,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
}
#endif
- /* Clean prequeue, it must be empty really */
- __skb_queue_purge(&tp->ucopy.prequeue);
-
/* Clean up a referenced TCP bind bucket. */
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
@@ -2263,7 +2268,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
icsk->icsk_probes_out,
sock_i_ino(sk),
- atomic_read(&sk->sk_refcnt), sk,
+ refcount_read(&sk->sk_refcnt), sk,
jiffies_to_clock_t(icsk->icsk_rto),
jiffies_to_clock_t(icsk->icsk_ack.ato),
(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
@@ -2289,7 +2294,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
- atomic_read(&tw->tw_refcnt), tw);
+ refcount_read(&tw->tw_refcnt), tw);
}
#define TMPSZ 150
@@ -2385,6 +2390,7 @@ struct proto tcp_prot = {
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
+ .leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
@@ -2463,6 +2469,9 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
+ net->ipv4.sysctl_tcp_sack = 1;
+ net->ipv4.sysctl_tcp_window_scaling = 1;
+ net->ipv4.sysctl_tcp_timestamps = 1;
return 0;
fail: