From 1946e672c173559155a3e210fe95dbf8b7b8ddf7 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 28 Dec 2016 17:52:32 +0800 Subject: ipv4: Namespaceify tcp_tw_recycle and tcp_max_tw_buckets knob Different namespace application might require fast recycling TIME-WAIT sockets independently of the host. Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 22cbd61079b5..66f8f1b1dc78 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -289,13 +289,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_max_tw_buckets", - .data = &tcp_death_row.sysctl_max_tw_buckets, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_fastopen", .data = &sysctl_tcp_fastopen, @@ -309,13 +302,6 @@ static struct ctl_table ipv4_table[] = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), .proc_handler = proc_tcp_fastopen_key, }, - { - .procname = "tcp_tw_recycle", - .data = &tcp_death_row.sysctl_tw_recycle, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, @@ -960,6 +946,20 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_max_tw_buckets", + .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_tw_recycle", + .data = &init_net.ipv4.tcp_death_row.sysctl_tw_recycle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", -- cgit From fee83d097b1620530f23bf6063f4ea251ba9c8c7 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Wed, 28 Dec 2016 17:52:33 +0800 Subject: ipv4: Namespaceify tcp_max_syn_backlog knob Different namespace application might require different maximal number of remembered connection requests. Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + include/net/request_sock.h | 4 +--- net/core/request_sock.c | 2 -- net/ipv4/sysctl_net_ipv4.c | 14 +++++++------- net/ipv4/tcp.c | 2 -- net/ipv4/tcp_input.c | 4 ++-- net/ipv4/tcp_ipv4.c | 7 +++++-- 7 files changed, 16 insertions(+), 18 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index fffd38453985..8e3f5b6f26d5 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -122,6 +122,7 @@ struct netns_ipv4 { unsigned int sysctl_tcp_notsent_lowat; int sysctl_tcp_tw_reuse; struct inet_timewait_death_row tcp_death_row; + int sysctl_max_syn_backlog; int sysctl_igmp_max_memberships; int sysctl_igmp_max_msf; diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 6ebe13eb1c4c..a12a5d25b27e 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -1,7 +1,7 @@ /* * NET Generic infrastructure for Network protocols. * - * Definitions for request_sock + * Definitions for request_sock * * Authors: Arnaldo Carvalho de Melo * @@ -123,8 +123,6 @@ static inline void reqsk_put(struct request_sock *req) reqsk_free(req); } -extern int sysctl_max_syn_backlog; - /* * For a TCP Fast Open listener - * lock - protects the access to all the reqsk, which is co-owned by diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 5d26056b6d8f..9b8727c67b58 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -34,8 +34,6 @@ * and it will increase in proportion to the memory of machine. * Note : Dont forget somaxconn that may limit backlog too. */ -int sysctl_max_syn_backlog = 256; -EXPORT_SYMBOL(sysctl_max_syn_backlog); void reqsk_queue_alloc(struct request_sock_queue *queue) { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 66f8f1b1dc78..134d8e191366 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -323,13 +323,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_max_syn_backlog", - .data = &sysctl_max_syn_backlog, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, @@ -960,6 +953,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_max_syn_backlog", + .data = &init_net.ipv4.sysctl_max_syn_backlog, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_IP_ROUTE_MULTIPATH { .procname = "fib_multipath_use_neigh", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7f0d81c090ce..2e3807d8eba8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3378,9 +3378,7 @@ void __init tcp_init(void) cnt = tcp_hashinfo.ehash_mask + 1; - sysctl_tcp_max_orphans = cnt / 2; - sysctl_max_syn_backlog = max(128, cnt / 256); tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c61480249835..ec6d84363024 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6377,8 +6377,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } /* Kill the following clause, if you dislike this way. */ else if (!net->ipv4.sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (sysctl_max_syn_backlog >> 2)) && + (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (net->ipv4.sysctl_max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst, false, tmp_opt.saw_tstamp)) { /* Without syncookies last quarter of diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 56b5f49e3f97..7e4be4f361f3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2419,7 +2419,7 @@ static void __net_exit tcp_sk_exit(struct net *net) static int __net_init tcp_sk_init(struct net *net) { - int res, cpu; + int res, cpu, cnt; net->ipv4.tcp_sk = alloc_percpu(struct sock *); if (!net->ipv4.tcp_sk) @@ -2458,10 +2458,13 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 0; + cnt = tcp_hashinfo.ehash_mask + 1; net->ipv4.tcp_death_row.sysctl_tw_recycle = 0; - net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (tcp_hashinfo.ehash_mask + 1) / 2; + net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; + net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256); + return 0; fail: tcp_sk_exit(net); -- cgit From 4a7f6009441144783e5925551c72e3f2e1b0839b Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 12 Jan 2017 22:11:41 -0800 Subject: tcp: remove thin_dupack feature Thin stream DUPACK is to start fast recovery on only one DUPACK provided the connection is a thin stream (i.e., low inflight). But this older feature is now subsumed with RACK. If a connection receives only a single DUPACK, RACK would arm a reordering timer and soon starts fast recovery instead of timeout if no further ACKs are received. The socket option (THIN_DUPACK) is kept as a nop for compatibility. Note that this patch does not change another thin-stream feature which enables linear RTO. Although it might be good to generalize that in the future (i.e., linear RTO for the first say 3 retries). Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 12 ------------ include/linux/tcp.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 7 ------- net/ipv4/tcp.c | 6 ++---- net/ipv4/tcp_input.c | 13 ------------- 5 files changed, 3 insertions(+), 37 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 7de2cf79e16f..aa1bb49f1dc6 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -703,18 +703,6 @@ tcp_thin_linear_timeouts - BOOLEAN Documentation/networking/tcp-thin.txt Default: 0 -tcp_thin_dupack - BOOLEAN - Enable dynamic triggering of retransmissions after one dupACK - for thin streams. If set, a check is performed upon reception - of a dupACK to determine if the stream is thin (less than 4 - packets in flight). As long as the stream is found to be thin, - data is retransmitted on the first received dupACK. This - improves retransmission latency for non-aggressive thin - streams, often found to be time-dependent. - For more information on thin streams, see - Documentation/networking/tcp-thin.txt - Default: 0 - tcp_limit_output_bytes - INTEGER Controls TCP Small Queue limit per tcp socket. TCP bulk sender tends to increase packets in flight until it diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4733368f953a..6c22332afb75 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -220,7 +220,7 @@ struct tcp_sock { unused:5; u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ - thin_dupack : 1,/* Fast retransmit on first dupack */ + unused1 : 1, repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0f2d37e8e983..c8d283615c6f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -536,13 +536,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_thin_dupack", - .data = &sysctl_tcp_thin_dupack, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_early_retrans", .data = &sysctl_tcp_early_retrans, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d9023e8ed53e..aba6ea76338e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2474,9 +2474,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; - else { - tp->thin_dupack = val; - } break; case TCP_REPAIR: @@ -2966,8 +2963,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; + case TCP_THIN_DUPACK: - val = tp->thin_dupack; + val = 0; break; case TCP_REPAIR: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 87315ab1ab1a..39ebc20ca1b2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -95,9 +95,6 @@ int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_min_rtt_wlen __read_mostly = 300; - -int sysctl_tcp_thin_dupack __read_mostly; - int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_early_retrans __read_mostly = 3; int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; @@ -2170,16 +2167,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) if (tcp_dupack_heuristics(tp) > tp->reordering) return true; - /* If a thin stream is detected, retransmit after first - * received dupack. Employ only if SACK is supported in order - * to avoid possible corner-case series of spurious retransmissions - * Use only if there are no unsent data. - */ - if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && - tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && - tcp_is_sack(tp) && !tcp_send_head(sk)) - return true; - return false; } -- cgit From 4548b683b78137f8eadeb312b94e20bb0d4a7141 Mon Sep 17 00:00:00 2001 From: Krister Johansen Date: Fri, 20 Jan 2017 17:49:11 -0800 Subject: Introduce a sysctl that modifies the value of PROT_SOCK. Add net.ipv4.ip_unprivileged_port_start, which is a per namespace sysctl that denotes the first unprivileged inet port in the namespace. To disable all privileged ports set this to zero. It also checks for overlap with the local port range. The privileged and local range may not overlap. The use case for this change is to allow containerized processes to bind to priviliged ports, but prevent them from ever being allowed to modify their container's network configuration. The latter is accomplished by ensuring that the network namespace is not a child of the user namespace. This modification was needed to allow the container manager to disable a namespace's priviliged port restrictions without exposing control of the network namespace to processes in the user namespace. Signed-off-by: Krister Johansen Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 9 ++++++ include/net/ip.h | 10 +++++++ include/net/netns/ipv4.h | 1 + net/ipv4/af_inet.c | 5 +++- net/ipv4/sysctl_net_ipv4.c | 50 +++++++++++++++++++++++++++++++++- net/ipv6/af_inet6.c | 3 +- net/netfilter/ipvs/ip_vs_ctl.c | 7 ++--- net/sctp/socket.c | 10 ++++--- security/selinux/hooks.c | 3 +- 9 files changed, 86 insertions(+), 12 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index aa1bb49f1dc6..17f2e7791042 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -822,6 +822,15 @@ ip_local_reserved_ports - list of comma separated ranges Default: Empty +ip_unprivileged_port_start - INTEGER + This is a per-namespace sysctl. It defines the first + unprivileged port in the network namespace. Privileged ports + require root or CAP_NET_BIND_SERVICE in order to bind to them. + To disable all privileged ports, set this to 0. It may not + overlap with the ip_local_reserved_ports range. + + Default: 1024 + ip_nonlocal_bind - BOOLEAN If set, allows processes to bind() to non-local IP addresses, which can be quite useful - but may break some applications. diff --git a/include/net/ip.h b/include/net/ip.h index ab6761a7c883..bf264a8db1ce 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -263,11 +263,21 @@ static inline bool sysctl_dev_name_is_allowed(const char *name) return strcmp(name, "default") != 0 && strcmp(name, "all") != 0; } +static inline int inet_prot_sock(struct net *net) +{ + return net->ipv4.sysctl_ip_prot_sock; +} + #else static inline int inet_is_local_reserved_port(struct net *net, int port) { return 0; } + +static inline int inet_prot_sock(struct net *net) +{ + return PROT_SOCK; +} #endif __be32 inet_current_timestamp(void); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 8e3f5b6f26d5..e365732b8051 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -135,6 +135,7 @@ struct netns_ipv4 { #ifdef CONFIG_SYSCTL unsigned long *sysctl_local_reserved_ports; + int sysctl_ip_prot_sock; #endif #ifdef CONFIG_IP_MROUTE diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index aae410bb655a..28fe8da4e1ac 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -479,7 +479,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) snum = ntohs(addr->sin_port); err = -EACCES; - if (snum && snum < PROT_SOCK && + if (snum && snum < inet_prot_sock(net) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; @@ -1700,6 +1700,9 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; +#ifdef CONFIG_SYSCTL + net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; +#endif return 0; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index c8d283615c6f..1b861997fdc5 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -35,6 +35,8 @@ static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; +static int ip_privileged_port_min; +static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; static int ip_ttl_max = 255; static int tcp_syn_retries_min = 1; @@ -79,7 +81,12 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { - if (range[1] < range[0]) + /* Ensure that the upper limit is not smaller than the lower, + * and that the lower does not encroach upon the privileged + * port limit. + */ + if ((range[1] < range[0]) || + (range[0] < net->ipv4.sysctl_ip_prot_sock)) ret = -EINVAL; else set_local_port_range(net, range); @@ -88,6 +95,40 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, return ret; } +/* Validate changes from /proc interface. */ +static int ipv4_privileged_ports(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_ip_prot_sock); + int ret; + int pports; + int range[2]; + struct ctl_table tmp = { + .data = &pports, + .maxlen = sizeof(pports), + .mode = table->mode, + .extra1 = &ip_privileged_port_min, + .extra2 = &ip_privileged_port_max, + }; + + pports = net->ipv4.sysctl_ip_prot_sock; + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) { + inet_get_local_port_range(net, &range[0], &range[1]); + /* Ensure that the local port range doesn't overlap with the + * privileged port range. + */ + if (range[0] < pports) + ret = -EINVAL; + else + net->ipv4.sysctl_ip_prot_sock = pports; + } + + return ret; +} static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high) { @@ -964,6 +1005,13 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = &one, }, #endif + { + .procname = "ip_unprivileged_port_start", + .maxlen = sizeof(int), + .data = &init_net.ipv4.sysctl_ip_prot_sock, + .mode = 0644, + .proc_handler = ipv4_privileged_ports, + }, { } }; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index aa42123bc301..04db40620ea6 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -302,7 +302,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return -EINVAL; snum = ntohs(addr->sin6_port); - if (snum && snum < PROT_SOCK && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) + if (snum && snum < inet_prot_sock(net) && + !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; lock_sock(sk); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 55e0169caa4c..8b7416f4e01a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -426,10 +426,9 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); - if (svc == NULL - && protocol == IPPROTO_TCP - && atomic_read(&ipvs->ftpsvc_counter) - && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { + if (!svc && protocol == IPPROTO_TCP && + atomic_read(&ipvs->ftpsvc_counter) && + (vport == FTPDATA || ntohs(vport) >= inet_prot_sock(ipvs->net))) { /* * Check if ftp service entry exists, the packet * might belong to FTP data connections. diff --git a/net/sctp/socket.c b/net/sctp/socket.c index bee4dd3feabb..d699d2cbf275 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -360,7 +360,7 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) } } - if (snum && snum < PROT_SOCK && + if (snum && snum < inet_prot_sock(net) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; @@ -1152,8 +1152,10 @@ static int __sctp_connect(struct sock *sk, * accept new associations, but it SHOULD NOT * be permitted to open new associations. */ - if (ep->base.bind_addr.port < PROT_SOCK && - !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) { + if (ep->base.bind_addr.port < + inet_prot_sock(net) && + !ns_capable(net->user_ns, + CAP_NET_BIND_SERVICE)) { err = -EACCES; goto out_free; } @@ -1818,7 +1820,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) * but it SHOULD NOT be permitted to open new * associations. */ - if (ep->base.bind_addr.port < PROT_SOCK && + if (ep->base.bind_addr.port < inet_prot_sock(net) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) { err = -EACCES; goto out_unlock; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c7c6619431d5..53cb6da5f1c6 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4365,7 +4365,8 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in inet_get_local_port_range(sock_net(sk), &low, &high); - if (snum < max(PROT_SOCK, low) || snum > high) { + if (snum < max(inet_prot_sock(sock_net(sk)), low) || + snum > high) { err = sel_netport_sid(sk->sk_protocol, snum, &sid); if (err) -- cgit From 63a6fff353d01da5a22b72670c434bf12fa0e3b8 Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Thu, 26 Jan 2017 18:02:24 +0000 Subject: net: Avoid receiving packets with an l3mdev on unbound UDP sockets Packets arriving in a VRF currently are delivered to UDP sockets that aren't bound to any interface. TCP defaults to not delivering packets arriving in a VRF to unbound sockets. IP route lookup and socket transmit both assume that unbound means using the default table and UDP applications that haven't been changed to be aware of VRFs may not function correctly in this case since they may not be able to handle overlapping IP address ranges, or be able to send packets back to the original sender if required. So add a sysctl, udp_l3mdev_accept, to control this behaviour with it being analgous to the existing tcp_l3mdev_accept, namely to allow a process to have a VRF-global listen socket. Have this default to off as this is the behaviour that users will expect, given that there is no explicit mechanism to set unmodified VRF-unaware application into a default VRF. Signed-off-by: Robert Shearman Acked-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +++++++ Documentation/networking/vrf.txt | 7 ++++--- include/net/netns/ipv4.h | 4 ++++ net/ipv4/sysctl_net_ipv4.c | 11 +++++++++++ net/ipv4/udp.c | 27 ++++++++++++++++++++------- net/ipv6/udp.c | 27 ++++++++++++++++++++------- 6 files changed, 66 insertions(+), 17 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 17f2e7791042..fc73eeb7b3b8 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -721,6 +721,13 @@ tcp_challenge_ack_limit - INTEGER UDP variables: +udp_l3mdev_accept - BOOLEAN + Enabling this option allows a "global" bound socket to work + across L3 master domains (e.g., VRFs) with packets capable of + being received regardless of the L3 domain in which they + originated. Only valid when the kernel was compiled with + CONFIG_NET_L3_MASTER_DEV. + udp_mem - vector of 3 INTEGERs: min, pressure, max Number of pages allowed for queueing by all UDP sockets. diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt index 755dab856392..3918dae964d4 100644 --- a/Documentation/networking/vrf.txt +++ b/Documentation/networking/vrf.txt @@ -98,10 +98,11 @@ VRF device: or to specify the output device using cmsg and IP_PKTINFO. -TCP services running in the default VRF context (ie., not bound to any VRF -device) can work across all VRF domains by enabling the tcp_l3mdev_accept -sysctl option: +TCP & UDP services running in the default VRF context (ie., not bound +to any VRF device) can work across all VRF domains by enabling the +tcp_l3mdev_accept and udp_l3mdev_accept sysctl options: sysctl -w net.ipv4.tcp_l3mdev_accept=1 + sysctl -w net.ipv4.udp_l3mdev_accept=1 netfilter rules on the VRF device can be used to limit access to services running in the default VRF context as well. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e365732b8051..622d2da27135 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -124,6 +124,10 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; +#ifdef CONFIG_NET_L3_MASTER_DEV + int sysctl_udp_l3mdev_accept; +#endif + int sysctl_igmp_max_memberships; int sysctl_igmp_max_msf; int sysctl_igmp_llm_reports; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1b861997fdc5..d6880a6149ee 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1012,6 +1012,17 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_privileged_ports, }, +#ifdef CONFIG_NET_L3_MASTER_DEV + { + .procname = "udp_l3mdev_accept", + .data = &init_net.ipv4.sysctl_udp_l3mdev_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d6dddcf59e79..cf6ba3387401 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -134,6 +134,17 @@ EXPORT_SYMBOL(udp_memory_allocated); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) +/* IPCB reference means this can not be used from early demux */ +static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (!net->ipv4.sysctl_udp_l3mdev_accept && + skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) + return true; +#endif + return false; +} + static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, @@ -369,7 +380,8 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) static int compute_score(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, - __be32 daddr, unsigned short hnum, int dif) + __be32 daddr, unsigned short hnum, int dif, + bool exact_dif) { int score; struct inet_sock *inet; @@ -400,7 +412,7 @@ static int compute_score(struct sock *sk, struct net *net, score += 4; } - if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if || exact_dif) { if (sk->sk_bound_dev_if != dif) return -1; score += 4; @@ -425,7 +437,7 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, - __be32 daddr, unsigned int hnum, int dif, + __be32 daddr, unsigned int hnum, int dif, bool exact_dif, struct udp_hslot *hslot2, struct sk_buff *skb) { @@ -437,7 +449,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif); + daddr, hnum, dif, exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -472,6 +484,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + bool exact_dif = udp_lib_exact_dif_match(net, skb); int score, badness, matches = 0, reuseport = 0; u32 hash = 0; @@ -484,7 +497,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, skb); + exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); @@ -499,7 +512,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, skb); + exact_dif, hslot2, skb); } return result; } @@ -508,7 +521,7 @@ begin: badness = 0; sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif); + daddr, hnum, dif, exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 05d69324862e..b4c6516a3a0c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -55,6 +55,16 @@ #include #include "udp_impl.h" +static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb) +{ +#if defined(CONFIG_NET_L3_MASTER_DEV) + if (!net->ipv4.sysctl_udp_l3mdev_accept && + skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) + return true; +#endif + return false; +} + static u32 udp6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -118,7 +128,7 @@ static void udp_v6_rehash(struct sock *sk) static int compute_score(struct sock *sk, struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, - int dif) + int dif, bool exact_dif) { int score; struct inet_sock *inet; @@ -149,7 +159,7 @@ static int compute_score(struct sock *sk, struct net *net, score++; } - if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if || exact_dif) { if (sk->sk_bound_dev_if != dif) return -1; score++; @@ -165,7 +175,7 @@ static int compute_score(struct sock *sk, struct net *net, static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, - struct udp_hslot *hslot2, + bool exact_dif, struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; @@ -176,7 +186,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif); + daddr, hnum, dif, exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { @@ -212,6 +222,7 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; + bool exact_dif = udp6_lib_exact_dif_match(net, skb); int score, badness, matches = 0, reuseport = 0; u32 hash = 0; @@ -223,7 +234,7 @@ struct sock *__udp6_lib_lookup(struct net *net, goto begin; result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, + daddr, hnum, dif, exact_dif, hslot2, skb); if (!result) { unsigned int old_slot2 = slot2; @@ -239,7 +250,8 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, skb); + exact_dif, hslot2, + skb); } return result; } @@ -247,7 +259,8 @@ begin: result = NULL; badness = -1; sk_for_each_rcu(sk, &hslot->head) { - score = compute_score(sk, net, saddr, sport, daddr, hnum, dif); + score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, + exact_dif); if (score > badness) { reuseport = sk->sk_reuseport; if (reuseport) { -- cgit