aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <[email protected]>2020-03-12 12:08:10 -0700
committerDavid S. Miller <[email protected]>2020-03-12 12:08:10 -0700
commit93e616131a3825189570524d4792b5223bf1d68c (patch)
tree46fadd2182ac940b8f992ca32d32eb017eca43b4
parentaf91fd7e17f12119763b95ccf526bce4b0693144 (diff)
parent7f204a7de8b08542aca3c1daa96ed20e1177ba87 (diff)
Merge branch 'bind_addr_zero'
Kuniyuki Iwashima says: ==================== Improve bind(addr, 0) behaviour. Currently we fail to bind sockets to ephemeral ports when all of the ports are exhausted even if all sockets have SO_REUSEADDR enabled. In this case, we still have a chance to connect to the different remote hosts. These patches add net.ipv4.ip_autobind_reuse option and fix the behaviour to fully utilize all space of the local (addr, port) tuples. Changes in v5: - Add more description to documents. - Fix sysctl option to use proc_dointvec_minmax. - Remove the Fixes: tag and squash two commits. Changes in v4: - Add net.ipv4.ip_autobind_reuse option to not change the current behaviour. - Modify .gitignore for test. https://lore.kernel.org/netdev/[email protected]/ Changes in v3: - Change the title and write more specific description of the 3rd patch. - Add a test in tools/testing/selftests/net/ as the 4th patch. https://lore.kernel.org/netdev/[email protected]/ Changes in v2: - Change the description of the 2nd patch ('localhost' -> 'address'). - Correct the description and the if statement of the 3rd patch. https://lore.kernel.org/netdev/[email protected]/ v1 with tests: https://lore.kernel.org/netdev/[email protected]/ ==================== Reviewed-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]>
-rw-r--r--Documentation/networking/ip-sysctl.txt9
-rw-r--r--include/net/netns/ipv4.h1
-rw-r--r--net/ipv4/inet_connection_sock.c36
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--tools/testing/selftests/net/.gitignore1
-rw-r--r--tools/testing/selftests/net/Makefile2
-rw-r--r--tools/testing/selftests/net/reuseaddr_ports_exhausted.c162
-rwxr-xr-xtools/testing/selftests/net/reuseaddr_ports_exhausted.sh35
8 files changed, 243 insertions, 12 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 5f53faff4e25..ee961d322d93 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -958,6 +958,15 @@ ip_nonlocal_bind - BOOLEAN
which can be quite useful - but may break some applications.
Default: 0
+ip_autobind_reuse - BOOLEAN
+ By default, bind() does not select the ports automatically even if
+ the new socket and all sockets bound to the port have SO_REUSEADDR.
+ ip_autobind_reuse allows bind() to reuse the port and this is useful
+ when you use bind()+connect(), but may break some applications.
+ The preferred solution is to use IP_BIND_ADDRESS_NO_PORT and this
+ option should only be set by experts.
+ Default: 0
+
ip_dynaddr - BOOLEAN
If set non-zero, enables support for dynamic addresses.
If set to a non-zero value larger than 1, a kernel log
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 08b98414d94e..154b8f01499b 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -101,6 +101,7 @@ struct netns_ipv4 {
int sysctl_ip_fwd_use_pmtu;
int sysctl_ip_fwd_update_priority;
int sysctl_ip_nonlocal_bind;
+ int sysctl_ip_autobind_reuse;
/* Shall we try to damage output packets if routing dev changes? */
int sysctl_ip_dynaddr;
int sysctl_ip_early_demux;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index a4db79b1b643..3b4f81790e3e 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -131,7 +131,7 @@ static int inet_csk_bind_conflict(const struct sock *sk,
{
struct sock *sk2;
bool reuse = sk->sk_reuse;
- bool reuseport = !!sk->sk_reuseport && reuseport_ok;
+ bool reuseport = !!sk->sk_reuseport;
kuid_t uid = sock_i_uid((struct sock *)sk);
/*
@@ -146,17 +146,21 @@ static int inet_csk_bind_conflict(const struct sock *sk,
(!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if ((!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) &&
- (!reuseport || !sk2->sk_reuseport ||
- rcu_access_pointer(sk->sk_reuseport_cb) ||
- (sk2->sk_state != TCP_TIME_WAIT &&
- !uid_eq(uid, sock_i_uid(sk2))))) {
- if (inet_rcv_saddr_equal(sk, sk2, true))
- break;
- }
- if (!relax && reuse && sk2->sk_reuse &&
+ if (reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN) {
+ if ((!relax ||
+ (!reuseport_ok &&
+ reuseport && sk2->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ (sk2->sk_state == TCP_TIME_WAIT ||
+ uid_eq(uid, sock_i_uid(sk2))))) &&
+ inet_rcv_saddr_equal(sk, sk2, true))
+ break;
+ } else if (!reuseport_ok ||
+ !reuseport || !sk2->sk_reuseport ||
+ rcu_access_pointer(sk->sk_reuseport_cb) ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid, sock_i_uid(sk2)))) {
if (inet_rcv_saddr_equal(sk, sk2, true))
break;
}
@@ -176,12 +180,14 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
int port = 0;
struct inet_bind_hashbucket *head;
struct net *net = sock_net(sk);
+ bool relax = false;
int i, low, high, attempt_half;
struct inet_bind_bucket *tb;
u32 remaining, offset;
int l3mdev;
l3mdev = inet_sk_bound_l3mdev(sk);
+ports_exhausted:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
inet_get_local_port_range(net, &low, &high);
@@ -219,7 +225,7 @@ other_parity_scan:
inet_bind_bucket_for_each(tb, &head->chain)
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
tb->port == port) {
- if (!inet_csk_bind_conflict(sk, tb, false, false))
+ if (!inet_csk_bind_conflict(sk, tb, relax, false))
goto success;
goto next_port;
}
@@ -239,6 +245,12 @@ next_port:
attempt_half = 2;
goto other_half_scan;
}
+
+ if (net->ipv4.sysctl_ip_autobind_reuse && !relax) {
+ /* We still have a chance to connect to different destinations */
+ relax = true;
+ goto ports_exhausted;
+ }
return NULL;
success:
*port_ret = port;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d9531b4b33f2..81b267e990a1 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -764,6 +764,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "ip_autobind_reuse",
+ .data = &init_net.ipv4.sysctl_ip_autobind_reuse,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "fwmark_reflect",
.data = &init_net.ipv4.sysctl_fwmark_reflect,
.maxlen = sizeof(int),
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index ecc52d4c034d..91f9aea853b1 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -23,3 +23,4 @@ so_txtime
tcp_fastopen_backup_key
nettest
fin_ack_lat
+reuseaddr_ports_exhausted \ No newline at end of file
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 287ae916ec0b..48063fd69924 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -12,6 +12,7 @@ TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_a
TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh ipv6_flowlabel.sh
TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh traceroute.sh
TEST_PROGS += fin_ack_lat.sh
+TEST_PROGS += reuseaddr_ports_exhausted.sh
TEST_PROGS_EXTENDED := in_netns.sh
TEST_GEN_FILES = socket nettest
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
@@ -22,6 +23,7 @@ TEST_GEN_FILES += tcp_fastopen_backup_key
TEST_GEN_FILES += fin_ack_lat
TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
+TEST_GEN_FILES += reuseaddr_ports_exhausted
KSFT_KHDR_INSTALL := 1
include ../lib.mk
diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c
new file mode 100644
index 000000000000..7b01b7c2ec10
--- /dev/null
+++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Check if we can fully utilize 4-tuples for connect().
+ *
+ * Rules to bind sockets to the same port when all ephemeral ports are
+ * exhausted.
+ *
+ * 1. if there are TCP_LISTEN sockets on the port, fail to bind.
+ * 2. if there are sockets without SO_REUSEADDR, fail to bind.
+ * 3. if SO_REUSEADDR is disabled, fail to bind.
+ * 4. if SO_REUSEADDR is enabled and SO_REUSEPORT is disabled,
+ * succeed to bind.
+ * 5. if SO_REUSEADDR and SO_REUSEPORT are enabled and
+ * there is no socket having the both options and the same EUID,
+ * succeed to bind.
+ * 6. fail to bind.
+ *
+ * Author: Kuniyuki Iwashima <[email protected]>
+ */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+
+struct reuse_opts {
+ int reuseaddr[2];
+ int reuseport[2];
+};
+
+struct reuse_opts unreusable_opts[12] = {
+ {0, 0, 0, 0},
+ {0, 0, 0, 1},
+ {0, 0, 1, 0},
+ {0, 0, 1, 1},
+ {0, 1, 0, 0},
+ {0, 1, 0, 1},
+ {0, 1, 1, 0},
+ {0, 1, 1, 1},
+ {1, 0, 0, 0},
+ {1, 0, 0, 1},
+ {1, 0, 1, 0},
+ {1, 0, 1, 1},
+};
+
+struct reuse_opts reusable_opts[4] = {
+ {1, 1, 0, 0},
+ {1, 1, 0, 1},
+ {1, 1, 1, 0},
+ {1, 1, 1, 1},
+};
+
+int bind_port(struct __test_metadata *_metadata, int reuseaddr, int reuseport)
+{
+ struct sockaddr_in local_addr;
+ int len = sizeof(local_addr);
+ int fd, ret;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_NE(-1, fd) TH_LOG("failed to open socket.");
+
+ ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(int));
+ ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEADDR.");
+
+ ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &reuseport, sizeof(int));
+ ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEPORT.");
+
+ local_addr.sin_family = AF_INET;
+ local_addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+ local_addr.sin_port = 0;
+
+ if (bind(fd, (struct sockaddr *)&local_addr, len) == -1) {
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+TEST(reuseaddr_ports_exhausted_unreusable)
+{
+ struct reuse_opts *opts;
+ int i, j, fd[2];
+
+ for (i = 0; i < 12; i++) {
+ opts = &unreusable_opts[i];
+
+ for (j = 0; j < 2; j++)
+ fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
+
+ ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
+ EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind.");
+
+ for (j = 0; j < 2; j++)
+ if (fd[j] != -1)
+ close(fd[j]);
+ }
+}
+
+TEST(reuseaddr_ports_exhausted_reusable_same_euid)
+{
+ struct reuse_opts *opts;
+ int i, j, fd[2];
+
+ for (i = 0; i < 4; i++) {
+ opts = &reusable_opts[i];
+
+ for (j = 0; j < 2; j++)
+ fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
+
+ ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
+
+ if (opts->reuseport[0] && opts->reuseport[1]) {
+ EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind because both sockets succeed to be listened.");
+ } else {
+ EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind to connect to different destinations.");
+ }
+
+ for (j = 0; j < 2; j++)
+ if (fd[j] != -1)
+ close(fd[j]);
+ }
+}
+
+TEST(reuseaddr_ports_exhausted_reusable_different_euid)
+{
+ struct reuse_opts *opts;
+ int i, j, ret, fd[2];
+ uid_t euid[2] = {10, 20};
+
+ for (i = 0; i < 4; i++) {
+ opts = &reusable_opts[i];
+
+ for (j = 0; j < 2; j++) {
+ ret = seteuid(euid[j]);
+ ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: %d.", euid[j]);
+
+ fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
+
+ ret = seteuid(0);
+ ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: 0.");
+ }
+
+ ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
+ EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind because one socket can be bound in each euid.");
+
+ if (fd[1] != -1) {
+ ret = listen(fd[0], 5);
+ ASSERT_EQ(0, ret) TH_LOG("failed to listen.");
+
+ ret = listen(fd[1], 5);
+ EXPECT_EQ(-1, ret) TH_LOG("should fail to listen because only one uid reserves the port in TCP_LISTEN.");
+ }
+
+ for (j = 0; j < 2; j++)
+ if (fd[j] != -1)
+ close(fd[j]);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh b/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh
new file mode 100755
index 000000000000..20e3a2913d06
--- /dev/null
+++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run tests when all ephemeral ports are exhausted.
+#
+# Author: Kuniyuki Iwashima <[email protected]>
+
+set +x
+set -e
+
+readonly NETNS="ns-$(mktemp -u XXXXXX)"
+
+setup() {
+ ip netns add "${NETNS}"
+ ip -netns "${NETNS}" link set lo up
+ ip netns exec "${NETNS}" \
+ sysctl -w net.ipv4.ip_local_port_range="32768 32768" \
+ > /dev/null 2>&1
+ ip netns exec "${NETNS}" \
+ sysctl -w net.ipv4.ip_autobind_reuse=1 > /dev/null 2>&1
+}
+
+cleanup() {
+ ip netns del "${NETNS}"
+}
+
+trap cleanup EXIT
+setup
+
+do_test() {
+ ip netns exec "${NETNS}" ./reuseaddr_ports_exhausted
+}
+
+do_test
+echo "tests done"