diff options
author | Jakub Kicinski <[email protected]> | 2021-12-17 19:27:07 -0800 |
---|---|---|
committer | Jakub Kicinski <[email protected]> | 2021-12-17 19:27:07 -0800 |
commit | 7e1c5d7b6926f63d3750a273486a143e50d216ad (patch) | |
tree | 45a328723f709840ef9fa7e0961c35dfa2e8b7d5 | |
parent | ab9d0e2171be56bb3bcd0fe4bd8ae5d2f24e5a80 (diff) | |
parent | 59060a47ca50bbdb1d863b73667a1065873ecc06 (diff) |
Merge branch 'mptcp-miscellaneous-changes-for-5-17'
Mat Martineau says:
====================
mptcp: Miscellaneous changes for 5.17
These are three unrelated patches that we've been testing in the MPTCP
tree.
Patch 1 modifies the packet scheduler that picks which TCP subflow is
used for each chunk of outgoing data. The updated scheduler improves
throughput on multiple-subflow connections.
Patch 2 updates a selftest to verify recent TCP_ULP sockopt changes
on MPTCP fallback sockets.
Patch 3 cleans up some unnecessary comparisons with an 8-bit value.
====================
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Jakub Kicinski <[email protected]>
-rw-r--r-- | net/mptcp/pm_netlink.c | 8 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 72 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 1 | ||||
-rw-r--r-- | tools/testing/selftests/net/mptcp/mptcp_connect.c | 97 | ||||
-rwxr-xr-x | tools/testing/selftests/net/mptcp/mptcp_connect.sh | 20 |
5 files changed, 103 insertions, 95 deletions
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 3186d33b5208..6cde58c259a8 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -38,7 +38,8 @@ struct mptcp_pm_add_entry { u8 retrans_times; }; -#define MAX_ADDR_ID 255 +/* max value of mptcp_addr_info.id */ +#define MAX_ADDR_ID U8_MAX #define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG) struct pm_nl_pernet { @@ -825,14 +826,13 @@ find_next: entry->addr.id = find_next_zero_bit(pernet->id_bitmap, MAX_ADDR_ID + 1, pernet->next_id); - if ((!entry->addr.id || entry->addr.id > MAX_ADDR_ID) && - pernet->next_id != 1) { + if (!entry->addr.id && pernet->next_id != 1) { pernet->next_id = 1; goto find_next; } } - if (!entry->addr.id || entry->addr.id > MAX_ADDR_ID) + if (!entry->addr.id) goto out; __set_bit(entry->addr.id, pernet->id_bitmap); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3e549f6190c0..df5a0cf431c1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1372,7 +1372,7 @@ out: struct subflow_send_info { struct sock *ssk; - u64 ratio; + u64 linger_time; }; void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) @@ -1397,20 +1397,24 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) return __mptcp_subflow_active(subflow); } +#define SSK_MODE_ACTIVE 0 +#define SSK_MODE_BACKUP 1 +#define SSK_MODE_MAX 2 + /* implement the mptcp packet scheduler; * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { - struct subflow_send_info send_info[2]; + struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; + u32 pace, burst, wmem; int i, nr_active = 0; struct sock *ssk; + u64 linger_time; long tout = 0; - u64 ratio; - u32 pace; sock_owned_by_me(sk); @@ -1429,10 +1433,11 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } /* pick the subflow with the lower wmem/wspace ratio */ - for (i = 0; i < 2; ++i) { + for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; - send_info[i].ratio = -1; + send_info[i].linger_time = -1; } + mptcp_for_each_subflow(msk, subflow) { trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); @@ -1441,34 +1446,51 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) tout = max(tout, mptcp_timeout_from_subflow(subflow)); nr_active += !subflow->backup; - if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd) - continue; - - pace = READ_ONCE(ssk->sk_pacing_rate); - if (!pace) - continue; + pace = subflow->avg_pacing_rate; + if (unlikely(!pace)) { + /* init pacing rate from socket */ + subflow->avg_pacing_rate = READ_ONCE(ssk->sk_pacing_rate); + pace = subflow->avg_pacing_rate; + if (!pace) + continue; + } - ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, - pace); - if (ratio < send_info[subflow->backup].ratio) { + linger_time = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32, pace); + if (linger_time < send_info[subflow->backup].linger_time) { send_info[subflow->backup].ssk = ssk; - send_info[subflow->backup].ratio = ratio; + send_info[subflow->backup].linger_time = linger_time; } } __mptcp_set_timeout(sk, tout); /* pick the best backup if no other subflow is active */ if (!nr_active) - send_info[0].ssk = send_info[1].ssk; - - if (send_info[0].ssk) { - msk->last_snd = send_info[0].ssk; - msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE, - tcp_sk(msk->last_snd)->snd_wnd); - return msk->last_snd; - } + send_info[SSK_MODE_ACTIVE].ssk = send_info[SSK_MODE_BACKUP].ssk; + + /* According to the blest algorithm, to avoid HoL blocking for the + * faster flow, we need to: + * - estimate the faster flow linger time + * - use the above to estimate the amount of byte transferred + * by the faster flow + * - check that the amount of queued data is greter than the above, + * otherwise do not use the picked, slower, subflow + * We select the subflow with the shorter estimated time to flush + * the queued mem, which basically ensure the above. We just need + * to check that subflow has a non empty cwin. + */ + ssk = send_info[SSK_MODE_ACTIVE].ssk; + if (!ssk || !sk_stream_memory_free(ssk) || !tcp_sk(ssk)->snd_wnd) + return NULL; - return NULL; + burst = min_t(int, MPTCP_SEND_BURST_SIZE, tcp_sk(ssk)->snd_wnd); + wmem = READ_ONCE(ssk->sk_wmem_queued); + subflow = mptcp_subflow_ctx(ssk); + subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + + READ_ONCE(ssk->sk_pacing_rate) * burst, + burst + wmem); + msk->last_snd = ssk; + msk->snd_burst = burst; + return ssk; } static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e1469155fb15..0486c9f5b38b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -395,6 +395,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); /* MPTCP subflow context */ struct mptcp_subflow_context { struct list_head node;/* conn_list of subflows */ + unsigned long avg_pacing_rate; /* protected by msk socket lock */ u64 local_key; u64 remote_key; u64 idsn; diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 98de28ac3ba8..a30e93c5c549 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -59,7 +59,6 @@ static enum cfg_peek cfg_peek = CFG_NONE_PEEK; static const char *cfg_host; static const char *cfg_port = "12000"; static int cfg_sock_proto = IPPROTO_MPTCP; -static bool tcpulp_audit; static int pf = AF_INET; static int cfg_sndbuf; static int cfg_rcvbuf; @@ -103,7 +102,6 @@ static void die_usage(void) fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n"); fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n"); fprintf(stderr, "\t-M mark -- set socket packet mark\n"); - fprintf(stderr, "\t-u -- check mptcp ulp\n"); fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n"); fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n"); fprintf(stderr, "\t-o option -- test sockopt <option>\n"); @@ -215,6 +213,42 @@ static void set_transparent(int fd, int pf) } } +static int do_ulp_so(int sock, const char *name) +{ + return setsockopt(sock, IPPROTO_TCP, TCP_ULP, name, strlen(name)); +} + +#define X(m) xerror("%s:%u: %s: failed for proto %d at line %u", __FILE__, __LINE__, (m), proto, line) +static void sock_test_tcpulp(int sock, int proto, unsigned int line) +{ + socklen_t buflen = 8; + char buf[8] = ""; + int ret = getsockopt(sock, IPPROTO_TCP, TCP_ULP, buf, &buflen); + + if (ret != 0) + X("getsockopt"); + + if (buflen > 0) { + if (strcmp(buf, "mptcp") != 0) + xerror("unexpected ULP '%s' for proto %d at line %u", buf, proto, line); + ret = do_ulp_so(sock, "tls"); + if (ret == 0) + X("setsockopt"); + } else if (proto == IPPROTO_MPTCP) { + ret = do_ulp_so(sock, "tls"); + if (ret != -1) + X("setsockopt"); + } + + ret = do_ulp_so(sock, "mptcp"); + if (ret != -1) + X("setsockopt"); + +#undef X +} + +#define SOCK_TEST_TCPULP(s, p) sock_test_tcpulp((s), (p), __LINE__) + static int sock_listen_mptcp(const char * const listenaddr, const char * const port) { @@ -238,6 +272,8 @@ static int sock_listen_mptcp(const char * const listenaddr, if (sock < 0) continue; + SOCK_TEST_TCPULP(sock, cfg_sock_proto); + if (-1 == setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))) perror("setsockopt"); @@ -260,50 +296,17 @@ static int sock_listen_mptcp(const char * const listenaddr, return sock; } + SOCK_TEST_TCPULP(sock, cfg_sock_proto); + if (listen(sock, 20)) { perror("listen"); close(sock); return -1; } - return sock; -} + SOCK_TEST_TCPULP(sock, cfg_sock_proto); -static bool sock_test_tcpulp(const char * const remoteaddr, - const char * const port) -{ - struct addrinfo hints = { - .ai_protocol = IPPROTO_TCP, - .ai_socktype = SOCK_STREAM, - }; - struct addrinfo *a, *addr; - int sock = -1, ret = 0; - bool test_pass = false; - - hints.ai_family = AF_INET; - - xgetaddrinfo(remoteaddr, port, &hints, &addr); - for (a = addr; a; a = a->ai_next) { - sock = socket(a->ai_family, a->ai_socktype, IPPROTO_TCP); - if (sock < 0) { - perror("socket"); - continue; - } - ret = setsockopt(sock, IPPROTO_TCP, TCP_ULP, "mptcp", - sizeof("mptcp")); - if (ret == -1 && errno == EOPNOTSUPP) - test_pass = true; - close(sock); - - if (test_pass) - break; - if (!ret) - fprintf(stderr, - "setsockopt(TCP_ULP) returned 0\n"); - else - perror("setsockopt(TCP_ULP)"); - } - return test_pass; + return sock; } static int sock_connect_mptcp(const char * const remoteaddr, @@ -326,6 +329,8 @@ static int sock_connect_mptcp(const char * const remoteaddr, continue; } + SOCK_TEST_TCPULP(sock, proto); + if (cfg_mark) set_mark(sock, cfg_mark); @@ -338,6 +343,8 @@ static int sock_connect_mptcp(const char * const remoteaddr, } freeaddrinfo(addr); + if (sock != -1) + SOCK_TEST_TCPULP(sock, proto); return sock; } @@ -954,6 +961,8 @@ int main_loop_s(int listensock) check_sockaddr(pf, &ss, salen); check_getpeername(remotesock, &ss, salen); + SOCK_TEST_TCPULP(remotesock, 0); + return copyfd_io(0, remotesock, 1); } @@ -1059,6 +1068,8 @@ int main_loop(void) check_getpeername_connect(fd); + SOCK_TEST_TCPULP(fd, cfg_sock_proto); + if (cfg_rcvbuf) set_rcvbuf(fd, cfg_rcvbuf); if (cfg_sndbuf) @@ -1151,7 +1162,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "6jr:lp:s:hut:T:m:S:R:w:M:P:c:o:")) != -1) { + while ((c = getopt(argc, argv, "6jr:lp:s:ht:T:m:S:R:w:M:P:c:o:")) != -1) { switch (c) { case 'j': cfg_join = true; @@ -1177,9 +1188,6 @@ static void parse_opts(int argc, char **argv) case 'h': die_usage(); break; - case 'u': - tcpulp_audit = true; - break; case '6': pf = AF_INET6; break; @@ -1233,9 +1241,6 @@ int main(int argc, char *argv[]) signal(SIGUSR1, handle_signal); parse_opts(argc, argv); - if (tcpulp_audit) - return sock_test_tcpulp(cfg_host, cfg_port) ? 0 : 1; - if (listen_mode) { int fd = sock_listen_mptcp(cfg_host, cfg_port); diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index a4226b608c68..ee28f82a6c89 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -296,24 +296,6 @@ check_mptcp_disabled() return 0 } -check_mptcp_ulp_setsockopt() -{ - local t retval - t="ns_ulp-$sech-$(mktemp -u XXXXXX)" - - ip netns add ${t} || exit $ksft_skip - if ! ip netns exec ${t} ./mptcp_connect -u -p 10000 -s TCP 127.0.0.1 2>&1; then - printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) allowed\t[ FAIL ]\n" - retval=1 - ret=$retval - else - printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) blocked\t[ OK ]\n" - retval=0 - fi - ip netns del ${t} - return $retval -} - # $1: IP address is_v6() { @@ -780,8 +762,6 @@ make_file "$sin" "server" check_mptcp_disabled -check_mptcp_ulp_setsockopt - stop_if_error "The kernel configuration is not valid for MPTCP" echo "INFO: validating network environment with pings" |