diff options
-rw-r--r-- | Documentation/networking/mptcp-sysctl.rst | 8 | ||||
-rw-r--r-- | include/net/mptcp.h | 21 | ||||
-rw-r--r-- | net/mptcp/Makefile | 2 | ||||
-rw-r--r-- | net/mptcp/ctrl.c | 14 | ||||
-rw-r--r-- | net/mptcp/pm.c | 9 | ||||
-rw-r--r-- | net/mptcp/pm_netlink.c | 3 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 277 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 18 | ||||
-rw-r--r-- | net/mptcp/sched.c | 173 |
9 files changed, 393 insertions, 132 deletions
diff --git a/Documentation/networking/mptcp-sysctl.rst b/Documentation/networking/mptcp-sysctl.rst index 213510698014..15f1919d640c 100644 --- a/Documentation/networking/mptcp-sysctl.rst +++ b/Documentation/networking/mptcp-sysctl.rst @@ -74,3 +74,11 @@ stale_loss_cnt - INTEGER This is a per-namespace sysctl. Default: 4 + +scheduler - STRING + Select the scheduler of your choice. + + Support for selection of different schedulers. This is a per-namespace + sysctl. + + Default: "default" diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 3c5c68618fcc..fb996124b3d5 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -96,6 +96,27 @@ struct mptcp_out_options { #endif }; +#define MPTCP_SCHED_NAME_MAX 16 +#define MPTCP_SUBFLOWS_MAX 8 + +struct mptcp_sched_data { + bool reinject; + u8 subflows; + struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; +}; + +struct mptcp_sched_ops { + int (*get_subflow)(struct mptcp_sock *msk, + struct mptcp_sched_data *data); + + char name[MPTCP_SCHED_NAME_MAX]; + struct module *owner; + struct list_head list; + + void (*init)(struct mptcp_sock *msk); + void (*release)(struct mptcp_sock *msk); +} ____cacheline_aligned_in_smp; + #ifdef CONFIG_MPTCP void mptcp_init(void); diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index a3829ce548f9..84e531f86b82 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ - mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o + mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index ae20b7d92e28..c46c22a84d23 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -32,6 +32,7 @@ struct mptcp_pernet { u8 checksum_enabled; u8 allow_join_initial_addr_port; u8 pm_type; + char scheduler[MPTCP_SCHED_NAME_MAX]; }; static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) @@ -69,6 +70,11 @@ int mptcp_get_pm_type(const struct net *net) return mptcp_get_pernet(net)->pm_type; } +const char *mptcp_get_scheduler(const struct net *net) +{ + return mptcp_get_pernet(net)->scheduler; +} + static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; @@ -77,6 +83,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) pernet->allow_join_initial_addr_port = 1; pernet->stale_loss_cnt = 4; pernet->pm_type = MPTCP_PM_TYPE_KERNEL; + strcpy(pernet->scheduler, "default"); } #ifdef CONFIG_SYSCTL @@ -128,6 +135,12 @@ static struct ctl_table mptcp_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &mptcp_pm_type_max }, + { + .procname = "scheduler", + .maxlen = MPTCP_SCHED_NAME_MAX, + .mode = 0644, + .proc_handler = proc_dostring, + }, {} }; @@ -149,6 +162,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[3].data = &pernet->allow_join_initial_addr_port; table[4].data = &pernet->stale_loss_cnt; table[5].data = &pernet->pm_type; + table[6].data = &pernet->scheduler; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 7dbbad1e4f55..d8da5374d9e1 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -299,15 +299,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup) pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); msk = mptcp_sk(sk); - if (subflow->backup != bkup) { + if (subflow->backup != bkup) subflow->backup = bkup; - mptcp_data_lock(sk); - if (!sock_owned_by_user(sk)) - msk->last_snd = NULL; - else - __set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags); - mptcp_data_unlock(sk); - } mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC); } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index c75d9d88a053..9661f3812682 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -472,9 +472,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con slow = lock_sock_fast(ssk); if (prio) { - if (subflow->backup != backup) - msk->last_snd = NULL; - subflow->send_mp_prio = 1; subflow->backup = backup; subflow->request_bkup = backup; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 6019a3cf1625..933b257eee02 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1366,7 +1366,7 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) * returns the subflow that will transmit the next DSS * additionally updates the rtx timeout */ -static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) +struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[SSK_MODE_MAX]; struct mptcp_subflow_context *subflow; @@ -1377,23 +1377,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) u64 linger_time; long tout = 0; - msk_owned_by_me(msk); - - if (__mptcp_check_fallback(msk)) { - if (!msk->first) - return NULL; - return __tcp_can_send(msk->first) && - sk_stream_memory_free(msk->first) ? msk->first : NULL; - } - - /* re-use last subflow, if the burst allow that */ - if (msk->last_snd && msk->snd_burst > 0 && - sk_stream_memory_free(msk->last_snd) && - mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { - mptcp_set_timeout(sk); - return msk->last_snd; - } - /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < SSK_MODE_MAX; ++i) { send_info[i].ssk = NULL; @@ -1446,16 +1429,13 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); - if (!burst) { - msk->last_snd = NULL; + if (!burst) return ssk; - } subflow = mptcp_subflow_ctx(ssk); subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + READ_ONCE(ssk->sk_pacing_rate) * burst, burst + wmem); - msk->last_snd = ssk; msk->snd_burst = burst; return ssk; } @@ -1499,64 +1479,106 @@ void mptcp_check_and_set_pending(struct sock *sk) mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING); } -void __mptcp_push_pending(struct sock *sk, unsigned int flags) +static int __subflow_push_pending(struct sock *sk, struct sock *ssk, + struct mptcp_sendmsg_info *info) { - struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); - struct mptcp_sendmsg_info info = { - .flags = flags, - }; - bool do_check_data_fin = false; struct mptcp_data_frag *dfrag; - int len; + int len, copied = 0, err = 0; while ((dfrag = mptcp_send_head(sk))) { - info.sent = dfrag->already_sent; - info.limit = dfrag->data_len; + info->sent = dfrag->already_sent; + info->limit = dfrag->data_len; len = dfrag->data_len - dfrag->already_sent; while (len > 0) { int ret = 0; - prev_ssk = ssk; - ssk = mptcp_subflow_get_send(msk); - - /* First check. If the ssk has changed since - * the last round, release prev_ssk - */ - if (ssk != prev_ssk && prev_ssk) - mptcp_push_release(prev_ssk, &info); - if (!ssk) - goto out; - - /* Need to lock the new subflow only if different - * from the previous one, otherwise we are still - * helding the relevant lock - */ - if (ssk != prev_ssk) - lock_sock(ssk); - - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info); if (ret <= 0) { - if (ret == -EAGAIN) - continue; - mptcp_push_release(ssk, &info); + err = copied ? : ret; goto out; } - do_check_data_fin = true; - info.sent += ret; + info->sent += ret; + copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + + if (msk->snd_burst <= 0 || + !sk_stream_memory_free(ssk) || + !mptcp_subflow_active(mptcp_subflow_ctx(ssk))) { + err = copied; + goto out; + } + mptcp_set_timeout(sk); + } + err = copied; + +out: + return err; +} + +void __mptcp_push_pending(struct sock *sk, unsigned int flags) +{ + struct sock *prev_ssk = NULL, *ssk = NULL; + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_sendmsg_info info = { + .flags = flags, + }; + bool do_check_data_fin = false; + int push_count = 1; + + while (mptcp_send_head(sk) && (push_count > 0)) { + struct mptcp_subflow_context *subflow; + int ret = 0; + + if (mptcp_sched_get_send(msk)) + break; + + push_count = 0; + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + mptcp_subflow_set_scheduled(subflow, false); + + prev_ssk = ssk; + ssk = mptcp_subflow_tcp_sock(subflow); + if (ssk != prev_ssk) { + /* First check. If the ssk has changed since + * the last round, release prev_ssk + */ + if (prev_ssk) + mptcp_push_release(prev_ssk, &info); + + /* Need to lock the new subflow only if different + * from the previous one, otherwise we are still + * helding the relevant lock + */ + lock_sock(ssk); + } + + push_count++; + + ret = __subflow_push_pending(sk, ssk, &info); + if (ret <= 0) { + if (ret != -EAGAIN || + (1 << ssk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE)) + push_count--; + continue; + } + do_check_data_fin = true; + } + } } /* at this point we held the socket lock for the last subflow we used */ if (ssk) mptcp_push_release(ssk, &info); -out: /* ensure the rtx timer is running */ if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); @@ -1570,42 +1592,49 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool struct mptcp_sendmsg_info info = { .data_lock_held = true, }; - struct mptcp_data_frag *dfrag; + bool keep_pushing = true; struct sock *xmit_ssk; - int len, copied = 0; + int copied = 0; info.flags = 0; - while ((dfrag = mptcp_send_head(sk))) { - info.sent = dfrag->already_sent; - info.limit = dfrag->data_len; - len = dfrag->data_len - dfrag->already_sent; - while (len > 0) { - int ret = 0; - - /* check for a different subflow usage only after - * spooling the first chunk of data - */ - xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk); - if (!xmit_ssk) - goto out; - if (xmit_ssk != ssk) { - mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), - MPTCP_DELEGATE_SEND); - goto out; - } + while (mptcp_send_head(sk) && keep_pushing) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + int ret = 0; - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + /* check for a different subflow usage only after + * spooling the first chunk of data + */ + if (first) { + mptcp_subflow_set_scheduled(subflow, false); + ret = __subflow_push_pending(sk, ssk, &info); + first = false; if (ret <= 0) - goto out; + break; + copied += ret; + continue; + } + + if (mptcp_sched_get_send(msk)) + goto out; - info.sent += ret; + if (READ_ONCE(subflow->scheduled)) { + mptcp_subflow_set_scheduled(subflow, false); + ret = __subflow_push_pending(sk, ssk, &info); + if (ret <= 0) + keep_pushing = false; copied += ret; - len -= ret; - first = false; + } - mptcp_update_post_push(msk, dfrag, ret); + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + xmit_ssk = mptcp_subflow_tcp_sock(subflow); + if (xmit_ssk != ssk) { + mptcp_subflow_delegate(subflow, + MPTCP_DELEGATE_SEND); + keep_pushing = false; + } + } } - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } out: @@ -2198,17 +2227,12 @@ static void mptcp_timeout_timer(struct timer_list *t) * * A backup subflow is returned only if that is the only kind available. */ -static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) +struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) { struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; int min_stale_count = INT_MAX; - msk_owned_by_me(msk); - - if (__mptcp_check_fallback(msk)) - return NULL; - mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -2370,9 +2394,6 @@ out_release: WRITE_ONCE(msk->first, NULL); out: - if (ssk == msk->last_snd) - msk->last_snd = NULL; - if (need_push) __mptcp_push_pending(sk, 0); } @@ -2489,16 +2510,17 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) static void __mptcp_retrans(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *subflow; struct mptcp_sendmsg_info info = {}; struct mptcp_data_frag *dfrag; - size_t copied = 0; struct sock *ssk; - int ret; + int ret, err; + u16 len = 0; mptcp_clean_una_wakeup(sk); /* first check ssk: need to kick "stale" logic */ - ssk = mptcp_subflow_get_retrans(msk); + err = mptcp_sched_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { @@ -2517,32 +2539,45 @@ static void __mptcp_retrans(struct sock *sk) goto reset_timer; } - if (!ssk) + if (err) goto reset_timer; - lock_sock(ssk); + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) { + u16 copied = 0; - /* limit retransmission to the bytes already sent on some subflows */ - info.sent = 0; - info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; - while (info.sent < info.limit) { - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); - if (ret <= 0) - break; + mptcp_subflow_set_scheduled(subflow, false); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); - copied += ret; - info.sent += ret; - } - if (copied) { - dfrag->already_sent = max(dfrag->already_sent, info.sent); - msk->bytes_retrans += copied; - tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, - info.size_goal); - WRITE_ONCE(msk->allow_infinite_fallback, false); + ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + + /* limit retransmission to the bytes already sent on some subflows */ + info.sent = 0; + info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : + dfrag->already_sent; + while (info.sent < info.limit) { + ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); + if (ret <= 0) + break; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); + copied += ret; + info.sent += ret; + } + if (copied) { + len = max(copied, len); + tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, + info.size_goal); + WRITE_ONCE(msk->allow_infinite_fallback, false); + } + + release_sock(ssk); + } } - release_sock(ssk); + msk->bytes_retrans += len; + dfrag->already_sent = max(dfrag->already_sent, len); reset_timer: mptcp_check_and_set_pending(sk); @@ -2694,6 +2729,7 @@ static void mptcp_ca_reset(struct sock *sk) static int mptcp_init_sock(struct sock *sk) { struct net *net = sock_net(sk); + int ret; __mptcp_init_sock(sk); @@ -2703,6 +2739,11 @@ static int mptcp_init_sock(struct sock *sk) if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; + ret = mptcp_init_sched(mptcp_sk(sk), + mptcp_sched_find(mptcp_get_scheduler(net))); + if (ret) + return ret; + set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will @@ -2848,6 +2889,7 @@ static void __mptcp_destroy_sock(struct sock *sk) mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); msk->pm.status = 0; + mptcp_release_sched(msk); sk->sk_prot->destroy(sk); @@ -3037,7 +3079,6 @@ static int mptcp_disconnect(struct sock *sk, int flags) * subflow */ mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); - msk->last_snd = NULL; WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; msk->push_pending = 0; @@ -3103,6 +3144,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, msk->snd_una = msk->write_seq; msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; + mptcp_init_sched(msk, mptcp_sk(sk)->sched); /* passive msk is created after the first/MPC subflow */ msk->subflow_id = 2; @@ -3307,8 +3349,6 @@ static void mptcp_release_cb(struct sock *sk) __mptcp_set_connected(sk); if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) __mptcp_error_report(sk); - if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags)) - msk->last_snd = NULL; } __mptcp_update_rmem(sk); @@ -3925,6 +3965,7 @@ void __init mptcp_proto_init(void) mptcp_subflow_init(); mptcp_pm_init(); + mptcp_sched_init(); mptcp_token_init(); if (proto_register(&mptcp_prot, 1) != 0) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 38c7ea013361..7254b3562575 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -123,7 +123,6 @@ #define MPTCP_RETRANSMIT 4 #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_CONNECTED 6 -#define MPTCP_RESET_SCHEDULER 7 struct mptcp_skb_cb { u64 map_seq; @@ -269,7 +268,6 @@ struct mptcp_sock { u64 rcv_data_fin_seq; u64 bytes_retrans; int rmem_fwd_alloc; - struct sock *last_snd; int snd_burst; int old_wspace; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; @@ -314,6 +312,7 @@ struct mptcp_sock { * lock as such sock is freed after close(). */ struct mptcp_pm_data pm; + struct mptcp_sched_ops *sched; struct { u32 space; /* bytes copied in last measurement window */ u32 copied; /* bytes copied in this measurement window */ @@ -492,6 +491,7 @@ struct mptcp_subflow_context { is_mptfo : 1, /* subflow is doing TFO */ __unused : 9; enum mptcp_data_avail data_avail; + bool scheduled; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -625,6 +625,7 @@ int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); int mptcp_get_pm_type(const struct net *net); +const char *mptcp_get_scheduler(const struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); @@ -657,6 +658,19 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); +struct mptcp_sched_ops *mptcp_sched_find(const char *name); +int mptcp_register_scheduler(struct mptcp_sched_ops *sched); +void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); +void mptcp_sched_init(void); +int mptcp_init_sched(struct mptcp_sock *msk, + struct mptcp_sched_ops *sched); +void mptcp_release_sched(struct mptcp_sock *msk); +void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, + bool scheduled); +struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk); +struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk); +int mptcp_sched_get_send(struct mptcp_sock *msk); +int mptcp_sched_get_retrans(struct mptcp_sock *msk); static inline bool __tcp_can_send(const struct sock *ssk) { diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c new file mode 100644 index 000000000000..4ab0693c069c --- /dev/null +++ b/net/mptcp/sched.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2022, SUSE. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/spinlock.h> +#include "protocol.h" + +static DEFINE_SPINLOCK(mptcp_sched_list_lock); +static LIST_HEAD(mptcp_sched_list); + +static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + struct sock *ssk; + + ssk = data->reinject ? mptcp_subflow_get_retrans(msk) : + mptcp_subflow_get_send(msk); + if (!ssk) + return -EINVAL; + + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); + return 0; +} + +static struct mptcp_sched_ops mptcp_sched_default = { + .get_subflow = mptcp_sched_default_get_subflow, + .name = "default", + .owner = THIS_MODULE, +}; + +/* Must be called with rcu read lock held */ +struct mptcp_sched_ops *mptcp_sched_find(const char *name) +{ + struct mptcp_sched_ops *sched, *ret = NULL; + + list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { + if (!strcmp(sched->name, name)) { + ret = sched; + break; + } + } + + return ret; +} + +int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +{ + if (!sched->get_subflow) + return -EINVAL; + + spin_lock(&mptcp_sched_list_lock); + if (mptcp_sched_find(sched->name)) { + spin_unlock(&mptcp_sched_list_lock); + return -EEXIST; + } + list_add_tail_rcu(&sched->list, &mptcp_sched_list); + spin_unlock(&mptcp_sched_list_lock); + + pr_debug("%s registered", sched->name); + return 0; +} + +void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) +{ + if (sched == &mptcp_sched_default) + return; + + spin_lock(&mptcp_sched_list_lock); + list_del_rcu(&sched->list); + spin_unlock(&mptcp_sched_list_lock); +} + +void mptcp_sched_init(void) +{ + mptcp_register_scheduler(&mptcp_sched_default); +} + +int mptcp_init_sched(struct mptcp_sock *msk, + struct mptcp_sched_ops *sched) +{ + if (!sched) + sched = &mptcp_sched_default; + + if (!bpf_try_module_get(sched, sched->owner)) + return -EBUSY; + + msk->sched = sched; + if (msk->sched->init) + msk->sched->init(msk); + + pr_debug("sched=%s", msk->sched->name); + + return 0; +} + +void mptcp_release_sched(struct mptcp_sock *msk) +{ + struct mptcp_sched_ops *sched = msk->sched; + + if (!sched) + return; + + msk->sched = NULL; + if (sched->release) + sched->release(msk); + + bpf_module_put(sched, sched->owner); +} + +void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, + bool scheduled) +{ + WRITE_ONCE(subflow->scheduled, scheduled); +} + +int mptcp_sched_get_send(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sched_data data; + + msk_owned_by_me(msk); + + /* the following check is moved out of mptcp_subflow_get_send */ + if (__mptcp_check_fallback(msk)) { + if (msk->first && + __tcp_can_send(msk->first) && + sk_stream_memory_free(msk->first)) { + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true); + return 0; + } + return -EINVAL; + } + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) + return 0; + } + + data.reinject = false; + if (msk->sched == &mptcp_sched_default || !msk->sched) + return mptcp_sched_default_get_subflow(msk, &data); + return msk->sched->get_subflow(msk, &data); +} + +int mptcp_sched_get_retrans(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sched_data data; + + msk_owned_by_me(msk); + + /* the following check is moved out of mptcp_subflow_get_retrans */ + if (__mptcp_check_fallback(msk)) + return -EINVAL; + + mptcp_for_each_subflow(msk, subflow) { + if (READ_ONCE(subflow->scheduled)) + return 0; + } + + data.reinject = true; + if (msk->sched == &mptcp_sched_default || !msk->sched) + return mptcp_sched_default_get_subflow(msk, &data); + return msk->sched->get_subflow(msk, &data); +} |