diff options
author | Jakub Kicinski <[email protected]> | 2024-01-26 20:34:28 -0800 |
---|---|---|
committer | Jakub Kicinski <[email protected]> | 2024-01-26 20:34:28 -0800 |
commit | 1cf05e2508ab70ec1af56d1e62e79520d7b8dd6e (patch) | |
tree | 4345fae53e80824da6164e33c5f2577ffb92616d | |
parent | fb4bb62aaac715e50c7c007714af19a2698db88b (diff) | |
parent | d9f21b3613337b55cc9d4a6ead484dca68475143 (diff) |
Merge branch 'af_unix-random-improvements-for-gc'
Kuniyuki Iwashima says:
====================
af_unix: Random improvements for GC.
If more than 16000 inflight AF_UNIX sockets exist on a host, each
sendmsg() will be forced to wait for unix_gc() even if a process
is not sending any FD.
This series tries not to impose such a penalty on sane users who
do not send AF_UNIX FDs or do not have inflight sockets more than
SCM_MAX_FD * 8.
The first patch can be backported to -stable.
Cleanup patches for commit 69db702c8387 ("io_uring/af_unix: disable
sending io_uring over sockets") and large refactoring of GC will
be followed later.
v4: https://lore.kernel.org/netdev/[email protected]/
v3: https://lore.kernel.org/netdev/[email protected]/
v2: https://lore.kernel.org/netdev/[email protected]/
v1: https://lore.kernel.org/netdev/[email protected]/
====================
Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Jakub Kicinski <[email protected]>
-rw-r--r-- | include/net/af_unix.h | 14 | ||||
-rw-r--r-- | include/net/scm.h | 1 | ||||
-rw-r--r-- | net/core/scm.c | 5 | ||||
-rw-r--r-- | net/unix/af_unix.c | 10 | ||||
-rw-r--r-- | net/unix/garbage.c | 98 | ||||
-rw-r--r-- | net/unix/scm.c | 27 |
6 files changed, 85 insertions, 70 deletions
diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 49c4640027d8..f045bbd9017d 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -8,13 +8,21 @@ #include <linux/refcount.h> #include <net/sock.h> +#if IS_ENABLED(CONFIG_UNIX) +struct unix_sock *unix_get_socket(struct file *filp); +#else +static inline struct unix_sock *unix_get_socket(struct file *filp) +{ + return NULL; +} +#endif + void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); void unix_destruct_scm(struct sk_buff *skb); void io_uring_destruct_scm(struct sk_buff *skb); void unix_gc(void); -void wait_for_unix_gc(void); -struct sock *unix_get_socket(struct file *filp); +void wait_for_unix_gc(struct scm_fp_list *fpl); struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) @@ -61,7 +69,7 @@ struct unix_sock { struct mutex iolock, bindlock; struct sock *peer; struct list_head link; - atomic_long_t inflight; + unsigned long inflight; spinlock_t lock; unsigned long gc_flags; #define UNIX_GC_CANDIDATE 0 diff --git a/include/net/scm.h b/include/net/scm.h index cf68acec4d70..92276a2c5543 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -25,6 +25,7 @@ struct scm_creds { struct scm_fp_list { short count; + short count_unix; short max; struct user_struct *user; struct file *fp[SCM_MAX_FD]; diff --git a/net/core/scm.c b/net/core/scm.c index d0e0852a24d5..9cd4b0a01cd6 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -36,6 +36,7 @@ #include <net/compat.h> #include <net/scm.h> #include <net/cls_cgroup.h> +#include <net/af_unix.h> /* @@ -85,6 +86,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -ENOMEM; *fplp = fpl; fpl->count = 0; + fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; } @@ -109,6 +111,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fput(file); return -EINVAL; } + if (unix_get_socket(file)) + fpl->count_unix++; + *fpp++ = file; fpl->count++; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ac1f2bc18fc9..1720419d93d6 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -993,11 +993,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; - u = unix_sk(sk); + u = unix_sk(sk); + u->inflight = 0; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - atomic_long_set(&u->inflight, 0); INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ @@ -1923,11 +1923,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, long timeo; int err; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; @@ -2199,11 +2200,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, bool fds_sent = false; int data_len; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2405f0f9af31..4046c606f0e6 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -86,7 +86,6 @@ /* Internal data structures and random procedures: */ static LIST_HEAD(gc_candidates); -static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) @@ -105,20 +104,15 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), while (nfd--) { /* Get the socket the fd matches if it indeed does so */ - struct sock *sk = unix_get_socket(*fp++); + struct unix_sock *u = unix_get_socket(*fp++); - if (sk) { - struct unix_sock *u = unix_sk(sk); + /* Ignore non-candidates, they could have been added + * to the queues after starting the garbage collection + */ + if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { + hit = true; - /* Ignore non-candidates, they could - * have been added to the queues after - * starting the garbage collection - */ - if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { - hit = true; - - func(u); - } + func(u); } } if (hit && hitlist != NULL) { @@ -166,17 +160,18 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *), static void dec_inflight(struct unix_sock *usk) { - atomic_long_dec(&usk->inflight); + usk->inflight--; } static void inc_inflight(struct unix_sock *usk) { - atomic_long_inc(&usk->inflight); + usk->inflight++; } static void inc_inflight_move_tail(struct unix_sock *u) { - atomic_long_inc(&u->inflight); + u->inflight++; + /* If this still might be part of a cycle, move it to the end * of the list, so that it's checked even if it was already * passed over @@ -186,23 +181,8 @@ static void inc_inflight_move_tail(struct unix_sock *u) } static bool gc_in_progress; -#define UNIX_INFLIGHT_TRIGGER_GC 16000 -void wait_for_unix_gc(void) -{ - /* If number of inflight sockets is insane, - * force a garbage collect right now. - * Paired with the WRITE_ONCE() in unix_inflight(), - * unix_notinflight() and gc_in_progress(). - */ - if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && - !READ_ONCE(gc_in_progress)) - unix_gc(); - wait_event(unix_gc_wait, gc_in_progress == false); -} - -/* The external entry point: unix_gc() */ -void unix_gc(void) +static void __unix_gc(struct work_struct *work) { struct sk_buff *next_skb, *skb; struct unix_sock *u; @@ -213,13 +193,6 @@ void unix_gc(void) spin_lock(&unix_gc_lock); - /* Avoid a recursive GC. */ - if (gc_in_progress) - goto out; - - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ - WRITE_ONCE(gc_in_progress, true); - /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. @@ -237,14 +210,12 @@ void unix_gc(void) */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { long total_refs; - long inflight_refs; total_refs = file_count(u->sk.sk_socket->file); - inflight_refs = atomic_long_read(&u->inflight); - BUG_ON(inflight_refs < 1); - BUG_ON(total_refs < inflight_refs); - if (total_refs == inflight_refs) { + BUG_ON(!u->inflight); + BUG_ON(total_refs < u->inflight); + if (total_refs == u->inflight) { list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); @@ -271,7 +242,7 @@ void unix_gc(void) /* Move cursor to after the current position. */ list_move(&cursor, &u->link); - if (atomic_long_read(&u->inflight) > 0) { + if (u->inflight) { list_move_tail(&u->link, ¬_cycle_list); __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); scan_children(&u->sk, inc_inflight_move_tail, NULL); @@ -328,8 +299,39 @@ void unix_gc(void) /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); - wake_up(&unix_gc_wait); - - out: spin_unlock(&unix_gc_lock); } + +static DECLARE_WORK(unix_gc_work, __unix_gc); + +void unix_gc(void) +{ + WRITE_ONCE(gc_in_progress, true); + queue_work(system_unbound_wq, &unix_gc_work); +} + +#define UNIX_INFLIGHT_TRIGGER_GC 16000 +#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) + +void wait_for_unix_gc(struct scm_fp_list *fpl) +{ + /* If number of inflight sockets is insane, + * force a garbage collect right now. + * + * Paired with the WRITE_ONCE() in unix_inflight(), + * unix_notinflight(), and __unix_gc(). + */ + if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && + !READ_ONCE(gc_in_progress)) + unix_gc(); + + /* Penalise users who want to send AF_UNIX sockets + * but whose sockets have not been received yet. + */ + if (!fpl || !fpl->count_unix || + READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) + return; + + if (READ_ONCE(gc_in_progress)) + flush_work(&unix_gc_work); +} diff --git a/net/unix/scm.c b/net/unix/scm.c index 822ce0d0d791..b5ae5ab16777 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -21,9 +21,8 @@ EXPORT_SYMBOL(gc_inflight_list); DEFINE_SPINLOCK(unix_gc_lock); EXPORT_SYMBOL(unix_gc_lock); -struct sock *unix_get_socket(struct file *filp) +struct unix_sock *unix_get_socket(struct file *filp) { - struct sock *u_sock = NULL; struct inode *inode = file_inode(filp); /* Socket ? */ @@ -34,10 +33,10 @@ struct sock *unix_get_socket(struct file *filp) /* PF_UNIX ? */ if (s && ops && ops->family == PF_UNIX) - u_sock = s; + return unix_sk(s); } - return u_sock; + return NULL; } EXPORT_SYMBOL(unix_get_socket); @@ -46,19 +45,18 @@ EXPORT_SYMBOL(unix_get_socket); */ void unix_inflight(struct user_struct *user, struct file *fp) { - struct sock *s = unix_get_socket(fp); + struct unix_sock *u = unix_get_socket(fp); spin_lock(&unix_gc_lock); - if (s) { - struct unix_sock *u = unix_sk(s); - - if (atomic_long_inc_return(&u->inflight) == 1) { + if (u) { + if (!u->inflight) { BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); } else { BUG_ON(list_empty(&u->link)); } + u->inflight++; /* Paired with READ_ONCE() in wait_for_unix_gc() */ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); } @@ -68,17 +66,16 @@ void unix_inflight(struct user_struct *user, struct file *fp) void unix_notinflight(struct user_struct *user, struct file *fp) { - struct sock *s = unix_get_socket(fp); + struct unix_sock *u = unix_get_socket(fp); spin_lock(&unix_gc_lock); - if (s) { - struct unix_sock *u = unix_sk(s); - - BUG_ON(!atomic_long_read(&u->inflight)); + if (u) { + BUG_ON(!u->inflight); BUG_ON(list_empty(&u->link)); - if (atomic_long_dec_and_test(&u->inflight)) + u->inflight--; + if (!u->inflight) list_del_init(&u->link); /* Paired with READ_ONCE() in wait_for_unix_gc() */ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); |