aboutsummaryrefslogtreecommitdiff
path: root/net/unix/af_unix.c
diff options
context:
space:
mode:
authorLinus Torvalds <[email protected]>2022-08-03 16:29:08 -0700
committerLinus Torvalds <[email protected]>2022-08-03 16:29:08 -0700
commitf86d1fbbe7858884d6754534a0afbb74fc30bc26 (patch)
treef61796870edefbe77d495e9d719c68af1d14275b /net/unix/af_unix.c
parent526942b8134cc34d25d27f95dfff98b8ce2f6fcd (diff)
parent7c6327c77d509e78bff76f2a4551fcfee851682e (diff)
Merge tag 'net-next-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking changes from Paolo Abeni: "Core: - Refactor the forward memory allocation to better cope with memory pressure with many open sockets, moving from a per socket cache to a per-CPU one - Replace rwlocks with RCU for better fairness in ping, raw sockets and IP multicast router. - Network-side support for IO uring zero-copy send. - A few skb drop reason improvements, including codegen the source file with string mapping instead of using macro magic. - Rename reference tracking helpers to a more consistent netdev_* schema. - Adapt u64_stats_t type to address load/store tearing issues. - Refine debug helper usage to reduce the log noise caused by bots. BPF: - Improve socket map performance, avoiding skb cloning on read operation. - Add support for 64 bits enum, to match types exposed by kernel. - Introduce support for sleepable uprobes program. - Introduce support for enum textual representation in libbpf. - New helpers to implement synproxy with eBPF/XDP. - Improve loop performances, inlining indirect calls when possible. - Removed all the deprecated libbpf APIs. - Implement new eBPF-based LSM flavor. - Add type match support, which allow accurate queries to the eBPF used types. - A few TCP congetsion control framework usability improvements. - Add new infrastructure to manipulate CT entries via eBPF programs. - Allow for livepatch (KLP) and BPF trampolines to attach to the same kernel function. Protocols: - Introduce per network namespace lookup tables for unix sockets, increasing scalability and reducing contention. - Preparation work for Wi-Fi 7 Multi-Link Operation (MLO) support. - Add support to forciby close TIME_WAIT TCP sockets via user-space tools. - Significant performance improvement for the TLS 1.3 receive path, both for zero-copy and not-zero-copy. - Support for changing the initial MTPCP subflow priority/backup status - Introduce virtually contingus buffers for sockets over RDMA, to cope better with memory pressure. - Extend CAN ethtool support with timestamping capabilities - Refactor CAN build infrastructure to allow building only the needed features. Driver API: - Remove devlink mutex to allow parallel commands on multiple links. - Add support for pause stats in distributed switch. - Implement devlink helpers to query and flash line cards. - New helper for phy mode to register conversion. New hardware / drivers: - Ethernet DSA driver for the rockchip mt7531 on BPI-R2 Pro. - Ethernet DSA driver for the Renesas RZ/N1 A5PSW switch. - Ethernet DSA driver for the Microchip LAN937x switch. - Ethernet PHY driver for the Aquantia AQR113C EPHY. - CAN driver for the OBD-II ELM327 interface. - CAN driver for RZ/N1 SJA1000 CAN controller. - Bluetooth: Infineon CYW55572 Wi-Fi plus Bluetooth combo device. Drivers: - Intel Ethernet NICs: - i40e: add support for vlan pruning - i40e: add support for XDP framented packets - ice: improved vlan offload support - ice: add support for PPPoE offload - Mellanox Ethernet (mlx5) - refactor packet steering offload for performance and scalability - extend support for TC offload - refactor devlink code to clean-up the locking schema - support stacked vlans for bridge offloads - use TLS objects pool to improve connection rate - Netronome Ethernet NICs (nfp): - extend support for IPv6 fields mangling offload - add support for vepa mode in HW bridge - better support for virtio data path acceleration (VDPA) - enable TSO by default - Microsoft vNIC driver (mana) - add support for XDP redirect - Others Ethernet drivers: - bonding: add per-port priority support - microchip lan743x: extend phy support - Fungible funeth: support UDP segmentation offload and XDP xmit - Solarflare EF100: add support for virtual function representors - MediaTek SoC: add XDP support - Mellanox Ethernet/IB switch (mlxsw): - dropped support for unreleased H/W (XM router). - improved stats accuracy - unified bridge model coversion improving scalability (parts 1-6) - support for PTP in Spectrum-2 asics - Broadcom PHYs - add PTP support for BCM54210E - add support for the BCM53128 internal PHY - Marvell Ethernet switches (prestera): - implement support for multicast forwarding offload - Embedded Ethernet switches: - refactor OcteonTx MAC filter for better scalability - improve TC H/W offload for the Felix driver - refactor the Microchip ksz8 and ksz9477 drivers to share the probe code (parts 1, 2), add support for phylink mac configuration - Other WiFi: - Microchip wilc1000: diable WEP support and enable WPA3 - Atheros ath10k: encapsulation offload support Old code removal: - Neterion vxge ethernet driver: this is untouched since more than 10 years" * tag 'net-next-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1890 commits) doc: sfp-phylink: Fix a broken reference wireguard: selftests: support UML wireguard: allowedips: don't corrupt stack when detecting overflow wireguard: selftests: update config fragments wireguard: ratelimiter: use hrtimer in selftest net/mlx5e: xsk: Discard unaligned XSK frames on striding RQ net: usb: ax88179_178a: Bind only to vendor-specific interface selftests: net: fix IOAM test skip return code net: usb: make USB_RTL8153_ECM non user configurable net: marvell: prestera: remove reduntant code octeontx2-pf: Reduce minimum mtu size to 60 net: devlink: Fix missing mutex_unlock() call net/tls: Remove redundant workqueue flush before destroy net: txgbe: Fix an error handling path in txgbe_probe() net: dsa: Fix spelling mistakes and cleanup code Documentation: devlink: add add devlink-selftests to the table of contents dccp: put dccp_qpolicy_full() and dccp_qpolicy_push() in the same lock net: ionic: fix error check for vlan flags in ionic_set_nic_features() net: ice: fix error NETIF_F_HW_VLAN_CTAG_FILTER check in ice_vsi_sync_fltr() nfp: flower: add support for tunnel offload without key ID ...
Diffstat (limited to 'net/unix/af_unix.c')
-rw-r--r--net/unix/af_unix.c294
1 files changed, 167 insertions, 127 deletions
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2206e6f8902d..bf338b782fc4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -118,15 +118,13 @@
#include "scm.h"
-spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE];
-EXPORT_SYMBOL_GPL(unix_table_locks);
-struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
-EXPORT_SYMBOL_GPL(unix_socket_table);
static atomic_long_t unix_nr_socks;
+static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
+static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
/* SMP locking strategy:
- * hash table is protected with spinlock unix_table_locks
- * each socket state is protected by separate spin lock.
+ * hash table is protected with spinlock.
+ * each socket state is protected by separate spinlock.
*/
static unsigned int unix_unbound_hash(struct sock *sk)
@@ -137,12 +135,12 @@ static unsigned int unix_unbound_hash(struct sock *sk)
hash ^= hash >> 8;
hash ^= sk->sk_type;
- return UNIX_HASH_SIZE + (hash & (UNIX_HASH_SIZE - 1));
+ return hash & UNIX_HASH_MOD;
}
static unsigned int unix_bsd_hash(struct inode *i)
{
- return i->i_ino & (UNIX_HASH_SIZE - 1);
+ return i->i_ino & UNIX_HASH_MOD;
}
static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
@@ -155,26 +153,34 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
hash ^= hash >> 8;
hash ^= type;
- return hash & (UNIX_HASH_SIZE - 1);
+ return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
}
-static void unix_table_double_lock(unsigned int hash1, unsigned int hash2)
+static void unix_table_double_lock(struct net *net,
+ unsigned int hash1, unsigned int hash2)
{
- /* hash1 and hash2 is never the same because
- * one is between 0 and UNIX_HASH_SIZE - 1, and
- * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2.
- */
+ if (hash1 == hash2) {
+ spin_lock(&net->unx.table.locks[hash1]);
+ return;
+ }
+
if (hash1 > hash2)
swap(hash1, hash2);
- spin_lock(&unix_table_locks[hash1]);
- spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING);
+ spin_lock(&net->unx.table.locks[hash1]);
+ spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
}
-static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2)
+static void unix_table_double_unlock(struct net *net,
+ unsigned int hash1, unsigned int hash2)
{
- spin_unlock(&unix_table_locks[hash1]);
- spin_unlock(&unix_table_locks[hash2]);
+ if (hash1 == hash2) {
+ spin_unlock(&net->unx.table.locks[hash1]);
+ return;
+ }
+
+ spin_unlock(&net->unx.table.locks[hash1]);
+ spin_unlock(&net->unx.table.locks[hash2]);
}
#ifdef CONFIG_SECURITY_NETWORK
@@ -300,34 +306,52 @@ static void __unix_remove_socket(struct sock *sk)
sk_del_node_init(sk);
}
-static void __unix_insert_socket(struct sock *sk)
+static void __unix_insert_socket(struct net *net, struct sock *sk)
{
- WARN_ON(!sk_unhashed(sk));
- sk_add_node(sk, &unix_socket_table[sk->sk_hash]);
+ DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
+ sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
}
-static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr,
- unsigned int hash)
+static void __unix_set_addr_hash(struct net *net, struct sock *sk,
+ struct unix_address *addr, unsigned int hash)
{
__unix_remove_socket(sk);
smp_store_release(&unix_sk(sk)->addr, addr);
sk->sk_hash = hash;
- __unix_insert_socket(sk);
+ __unix_insert_socket(net, sk);
}
-static void unix_remove_socket(struct sock *sk)
+static void unix_remove_socket(struct net *net, struct sock *sk)
{
- spin_lock(&unix_table_locks[sk->sk_hash]);
+ spin_lock(&net->unx.table.locks[sk->sk_hash]);
__unix_remove_socket(sk);
- spin_unlock(&unix_table_locks[sk->sk_hash]);
+ spin_unlock(&net->unx.table.locks[sk->sk_hash]);
+}
+
+static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
+{
+ spin_lock(&net->unx.table.locks[sk->sk_hash]);
+ __unix_insert_socket(net, sk);
+ spin_unlock(&net->unx.table.locks[sk->sk_hash]);
+}
+
+static void unix_insert_bsd_socket(struct sock *sk)
+{
+ spin_lock(&bsd_socket_locks[sk->sk_hash]);
+ sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
+ spin_unlock(&bsd_socket_locks[sk->sk_hash]);
}
-static void unix_insert_unbound_socket(struct sock *sk)
+static void unix_remove_bsd_socket(struct sock *sk)
{
- spin_lock(&unix_table_locks[sk->sk_hash]);
- __unix_insert_socket(sk);
- spin_unlock(&unix_table_locks[sk->sk_hash]);
+ if (!hlist_unhashed(&sk->sk_bind_node)) {
+ spin_lock(&bsd_socket_locks[sk->sk_hash]);
+ __sk_del_bind_node(sk);
+ spin_unlock(&bsd_socket_locks[sk->sk_hash]);
+
+ sk_node_init(&sk->sk_bind_node);
+ }
}
static struct sock *__unix_find_socket_byname(struct net *net,
@@ -336,12 +360,9 @@ static struct sock *__unix_find_socket_byname(struct net *net,
{
struct sock *s;
- sk_for_each(s, &unix_socket_table[hash]) {
+ sk_for_each(s, &net->unx.table.buckets[hash]) {
struct unix_sock *u = unix_sk(s);
- if (!net_eq(sock_net(s), net))
- continue;
-
if (u->addr->len == len &&
!memcmp(u->addr->name, sunname, len))
return s;
@@ -355,11 +376,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net,
{
struct sock *s;
- spin_lock(&unix_table_locks[hash]);
+ spin_lock(&net->unx.table.locks[hash]);
s = __unix_find_socket_byname(net, sunname, len, hash);
if (s)
sock_hold(s);
- spin_unlock(&unix_table_locks[hash]);
+ spin_unlock(&net->unx.table.locks[hash]);
return s;
}
@@ -368,17 +389,17 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
unsigned int hash = unix_bsd_hash(i);
struct sock *s;
- spin_lock(&unix_table_locks[hash]);
- sk_for_each(s, &unix_socket_table[hash]) {
+ spin_lock(&bsd_socket_locks[hash]);
+ sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
struct dentry *dentry = unix_sk(s)->path.dentry;
if (dentry && d_backing_inode(dentry) == i) {
sock_hold(s);
- spin_unlock(&unix_table_locks[hash]);
+ spin_unlock(&bsd_socket_locks[hash]);
return s;
}
}
- spin_unlock(&unix_table_locks[hash]);
+ spin_unlock(&bsd_socket_locks[hash]);
return NULL;
}
@@ -554,9 +575,9 @@ static void unix_sock_destructor(struct sock *sk)
u->oob_skb = NULL;
}
#endif
- WARN_ON(refcount_read(&sk->sk_wmem_alloc));
- WARN_ON(!sk_unhashed(sk));
- WARN_ON(sk->sk_socket);
+ DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
+ DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
+ DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
if (!sock_flag(sk, SOCK_DEAD)) {
pr_info("Attempt to release alive unix socket: %p\n", sk);
return;
@@ -576,12 +597,13 @@ static void unix_sock_destructor(struct sock *sk)
static void unix_release_sock(struct sock *sk, int embrion)
{
struct unix_sock *u = unix_sk(sk);
- struct path path;
struct sock *skpair;
struct sk_buff *skb;
+ struct path path;
int state;
- unix_remove_socket(sk);
+ unix_remove_socket(sock_net(sk), sk);
+ unix_remove_bsd_socket(sk);
/* Clear state */
unix_state_lock(sk);
@@ -741,10 +763,8 @@ static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
unsigned int flags);
static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
-static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
- sk_read_actor_t recv_actor);
-static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
- sk_read_actor_t recv_actor);
+static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
+static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
static int unix_dgram_connect(struct socket *, struct sockaddr *,
int, int);
static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
@@ -798,7 +818,7 @@ static const struct proto_ops unix_stream_ops = {
.shutdown = unix_shutdown,
.sendmsg = unix_stream_sendmsg,
.recvmsg = unix_stream_recvmsg,
- .read_sock = unix_stream_read_sock,
+ .read_skb = unix_stream_read_skb,
.mmap = sock_no_mmap,
.sendpage = unix_stream_sendpage,
.splice_read = unix_stream_splice_read,
@@ -823,7 +843,7 @@ static const struct proto_ops unix_dgram_ops = {
.listen = sock_no_listen,
.shutdown = unix_shutdown,
.sendmsg = unix_dgram_sendmsg,
- .read_sock = unix_read_sock,
+ .read_skb = unix_read_skb,
.recvmsg = unix_dgram_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
@@ -930,9 +950,9 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
init_waitqueue_head(&u->peer_wait);
init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
memset(&u->scm_stat, 0, sizeof(struct scm_stat));
- unix_insert_unbound_socket(sk);
+ unix_insert_unbound_socket(net, sk);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ sock_prot_inuse_add(net, sk->sk_prot, 1);
return sk;
@@ -993,8 +1013,8 @@ static int unix_release(struct socket *sock)
return 0;
}
-static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr,
- int addr_len, int type)
+static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
+ int type)
{
struct inode *inode;
struct path path;
@@ -1063,7 +1083,7 @@ static struct sock *unix_find_other(struct net *net,
struct sock *sk;
if (sunaddr->sun_path[0])
- sk = unix_find_bsd(net, sunaddr, addr_len, type);
+ sk = unix_find_bsd(sunaddr, addr_len, type);
else
sk = unix_find_abstract(net, sunaddr, addr_len, type);
@@ -1074,6 +1094,7 @@ static int unix_autobind(struct sock *sk)
{
unsigned int new_hash, old_hash = sk->sk_hash;
struct unix_sock *u = unix_sk(sk);
+ struct net *net = sock_net(sk);
struct unix_address *addr;
u32 lastnum, ordernum;
int err;
@@ -1102,11 +1123,10 @@ retry:
sprintf(addr->name->sun_path + 1, "%05x", ordernum);
new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
- unix_table_double_lock(old_hash, new_hash);
+ unix_table_double_lock(net, old_hash, new_hash);
- if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
- new_hash)) {
- unix_table_double_unlock(old_hash, new_hash);
+ if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
+ unix_table_double_unlock(net, old_hash, new_hash);
/* __unix_find_socket_byname() may take long time if many names
* are already in use.
@@ -1123,8 +1143,8 @@ retry:
goto retry;
}
- __unix_set_addr_hash(sk, addr, new_hash);
- unix_table_double_unlock(old_hash, new_hash);
+ __unix_set_addr_hash(net, sk, addr, new_hash);
+ unix_table_double_unlock(net, old_hash, new_hash);
err = 0;
out: mutex_unlock(&u->bindlock);
@@ -1138,6 +1158,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
(SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
unsigned int new_hash, old_hash = sk->sk_hash;
struct unix_sock *u = unix_sk(sk);
+ struct net *net = sock_net(sk);
struct user_namespace *ns; // barf...
struct unix_address *addr;
struct dentry *dentry;
@@ -1178,11 +1199,12 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
goto out_unlock;
new_hash = unix_bsd_hash(d_backing_inode(dentry));
- unix_table_double_lock(old_hash, new_hash);
+ unix_table_double_lock(net, old_hash, new_hash);
u->path.mnt = mntget(parent.mnt);
u->path.dentry = dget(dentry);
- __unix_set_addr_hash(sk, addr, new_hash);
- unix_table_double_unlock(old_hash, new_hash);
+ __unix_set_addr_hash(net, sk, addr, new_hash);
+ unix_table_double_unlock(net, old_hash, new_hash);
+ unix_insert_bsd_socket(sk);
mutex_unlock(&u->bindlock);
done_path_create(&parent, dentry);
return 0;
@@ -1205,6 +1227,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
{
unsigned int new_hash, old_hash = sk->sk_hash;
struct unix_sock *u = unix_sk(sk);
+ struct net *net = sock_net(sk);
struct unix_address *addr;
int err;
@@ -1222,19 +1245,18 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
}
new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
- unix_table_double_lock(old_hash, new_hash);
+ unix_table_double_lock(net, old_hash, new_hash);
- if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
- new_hash))
+ if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
goto out_spin;
- __unix_set_addr_hash(sk, addr, new_hash);
- unix_table_double_unlock(old_hash, new_hash);
+ __unix_set_addr_hash(net, sk, addr, new_hash);
+ unix_table_double_unlock(net, old_hash, new_hash);
mutex_unlock(&u->bindlock);
return 0;
out_spin:
- unix_table_double_unlock(old_hash, new_hash);
+ unix_table_double_unlock(net, old_hash, new_hash);
err = -EADDRINUSE;
out_mutex:
mutex_unlock(&u->bindlock);
@@ -1293,9 +1315,8 @@ static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
int alen, int flags)
{
- struct sock *sk = sock->sk;
- struct net *net = sock_net(sk);
struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
+ struct sock *sk = sock->sk;
struct sock *other;
int err;
@@ -1316,7 +1337,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
}
restart:
- other = unix_find_other(net, sunaddr, alen, sock->type);
+ other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
if (IS_ERR(other)) {
err = PTR_ERR(other);
goto out;
@@ -1404,15 +1425,13 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
- struct sock *sk = sock->sk;
- struct net *net = sock_net(sk);
+ struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
struct unix_sock *u = unix_sk(sk), *newu, *otheru;
- struct sock *newsk = NULL;
- struct sock *other = NULL;
+ struct net *net = sock_net(sk);
struct sk_buff *skb = NULL;
- int st;
- int err;
long timeo;
+ int err;
+ int st;
err = unix_validate_addr(sunaddr, addr_len);
if (err)
@@ -1432,7 +1451,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
*/
/* create new sock for complete connection */
- newsk = unix_create1(sock_net(sk), NULL, 0, sock->type);
+ newsk = unix_create1(net, NULL, 0, sock->type);
if (IS_ERR(newsk)) {
err = PTR_ERR(newsk);
newsk = NULL;
@@ -1541,9 +1560,9 @@ restart:
*
* The contents of *(otheru->addr) and otheru->path
* are seen fully set up here, since we have found
- * otheru in hash under unix_table_locks. Insertion
- * into the hash chain we'd found it in had been done
- * in an earlier critical area protected by unix_table_locks,
+ * otheru in hash under its lock. Insertion into the
+ * hash chain we'd found it in had been done in an
+ * earlier critical area protected by the chain's lock,
* the same one where we'd set *(otheru->addr) contents,
* as well as otheru->path and otheru->addr itself.
*
@@ -1840,17 +1859,15 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
size_t len)
{
- struct sock *sk = sock->sk;
- struct net *net = sock_net(sk);
- struct unix_sock *u = unix_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
- struct sock *other = NULL;
- int err;
- struct sk_buff *skb;
- long timeo;
+ struct sock *sk = sock->sk, *other = NULL;
+ struct unix_sock *u = unix_sk(sk);
struct scm_cookie scm;
+ struct sk_buff *skb;
int data_len = 0;
int sk_locked;
+ long timeo;
+ int err;
wait_for_unix_gc();
err = scm_send(sock, msg, &scm, false);
@@ -1917,7 +1934,7 @@ restart:
if (sunaddr == NULL)
goto out_free;
- other = unix_find_other(net, sunaddr, msg->msg_namelen,
+ other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
sk->sk_type);
if (IS_ERR(other)) {
err = PTR_ERR(other);
@@ -2487,8 +2504,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si
return __unix_dgram_recvmsg(sk, msg, size, flags);
}
-static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
- sk_read_actor_t recv_actor)
+static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
int copied = 0;
@@ -2503,7 +2519,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
if (!skb)
return err;
- used = recv_actor(desc, skb, 0, skb->len);
+ used = recv_actor(sk, skb);
if (used <= 0) {
if (!copied)
copied = used;
@@ -2514,8 +2530,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
}
kfree_skb(skb);
- if (!desc->count)
- break;
+ break;
}
return copied;
@@ -2650,13 +2665,12 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
}
#endif
-static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
- sk_read_actor_t recv_actor)
+static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
if (unlikely(sk->sk_state != TCP_ESTABLISHED))
return -ENOTCONN;
- return unix_read_sock(sk, desc, recv_actor);
+ return unix_read_skb(sk, recv_actor);
}
static int unix_stream_read_generic(struct unix_stream_read_state *state,
@@ -3226,12 +3240,11 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
{
unsigned long offset = get_offset(*pos);
unsigned long bucket = get_bucket(*pos);
- struct sock *sk;
unsigned long count = 0;
+ struct sock *sk;
- for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
- if (sock_net(sk) != seq_file_net(seq))
- continue;
+ for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
+ sk; sk = sk_next(sk)) {
if (++count == offset)
break;
}
@@ -3242,16 +3255,17 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
{
unsigned long bucket = get_bucket(*pos);
+ struct net *net = seq_file_net(seq);
struct sock *sk;
- while (bucket < ARRAY_SIZE(unix_socket_table)) {
- spin_lock(&unix_table_locks[bucket]);
+ while (bucket < UNIX_HASH_SIZE) {
+ spin_lock(&net->unx.table.locks[bucket]);
sk = unix_from_bucket(seq, pos);
if (sk)
return sk;
- spin_unlock(&unix_table_locks[bucket]);
+ spin_unlock(&net->unx.table.locks[bucket]);
*pos = set_bucket_offset(++bucket, 1);
}
@@ -3264,11 +3278,12 @@ static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
{
unsigned long bucket = get_bucket(*pos);
- for (sk = sk_next(sk); sk; sk = sk_next(sk))
- if (sock_net(sk) == seq_file_net(seq))
- return sk;
+ sk = sk_next(sk);
+ if (sk)
+ return sk;
- spin_unlock(&unix_table_locks[bucket]);
+
+ spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
*pos = set_bucket_offset(++bucket, 1);
@@ -3298,7 +3313,7 @@ static void unix_seq_stop(struct seq_file *seq, void *v)
struct sock *sk = v;
if (sk)
- spin_unlock(&unix_table_locks[sk->sk_hash]);
+ spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
}
static int unix_seq_show(struct seq_file *seq, void *v)
@@ -3323,7 +3338,7 @@ static int unix_seq_show(struct seq_file *seq, void *v)
(s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
sock_i_ino(s));
- if (u->addr) { // under unix_table_locks here
+ if (u->addr) { // under a hash table lock here
int i, len;
seq_putc(seq, ' ');
@@ -3393,9 +3408,6 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
iter->batch[iter->end_sk++] = start_sk;
for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
- if (sock_net(sk) != seq_file_net(seq))
- continue;
-
if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
iter->batch[iter->end_sk++] = sk;
@@ -3404,7 +3416,7 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
expected++;
}
- spin_unlock(&unix_table_locks[start_sk->sk_hash]);
+ spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
return expected;
}
@@ -3564,7 +3576,7 @@ static const struct net_proto_family unix_family_ops = {
static int __net_init unix_net_init(struct net *net)
{
- int error = -ENOMEM;
+ int i;
net->unx.sysctl_max_dgram_qlen = 10;
if (unix_sysctl_register(net))
@@ -3572,18 +3584,44 @@ static int __net_init unix_net_init(struct net *net)
#ifdef CONFIG_PROC_FS
if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
- sizeof(struct seq_net_private))) {
- unix_sysctl_unregister(net);
- goto out;
+ sizeof(struct seq_net_private)))
+ goto err_sysctl;
+#endif
+
+ net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
+ sizeof(spinlock_t), GFP_KERNEL);
+ if (!net->unx.table.locks)
+ goto err_proc;
+
+ net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
+ sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!net->unx.table.buckets)
+ goto free_locks;
+
+ for (i = 0; i < UNIX_HASH_SIZE; i++) {
+ spin_lock_init(&net->unx.table.locks[i]);
+ INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
}
+
+ return 0;
+
+free_locks:
+ kvfree(net->unx.table.locks);
+err_proc:
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("unix", net->proc_net);
+err_sysctl:
#endif
- error = 0;
+ unix_sysctl_unregister(net);
out:
- return error;
+ return -ENOMEM;
}
static void __net_exit unix_net_exit(struct net *net)
{
+ kvfree(net->unx.table.buckets);
+ kvfree(net->unx.table.locks);
unix_sysctl_unregister(net);
remove_proc_entry("unix", net->proc_net);
}
@@ -3671,8 +3709,10 @@ static int __init af_unix_init(void)
BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
- for (i = 0; i < 2 * UNIX_HASH_SIZE; i++)
- spin_lock_init(&unix_table_locks[i]);
+ for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
+ spin_lock_init(&bsd_socket_locks[i]);
+ INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
+ }
rc = proto_register(&unix_dgram_proto, 1);
if (rc != 0) {