aboutsummaryrefslogtreecommitdiff
path: root/net/core/sock.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/sock.c')
-rw-r--r--net/core/sock.c429
1 files changed, 314 insertions, 115 deletions
diff --git a/net/core/sock.c b/net/core/sock.c
index 4eca27dc5c94..ac2a404c73eb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -102,6 +102,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
@@ -138,10 +139,7 @@
#include <trace/events/sock.h>
-#ifdef CONFIG_INET
#include <net/tcp.h>
-#endif
-
#include <net/busy_poll.h>
static DEFINE_MUTEX(proto_list_mutex);
@@ -197,73 +195,117 @@ EXPORT_SYMBOL(sk_net_capable);
/*
* Each address family might have different locking rules, so we have
- * one slock key per address family:
+ * one slock key per address family and separate keys for internal and
+ * userspace sockets.
*/
static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_kern_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
+static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
/*
* Make lock validator output more readable. (we pre-construct these
* strings build-time, so that runtime initialization of socket
* locks is fast):
*/
+
+#define _sock_locks(x) \
+ x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
+ x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
+ x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
+ x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
+ x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
+ x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
+ x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
+ x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
+ x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
+ x "27" , x "28" , x "AF_CAN" , \
+ x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
+ x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
+ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
+ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
+ x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX"
+
static const char *const af_family_key_strings[AF_MAX+1] = {
- "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
- "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
- "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
- "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
- "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
- "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
- "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
- "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
- "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
- "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
- "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
- "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
- "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
- "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
- "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX"
+ _sock_locks("sk_lock-")
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
- "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
- "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
- "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
- "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
- "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
- "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
- "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
- "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
- "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
- "slock-27" , "slock-28" , "slock-AF_CAN" ,
- "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
- "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
- "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
- "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
- "slock-AF_QIPCRTR", "slock-AF_MAX"
+ _sock_locks("slock-")
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
- "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
- "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
- "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
- "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
- "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
- "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
- "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
- "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
- "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
- "clock-27" , "clock-28" , "clock-AF_CAN" ,
- "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
- "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
- "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
- "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
- "clock-AF_QIPCRTR", "clock-AF_MAX"
+ _sock_locks("clock-")
+};
+
+static const char *const af_family_kern_key_strings[AF_MAX+1] = {
+ _sock_locks("k-sk_lock-")
+};
+static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
+ _sock_locks("k-slock-")
+};
+static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
+ _sock_locks("k-clock-")
+};
+static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
+ "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" ,
+ "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK",
+ "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" ,
+ "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" ,
+ "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" ,
+ "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" ,
+ "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" ,
+ "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" ,
+ "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" ,
+ "rlock-27" , "rlock-28" , "rlock-AF_CAN" ,
+ "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" ,
+ "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
+ "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
+ "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
+ "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX"
+};
+static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
+ "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
+ "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK",
+ "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" ,
+ "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" ,
+ "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" ,
+ "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" ,
+ "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" ,
+ "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" ,
+ "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" ,
+ "wlock-27" , "wlock-28" , "wlock-AF_CAN" ,
+ "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" ,
+ "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
+ "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
+ "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
+ "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX"
+};
+static const char *const af_family_elock_key_strings[AF_MAX+1] = {
+ "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
+ "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK",
+ "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" ,
+ "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" ,
+ "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" ,
+ "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" ,
+ "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" ,
+ "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" ,
+ "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" ,
+ "elock-27" , "elock-28" , "elock-AF_CAN" ,
+ "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" ,
+ "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
+ "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
+ "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
+ "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX"
};
/*
- * sk_callback_lock locking rules are per-address-family,
+ * sk_callback_lock and sk queues locking rules are per-address-family,
* so split the lock classes by using a per-AF key:
*/
static struct lock_class_key af_callback_keys[AF_MAX];
+static struct lock_class_key af_rlock_keys[AF_MAX];
+static struct lock_class_key af_wlock_keys[AF_MAX];
+static struct lock_class_key af_elock_keys[AF_MAX];
+static struct lock_class_key af_kern_callback_keys[AF_MAX];
/* Take into consideration the size of the struct sk_buff overhead in the
* determination of these values, since that is non-constant across
@@ -328,14 +370,14 @@ EXPORT_SYMBOL_GPL(sk_clear_memalloc);
int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
int ret;
- unsigned long pflags = current->flags;
+ unsigned int noreclaim_flag;
/* these should have been dropped before queueing */
BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
ret = sk->sk_backlog_rcv(sk, skb);
- tsk_restore_flags(current, pflags, PF_MEMALLOC);
+ memalloc_noreclaim_restore(noreclaim_flag);
return ret;
}
@@ -367,7 +409,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
if (tv.tv_sec == 0 && tv.tv_usec == 0)
return 0;
if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
- *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
+ *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
return 0;
}
@@ -502,6 +544,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
+ sk->sk_dst_pending_confirm = 0;
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
@@ -762,11 +805,8 @@ set_rcvbuf:
goto set_rcvbuf;
case SO_KEEPALIVE:
-#ifdef CONFIG_INET
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM)
- tcp_set_keepalive(sk, valbool);
-#endif
+ if (sk->sk_prot->keepalive)
+ sk->sk_prot->keepalive(sk, valbool);
sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
break;
@@ -998,6 +1038,10 @@ set_rcvbuf:
#endif
case SO_MAX_PACING_RATE:
+ if (val != ~0U)
+ cmpxchg(&sk->sk_pacing_status,
+ SK_PACING_NONE,
+ SK_PACING_NEEDED);
sk->sk_max_pacing_rate = val;
sk->sk_pacing_rate = min(sk->sk_pacing_rate,
sk->sk_max_pacing_rate);
@@ -1034,6 +1078,18 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred,
}
}
+static int groups_to_user(gid_t __user *dst, const struct group_info *src)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ int i;
+
+ for (i = 0; i < src->ngroups; i++)
+ if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
+ return -EFAULT;
+
+ return 0;
+}
+
int sock_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -1041,6 +1097,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
union {
int val;
+ u64 val64;
struct linger ling;
struct timeval tm;
} v;
@@ -1148,7 +1205,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.tm.tv_usec = 0;
} else {
v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
+ v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
}
break;
@@ -1159,7 +1216,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.tm.tv_usec = 0;
} else {
v.tm.tv_sec = sk->sk_sndtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
+ v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
}
break;
@@ -1186,6 +1243,27 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
goto lenout;
}
+ case SO_PEERGROUPS:
+ {
+ int ret, n;
+
+ if (!sk->sk_peer_cred)
+ return -ENODATA;
+
+ n = sk->sk_peer_cred->group_info->ngroups;
+ if (len < n * sizeof(gid_t)) {
+ len = n * sizeof(gid_t);
+ return put_user(len, optlen) ? -EFAULT : -ERANGE;
+ }
+ len = n * sizeof(gid_t);
+
+ ret = groups_to_user((gid_t __user *)optval,
+ sk->sk_peer_cred->group_info);
+ if (ret)
+ return ret;
+ goto lenout;
+ }
+
case SO_PEERNAME:
{
char address[128];
@@ -1271,6 +1349,40 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_incoming_cpu;
break;
+ case SO_MEMINFO:
+ {
+ u32 meminfo[SK_MEMINFO_VARS];
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ sk_get_meminfo(sk, meminfo);
+
+ len = min_t(unsigned int, len, sizeof(meminfo));
+ if (copy_to_user(optval, &meminfo, len))
+ return -EFAULT;
+
+ goto lenout;
+ }
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_INCOMING_NAPI_ID:
+ v.val = READ_ONCE(sk->sk_napi_id);
+
+ /* aggregate non-NAPI IDs down to 0 */
+ if (v.val < MIN_NAPI_ID)
+ v.val = 0;
+
+ break;
+#endif
+
+ case SO_COOKIE:
+ lv = sizeof(u64);
+ if (len < lv)
+ return -EINVAL;
+ v.val64 = sock_gen_cookie(sk);
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -1295,7 +1407,16 @@ lenout:
*/
static inline void sock_lock_init(struct sock *sk)
{
- sock_lock_init_class_and_name(sk,
+ if (sk->sk_kern_sock)
+ sock_lock_init_class_and_name(
+ sk,
+ af_family_kern_slock_key_strings[sk->sk_family],
+ af_family_kern_slock_keys + sk->sk_family,
+ af_family_kern_key_strings[sk->sk_family],
+ af_family_kern_keys + sk->sk_family);
+ else
+ sock_lock_init_class_and_name(
+ sk,
af_family_slock_key_strings[sk->sk_family],
af_family_slock_keys + sk->sk_family,
af_family_key_strings[sk->sk_family],
@@ -1401,12 +1522,13 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
+ sk->sk_kern_sock = kern;
sock_lock_init(sk);
sk->sk_net_refcnt = kern ? 0 : 1;
if (likely(sk->sk_net_refcnt))
get_net(net);
sock_net_set(sk, net);
- atomic_set(&sk->sk_wmem_alloc, 1);
+ refcount_set(&sk->sk_wmem_alloc, 1);
mem_cgroup_sk_alloc(sk);
cgroup_sk_alloc(&sk->sk_cgrp_data);
@@ -1430,7 +1552,7 @@ static void __sk_destruct(struct rcu_head *head)
sk->sk_destruct(sk);
filter = rcu_dereference_check(sk->sk_filter,
- atomic_read(&sk->sk_wmem_alloc) == 0);
+ refcount_read(&sk->sk_wmem_alloc) == 0);
if (filter) {
sk_filter_uncharge(sk, filter);
RCU_INIT_POINTER(sk->sk_filter, NULL);
@@ -1444,6 +1566,11 @@ static void __sk_destruct(struct rcu_head *head)
pr_debug("%s: optmem leakage (%d bytes) detected\n",
__func__, atomic_read(&sk->sk_omem_alloc));
+ if (sk->sk_frag.page) {
+ put_page(sk->sk_frag.page);
+ sk->sk_frag.page = NULL;
+ }
+
if (sk->sk_peer_cred)
put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
@@ -1475,11 +1602,32 @@ void sk_free(struct sock *sk)
* some packets are still in some tx queue.
* If not null, sock_wfree() will call __sk_free(sk) later
*/
- if (atomic_dec_and_test(&sk->sk_wmem_alloc))
+ if (refcount_dec_and_test(&sk->sk_wmem_alloc))
__sk_free(sk);
}
EXPORT_SYMBOL(sk_free);
+static void sk_init_common(struct sock *sk)
+{
+ skb_queue_head_init(&sk->sk_receive_queue);
+ skb_queue_head_init(&sk->sk_write_queue);
+ skb_queue_head_init(&sk->sk_error_queue);
+
+ rwlock_init(&sk->sk_callback_lock);
+ lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
+ af_rlock_keys + sk->sk_family,
+ af_family_rlock_key_strings[sk->sk_family]);
+ lockdep_set_class_and_name(&sk->sk_write_queue.lock,
+ af_wlock_keys + sk->sk_family,
+ af_family_wlock_key_strings[sk->sk_family]);
+ lockdep_set_class_and_name(&sk->sk_error_queue.lock,
+ af_elock_keys + sk->sk_family,
+ af_family_elock_key_strings[sk->sk_family]);
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
+ af_callback_keys + sk->sk_family,
+ af_family_clock_key_strings[sk->sk_family]);
+}
+
/**
* sk_clone_lock - clone a socket, and lock its clone
* @sk: the socket to clone
@@ -1511,17 +1659,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
/*
* sk_wmem_alloc set to one (see sk_free() and sock_wfree())
*/
- atomic_set(&newsk->sk_wmem_alloc, 1);
+ refcount_set(&newsk->sk_wmem_alloc, 1);
atomic_set(&newsk->sk_omem_alloc, 0);
- skb_queue_head_init(&newsk->sk_receive_queue);
- skb_queue_head_init(&newsk->sk_write_queue);
-
- rwlock_init(&newsk->sk_callback_lock);
- lockdep_set_class_and_name(&newsk->sk_callback_lock,
- af_callback_keys + newsk->sk_family,
- af_family_clock_key_strings[newsk->sk_family]);
+ sk_init_common(newsk);
newsk->sk_dst_cache = NULL;
+ newsk->sk_dst_pending_confirm = 0;
newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0;
atomic_set(&newsk->sk_drops, 0);
@@ -1529,7 +1672,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
sock_reset_flag(newsk, SOCK_DONE);
- skb_queue_head_init(&newsk->sk_error_queue);
filter = rcu_dereference_protected(newsk->sk_filter, 1);
if (filter != NULL)
@@ -1540,11 +1682,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
is_charged = sk_filter_charge(newsk, filter);
if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- newsk->sk_destruct = NULL;
- bh_unlock_sock(newsk);
- sk_free(newsk);
+ /* We need to make sure that we don't uncharge the new
+ * socket if we couldn't charge it in the first place
+ * as otherwise we uncharge the parent's filter.
+ */
+ if (!is_charged)
+ RCU_INIT_POINTER(newsk->sk_filter, NULL);
+ sk_free_unlock_clone(newsk);
newsk = NULL;
goto out;
}
@@ -1564,7 +1708,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
* (Documentation/RCU/rculist_nulls.txt for details)
*/
smp_wmb();
- atomic_set(&newsk->sk_refcnt, 2);
+ refcount_set(&newsk->sk_refcnt, 2);
/*
* Increment the counter in the same struct proto as the master
@@ -1593,6 +1737,16 @@ out:
}
EXPORT_SYMBOL_GPL(sk_clone_lock);
+void sk_free_unlock_clone(struct sock *sk)
+{
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ sk->sk_destruct = NULL;
+ bh_unlock_sock(sk);
+ sk_free(sk);
+}
+EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
+
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
u32 max_segs = 1;
@@ -1633,7 +1787,7 @@ void sock_wfree(struct sk_buff *skb)
* Keep a reference on sk_wmem_alloc, this will be released
* after sk_write_space() call
*/
- atomic_sub(len - 1, &sk->sk_wmem_alloc);
+ WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
sk->sk_write_space(sk);
len = 1;
}
@@ -1641,7 +1795,7 @@ void sock_wfree(struct sk_buff *skb)
* if sk_wmem_alloc reaches 0, we must finish what sk_free()
* could not do because of in-flight packets
*/
- if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
+ if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
__sk_free(sk);
}
EXPORT_SYMBOL(sock_wfree);
@@ -1653,7 +1807,7 @@ void __sock_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
- if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
+ if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
__sk_free(sk);
}
@@ -1675,7 +1829,7 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
* is enough to guarantee sk_free() wont free this sock until
* all in-flight packets are completed
*/
- atomic_add(skb->truesize, &sk->sk_wmem_alloc);
+ refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}
EXPORT_SYMBOL(skb_set_owner_w);
@@ -1683,28 +1837,24 @@ EXPORT_SYMBOL(skb_set_owner_w);
* delay queue. We want to allow the owner socket to send more
* packets, as if they were already TX completed by a typical driver.
* But we also want to keep skb->sk set because some packet schedulers
- * rely on it (sch_fq for example). So we set skb->truesize to a small
- * amount (1) and decrease sk_wmem_alloc accordingly.
+ * rely on it (sch_fq for example).
*/
void skb_orphan_partial(struct sk_buff *skb)
{
- /* If this skb is a TCP pure ACK or already went here,
- * we have nothing to do. 2 is already a very small truesize.
- */
- if (skb->truesize <= 2)
+ if (skb_is_tcp_pure_ack(skb))
return;
- /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
- * so we do not completely orphan skb, but transfert all
- * accounted bytes but one, to avoid unexpected reorders.
- */
if (skb->destructor == sock_wfree
#ifdef CONFIG_INET
|| skb->destructor == tcp_wfree
#endif
) {
- atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
- skb->truesize = 1;
+ struct sock *sk = skb->sk;
+
+ if (refcount_inc_not_zero(&sk->sk_refcnt)) {
+ WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
+ skb->destructor = sock_efree;
+ }
} else {
skb_orphan(skb);
}
@@ -1762,7 +1912,7 @@ EXPORT_SYMBOL(sock_i_ino);
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority)
{
- if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
struct sk_buff *skb = alloc_skb(size, priority);
if (skb) {
skb_set_owner_w(skb, sk);
@@ -1837,7 +1987,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+ if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
break;
if (sk->sk_shutdown & SEND_SHUTDOWN)
break;
@@ -1959,6 +2109,26 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
}
EXPORT_SYMBOL(sock_cmsg_send);
+static void sk_enter_memory_pressure(struct sock *sk)
+{
+ if (!sk->sk_prot->enter_memory_pressure)
+ return;
+
+ sk->sk_prot->enter_memory_pressure(sk);
+}
+
+static void sk_leave_memory_pressure(struct sock *sk)
+{
+ if (sk->sk_prot->leave_memory_pressure) {
+ sk->sk_prot->leave_memory_pressure(sk);
+ } else {
+ unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
+
+ if (memory_pressure && *memory_pressure)
+ *memory_pressure = 0;
+ }
+}
+
/* On 32bit arches, an skb frag is limited to 2^15 */
#define SKB_FRAG_PAGE_ORDER get_order(32768)
@@ -2140,7 +2310,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
if (sk->sk_type == SOCK_STREAM) {
if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
return 1;
- } else if (atomic_read(&sk->sk_wmem_alloc) <
+ } else if (refcount_read(&sk->sk_wmem_alloc) <
prot->sysctl_wmem[0])
return 1;
}
@@ -2272,7 +2442,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
}
EXPORT_SYMBOL(sock_no_socketpair);
-int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
+int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern)
{
return -EOPNOTSUPP;
}
@@ -2406,7 +2577,7 @@ static void sock_def_write_space(struct sock *sk)
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+ if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
@@ -2449,10 +2620,7 @@ EXPORT_SYMBOL(sk_stop_timer);
void sock_init_data(struct socket *sock, struct sock *sk)
{
- skb_queue_head_init(&sk->sk_receive_queue);
- skb_queue_head_init(&sk->sk_write_queue);
- skb_queue_head_init(&sk->sk_error_queue);
-
+ sk_init_common(sk);
sk->sk_send_head = NULL;
init_timer(&sk->sk_timer);
@@ -2476,7 +2644,14 @@ void sock_init_data(struct socket *sock, struct sock *sk)
}
rwlock_init(&sk->sk_callback_lock);
- lockdep_set_class_and_name(&sk->sk_callback_lock,
+ if (sk->sk_kern_sock)
+ lockdep_set_class_and_name(
+ &sk->sk_callback_lock,
+ af_kern_callback_keys + sk->sk_family,
+ af_family_kern_clock_key_strings[sk->sk_family]);
+ else
+ lockdep_set_class_and_name(
+ &sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
@@ -2497,7 +2672,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
- sk->sk_stamp = ktime_set(-1L, 0);
+ sk->sk_stamp = SK_DEFAULT_STAMP;
#ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = 0;
@@ -2512,7 +2687,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
* (Documentation/RCU/rculist_nulls.txt for details)
*/
smp_wmb();
- atomic_set(&sk->sk_refcnt, 1);
+ refcount_set(&sk->sk_refcnt, 1);
atomic_set(&sk->sk_drops, 0);
}
EXPORT_SYMBOL(sock_init_data);
@@ -2557,9 +2732,12 @@ EXPORT_SYMBOL(release_sock);
* @sk: socket
*
* This version should be used for very small section, where process wont block
- * return false if fast path is taken
+ * return false if fast path is taken:
+ *
* sk_lock.slock locked, owned = 0, BH disabled
- * return true if slow path is taken
+ *
+ * return true if slow path is taken:
+ *
* sk_lock.slock unlocked, owned = 1, BH enabled
*/
bool lock_sock_fast(struct sock *sk)
@@ -2774,15 +2952,25 @@ void sk_common_release(struct sock *sk)
sk_refcnt_debug_release(sk);
- if (sk->sk_frag.page) {
- put_page(sk->sk_frag.page);
- sk->sk_frag.page = NULL;
- }
-
sock_put(sk);
}
EXPORT_SYMBOL(sk_common_release);
+void sk_get_meminfo(const struct sock *sk, u32 *mem)
+{
+ memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
+
+ mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
+ mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
+ mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+ mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
+ mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
+ mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+ mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
+}
+
#ifdef CONFIG_PROC_FS
#define PROTO_INUSE_NR 64 /* should be enough for the first time */
struct prot_inuse {
@@ -3123,3 +3311,14 @@ static int __init proto_init(void)
subsys_initcall(proto_init);
#endif /* PROC_FS */
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+bool sk_busy_loop_end(void *p, unsigned long start_time)
+{
+ struct sock *sk = p;
+
+ return !skb_queue_empty(&sk->sk_receive_queue) ||
+ sk_busy_loop_timeout(sk, start_time);
+}
+EXPORT_SYMBOL(sk_busy_loop_end);
+#endif /* CONFIG_NET_RX_BUSY_POLL */