aboutsummaryrefslogtreecommitdiff
path: root/include/net/sock.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/net/sock.h')
-rw-r--r--include/net/sock.h173
1 files changed, 130 insertions, 43 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 92b269709b9a..9ccefa5c5487 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -70,6 +70,7 @@
#include <net/checksum.h>
#include <net/tcp_states.h>
#include <linux/net_tstamp.h>
+#include <net/smc.h>
/*
* This structure really needs to be cleaned up.
@@ -239,6 +240,7 @@ struct sock_common {
* @sk_wq: sock wait queue and async head
* @sk_rx_dst: receive input route used by early demux
* @sk_dst_cache: destination cache
+ * @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
* @sk_receive_queue: incoming packets
* @sk_wmem_alloc: transmit queue bytes committed
@@ -343,6 +345,9 @@ struct sock {
#define sk_rxhash __sk_common.skc_rxhash
socket_lock_t sk_lock;
+ atomic_t sk_drops;
+ int sk_rcvlowat;
+ struct sk_buff_head sk_error_queue;
struct sk_buff_head sk_receive_queue;
/*
* The backlog queue is special, it is always used with
@@ -359,14 +364,13 @@ struct sock {
struct sk_buff *tail;
} sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc
- int sk_forward_alloc;
- __u32 sk_txhash;
+ int sk_forward_alloc;
#ifdef CONFIG_NET_RX_BUSY_POLL
- unsigned int sk_napi_id;
unsigned int sk_ll_usec;
+ /* ===== mostly read cache line ===== */
+ unsigned int sk_napi_id;
#endif
- atomic_t sk_drops;
int sk_rcvbuf;
struct sk_filter __rcu *sk_filter;
@@ -379,16 +383,52 @@ struct sock {
#endif
struct dst_entry *sk_rx_dst;
struct dst_entry __rcu *sk_dst_cache;
- /* Note: 32bit hole on 64bit arches */
- atomic_t sk_wmem_alloc;
atomic_t sk_omem_alloc;
int sk_sndbuf;
+
+ /* ===== cache line for TX ===== */
+ int sk_wmem_queued;
+ atomic_t sk_wmem_alloc;
+ unsigned long sk_tsq_flags;
+ struct sk_buff *sk_send_head;
struct sk_buff_head sk_write_queue;
+ __s32 sk_peek_off;
+ int sk_write_pending;
+ __u32 sk_dst_pending_confirm;
+ /* Note: 32bit hole on 64bit arches */
+ long sk_sndtimeo;
+ struct timer_list sk_timer;
+ __u32 sk_priority;
+ __u32 sk_mark;
+ u32 sk_pacing_rate; /* bytes per second */
+ u32 sk_max_pacing_rate;
+ struct page_frag sk_frag;
+ netdev_features_t sk_route_caps;
+ netdev_features_t sk_route_nocaps;
+ int sk_gso_type;
+ unsigned int sk_gso_max_size;
+ gfp_t sk_allocation;
+ __u32 sk_txhash;
/*
* Because of non atomicity rules, all
* changes are protected by socket lock.
*/
+ unsigned int __sk_flags_offset[0];
+#ifdef __BIG_ENDIAN_BITFIELD
+#define SK_FL_PROTO_SHIFT 16
+#define SK_FL_PROTO_MASK 0x00ff0000
+
+#define SK_FL_TYPE_SHIFT 0
+#define SK_FL_TYPE_MASK 0x0000ffff
+#else
+#define SK_FL_PROTO_SHIFT 8
+#define SK_FL_PROTO_MASK 0x0000ff00
+
+#define SK_FL_TYPE_SHIFT 16
+#define SK_FL_TYPE_MASK 0xffff0000
+#endif
+
kmemcheck_bitfield_begin(flags);
unsigned int sk_padding : 2,
sk_no_check_tx : 1,
@@ -399,41 +439,24 @@ struct sock {
#define SK_PROTOCOL_MAX U8_MAX
kmemcheck_bitfield_end(flags);
- int sk_wmem_queued;
- gfp_t sk_allocation;
- u32 sk_pacing_rate; /* bytes per second */
- u32 sk_max_pacing_rate;
- netdev_features_t sk_route_caps;
- netdev_features_t sk_route_nocaps;
- int sk_gso_type;
- unsigned int sk_gso_max_size;
u16 sk_gso_max_segs;
- int sk_rcvlowat;
unsigned long sk_lingertime;
- struct sk_buff_head sk_error_queue;
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
int sk_err,
sk_err_soft;
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
- __u32 sk_priority;
- __u32 sk_mark;
+ kuid_t sk_uid;
struct pid *sk_peer_pid;
const struct cred *sk_peer_cred;
long sk_rcvtimeo;
- long sk_sndtimeo;
- struct timer_list sk_timer;
ktime_t sk_stamp;
u16 sk_tsflags;
u8 sk_shutdown;
u32 sk_tskey;
struct socket *sk_socket;
void *sk_user_data;
- struct page_frag sk_frag;
- struct sk_buff *sk_send_head;
- __s32 sk_peek_off;
- int sk_write_pending;
#ifdef CONFIG_SECURITY
void *sk_security;
#endif
@@ -524,8 +547,7 @@ static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
static inline struct sock *sk_next(const struct sock *sk)
{
- return sk->sk_node.next ?
- hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL;
+ return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node);
}
static inline struct sock *sk_nulls_next(const struct sock *sk)
@@ -894,7 +916,20 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
- sock_rps_record_flow_hash(sk->sk_rxhash);
+ if (static_key_false(&rfs_needed)) {
+ /* Reading sk->sk_rxhash might incur an expensive cache line
+ * miss.
+ *
+ * TCP_ESTABLISHED does cover almost all states where RFS
+ * might be useful, and is cheaper [1] than testing :
+ * IPv4: inet_sk(sk)->inet_daddr
+ * IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
+ * OR an additional socket flag
+ * [1] : sk_state and sk_prot are in the same cache line.
+ */
+ if (sk->sk_state == TCP_ESTABLISHED)
+ sock_rps_record_flow_hash(sk->sk_rxhash);
+ }
#endif
}
@@ -914,14 +949,16 @@ static inline void sock_rps_reset_rxhash(struct sock *sk)
#endif
}
-#define sk_wait_event(__sk, __timeo, __condition) \
+#define sk_wait_event(__sk, __timeo, __condition, __wait) \
({ int __rc; \
release_sock(__sk); \
__rc = __condition; \
if (!__rc) { \
- *(__timeo) = schedule_timeout(*(__timeo)); \
+ *(__timeo) = wait_woken(__wait, \
+ TASK_INTERRUPTIBLE, \
+ *(__timeo)); \
} \
- sched_annotate_sleep(); \
+ sched_annotate_sleep(); \
lock_sock(__sk); \
__rc = __condition; \
__rc; \
@@ -952,6 +989,7 @@ struct request_sock_ops;
struct timewait_sock_ops;
struct inet_hashinfo;
struct raw_hashinfo;
+struct smc_hashinfo;
struct module;
/*
@@ -990,6 +1028,7 @@ struct proto {
int (*getsockopt)(struct sock *sk, int level,
int optname, char __user *optval,
int __user *option);
+ void (*keepalive)(struct sock *sk, int valbool);
#ifdef CONFIG_COMPAT
int (*compat_setsockopt)(struct sock *sk,
int level,
@@ -1059,6 +1098,7 @@ struct proto {
struct inet_hashinfo *hashinfo;
struct udp_table *udp_table;
struct raw_hashinfo *raw_hash;
+ struct smc_hashinfo *smc_hash;
} h;
struct module *owner;
@@ -1162,11 +1202,6 @@ static inline void sk_enter_memory_pressure(struct sock *sk)
sk->sk_prot->enter_memory_pressure(sk);
}
-static inline long sk_prot_mem_limits(const struct sock *sk, int index)
-{
- return sk->sk_prot->sysctl_mem[index];
-}
-
static inline long
sk_memory_allocated(const struct sock *sk)
{
@@ -1276,14 +1311,32 @@ static inline struct inode *SOCK_INODE(struct socket *socket)
/*
* Functions for memory accounting
*/
+int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
int __sk_mem_schedule(struct sock *sk, int size, int kind);
+void __sk_mem_reduce_allocated(struct sock *sk, int amount);
void __sk_mem_reclaim(struct sock *sk, int amount);
-#define SK_MEM_QUANTUM ((int)PAGE_SIZE)
+/* We used to have PAGE_SIZE here, but systems with 64KB pages
+ * do not necessarily have 16x time more memory than 4KB ones.
+ */
+#define SK_MEM_QUANTUM 4096
#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
#define SK_MEM_SEND 0
#define SK_MEM_RECV 1
+/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
+static inline long sk_prot_mem_limits(const struct sock *sk, int index)
+{
+ long val = sk->sk_prot->sysctl_mem[index];
+
+#if PAGE_SIZE > SK_MEM_QUANTUM
+ val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
+#elif PAGE_SIZE < SK_MEM_QUANTUM
+ val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
+#endif
+ return val;
+}
+
static inline int sk_mem_pages(int amt)
{
return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
@@ -1484,7 +1537,7 @@ void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
#else
-#define sock_edemux(skb) sock_efree(skb)
+#define sock_edemux sock_efree
#endif
int sock_setsockopt(struct socket *sock, int level, int op,
@@ -1651,6 +1704,7 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
sk->sk_wq = parent->wq;
parent->sk = sk;
sk_set_socket(sk, parent);
+ sk->sk_uid = SOCK_INODE(parent)->i_uid;
security_sock_graft(sk, parent);
write_unlock_bh(&sk->sk_callback_lock);
}
@@ -1658,6 +1712,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
kuid_t sock_i_uid(struct sock *sk);
unsigned long sock_i_ino(struct sock *sk);
+static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
+{
+ return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+}
+
static inline u32 net_tx_rndhash(void)
{
u32 v = prandom_u32();
@@ -1708,6 +1767,7 @@ static inline void dst_negative_advice(struct sock *sk)
if (ndst != dst) {
rcu_assign_pointer(sk->sk_dst_cache, ndst);
sk_tx_queue_clear(sk);
+ sk->sk_dst_pending_confirm = 0;
}
}
}
@@ -1718,6 +1778,7 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst)
struct dst_entry *old_dst;
sk_tx_queue_clear(sk);
+ sk->sk_dst_pending_confirm = 0;
/*
* This can be called while sk is owned by the caller only,
* with no state that can be checked in a rcu_dereference_check() cond
@@ -1733,6 +1794,7 @@ sk_dst_set(struct sock *sk, struct dst_entry *dst)
struct dst_entry *old_dst;
sk_tx_queue_clear(sk);
+ sk->sk_dst_pending_confirm = 0;
old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst);
dst_release(old_dst);
}
@@ -1753,6 +1815,26 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);
struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
+static inline void sk_dst_confirm(struct sock *sk)
+{
+ if (!sk->sk_dst_pending_confirm)
+ sk->sk_dst_pending_confirm = 1;
+}
+
+static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
+{
+ if (skb_get_dst_pending_confirm(skb)) {
+ struct sock *sk = skb->sk;
+ unsigned long now = jiffies;
+
+ /* avoid dirtying neighbour */
+ if (n->confirmed != now)
+ n->confirmed = now;
+ if (sk && sk->sk_dst_pending_confirm)
+ sk->sk_dst_pending_confirm = 0;
+ }
+}
+
bool sk_mc_loop(struct sock *sk);
static inline bool sk_can_gso(const struct sock *sk)
@@ -1783,13 +1865,13 @@ static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
{
if (skb->ip_summed == CHECKSUM_NONE) {
__wsum csum = 0;
- if (csum_and_copy_from_iter(to, copy, &csum, from) != copy)
+ if (!csum_and_copy_from_iter_full(to, copy, &csum, from))
return -EFAULT;
skb->csum = csum_block_add(skb->csum, csum, offset);
} else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
- if (copy_from_iter_nocache(to, copy, from) != copy)
+ if (!copy_from_iter_full_nocache(to, copy, from))
return -EFAULT;
- } else if (copy_from_iter(to, copy, from) != copy)
+ } else if (!copy_from_iter_full(to, copy, from))
return -EFAULT;
return 0;
@@ -1952,6 +2034,10 @@ void sk_reset_timer(struct sock *sk, struct timer_list *timer,
void sk_stop_timer(struct sock *sk, struct timer_list *timer);
+int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb,
+ unsigned int flags,
+ void (*destructor)(struct sock *sk,
+ struct sk_buff *skb));
int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
@@ -2108,7 +2194,8 @@ struct sock_skb_cb {
static inline void
sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
{
- SOCK_SKB_CB(skb)->dropcount = atomic_read(&sk->sk_drops);
+ SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
+ atomic_read(&sk->sk_drops) : 0;
}
static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
@@ -2137,8 +2224,8 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
*/
if (sock_flag(sk, SOCK_RCVTSTAMP) ||
(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
- (kt.tv64 && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
- (hwtstamps->hwtstamp.tv64 &&
+ (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
+ (hwtstamps->hwtstamp &&
(sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
__sock_recv_timestamp(msg, sk, skb);
else