aboutsummaryrefslogtreecommitdiff
path: root/include/net/sock.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/net/sock.h')
-rw-r--r--include/net/sock.h172
1 files changed, 81 insertions, 91 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index c4b91fc19b9c..a7273b289188 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -161,9 +161,6 @@ typedef __u64 __bitwise __addrpair;
* for struct sock and struct inet_timewait_sock.
*/
struct sock_common {
- /* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned
- * address on 64bit arches : cf INET_MATCH()
- */
union {
__addrpair skc_addrpair;
struct {
@@ -292,7 +289,6 @@ struct sk_filter;
* @sk_pacing_shift: scaling factor for TCP Small Queues
* @sk_lingertime: %SO_LINGER l_linger setting
* @sk_backlog: always used with the per-socket spinlock held
- * @defer_list: head of llist storing skbs to be freed
* @sk_callback_lock: used with the callbacks in the end of this struct
* @sk_error_queue: rarely used
* @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
@@ -417,7 +413,6 @@ struct sock {
struct sk_buff *head;
struct sk_buff *tail;
} sk_backlog;
- struct llist_head defer_list;
#define sk_rmem_alloc sk_backlog.rmem_alloc
@@ -614,7 +609,7 @@ void sock_net_set(struct sock *sk, struct net *net)
int sk_set_peek_off(struct sock *sk, int val);
-static inline int sk_peek_offset(struct sock *sk, int flags)
+static inline int sk_peek_offset(const struct sock *sk, int flags)
{
if (unlikely(flags & MSG_PEEK)) {
return READ_ONCE(sk->sk_peek_off);
@@ -854,7 +849,7 @@ static inline void sk_add_bind_node(struct sock *sk,
({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \
pos = rcu_dereference(hlist_next_rcu(pos)))
-static inline struct user_namespace *sk_user_ns(struct sock *sk)
+static inline struct user_namespace *sk_user_ns(const struct sock *sk)
{
/* Careful only use this in a context where these parameters
* can not change and must all be valid, such as recvmsg from
@@ -895,11 +890,12 @@ enum sock_flags {
SOCK_TXTIME,
SOCK_XDP, /* XDP is attached */
SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
+ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */
};
#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
-static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
+static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk)
{
nsk->sk_flags = osk->sk_flags;
}
@@ -1202,8 +1198,7 @@ struct proto {
int (*sendmsg)(struct sock *sk, struct msghdr *msg,
size_t len);
int (*recvmsg)(struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags,
- int *addr_len);
+ size_t len, int flags, int *addr_len);
int (*sendpage)(struct sock *sk, struct page *page,
int offset, size_t size, int flags);
int (*bind)(struct sock *sk,
@@ -1245,6 +1240,7 @@ struct proto {
void (*enter_memory_pressure)(struct sock *sk);
void (*leave_memory_pressure)(struct sock *sk);
atomic_long_t *memory_allocated; /* Current allocated memory. */
+ int __percpu *per_cpu_fw_alloc;
struct percpu_counter *sockets_allocated; /* Current number of sockets. */
/*
@@ -1388,21 +1384,46 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
}
static inline long
-sk_memory_allocated(const struct sock *sk)
+proto_memory_allocated(const struct proto *prot)
{
- return atomic_long_read(sk->sk_prot->memory_allocated);
+ return max(0L, atomic_long_read(prot->memory_allocated));
}
static inline long
+sk_memory_allocated(const struct sock *sk)
+{
+ return proto_memory_allocated(sk->sk_prot);
+}
+
+/* 1 MB per cpu, in page units */
+#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))
+
+static inline void
sk_memory_allocated_add(struct sock *sk, int amt)
{
- return atomic_long_add_return(amt, sk->sk_prot->memory_allocated);
+ int local_reserve;
+
+ preempt_disable();
+ local_reserve = __this_cpu_add_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
+ if (local_reserve >= SK_MEMORY_PCPU_RESERVE) {
+ __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
+ atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
+ }
+ preempt_enable();
}
static inline void
sk_memory_allocated_sub(struct sock *sk, int amt)
{
- atomic_long_sub(amt, sk->sk_prot->memory_allocated);
+ int local_reserve;
+
+ preempt_disable();
+ local_reserve = __this_cpu_sub_return(*sk->sk_prot->per_cpu_fw_alloc, amt);
+ if (local_reserve <= -SK_MEMORY_PCPU_RESERVE) {
+ __this_cpu_sub(*sk->sk_prot->per_cpu_fw_alloc, local_reserve);
+ atomic_long_add(local_reserve, sk->sk_prot->memory_allocated);
+ }
+ preempt_enable();
}
#define SK_ALLOC_PERCPU_COUNTER_BATCH 16
@@ -1431,12 +1452,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot)
return percpu_counter_sum_positive(prot->sockets_allocated);
}
-static inline long
-proto_memory_allocated(struct proto *prot)
-{
- return atomic_long_read(prot->memory_allocated);
-}
-
static inline bool
proto_memory_pressure(struct proto *prot)
{
@@ -1523,30 +1538,18 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind);
void __sk_mem_reduce_allocated(struct sock *sk, int amount);
void __sk_mem_reclaim(struct sock *sk, int amount);
-/* We used to have PAGE_SIZE here, but systems with 64KB pages
- * do not necessarily have 16x time more memory than 4KB ones.
- */
-#define SK_MEM_QUANTUM 4096
-#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
#define SK_MEM_SEND 0
#define SK_MEM_RECV 1
-/* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
+/* sysctl_mem values are in pages */
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
{
- long val = sk->sk_prot->sysctl_mem[index];
-
-#if PAGE_SIZE > SK_MEM_QUANTUM
- val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
-#elif PAGE_SIZE < SK_MEM_QUANTUM
- val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT;
-#endif
- return val;
+ return READ_ONCE(sk->sk_prot->sysctl_mem[index]);
}
static inline int sk_mem_pages(int amt)
{
- return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT;
+ return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT;
}
static inline bool sk_has_account(struct sock *sk)
@@ -1557,19 +1560,23 @@ static inline bool sk_has_account(struct sock *sk)
static inline bool sk_wmem_schedule(struct sock *sk, int size)
{
+ int delta;
+
if (!sk_has_account(sk))
return true;
- return size <= sk->sk_forward_alloc ||
- __sk_mem_schedule(sk, size, SK_MEM_SEND);
+ delta = size - sk->sk_forward_alloc;
+ return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
}
static inline bool
sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
{
+ int delta;
+
if (!sk_has_account(sk))
return true;
- return size <= sk->sk_forward_alloc ||
- __sk_mem_schedule(sk, size, SK_MEM_RECV) ||
+ delta = size - sk->sk_forward_alloc;
+ return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
skb_pfmemalloc(skb);
}
@@ -1595,7 +1602,7 @@ static inline void sk_mem_reclaim(struct sock *sk)
reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
- if (reclaimable >= SK_MEM_QUANTUM)
+ if (reclaimable >= (int)PAGE_SIZE)
__sk_mem_reclaim(sk, reclaimable);
}
@@ -1605,19 +1612,6 @@ static inline void sk_mem_reclaim_final(struct sock *sk)
sk_mem_reclaim(sk);
}
-static inline void sk_mem_reclaim_partial(struct sock *sk)
-{
- int reclaimable;
-
- if (!sk_has_account(sk))
- return;
-
- reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
-
- if (reclaimable > SK_MEM_QUANTUM)
- __sk_mem_reclaim(sk, reclaimable - 1);
-}
-
static inline void sk_mem_charge(struct sock *sk, int size)
{
if (!sk_has_account(sk))
@@ -1625,29 +1619,12 @@ static inline void sk_mem_charge(struct sock *sk, int size)
sk->sk_forward_alloc -= size;
}
-/* the following macros control memory reclaiming in sk_mem_uncharge()
- */
-#define SK_RECLAIM_THRESHOLD (1 << 21)
-#define SK_RECLAIM_CHUNK (1 << 20)
-
static inline void sk_mem_uncharge(struct sock *sk, int size)
{
- int reclaimable;
-
if (!sk_has_account(sk))
return;
sk->sk_forward_alloc += size;
- reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);
-
- /* Avoid a possible overflow.
- * TCP send queues can make this happen, if sk_mem_reclaim()
- * is not called and more than 2 GBytes are released at once.
- *
- * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
- * no need to hold that much forward allocation anyway.
- */
- if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD))
- __sk_mem_reclaim(sk, SK_RECLAIM_CHUNK);
+ sk_mem_reclaim(sk);
}
/*
@@ -1825,11 +1802,17 @@ int sock_getsockopt(struct socket *sock, int level, int op,
char __user *optval, int __user *optlen);
int sock_gettstamp(struct socket *sock, void __user *userstamp,
bool timeval, bool time32);
-struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
- int noblock, int *errcode);
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
unsigned long data_len, int noblock,
int *errcode, int max_page_order);
+
+static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk,
+ unsigned long size,
+ int noblock, int *errcode)
+{
+ return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
+}
+
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority);
void sock_kfree_s(struct sock *sk, void *mem, int size);
void sock_kzfree_s(struct sock *sk, void *mem, int size);
@@ -2231,9 +2214,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
if (err)
return err;
- skb->len += copy;
- skb->data_len += copy;
- skb->truesize += copy;
+ skb_len_add(skb, copy);
sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
return 0;
@@ -2392,7 +2373,14 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
void (*destructor)(struct sock *sk,
struct sk_buff *skb));
int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason *reason);
+
+static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ return sock_queue_rcv_skb_reason(sk, skb, NULL);
+}
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
struct sk_buff *sock_dequeue_err_skb(struct sock *sk);
@@ -2643,20 +2631,21 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
__sock_recv_wifi_status(msg, sk, skb);
}
-void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
- struct sk_buff *skb);
+void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb);
#define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC)
-static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
- struct sk_buff *skb)
+static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb)
{
-#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \
- (1UL << SOCK_RCVTSTAMP))
+#define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \
+ (1UL << SOCK_RCVTSTAMP) | \
+ (1UL << SOCK_RCVMARK))
#define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \
SOF_TIMESTAMPING_RAW_HARDWARE)
- if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY)
- __sock_recv_ts_and_drops(msg, sk, skb);
+ if (sk->sk_flags & FLAGS_RECV_CMSGS || sk->sk_tsflags & TSFLAGS_ANY)
+ __sock_recv_cmsgs(msg, sk, skb);
else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
sock_write_timestamp(sk, skb->tstamp);
else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP))
@@ -2834,18 +2823,18 @@ static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
{
/* Does this proto have per netns sysctl_wmem ? */
if (proto->sysctl_wmem_offset)
- return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset);
+ return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset));
- return *proto->sysctl_wmem;
+ return READ_ONCE(*proto->sysctl_wmem);
}
static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
{
/* Does this proto have per netns sysctl_rmem ? */
if (proto->sysctl_rmem_offset)
- return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset);
+ return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset));
- return *proto->sysctl_rmem;
+ return READ_ONCE(*proto->sysctl_rmem);
}
/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
@@ -2866,13 +2855,14 @@ static inline void sk_pacing_shift_update(struct sock *sk, int val)
*/
static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
{
+ int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
int mdif;
- if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)
+ if (!bound_dev_if || bound_dev_if == dif)
return true;
mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif);
- if (mdif && mdif == sk->sk_bound_dev_if)
+ if (mdif && mdif == bound_dev_if)
return true;
return false;