From 98ba0bd5505dcbb90322a4be07bcfe6b8a18c73f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:37 -0400 Subject: sock: allocate skbs from optmem Add sock_omalloc and sock_ofree to be able to allocate control skbs, for instance for looping errors onto sk_error_queue. The transmit budget (sk_wmem_alloc) is involved in transmit skb shaping, most notably in TCP Small Queues. Using this budget for control packets would impact transmission. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ net/core/sock.c | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index 393c38e9f6aa..0f778d3c4300 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1531,6 +1531,8 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); void __sock_wfree(struct sk_buff *skb); void sock_wfree(struct sk_buff *skb); +struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, + gfp_t priority); void skb_orphan_partial(struct sk_buff *skb); void sock_rfree(struct sk_buff *skb); void sock_efree(struct sk_buff *skb); diff --git a/net/core/sock.c b/net/core/sock.c index 742f68c9c84a..1261880bdcc8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1923,6 +1923,33 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, } EXPORT_SYMBOL(sock_wmalloc); +static void sock_ofree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_omem_alloc); +} + +struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, + gfp_t priority) +{ + struct sk_buff *skb; + + /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ + if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > + sysctl_optmem_max) + return NULL; + + skb = alloc_skb(size, priority); + if (!skb) + return NULL; + + atomic_add(skb->truesize, &sk->sk_omem_alloc); + skb->sk = sk; + skb->destructor = sock_ofree; + return skb; +} + /* * Allocate a memory block from the socket's option memory buffer. */ -- cgit From 3ece782693c4b64d588dd217868558ab9a19bfe7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:38 -0400 Subject: sock: skb_copy_ubufs support for compound pages Refine skb_copy_ubufs to support compound pages. With upcoming TCP zerocopy sendmsg, such fragments may appear. The existing code replaces each page one for one. Splitting each compound page into an independent number of regular pages can result in exceeding limit MAX_SKB_FRAGS if data is not exactly page aligned. Instead, fill all destination pages but the last to PAGE_SIZE. Split the existing alloc + copy loop into separate stages: 1. compute bytelength and minimum number of pages to store this. 2. allocate 3. copy, filling each page except the last to PAGE_SIZE bytes 4. update skb frag array Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++-- net/core/skbuff.c | 53 ++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index be76082f48aa..2f64e2bbb592 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1796,13 +1796,18 @@ static inline unsigned int skb_headlen(const struct sk_buff *skb) return skb->len - skb->data_len; } -static inline unsigned int skb_pagelen(const struct sk_buff *skb) +static inline unsigned int __skb_pagelen(const struct sk_buff *skb) { unsigned int i, len = 0; for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--) len += skb_frag_size(&skb_shinfo(skb)->frags[i]); - return len + skb_headlen(skb); + return len; +} + +static inline unsigned int skb_pagelen(const struct sk_buff *skb) +{ + return skb_headlen(skb) + __skb_pagelen(skb); } /** diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0f0933b338d7..a95877a8ac8b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -932,17 +932,20 @@ EXPORT_SYMBOL_GPL(skb_morph); */ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) { - int i; + struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; int num_frags = skb_shinfo(skb)->nr_frags; struct page *page, *head = NULL; - struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; + int i, new_frags; + u32 d_off; - for (i = 0; i < num_frags; i++) { - skb_frag_t *f = &skb_shinfo(skb)->frags[i]; - u32 p_off, p_len, copied; - struct page *p; - u8 *vaddr; + if (!num_frags) + return 0; + + if (skb_shared(skb) || skb_unclone(skb, gfp_mask)) + return -EINVAL; + new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < new_frags; i++) { page = alloc_page(gfp_mask); if (!page) { while (head) { @@ -952,17 +955,36 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) } return -ENOMEM; } + set_page_private(page, (unsigned long)head); + head = page; + } + + page = head; + d_off = 0; + for (i = 0; i < num_frags; i++) { + skb_frag_t *f = &skb_shinfo(skb)->frags[i]; + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; skb_frag_foreach_page(f, f->page_offset, skb_frag_size(f), p, p_off, p_len, copied) { + u32 copy, done = 0; vaddr = kmap_atomic(p); - memcpy(page_address(page) + copied, vaddr + p_off, - p_len); + + while (done < p_len) { + if (d_off == PAGE_SIZE) { + d_off = 0; + page = (struct page *)page_private(page); + } + copy = min_t(u32, PAGE_SIZE - d_off, p_len - done); + memcpy(page_address(page) + d_off, + vaddr + p_off + done, copy); + done += copy; + d_off += copy; + } kunmap_atomic(vaddr); } - - set_page_private(page, (unsigned long)head); - head = page; } /* skb frags release userspace buffers */ @@ -972,11 +994,12 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) uarg->callback(uarg, false); /* skb frags point to kernel buffers */ - for (i = num_frags - 1; i >= 0; i--) { - __skb_fill_page_desc(skb, i, head, 0, - skb_shinfo(skb)->frags[i].size); + for (i = 0; i < new_frags - 1; i++) { + __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); head = (struct page *)page_private(head); } + __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); + skb_shinfo(skb)->nr_frags = new_frags; skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; return 0; -- cgit From 52267790ef52d7513879238ca9fac22c1733e0e3 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:39 -0400 Subject: sock: add MSG_ZEROCOPY The kernel supports zerocopy sendmsg in virtio and tap. Expand the infrastructure to support other socket types. Introduce a completion notification channel over the socket error queue. Notifications are returned with ee_origin SO_EE_ORIGIN_ZEROCOPY. ee_errno is 0 to avoid blocking the send/recv path on receiving notifications. Add reference counting, to support the skb split, merge, resize and clone operations possible with SOCK_STREAM and other socket types. The patch does not yet modify any datapaths. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 60 +++++++++++++++++++ include/linux/socket.h | 1 + include/net/sock.h | 2 + include/uapi/linux/errqueue.h | 3 + net/core/datagram.c | 55 ++++++++++------- net/core/skbuff.c | 133 ++++++++++++++++++++++++++++++++++++++++++ net/core/sock.c | 2 + 7 files changed, 235 insertions(+), 21 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2f64e2bbb592..59cff7aa494e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -429,6 +429,7 @@ enum { SKBTX_SCHED_TSTAMP = 1 << 6, }; +#define SKBTX_ZEROCOPY_FRAG (SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG) #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP) @@ -445,8 +446,28 @@ struct ubuf_info { void (*callback)(struct ubuf_info *, bool zerocopy_success); void *ctx; unsigned long desc; + u16 zerocopy:1; + atomic_t refcnt; }; +#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) + +struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size); + +static inline void sock_zerocopy_get(struct ubuf_info *uarg) +{ + atomic_inc(&uarg->refcnt); +} + +void sock_zerocopy_put(struct ubuf_info *uarg); +void sock_zerocopy_put_abort(struct ubuf_info *uarg); + +void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); + +int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, + struct msghdr *msg, int len, + struct ubuf_info *uarg); + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -1214,6 +1235,45 @@ static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb) return &skb_shinfo(skb)->hwtstamps; } +static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) +{ + bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY; + + return is_zcopy ? skb_uarg(skb) : NULL; +} + +static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg) +{ + if (skb && uarg && !skb_zcopy(skb)) { + sock_zerocopy_get(uarg); + skb_shinfo(skb)->destructor_arg = uarg; + skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; + } +} + +/* Release a reference on a zerocopy structure */ +static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy) +{ + struct ubuf_info *uarg = skb_zcopy(skb); + + if (uarg) { + uarg->zerocopy = uarg->zerocopy && zerocopy; + sock_zerocopy_put(uarg); + skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; + } +} + +/* Abort a zerocopy operation and revert zckey on error in send syscall */ +static inline void skb_zcopy_abort(struct sk_buff *skb) +{ + struct ubuf_info *uarg = skb_zcopy(skb); + + if (uarg) { + sock_zerocopy_put_abort(uarg); + skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; + } +} + /** * skb_queue_empty - check if a queue is empty * @list: queue head diff --git a/include/linux/socket.h b/include/linux/socket.h index 8b13db5163cc..8ad963cdc88c 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -287,6 +287,7 @@ struct ucred { #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN +#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through diff --git a/include/net/sock.h b/include/net/sock.h index 0f778d3c4300..fe1a0bc25cd3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -294,6 +294,7 @@ struct sock_common { * @sk_stamp: time stamp of last packet received * @sk_tsflags: SO_TIMESTAMPING socket options * @sk_tskey: counter to disambiguate concurrent tstamp requests + * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data * @sk_frag: cached page frag @@ -462,6 +463,7 @@ struct sock { u16 sk_tsflags; u8 sk_shutdown; u32 sk_tskey; + atomic_t sk_zckey; struct socket *sk_socket; void *sk_user_data; #ifdef CONFIG_SECURITY diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index 07bdce1f444a..78fdf52d6b2f 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -18,10 +18,13 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP 2 #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 +#define SO_EE_ORIGIN_ZEROCOPY 5 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) +#define SO_EE_CODE_ZEROCOPY_COPIED 1 + /** * struct scm_timestamping - timestamps exposed through cmsg * diff --git a/net/core/datagram.c b/net/core/datagram.c index ee5647bd91b3..2f3277945d35 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -573,27 +573,12 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iter); -/** - * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter - * @skb: buffer to copy - * @from: the source to copy from - * - * The function will first copy up to headlen, and then pin the userspace - * pages and build frags through them. - * - * Returns 0, -EFAULT or -EMSGSIZE. - */ -int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) +int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length) { - int len = iov_iter_count(from); - int copy = min_t(int, skb_headlen(skb), len); - int frag = 0; + int frag = skb_shinfo(skb)->nr_frags; - /* copy up to skb headlen */ - if (skb_copy_datagram_from_iter(skb, 0, from, copy)) - return -EFAULT; - - while (iov_iter_count(from)) { + while (length && iov_iter_count(from)) { struct page *pages[MAX_SKB_FRAGS]; size_t start; ssize_t copied; @@ -603,18 +588,24 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; - copied = iov_iter_get_pages(from, pages, ~0U, + copied = iov_iter_get_pages(from, pages, length, MAX_SKB_FRAGS - frag, &start); if (copied < 0) return -EFAULT; iov_iter_advance(from, copied); + length -= copied; truesize = PAGE_ALIGN(copied + start); skb->data_len += copied; skb->len += copied; skb->truesize += truesize; - refcount_add(truesize, &skb->sk->sk_wmem_alloc); + if (sk && sk->sk_type == SOCK_STREAM) { + sk->sk_wmem_queued += truesize; + sk_mem_charge(sk, truesize); + } else { + refcount_add(truesize, &skb->sk->sk_wmem_alloc); + } while (copied) { int size = min_t(int, copied, PAGE_SIZE - start); skb_fill_page_desc(skb, frag++, pages[n], start, size); @@ -625,6 +616,28 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) } return 0; } +EXPORT_SYMBOL(__zerocopy_sg_from_iter); + +/** + * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter + * @skb: buffer to copy + * @from: the source to copy from + * + * The function will first copy up to headlen, and then pin the userspace + * pages and build frags through them. + * + * Returns 0, -EFAULT or -EMSGSIZE. + */ +int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) +{ + int copy = min_t(int, skb_headlen(skb), iov_iter_count(from)); + + /* copy up to skb headlen */ + if (skb_copy_datagram_from_iter(skb, 0, from, copy)) + return -EFAULT; + + return __zerocopy_sg_from_iter(NULL, skb, from, ~0U); +} EXPORT_SYMBOL(zerocopy_sg_from_iter); static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a95877a8ac8b..0603e44950da 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -915,6 +915,139 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); +struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) +{ + struct ubuf_info *uarg; + struct sk_buff *skb; + + WARN_ON_ONCE(!in_task()); + + skb = sock_omalloc(sk, 0, GFP_KERNEL); + if (!skb) + return NULL; + + BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); + uarg = (void *)skb->cb; + + uarg->callback = sock_zerocopy_callback; + uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1; + uarg->zerocopy = 1; + atomic_set(&uarg->refcnt, 0); + sock_hold(sk); + + return uarg; +} +EXPORT_SYMBOL_GPL(sock_zerocopy_alloc); + +static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) +{ + return container_of((void *)uarg, struct sk_buff, cb); +} + +void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) +{ + struct sk_buff *skb = skb_from_uarg(uarg); + struct sock_exterr_skb *serr; + struct sock *sk = skb->sk; + u16 id = uarg->desc; + + if (sock_flag(sk, SOCK_DEAD)) + goto release; + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = 0; + serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; + serr->ee.ee_data = id; + if (!success) + serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; + + skb_queue_tail(&sk->sk_error_queue, skb); + skb = NULL; + + sk->sk_error_report(sk); + +release: + consume_skb(skb); + sock_put(sk); +} +EXPORT_SYMBOL_GPL(sock_zerocopy_callback); + +void sock_zerocopy_put(struct ubuf_info *uarg) +{ + if (uarg && atomic_dec_and_test(&uarg->refcnt)) { + if (uarg->callback) + uarg->callback(uarg, uarg->zerocopy); + else + consume_skb(skb_from_uarg(uarg)); + } +} +EXPORT_SYMBOL_GPL(sock_zerocopy_put); + +void sock_zerocopy_put_abort(struct ubuf_info *uarg) +{ + if (uarg) { + struct sock *sk = skb_from_uarg(uarg)->sk; + + atomic_dec(&sk->sk_zckey); + + /* sock_zerocopy_put expects a ref. Most sockets take one per + * skb, which is zero on abort. tcp_sendmsg holds one extra, to + * avoid an skb send inside the main loop triggering uarg free. + */ + if (sk->sk_type != SOCK_STREAM) + atomic_inc(&uarg->refcnt); + + sock_zerocopy_put(uarg); + } +} +EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); + +extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length); + +int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, + struct msghdr *msg, int len, + struct ubuf_info *uarg) +{ + struct iov_iter orig_iter = msg->msg_iter; + int err, orig_len = skb->len; + + err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len); + if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { + /* Streams do not free skb on error. Reset to prev state. */ + msg->msg_iter = orig_iter; + ___pskb_trim(skb, orig_len); + return err; + } + + skb_zcopy_set(skb, uarg); + return skb->len - orig_len; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); + +/* unused only until next patch in the series; will remove attribute */ +static int __attribute__((unused)) + skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, + gfp_t gfp_mask) +{ + if (skb_zcopy(orig)) { + if (skb_zcopy(nskb)) { + /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ + if (!gfp_mask) { + WARN_ON_ONCE(1); + return -ENOMEM; + } + if (skb_uarg(nskb) == skb_uarg(orig)) + return 0; + if (skb_copy_ubufs(nskb, GFP_ATOMIC)) + return -EIO; + } + skb_zcopy_set(nskb, skb_uarg(orig)); + } + return 0; +} + /** * skb_copy_ubufs - copy userspace skb frags buffers to kernel * @skb: the skb to modify diff --git a/net/core/sock.c b/net/core/sock.c index 1261880bdcc8..e8b696858cad 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1670,6 +1670,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); @@ -2722,6 +2723,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; + atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; -- cgit From 76851d1212c11365362525e1e2c0a18c97478e6b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:40 -0400 Subject: sock: add SOCK_ZEROCOPY sockopt The send call ignores unknown flags. Legacy applications may already unwittingly pass MSG_ZEROCOPY. Continue to ignore this flag unless a socket opts in to zerocopy. Introduce socket option SO_ZEROCOPY to enable MSG_ZEROCOPY processing. Processes can also query this socket option to detect kernel support for the feature. Older kernels will return ENOPROTOOPT. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/frv/include/uapi/asm/socket.h | 2 ++ arch/ia64/include/uapi/asm/socket.h | 2 ++ arch/m32r/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/mn10300/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/s390/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ arch/xtensa/include/uapi/asm/socket.h | 2 ++ include/uapi/asm-generic/socket.h | 2 ++ net/core/skbuff.c | 3 +++ net/core/sock.c | 18 ++++++++++++++++++ 13 files changed, 43 insertions(+) diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 7b285dd4fe05..c6133a045352 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -109,4 +109,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index f1e3b20dce9f..9abf02d6855a 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -102,5 +102,7 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 5dd5c5d0d642..002eb85a6941 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -111,4 +111,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index f8f7b47e247f..e268e51a38d1 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -102,4 +102,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 882823bec153..6c755bc07975 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -120,4 +120,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index c710db354ff2..ac82a3f26dbf 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -102,4 +102,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index a0d4dc9f4eb2..3b2bf7ae703b 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -101,4 +101,6 @@ #define SO_PEERGROUPS 0x4034 +#define SO_ZEROCOPY 0x4035 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 52a63f4175cb..a56916c83565 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -108,4 +108,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 186fd8199f54..b2f5c50d0947 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -98,6 +98,8 @@ #define SO_PEERGROUPS 0x003d +#define SO_ZEROCOPY 0x003e + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 3eed2761c149..220059999e74 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -113,4 +113,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 9861be8da65e..e47c9e436221 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -104,4 +104,6 @@ #define SO_PEERGROUPS 59 +#define SO_ZEROCOPY 60 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0603e44950da..29e34bc6a17c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -922,6 +922,9 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) WARN_ON_ONCE(!in_task()); + if (!sock_flag(sk, SOCK_ZEROCOPY)) + return NULL; + skb = sock_omalloc(sk, 0, GFP_KERNEL); if (!skb) return NULL; diff --git a/net/core/sock.c b/net/core/sock.c index e8b696858cad..9ea988d25b0a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1055,6 +1055,20 @@ set_rcvbuf: if (val == 1) dst_negative_advice(sk); break; + + case SO_ZEROCOPY: + if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + ret = -ENOTSUPP; + else if (sk->sk_protocol != IPPROTO_TCP) + ret = -ENOTSUPP; + else if (sk->sk_state != TCP_CLOSE) + ret = -EBUSY; + else if (val < 0 || val > 1) + ret = -EINVAL; + else + sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); + break; + default: ret = -ENOPROTOOPT; break; @@ -1383,6 +1397,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val64 = sock_gen_cookie(sk); break; + case SO_ZEROCOPY: + v.val = sock_flag(sk, SOCK_ZEROCOPY); + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). -- cgit From 1f8b977ab32dc5d148f103326e80d9097f1cefb5 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:41 -0400 Subject: sock: enable MSG_ZEROCOPY Prepare the datapath for refcounted ubuf_info. Clone ubuf_info with skb_zerocopy_clone() wherever needed due to skb split, merge, resize or clone. Split skb_orphan_frags into two variants. The split, merge, .. paths support reference counted zerocopy buffers, so do not do a deep copy. Add skb_orphan_frags_rx for paths that may loop packets to receive sockets. That is not allowed, as it may cause unbounded latency. Deep copy all zerocopy copy buffers, ref-counted or not, in this path. The exact locations to modify were chosen by exhaustively searching through all code that might modify skb_frag references and/or the the SKBTX_DEV_ZEROCOPY tx_flags bit. The changes err on the safe side, in two ways. (1) legacy ubuf_info paths virtio and tap are not modified. They keep a 1:1 ubuf_info to sk_buff relationship. Calls to skb_orphan_frags still call skb_copy_ubufs and thus copy frags in this case. (2) not all copies deep in the stack are addressed yet. skb_shift, skb_split and skb_try_coalesce can be refined to avoid copying. These are not in the hot path and this patch is hairy enough as is, so that is left for future refinement. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 +- drivers/vhost/net.c | 1 + include/linux/skbuff.h | 14 +++++++++++++- net/core/dev.c | 4 ++-- net/core/skbuff.c | 48 +++++++++++++++++++----------------------------- 5 files changed, 36 insertions(+), 33 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 68add55f8460..d21510d47aa2 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -892,7 +892,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) sk_filter(tfile->socket.sk, skb)) goto drop; - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; skb_tx_timestamp(skb); diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 06d044862e58..ba08b78ed630 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -533,6 +533,7 @@ static void handle_tx(struct vhost_net *net) ubuf->callback = vhost_zerocopy_callback; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; + atomic_set(&ubuf->refcnt, 1); msg.msg_control = ubuf; msg.msg_controllen = sizeof(ubuf); ubufs = nvq->ubufs; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 59cff7aa494e..e5387932c266 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2512,7 +2512,17 @@ static inline void skb_orphan(struct sk_buff *skb) */ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask) { - if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY))) + if (likely(!skb_zcopy(skb))) + return 0; + if (skb_uarg(skb)->callback == sock_zerocopy_callback) + return 0; + return skb_copy_ubufs(skb, gfp_mask); +} + +/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */ +static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask) +{ + if (likely(!skb_zcopy(skb))) return 0; return skb_copy_ubufs(skb, gfp_mask); } @@ -2944,6 +2954,8 @@ static inline int skb_add_data(struct sk_buff *skb, static inline bool skb_can_coalesce(struct sk_buff *skb, int i, const struct page *page, int off) { + if (skb_zcopy(skb)) + return false; if (i) { const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1]; diff --git a/net/core/dev.c b/net/core/dev.c index 8ea6b4b42611..1d75499add72 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1853,7 +1853,7 @@ static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); @@ -4412,7 +4412,7 @@ skip_classify: } if (pt_prev) { - if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) goto drop; else ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 29e34bc6a17c..74d3c36f8419 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -567,21 +567,10 @@ static void skb_release_data(struct sk_buff *skb) for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i]); - /* - * If skb buf is from userspace, we need to notify the caller - * the lower device DMA has done; - */ - if (shinfo->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; - - uarg = shinfo->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, true); - } - if (shinfo->frag_list) kfree_skb_list(shinfo->frag_list); + skb_zcopy_clear(skb, true); skb_free_head(skb); } @@ -695,14 +684,7 @@ EXPORT_SYMBOL(kfree_skb_list); */ void skb_tx_error(struct sk_buff *skb) { - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - struct ubuf_info *uarg; - - uarg = skb_shinfo(skb)->destructor_arg; - if (uarg->callback) - uarg->callback(uarg, false); - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; - } + skb_zcopy_clear(skb, true); } EXPORT_SYMBOL(skb_tx_error); @@ -1029,9 +1011,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); -/* unused only until next patch in the series; will remove attribute */ -static int __attribute__((unused)) - skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, +static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, gfp_t gfp_mask) { if (skb_zcopy(orig)) { @@ -1068,7 +1048,6 @@ static int __attribute__((unused)) */ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) { - struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg; int num_frags = skb_shinfo(skb)->nr_frags; struct page *page, *head = NULL; int i, new_frags; @@ -1127,8 +1106,6 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) for (i = 0; i < num_frags; i++) skb_frag_unref(skb, i); - uarg->callback(uarg, false); - /* skb frags point to kernel buffers */ for (i = 0; i < new_frags - 1; i++) { __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE); @@ -1137,7 +1114,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off); skb_shinfo(skb)->nr_frags = new_frags; - skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; + skb_zcopy_clear(skb, false); return 0; } EXPORT_SYMBOL_GPL(skb_copy_ubufs); @@ -1298,7 +1275,8 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, if (skb_shinfo(skb)->nr_frags) { int i; - if (skb_orphan_frags(skb, gfp_mask)) { + if (skb_orphan_frags(skb, gfp_mask) || + skb_zerocopy_clone(n, skb, gfp_mask)) { kfree_skb(n); n = NULL; goto out; @@ -1375,9 +1353,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, * be since all we did is relocate the values */ if (skb_cloned(skb)) { - /* copy this zero copy skb frags */ if (skb_orphan_frags(skb, gfp_mask)) goto nofrags; + if (skb_zcopy(skb)) + atomic_inc(&skb_uarg(skb)->refcnt); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); @@ -1872,6 +1851,9 @@ end: skb->tail += delta; skb->data_len -= delta; + if (!skb->data_len) + skb_zcopy_clear(skb, false); + return skb_tail_pointer(skb); } EXPORT_SYMBOL(__pskb_pull_tail); @@ -2627,6 +2609,7 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) skb_tx_error(from); return -ENOMEM; } + skb_zerocopy_clone(to, from, GFP_ATOMIC); for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { if (!len) @@ -2924,6 +2907,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; + skb_zerocopy_clone(skb1, skb, 0); if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ @@ -2967,6 +2951,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) if (skb_headlen(skb)) return 0; + if (skb_zcopy(tgt) || skb_zcopy(skb)) + return 0; todo = shiftlen; from = 0; @@ -3540,6 +3526,8 @@ normal: skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; + if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC)) + goto err; while (pos < offset + len) { if (i >= nfrags) { @@ -4663,6 +4651,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (skb_has_frag_list(to) || skb_has_frag_list(from)) return false; + if (skb_zcopy(to) || skb_zcopy(from)) + return false; if (skb_headlen(from) != 0) { struct page *page; -- cgit From 4ab6c99d99bb1bf0fbba8ff4e52114c66109992f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:42 -0400 Subject: sock: MSG_ZEROCOPY notification coalescing In the simple case, each sendmsg() call generates data and eventually a zerocopy ready notification N, where N indicates the Nth successful invocation of sendmsg() with the MSG_ZEROCOPY flag on this socket. TCP and corked sockets can cause send() calls to append new data to an existing sk_buff and, thus, ubuf_info. In that case the notification must hold a range. odify ubuf_info to store a inclusive range [N..N+m] and add skb_zerocopy_realloc() to optionally extend an existing range. Also coalesce notifications in this common case: if a notification [1, 1] is about to be queued while [0, 0] is the queue tail, just modify the head of the queue to read [0, 1]. Coalescing is limited to a few TSO frames worth of data to bound notification latency. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 17 +++++++-- net/core/skbuff.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 106 insertions(+), 10 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e5387932c266..f5bdd93a87da 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -444,15 +444,26 @@ enum { */ struct ubuf_info { void (*callback)(struct ubuf_info *, bool zerocopy_success); - void *ctx; - unsigned long desc; - u16 zerocopy:1; + union { + struct { + unsigned long desc; + void *ctx; + }; + struct { + u32 id; + u16 len; + u16 zerocopy:1; + u32 bytelen; + }; + }; atomic_t refcnt; }; #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size); +struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, + struct ubuf_info *uarg); static inline void sock_zerocopy_get(struct ubuf_info *uarg) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 74d3c36f8419..dcee0f64f1fa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -915,7 +915,9 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) uarg = (void *)skb->cb; uarg->callback = sock_zerocopy_callback; - uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1; + uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; + uarg->len = 1; + uarg->bytelen = size; uarg->zerocopy = 1; atomic_set(&uarg->refcnt, 0); sock_hold(sk); @@ -929,26 +931,101 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) return container_of((void *)uarg, struct sk_buff, cb); } +struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, + struct ubuf_info *uarg) +{ + if (uarg) { + const u32 byte_limit = 1 << 19; /* limit to a few TSO */ + u32 bytelen, next; + + /* realloc only when socket is locked (TCP, UDP cork), + * so uarg->len and sk_zckey access is serialized + */ + if (!sock_owned_by_user(sk)) { + WARN_ON_ONCE(1); + return NULL; + } + + bytelen = uarg->bytelen + size; + if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) { + /* TCP can create new skb to attach new uarg */ + if (sk->sk_type == SOCK_STREAM) + goto new_alloc; + return NULL; + } + + next = (u32)atomic_read(&sk->sk_zckey); + if ((u32)(uarg->id + uarg->len) == next) { + uarg->len++; + uarg->bytelen = bytelen; + atomic_set(&sk->sk_zckey, ++next); + return uarg; + } + } + +new_alloc: + return sock_zerocopy_alloc(sk, size); +} +EXPORT_SYMBOL_GPL(sock_zerocopy_realloc); + +static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + u32 old_lo, old_hi; + u64 sum_len; + + old_lo = serr->ee.ee_info; + old_hi = serr->ee.ee_data; + sum_len = old_hi - old_lo + 1ULL + len; + + if (sum_len >= (1ULL << 32)) + return false; + + if (lo != old_hi + 1) + return false; + + serr->ee.ee_data += len; + return true; +} + void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) { - struct sk_buff *skb = skb_from_uarg(uarg); + struct sk_buff *tail, *skb = skb_from_uarg(uarg); struct sock_exterr_skb *serr; struct sock *sk = skb->sk; - u16 id = uarg->desc; + struct sk_buff_head *q; + unsigned long flags; + u32 lo, hi; + u16 len; - if (sock_flag(sk, SOCK_DEAD)) + /* if !len, there was only 1 call, and it was aborted + * so do not queue a completion notification + */ + if (!uarg->len || sock_flag(sk, SOCK_DEAD)) goto release; + len = uarg->len; + lo = uarg->id; + hi = uarg->id + len - 1; + serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = 0; serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; - serr->ee.ee_data = id; + serr->ee.ee_data = hi; + serr->ee.ee_info = lo; if (!success) serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; - skb_queue_tail(&sk->sk_error_queue, skb); - skb = NULL; + q = &sk->sk_error_queue; + spin_lock_irqsave(&q->lock, flags); + tail = skb_peek_tail(q); + if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || + !skb_zerocopy_notify_extend(tail, lo, len)) { + __skb_queue_tail(q, skb); + skb = NULL; + } + spin_unlock_irqrestore(&q->lock, flags); sk->sk_error_report(sk); @@ -975,6 +1052,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) struct sock *sk = skb_from_uarg(uarg)->sk; atomic_dec(&sk->sk_zckey); + uarg->len--; /* sock_zerocopy_put expects a ref. Most sockets take one per * skb, which is zero on abort. tcp_sendmsg holds one extra, to @@ -995,9 +1073,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) { + struct ubuf_info *orig_uarg = skb_zcopy(skb); struct iov_iter orig_iter = msg->msg_iter; int err, orig_len = skb->len; + /* An skb can only point to one uarg. This edge case happens when + * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. + */ + if (orig_uarg && uarg != orig_uarg) + return -EEXIST; + err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { /* Streams do not free skb on error. Reset to prev state. */ -- cgit From a91dbff551a6f1865b68fa82b654591490b59901 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:43 -0400 Subject: sock: ulimit on MSG_ZEROCOPY pages Bound the number of pages that a user may pin. Follow the lead of perf tools to maintain a per-user bound on memory locked pages commit 789f90fcf6b0 ("perf_counter: per user mlock gift") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/sched/user.h | 3 ++- include/linux/skbuff.h | 5 +++++ net/core/skbuff.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 5d5415e129d4..3c07e4135127 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -36,7 +36,8 @@ struct user_struct { struct hlist_node uidhash_node; kuid_t uid; -#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) +#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ + defined(CONFIG_NET) atomic_long_t locked_vm; #endif }; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f5bdd93a87da..8c0708d2e5e6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -457,6 +457,11 @@ struct ubuf_info { }; }; atomic_t refcnt; + + struct mmpin { + struct user_struct *user; + unsigned int num_pg; + } mmp; }; #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index dcee0f64f1fa..42b62c716a33 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -897,6 +897,44 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); +static int mm_account_pinned_pages(struct mmpin *mmp, size_t size) +{ + unsigned long max_pg, num_pg, new_pg, old_pg; + struct user_struct *user; + + if (capable(CAP_IPC_LOCK) || !size) + return 0; + + num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ + max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + user = mmp->user ? : current_user(); + + do { + old_pg = atomic_long_read(&user->locked_vm); + new_pg = old_pg + num_pg; + if (new_pg > max_pg) + return -ENOBUFS; + } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) != + old_pg); + + if (!mmp->user) { + mmp->user = get_uid(user); + mmp->num_pg = num_pg; + } else { + mmp->num_pg += num_pg; + } + + return 0; +} + +static void mm_unaccount_pinned_pages(struct mmpin *mmp) +{ + if (mmp->user) { + atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); + free_uid(mmp->user); + } +} + struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) { struct ubuf_info *uarg; @@ -913,6 +951,12 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); uarg = (void *)skb->cb; + uarg->mmp.user = NULL; + + if (mm_account_pinned_pages(&uarg->mmp, size)) { + kfree_skb(skb); + return NULL; + } uarg->callback = sock_zerocopy_callback; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; @@ -956,6 +1000,8 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg->id + uarg->len) == next) { + if (mm_account_pinned_pages(&uarg->mmp, size)) + return NULL; uarg->len++; uarg->bytelen = bytelen; atomic_set(&sk->sk_zckey, ++next); @@ -1038,6 +1084,8 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback); void sock_zerocopy_put(struct ubuf_info *uarg) { if (uarg && atomic_dec_and_test(&uarg->refcnt)) { + mm_unaccount_pinned_pages(&uarg->mmp); + if (uarg->callback) uarg->callback(uarg, uarg->zerocopy); else -- cgit From f214f915e7db99091f1312c48b30928c1e0c90b7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:44 -0400 Subject: tcp: enable MSG_ZEROCOPY Enable support for MSG_ZEROCOPY to the TCP stack. TSO and GSO are both supported. Only data sent to remote destinations is sent without copying. Packets looped onto a local destination have their payload copied to avoid unbounded latency. Tested: A 10x TCP_STREAM between two hosts showed a reduction in netserver process cycles by up to 70%, depending on packet size. Systemwide, savings are of course much less pronounced, at up to 20% best case. msg_zerocopy.sh 4 tcp: without zerocopy tx=121792 (7600 MB) txc=0 zc=n rx=60458 (7600 MB) with zerocopy tx=286257 (17863 MB) txc=286257 zc=y rx=140022 (17863 MB) This test opens a pair of sockets over veth, one one calls send with 64KB and optionally MSG_ZEROCOPY and on the other reads the initial bytes. The receiver truncates, so this is strictly an upper bound on what is achievable. It is more representative of sending data out of a physical NIC (when payload is not touched, either). Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9dd6f4dba9b1..71b25567e787 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1165,6 +1165,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); + struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct sockcm_cookie sockc; int flags, err, copied = 0; @@ -1174,6 +1175,26 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) long timeo; flags = msg->msg_flags; + + if (flags & MSG_ZEROCOPY && size) { + if (sk->sk_state != TCP_ESTABLISHED) { + err = -EINVAL; + goto out_err; + } + + skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL; + uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); + if (!uarg) { + err = -ENOBUFS; + goto out_err; + } + + /* skb may be freed in main loop, keep extra ref on uarg */ + sock_zerocopy_get(uarg); + if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG)) + uarg->zerocopy = 0; + } + if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); if (err == -EINPROGRESS && copied_syn > 0) @@ -1297,7 +1318,7 @@ new_segment: err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); if (err) goto do_fault; - } else { + } else if (!uarg || !uarg->zerocopy) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); @@ -1335,6 +1356,13 @@ new_segment: page_ref_inc(pfrag->page); } pfrag->offset += copy; + } else { + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); + if (err == -EMSGSIZE || err == -EEXIST) + goto new_segment; + if (err < 0) + goto do_error; + copy = err; } if (!copied) @@ -1381,6 +1409,7 @@ out: tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: + sock_zerocopy_put(uarg); return copied + copied_syn; do_fault: @@ -1397,6 +1426,7 @@ do_error: if (copied + copied_syn) goto out; out_err: + sock_zerocopy_put_abort(uarg); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && -- cgit From 07b65c5b31ce477c3ced6e3541fd2331338be214 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:45 -0400 Subject: test: add msg_zerocopy test Introduce regression test for msg_zerocopy feature. Send traffic from one process to another with and without zerocopy. Evaluate tcp, udp, raw and packet sockets, including variants - udp: corking and corking with mixed copy/zerocopy calls - raw: with and without hdrincl - packet: at both raw and dgram level Test on both ipv4 and ipv6, optionally with ethtool changes to disable scatter-gather, tx checksum or tso offload. All of these can affect zerocopy behavior. The regression test can be run on a single machine if over a veth pair. Then skb_orphan_frags_rx must be modified to be identical to skb_orphan_frags to allow forwarding zerocopy locally. The msg_zerocopy.sh script will setup the veth pair in network namespaces and run all tests. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/msg_zerocopy.c | 697 ++++++++++++++++++++++++++++ tools/testing/selftests/net/msg_zerocopy.sh | 112 +++++ 4 files changed, 811 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/net/msg_zerocopy.c create mode 100755 tools/testing/selftests/net/msg_zerocopy.sh diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index afe109e5508a..9801253e4802 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -1,3 +1,4 @@ +msg_zerocopy socket psock_fanout psock_tpacket diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index f6c9dbf478f8..6135a8448900 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -7,7 +7,7 @@ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh TEST_GEN_FILES = socket TEST_GEN_FILES += psock_fanout psock_tpacket TEST_GEN_FILES += reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa -TEST_GEN_FILES += reuseport_dualstack +TEST_GEN_FILES += reuseport_dualstack msg_zerocopy include ../lib.mk diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c new file mode 100644 index 000000000000..448c69a8af74 --- /dev/null +++ b/tools/testing/selftests/net/msg_zerocopy.c @@ -0,0 +1,697 @@ +/* Evaluate MSG_ZEROCOPY + * + * Send traffic between two processes over one of the supported + * protocols and modes: + * + * PF_INET/PF_INET6 + * - SOCK_STREAM + * - SOCK_DGRAM + * - SOCK_DGRAM with UDP_CORK + * - SOCK_RAW + * - SOCK_RAW with IP_HDRINCL + * + * PF_PACKET + * - SOCK_DGRAM + * - SOCK_RAW + * + * Start this program on two connected hosts, one in send mode and + * the other with option '-r' to put it in receiver mode. + * + * If zerocopy mode ('-z') is enabled, the sender will verify that + * the kernel queues completions on the error queue for all zerocopy + * transfers. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef SO_EE_ORIGIN_ZEROCOPY +#define SO_EE_ORIGIN_ZEROCOPY SO_EE_ORIGIN_UPAGE +#endif + +#ifndef SO_ZEROCOPY +#define SO_ZEROCOPY 59 +#endif + +#ifndef SO_EE_CODE_ZEROCOPY_COPIED +#define SO_EE_CODE_ZEROCOPY_COPIED 1 +#endif + +#ifndef MSG_ZEROCOPY +#define MSG_ZEROCOPY 0x4000000 +#endif + +static int cfg_cork; +static bool cfg_cork_mixed; +static int cfg_cpu = -1; /* default: pin to last cpu */ +static int cfg_family = PF_UNSPEC; +static int cfg_ifindex = 1; +static int cfg_payload_len; +static int cfg_port = 8000; +static bool cfg_rx; +static int cfg_runtime_ms = 4200; +static int cfg_verbose; +static int cfg_waittime_ms = 500; +static bool cfg_zerocopy; + +static socklen_t cfg_alen; +static struct sockaddr_storage cfg_dst_addr; +static struct sockaddr_storage cfg_src_addr; + +static char payload[IP_MAXPACKET]; +static long packets, bytes, completions, expected_completions; +static int zerocopied = -1; +static uint32_t next_completion; + +static unsigned long gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); +} + +static uint16_t get_ip_csum(const uint16_t *start, int num_words) +{ + unsigned long sum = 0; + int i; + + for (i = 0; i < num_words; i++) + sum += start[i]; + + while (sum >> 16) + sum = (sum & 0xFFFF) + (sum >> 16); + + return ~sum; +} + +static int do_setcpu(int cpu) +{ + cpu_set_t mask; + + CPU_ZERO(&mask); + CPU_SET(cpu, &mask); + if (sched_setaffinity(0, sizeof(mask), &mask)) + error(1, 0, "setaffinity %d", cpu); + + if (cfg_verbose) + fprintf(stderr, "cpu: %u\n", cpu); + + return 0; +} + +static void do_setsockopt(int fd, int level, int optname, int val) +{ + if (setsockopt(fd, level, optname, &val, sizeof(val))) + error(1, errno, "setsockopt %d.%d: %d", level, optname, val); +} + +static int do_poll(int fd, int events) +{ + struct pollfd pfd; + int ret; + + pfd.events = events; + pfd.revents = 0; + pfd.fd = fd; + + ret = poll(&pfd, 1, cfg_waittime_ms); + if (ret == -1) + error(1, errno, "poll"); + + return ret && (pfd.revents & events); +} + +static int do_accept(int fd) +{ + int fda = fd; + + fd = accept(fda, NULL, NULL); + if (fd == -1) + error(1, errno, "accept"); + if (close(fda)) + error(1, errno, "close listen sock"); + + return fd; +} + +static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy) +{ + int ret, len, i, flags; + + len = 0; + for (i = 0; i < msg->msg_iovlen; i++) + len += msg->msg_iov[i].iov_len; + + flags = MSG_DONTWAIT; + if (do_zerocopy) + flags |= MSG_ZEROCOPY; + + ret = sendmsg(fd, msg, flags); + if (ret == -1 && errno == EAGAIN) + return false; + if (ret == -1) + error(1, errno, "send"); + if (cfg_verbose && ret != len) + fprintf(stderr, "send: ret=%u != %u\n", ret, len); + + if (len) { + packets++; + bytes += ret; + if (do_zerocopy && ret) + expected_completions++; + } + + return true; +} + +static void do_sendmsg_corked(int fd, struct msghdr *msg) +{ + bool do_zerocopy = cfg_zerocopy; + int i, payload_len, extra_len; + + /* split up the packet. for non-multiple, make first buffer longer */ + payload_len = cfg_payload_len / cfg_cork; + extra_len = cfg_payload_len - (cfg_cork * payload_len); + + do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1); + + for (i = 0; i < cfg_cork; i++) { + + /* in mixed-frags mode, alternate zerocopy and copy frags + * start with non-zerocopy, to ensure attach later works + */ + if (cfg_cork_mixed) + do_zerocopy = (i & 1); + + msg->msg_iov[0].iov_len = payload_len + extra_len; + extra_len = 0; + + do_sendmsg(fd, msg, do_zerocopy); + } + + do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); +} + +static int setup_iph(struct iphdr *iph, uint16_t payload_len) +{ + struct sockaddr_in *daddr = (void *) &cfg_dst_addr; + struct sockaddr_in *saddr = (void *) &cfg_src_addr; + + memset(iph, 0, sizeof(*iph)); + + iph->version = 4; + iph->tos = 0; + iph->ihl = 5; + iph->ttl = 2; + iph->saddr = saddr->sin_addr.s_addr; + iph->daddr = daddr->sin_addr.s_addr; + iph->protocol = IPPROTO_EGP; + iph->tot_len = htons(sizeof(*iph) + payload_len); + iph->check = get_ip_csum((void *) iph, iph->ihl << 1); + + return sizeof(*iph); +} + +static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len) +{ + struct sockaddr_in6 *daddr = (void *) &cfg_dst_addr; + struct sockaddr_in6 *saddr = (void *) &cfg_src_addr; + + memset(ip6h, 0, sizeof(*ip6h)); + + ip6h->version = 6; + ip6h->payload_len = htons(payload_len); + ip6h->nexthdr = IPPROTO_EGP; + ip6h->hop_limit = 2; + ip6h->saddr = saddr->sin6_addr; + ip6h->daddr = daddr->sin6_addr; + + return sizeof(*ip6h); +} + +static void setup_sockaddr(int domain, const char *str_addr, void *sockaddr) +{ + struct sockaddr_in6 *addr6 = (void *) sockaddr; + struct sockaddr_in *addr4 = (void *) sockaddr; + + switch (domain) { + case PF_INET: + addr4->sin_family = AF_INET; + addr4->sin_port = htons(cfg_port); + if (inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) + error(1, 0, "ipv4 parse error: %s", str_addr); + break; + case PF_INET6: + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(cfg_port); + if (inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) + error(1, 0, "ipv6 parse error: %s", str_addr); + break; + default: + error(1, 0, "illegal domain"); + } +} + +static int do_setup_tx(int domain, int type, int protocol) +{ + int fd; + + fd = socket(domain, type, protocol); + if (fd == -1) + error(1, errno, "socket t"); + + do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21); + if (cfg_zerocopy) + do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1); + + if (domain != PF_PACKET) + if (connect(fd, (void *) &cfg_dst_addr, cfg_alen)) + error(1, errno, "connect"); + + return fd; +} + +static bool do_recv_completion(int fd) +{ + struct sock_extended_err *serr; + struct msghdr msg = {}; + struct cmsghdr *cm; + uint32_t hi, lo, range; + int ret, zerocopy; + char control[100]; + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); + if (ret == -1 && errno == EAGAIN) + return false; + if (ret == -1) + error(1, errno, "recvmsg notification"); + if (msg.msg_flags & MSG_CTRUNC) + error(1, errno, "recvmsg notification: truncated"); + + cm = CMSG_FIRSTHDR(&msg); + if (!cm) + error(1, 0, "cmsg: no cmsg"); + if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) || + (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR) || + (cm->cmsg_level == SOL_PACKET && cm->cmsg_type == PACKET_TX_TIMESTAMP))) + error(1, 0, "serr: wrong type: %d.%d", + cm->cmsg_level, cm->cmsg_type); + + serr = (void *) CMSG_DATA(cm); + if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) + error(1, 0, "serr: wrong origin: %u", serr->ee_origin); + if (serr->ee_errno != 0) + error(1, 0, "serr: wrong error code: %u", serr->ee_errno); + + hi = serr->ee_data; + lo = serr->ee_info; + range = hi - lo + 1; + + /* Detect notification gaps. These should not happen often, if at all. + * Gaps can occur due to drops, reordering and retransmissions. + */ + if (lo != next_completion) + fprintf(stderr, "gap: %u..%u does not append to %u\n", + lo, hi, next_completion); + next_completion = hi + 1; + + zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED); + if (zerocopied == -1) + zerocopied = zerocopy; + else if (zerocopied != zerocopy) { + fprintf(stderr, "serr: inconsistent\n"); + zerocopied = zerocopy; + } + + if (cfg_verbose >= 2) + fprintf(stderr, "completed: %u (h=%u l=%u)\n", + range, hi, lo); + + completions += range; + return true; +} + +/* Read all outstanding messages on the errqueue */ +static void do_recv_completions(int fd) +{ + while (do_recv_completion(fd)) {} +} + +/* Wait for all remaining completions on the errqueue */ +static void do_recv_remaining_completions(int fd) +{ + int64_t tstop = gettimeofday_ms() + cfg_waittime_ms; + + while (completions < expected_completions && + gettimeofday_ms() < tstop) { + if (do_poll(fd, POLLERR)) + do_recv_completions(fd); + } + + if (completions < expected_completions) + error(1, 0, "missing notifications: %lu < %lu\n", + completions, expected_completions); +} + +static void do_tx(int domain, int type, int protocol) +{ + struct iovec iov[3] = { {0} }; + struct sockaddr_ll laddr; + struct msghdr msg = {0}; + struct ethhdr eth; + union { + struct ipv6hdr ip6h; + struct iphdr iph; + } nh; + uint64_t tstop; + int fd; + + fd = do_setup_tx(domain, type, protocol); + + if (domain == PF_PACKET) { + uint16_t proto = cfg_family == PF_INET ? ETH_P_IP : ETH_P_IPV6; + + /* sock_raw passes ll header as data */ + if (type == SOCK_RAW) { + memset(eth.h_dest, 0x06, ETH_ALEN); + memset(eth.h_source, 0x02, ETH_ALEN); + eth.h_proto = htons(proto); + iov[0].iov_base = ð + iov[0].iov_len = sizeof(eth); + msg.msg_iovlen++; + } + + /* both sock_raw and sock_dgram expect name */ + memset(&laddr, 0, sizeof(laddr)); + laddr.sll_family = AF_PACKET; + laddr.sll_ifindex = cfg_ifindex; + laddr.sll_protocol = htons(proto); + laddr.sll_halen = ETH_ALEN; + + memset(laddr.sll_addr, 0x06, ETH_ALEN); + + msg.msg_name = &laddr; + msg.msg_namelen = sizeof(laddr); + } + + /* packet and raw sockets with hdrincl must pass network header */ + if (domain == PF_PACKET || protocol == IPPROTO_RAW) { + if (cfg_family == PF_INET) + iov[1].iov_len = setup_iph(&nh.iph, cfg_payload_len); + else + iov[1].iov_len = setup_ip6h(&nh.ip6h, cfg_payload_len); + + iov[1].iov_base = (void *) &nh; + msg.msg_iovlen++; + } + + iov[2].iov_base = payload; + iov[2].iov_len = cfg_payload_len; + msg.msg_iovlen++; + msg.msg_iov = &iov[3 - msg.msg_iovlen]; + + tstop = gettimeofday_ms() + cfg_runtime_ms; + do { + if (cfg_cork) + do_sendmsg_corked(fd, &msg); + else + do_sendmsg(fd, &msg, cfg_zerocopy); + + while (!do_poll(fd, POLLOUT)) { + if (cfg_zerocopy) + do_recv_completions(fd); + } + + } while (gettimeofday_ms() < tstop); + + if (cfg_zerocopy) + do_recv_remaining_completions(fd); + + if (close(fd)) + error(1, errno, "close"); + + fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n", + packets, bytes >> 20, completions, + zerocopied == 1 ? 'y' : 'n'); +} + +static int do_setup_rx(int domain, int type, int protocol) +{ + int fd; + + /* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW, + * to recv the only copy of the packet, not a clone + */ + if (domain == PF_PACKET) + error(1, 0, "Use PF_INET/SOCK_RAW to read"); + + if (type == SOCK_RAW && protocol == IPPROTO_RAW) + error(1, 0, "IPPROTO_RAW: not supported on Rx"); + + fd = socket(domain, type, protocol); + if (fd == -1) + error(1, errno, "socket r"); + + do_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 1 << 21); + do_setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, 1 << 16); + do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1); + + if (bind(fd, (void *) &cfg_dst_addr, cfg_alen)) + error(1, errno, "bind"); + + if (type == SOCK_STREAM) { + if (listen(fd, 1)) + error(1, errno, "listen"); + fd = do_accept(fd); + } + + return fd; +} + +/* Flush all outstanding bytes for the tcp receive queue */ +static void do_flush_tcp(int fd) +{ + int ret; + + /* MSG_TRUNC flushes up to len bytes */ + ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT); + if (ret == -1 && errno == EAGAIN) + return; + if (ret == -1) + error(1, errno, "flush"); + if (!ret) + return; + + packets++; + bytes += ret; +} + +/* Flush all outstanding datagrams. Verify first few bytes of each. */ +static void do_flush_datagram(int fd, int type) +{ + int ret, off = 0; + char buf[64]; + + /* MSG_TRUNC will return full datagram length */ + ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC); + if (ret == -1 && errno == EAGAIN) + return; + + /* raw ipv4 return with header, raw ipv6 without */ + if (cfg_family == PF_INET && type == SOCK_RAW) { + off += sizeof(struct iphdr); + ret -= sizeof(struct iphdr); + } + + if (ret == -1) + error(1, errno, "recv"); + if (ret != cfg_payload_len) + error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len); + if (ret > sizeof(buf) - off) + ret = sizeof(buf) - off; + if (memcmp(buf + off, payload, ret)) + error(1, 0, "recv: data mismatch"); + + packets++; + bytes += cfg_payload_len; +} + +static void do_rx(int domain, int type, int protocol) +{ + uint64_t tstop; + int fd; + + fd = do_setup_rx(domain, type, protocol); + + tstop = gettimeofday_ms() + cfg_runtime_ms; + do { + if (type == SOCK_STREAM) + do_flush_tcp(fd); + else + do_flush_datagram(fd, type); + + do_poll(fd, POLLIN); + + } while (gettimeofday_ms() < tstop); + + if (close(fd)) + error(1, errno, "close"); + + fprintf(stderr, "rx=%lu (%lu MB)\n", packets, bytes >> 20); +} + +static void do_test(int domain, int type, int protocol) +{ + int i; + + if (cfg_cork && (domain == PF_PACKET || type != SOCK_DGRAM)) + error(1, 0, "can only cork udp sockets"); + + do_setcpu(cfg_cpu); + + for (i = 0; i < IP_MAXPACKET; i++) + payload[i] = 'a' + (i % 26); + + if (cfg_rx) + do_rx(domain, type, protocol); + else + do_tx(domain, type, protocol); +} + +static void usage(const char *filepath) +{ + error(1, 0, "Usage: %s [options] ", filepath); +} + +static void parse_opts(int argc, char **argv) +{ + const int max_payload_len = sizeof(payload) - + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr) - + 40 /* max tcp options */; + int c; + + cfg_payload_len = max_payload_len; + + while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) { + switch (c) { + case '4': + if (cfg_family != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + cfg_family = PF_INET; + cfg_alen = sizeof(struct sockaddr_in); + break; + case '6': + if (cfg_family != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + cfg_family = PF_INET6; + cfg_alen = sizeof(struct sockaddr_in6); + break; + case 'c': + cfg_cork = strtol(optarg, NULL, 0); + break; + case 'C': + cfg_cpu = strtol(optarg, NULL, 0); + break; + case 'D': + setup_sockaddr(cfg_family, optarg, &cfg_dst_addr); + break; + case 'i': + cfg_ifindex = if_nametoindex(optarg); + if (cfg_ifindex == 0) + error(1, errno, "invalid iface: %s", optarg); + break; + case 'm': + cfg_cork_mixed = true; + break; + case 'p': + cfg_port = htons(strtoul(optarg, NULL, 0)); + break; + case 'r': + cfg_rx = true; + break; + case 's': + cfg_payload_len = strtoul(optarg, NULL, 0); + break; + case 'S': + setup_sockaddr(cfg_family, optarg, &cfg_src_addr); + break; + case 't': + cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000; + break; + case 'v': + cfg_verbose++; + break; + case 'z': + cfg_zerocopy = true; + break; + } + } + + if (cfg_payload_len > max_payload_len) + error(1, 0, "-s: payload exceeds max (%d)", max_payload_len); + if (cfg_cork_mixed && (!cfg_zerocopy || !cfg_cork)) + error(1, 0, "-m: cork_mixed requires corking and zerocopy"); + + if (optind != argc - 1) + usage(argv[0]); +} + +int main(int argc, char **argv) +{ + const char *cfg_test; + + parse_opts(argc, argv); + + cfg_test = argv[argc - 1]; + + if (!strcmp(cfg_test, "packet")) + do_test(PF_PACKET, SOCK_RAW, 0); + else if (!strcmp(cfg_test, "packet_dgram")) + do_test(PF_PACKET, SOCK_DGRAM, 0); + else if (!strcmp(cfg_test, "raw")) + do_test(cfg_family, SOCK_RAW, IPPROTO_EGP); + else if (!strcmp(cfg_test, "raw_hdrincl")) + do_test(cfg_family, SOCK_RAW, IPPROTO_RAW); + else if (!strcmp(cfg_test, "tcp")) + do_test(cfg_family, SOCK_STREAM, 0); + else if (!strcmp(cfg_test, "udp")) + do_test(cfg_family, SOCK_DGRAM, 0); + else + error(1, 0, "unknown cfg_test %s", cfg_test); + + return 0; +} diff --git a/tools/testing/selftests/net/msg_zerocopy.sh b/tools/testing/selftests/net/msg_zerocopy.sh new file mode 100755 index 000000000000..d571d213418d --- /dev/null +++ b/tools/testing/selftests/net/msg_zerocopy.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# +# Send data between two processes across namespaces +# Run twice: once without and once with zerocopy + +set -e + +readonly DEV="veth0" +readonly DEV_MTU=65535 +readonly BIN="./msg_zerocopy" + +readonly RAND="$(mktemp -u XXXXXX)" +readonly NSPREFIX="ns-${RAND}" +readonly NS1="${NSPREFIX}1" +readonly NS2="${NSPREFIX}2" + +readonly SADDR4='192.168.1.1' +readonly DADDR4='192.168.1.2' +readonly SADDR6='fd::1' +readonly DADDR6='fd::2' + +readonly path_sysctl_mem="net.core.optmem_max" + +# Argument parsing +if [[ "$#" -lt "2" ]]; then + echo "Usage: $0 [4|6] [tcp|udp|raw|raw_hdrincl|packet|packet_dgram] " + exit 1 +fi + +readonly IP="$1" +shift +readonly TXMODE="$1" +shift +readonly EXTRA_ARGS="$@" + +# Argument parsing: configure addresses +if [[ "${IP}" == "4" ]]; then + readonly SADDR="${SADDR4}" + readonly DADDR="${DADDR4}" +elif [[ "${IP}" == "6" ]]; then + readonly SADDR="${SADDR6}" + readonly DADDR="${DADDR6}" +else + echo "Invalid IP version ${IP}" + exit 1 +fi + +# Argument parsing: select receive mode +# +# This differs from send mode for +# - packet: use raw recv, because packet receives skb clones +# - raw_hdrinc: use raw recv, because hdrincl is a tx-only option +case "${TXMODE}" in +'packet' | 'packet_dgram' | 'raw_hdrincl') + RXMODE='raw' + ;; +*) + RXMODE="${TXMODE}" + ;; +esac + +# Start of state changes: install cleanup handler +save_sysctl_mem="$(sysctl -n ${path_sysctl_mem})" + +cleanup() { + ip netns del "${NS2}" + ip netns del "${NS1}" + sysctl -w -q "${path_sysctl_mem}=${save_sysctl_mem}" +} + +trap cleanup EXIT + +# Configure system settings +sysctl -w -q "${path_sysctl_mem}=1000000" + +# Create virtual ethernet pair between network namespaces +ip netns add "${NS1}" +ip netns add "${NS2}" + +ip link add "${DEV}" mtu "${DEV_MTU}" netns "${NS1}" type veth \ + peer name "${DEV}" mtu "${DEV_MTU}" netns "${NS2}" + +# Bring the devices up +ip -netns "${NS1}" link set "${DEV}" up +ip -netns "${NS2}" link set "${DEV}" up + +# Set fixed MAC addresses on the devices +ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02 +ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06 + +# Add fixed IP addresses to the devices +ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}" +ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}" +ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad +ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad + +# Optionally disable sg or csum offload to test edge cases +# ip netns exec "${NS1}" ethtool -K "${DEV}" sg off + +do_test() { + local readonly ARGS="$1" + + echo "ipv${IP} ${TXMODE} ${ARGS}" + ip netns exec "${NS2}" "${BIN}" "-${IP}" -i "${DEV}" -t 2 -C 2 -S "${SADDR}" -D "${DADDR}" ${ARGS} -r "${RXMODE}" & + sleep 0.2 + ip netns exec "${NS1}" "${BIN}" "-${IP}" -i "${DEV}" -t 1 -C 3 -S "${SADDR}" -D "${DADDR}" ${ARGS} "${TXMODE}" + wait +} + +do_test "${EXTRA_ARGS}" +do_test "-z ${EXTRA_ARGS}" +echo ok -- cgit