diff options
Diffstat (limited to 'net/core')
34 files changed, 4377 insertions, 1769 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index f6761b6e3b29..79f9479e9658 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -28,3 +28,4 @@ obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o +obj-$(CONFIG_GRO_CELLS) += gro_cells.o diff --git a/net/core/datagram.c b/net/core/datagram.c index ea633342ab0d..ee5647bd91b3 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -68,7 +68,7 @@ static inline int connection_based(struct sock *sk) return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; } -static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync, +static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { unsigned long bits = (unsigned long)key; @@ -161,10 +161,49 @@ done: return skb; } +struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, + struct sk_buff_head *queue, + unsigned int flags, + void (*destructor)(struct sock *sk, + struct sk_buff *skb), + int *peeked, int *off, int *err, + struct sk_buff **last) +{ + struct sk_buff *skb; + int _off = *off; + + *last = queue->prev; + skb_queue_walk(queue, skb) { + if (flags & MSG_PEEK) { + if (_off >= skb->len && (skb->len || _off || + skb->peeked)) { + _off -= skb->len; + continue; + } + if (!skb->len) { + skb = skb_set_peeked(skb); + if (unlikely(IS_ERR(skb))) { + *err = PTR_ERR(skb); + return NULL; + } + } + *peeked = 1; + refcount_inc(&skb->users); + } else { + __skb_unlink(skb, queue); + if (destructor) + destructor(sk, skb); + } + *off = _off; + return skb; + } + return NULL; +} + /** * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket - * @flags: MSG_ flags + * @flags: MSG\_ flags * @destructor: invoked under the receive lock on successful dequeue * @peeked: returns non-zero if this packet has been seen before * @off: an offset in bytes to peek skb from. Returns an offset @@ -181,7 +220,7 @@ done: * * This function will lock the socket if a skb is returned, so * the caller needs to unlock the socket in that case (usually by - * calling skb_free_datagram). Returns NULL with *err set to + * calling skb_free_datagram). Returns NULL with @err set to * -EAGAIN if no data was available or to some other value if an * error was detected. * @@ -222,42 +261,20 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, * Look at current nfs client by the way... * However, this function was correct in any case. 8) */ - int _off = *off; - - *last = (struct sk_buff *)queue; spin_lock_irqsave(&queue->lock, cpu_flags); - skb_queue_walk(queue, skb) { - *last = skb; - if (flags & MSG_PEEK) { - if (_off >= skb->len && (skb->len || _off || - skb->peeked)) { - _off -= skb->len; - continue; - } - if (!skb->len) { - skb = skb_set_peeked(skb); - if (IS_ERR(skb)) { - error = PTR_ERR(skb); - spin_unlock_irqrestore(&queue->lock, - cpu_flags); - goto no_packet; - } - } - *peeked = 1; - atomic_inc(&skb->users); - } else { - __skb_unlink(skb, queue); - if (destructor) - destructor(sk, skb); - } - spin_unlock_irqrestore(&queue->lock, cpu_flags); - *off = _off; + skb = __skb_try_recv_from_queue(sk, queue, flags, destructor, + peeked, off, &error, last); + spin_unlock_irqrestore(&queue->lock, cpu_flags); + if (error) + goto no_packet; + if (skb) return skb; - } - spin_unlock_irqrestore(&queue->lock, cpu_flags); - } while (sk_can_busy_loop(sk) && - sk_busy_loop(sk, flags & MSG_DONTWAIT)); + if (!sk_can_busy_loop(sk)) + break; + + sk_busy_loop(sk, flags & MSG_DONTWAIT); + } while (!skb_queue_empty(&sk->sk_receive_queue)); error = -EAGAIN; @@ -313,9 +330,7 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) { bool slow; - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) { + if (!skb_unref(skb)) { sk_peek_offset_bwd(sk, len); return; } @@ -331,8 +346,8 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) } EXPORT_SYMBOL(__skb_free_datagram_locked); -int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, - unsigned int flags, +int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, + struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)) { @@ -340,15 +355,15 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, if (flags & MSG_PEEK) { err = -ENOENT; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - atomic_dec(&skb->users); + spin_lock_bh(&sk_queue->lock); + if (skb == skb_peek(sk_queue)) { + __skb_unlink(skb, sk_queue); + refcount_dec(&skb->users); if (destructor) destructor(sk, skb); err = 0; } - spin_unlock_bh(&sk->sk_receive_queue.lock); + spin_unlock_bh(&sk_queue->lock); } atomic_inc(&sk->sk_drops); @@ -360,7 +375,7 @@ EXPORT_SYMBOL(__sk_queue_drop_skb); * skb_kill_datagram - Free a datagram skbuff forcibly * @sk: socket * @skb: datagram skbuff - * @flags: MSG_ flags + * @flags: MSG\_ flags * * This function frees a datagram skbuff that was received by * skb_recv_datagram. The flags argument must match the one @@ -379,7 +394,8 @@ EXPORT_SYMBOL(__sk_queue_drop_skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) { - int err = __sk_queue_drop_skb(sk, skb, flags, NULL); + int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags, + NULL); kfree_skb(skb); sk_mem_reclaim_partial(sk); @@ -398,7 +414,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, struct iov_iter *to, int len) { int start = skb_headlen(skb); - int i, copy = start - offset; + int i, copy = start - offset, start_off = offset, n; struct sk_buff *frag_iter; trace_skb_copy_datagram_iovec(skb, len); @@ -407,11 +423,12 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - if (copy_to_iter(skb->data + offset, copy, to) != copy) + n = copy_to_iter(skb->data + offset, copy, to); + offset += n; + if (n != copy) goto short_copy; if ((len -= copy) == 0) return 0; - offset += copy; } /* Copy paged appendix. Hmm... why does this look so complicated? */ @@ -425,13 +442,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, if ((copy = end - offset) > 0) { if (copy > len) copy = len; - if (copy_page_to_iter(skb_frag_page(frag), + n = copy_page_to_iter(skb_frag_page(frag), frag->page_offset + offset - - start, copy, to) != copy) + start, copy, to); + offset += n; + if (n != copy) goto short_copy; if (!(len -= copy)) return 0; - offset += copy; } start = end; } @@ -463,6 +481,7 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, */ fault: + iov_iter_revert(to, offset - start_off); return -EFAULT; short_copy: @@ -595,7 +614,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) skb->data_len += copied; skb->len += copied; skb->truesize += truesize; - atomic_add(truesize, &skb->sk->sk_wmem_alloc); + refcount_add(truesize, &skb->sk->sk_wmem_alloc); while (copied) { int size = min_t(int, copied, PAGE_SIZE - start); skb_fill_page_desc(skb, frag++, pages[n], start, size); @@ -613,7 +632,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, __wsum *csump) { int start = skb_headlen(skb); - int i, copy = start - offset; + int i, copy = start - offset, start_off = offset; struct sk_buff *frag_iter; int pos = 0; int n; @@ -623,11 +642,11 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, if (copy > len) copy = len; n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to); + offset += n; if (n != copy) goto fault; if ((len -= copy) == 0) return 0; - offset += copy; pos = copy; } @@ -649,12 +668,12 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, offset - start, copy, &csum2, to); kunmap(page); + offset += n; if (n != copy) goto fault; *csump = csum_block_add(*csump, csum2, pos); if (!(len -= copy)) return 0; - offset += copy; pos += copy; } start = end; @@ -687,6 +706,7 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, return 0; fault: + iov_iter_revert(to, offset - start_off); return -EFAULT; } @@ -756,7 +776,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (msg_data_left(msg) < chunk) { if (__skb_checksum_complete(skb)) - goto csum_error; + return -EINVAL; if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) goto fault; } else { @@ -764,14 +784,16 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, chunk, &csum)) goto fault; - if (csum_fold(csum)) - goto csum_error; + + if (csum_fold(csum)) { + iov_iter_revert(&msg->msg_iter, chunk); + return -EINVAL; + } + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) netdev_rx_csum_fault(skb->dev); } return 0; -csum_error: - return -EINVAL; fault: return -EFAULT; } @@ -787,7 +809,7 @@ EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); * sequenced packet sockets providing the socket receive queue * is only ever holding data ready to receive. * - * Note: when you _don't_ use this routine for this protocol, + * Note: when you *don't* use this routine for this protocol, * and you use a different write policy from sock_writeable() * then please supply your own write_space callback. */ diff --git a/net/core/dev.c b/net/core/dev.c index 29101c98399f..8515f8fe0460 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1,5 +1,5 @@ /* - * NET3 Protocol independent device support routines. + * NET3 Protocol independent device support routines. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -7,7 +7,7 @@ * 2 of the License, or (at your option) any later version. * * Derived from the non IP parts of dev.c 1.0.19 - * Authors: Ross Biro + * Authors: Ross Biro * Fred N. van Kempen, <[email protected]> * Mark Evans, <[email protected]> * @@ -21,9 +21,9 @@ * * Changes: * D.J. Barrow : Fixed bug where dev->refcnt gets set - * to 2 if register_netdev gets called - * before net_dev_init & also removed a - * few lines of code in the process. + * to 2 if register_netdev gets called + * before net_dev_init & also removed a + * few lines of code in the process. * Alan Cox : device private ioctl copies fields back. * Alan Cox : Transmit queue code does relevant * stunts to keep the queue safe. @@ -36,7 +36,7 @@ * Alan Cox : 100 backlog just doesn't cut it when * you start doing multicast video 8) * Alan Cox : Rewrote net_bh and list manager. - * Alan Cox : Fix ETH_P_ALL echoback lengths. + * Alan Cox : Fix ETH_P_ALL echoback lengths. * Alan Cox : Took out transmit every packet pass * Saved a few bytes in the ioctl handler * Alan Cox : Network driver sets packet type before @@ -46,7 +46,7 @@ * Richard Kooijman: Timestamp fixes. * Alan Cox : Wrong field in SIOCGIFDSTADDR * Alan Cox : Device lock protection. - * Alan Cox : Fixed nasty side effect of device close + * Alan Cox : Fixed nasty side effect of device close * changes. * Rudi Cilibrasi : Pass the right thing to * set_mac_address() @@ -67,8 +67,8 @@ * Paul Rusty Russell : SIOCSIFNAME * Pekka Riikonen : Netdev boot-time settings code * Andrew Morton : Make unregister_netdevice wait - * indefinitely on dev->refcnt - * J Hadi Salim : - Backlog queue sampling + * indefinitely on dev->refcnt + * J Hadi Salim : - Backlog queue sampling * - netif_rx() feedback */ @@ -81,6 +81,7 @@ #include <linux/hash.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/mutex.h> #include <linux/string.h> #include <linux/mm.h> @@ -95,6 +96,7 @@ #include <linux/notifier.h> #include <linux/skbuff.h> #include <linux/bpf.h> +#include <linux/bpf_trace.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/busy_poll.h> @@ -103,6 +105,7 @@ #include <net/dst.h> #include <net/dst_metadata.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> #include <linux/highmem.h> @@ -140,6 +143,7 @@ #include <linux/hrtimer.h> #include <linux/netfilter_ingress.h> #include <linux/crash_dump.h> +#include <linux/sctp.h> #include "net-sysfs.h" @@ -159,6 +163,7 @@ static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, struct netdev_notifier_info *info); +static struct napi_struct *napi_by_id(unsigned int napi_id); /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -192,7 +197,8 @@ static seqcount_t devnet_rename_seq; static inline void dev_base_seq_inc(struct net *net) { - while (++net->dev_base_seq == 0); + while (++net->dev_base_seq == 0) + ; } static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) @@ -274,8 +280,8 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * register_netdevice() inits txq->_xmit_lock and sets lockdep class * according to dev->type */ -static const unsigned short netdev_lock_type[] = - {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, +static const unsigned short netdev_lock_type[] = { + ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, @@ -291,22 +297,22 @@ static const unsigned short netdev_lock_type[] = ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; -static const char *const netdev_lock_name[] = - {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", - "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", - "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", - "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", - "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", - "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", - "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", - "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", - "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", - "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", - "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", - "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", - "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", - "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", - "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; +static const char *const netdev_lock_name[] = { + "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", + "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; @@ -352,10 +358,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) #endif /******************************************************************************* + * + * Protocol management and registration routines + * + *******************************************************************************/ - Protocol management and registration routines - -*******************************************************************************/ /* * Add a protocol ID to the list. Now that the input handler is @@ -538,10 +545,10 @@ void dev_remove_offload(struct packet_offload *po) EXPORT_SYMBOL(dev_remove_offload); /****************************************************************************** - - Device Boot-time Settings Routines - -*******************************************************************************/ + * + * Device Boot-time Settings Routines + * + ******************************************************************************/ /* Boot time configuration table */ static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; @@ -574,13 +581,13 @@ static int netdev_boot_setup_add(char *name, struct ifmap *map) } /** - * netdev_boot_setup_check - check boot time settings - * @dev: the netdevice + * netdev_boot_setup_check - check boot time settings + * @dev: the netdevice * - * Check boot time settings for the device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found, 1 if they are. + * Check boot time settings for the device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found, 1 if they are. */ int netdev_boot_setup_check(struct net_device *dev) { @@ -590,10 +597,10 @@ int netdev_boot_setup_check(struct net_device *dev) for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && !strcmp(dev->name, s[i].name)) { - dev->irq = s[i].map.irq; - dev->base_addr = s[i].map.base_addr; - dev->mem_start = s[i].map.mem_start; - dev->mem_end = s[i].map.mem_end; + dev->irq = s[i].map.irq; + dev->base_addr = s[i].map.base_addr; + dev->mem_start = s[i].map.mem_start; + dev->mem_end = s[i].map.mem_end; return 1; } } @@ -603,14 +610,14 @@ EXPORT_SYMBOL(netdev_boot_setup_check); /** - * netdev_boot_base - get address from boot time settings - * @prefix: prefix for network device - * @unit: id for network device + * netdev_boot_base - get address from boot time settings + * @prefix: prefix for network device + * @unit: id for network device * - * Check boot time settings for the base address of device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found. + * Check boot time settings for the base address of device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found. */ unsigned long netdev_boot_base(const char *prefix, int unit) { @@ -663,10 +670,10 @@ int __init netdev_boot_setup(char *str) __setup("netdev=", netdev_boot_setup); /******************************************************************************* - - Device Interface Subroutines - -*******************************************************************************/ + * + * Device Interface Subroutines + * + *******************************************************************************/ /** * dev_get_iflink - get 'iflink' value of a interface @@ -737,15 +744,15 @@ struct net_device *__dev_get_by_name(struct net *net, const char *name) EXPORT_SYMBOL(__dev_get_by_name); /** - * dev_get_by_name_rcu - find a device by its name - * @net: the applicable net namespace - * @name: name to find + * dev_get_by_name_rcu - find a device by its name + * @net: the applicable net namespace + * @name: name to find * - * Find an interface by name. - * If the name is found a pointer to the device is returned. - * If the name is not found then %NULL is returned. - * The reference counters are not incremented so the caller must be - * careful with locks. The caller must hold RCU lock. + * Find an interface by name. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. */ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) @@ -861,6 +868,31 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) EXPORT_SYMBOL(dev_get_by_index); /** + * dev_get_by_napi_id - find a device by napi_id + * @napi_id: ID of the NAPI struct + * + * Search for an interface by NAPI ID. Returns %NULL if the device + * is not found or a pointer to the device. The device has not had + * its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_napi_id(unsigned int napi_id) +{ + struct napi_struct *napi; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (napi_id < MIN_NAPI_ID) + return NULL; + + napi = napi_by_id(napi_id); + + return napi ? napi->dev : NULL; +} +EXPORT_SYMBOL(dev_get_by_napi_id); + +/** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. @@ -1249,8 +1281,9 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) if (!new_ifalias) return -ENOMEM; dev->ifalias = new_ifalias; + memcpy(dev->ifalias, alias, len); + dev->ifalias[len] = 0; - strlcpy(dev->ifalias, alias, len+1); return len; } @@ -1289,8 +1322,8 @@ void netdev_state_change(struct net_device *dev) EXPORT_SYMBOL(netdev_state_change); /** - * netdev_notify_peers - notify network peers about existence of @dev - * @dev: network device + * netdev_notify_peers - notify network peers about existence of @dev + * @dev: network device * * Generate traffic such that interested network peers are aware of * @dev, such as by generating a gratuitous ARP. This may be used when @@ -1302,6 +1335,7 @@ void netdev_notify_peers(struct net_device *dev) { rtnl_lock(); call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); + call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev); rtnl_unlock(); } EXPORT_SYMBOL(netdev_notify_peers); @@ -1518,17 +1552,17 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, static int dev_boot_phase = 1; /** - * register_netdevice_notifier - register a network notifier block - * @nb: notifier + * register_netdevice_notifier - register a network notifier block + * @nb: notifier * - * Register a notifier to be called when network device events occur. - * The notifier passed is linked into the kernel structures and must - * not be reused until it has been unregistered. A negative errno code - * is returned on a failure. + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. * - * When registered all registration and up events are replayed - * to the new notifier to allow device to have a race free - * view of the network device list. + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. */ int register_netdevice_notifier(struct notifier_block *nb) @@ -1585,17 +1619,17 @@ outroll: EXPORT_SYMBOL(register_netdevice_notifier); /** - * unregister_netdevice_notifier - unregister a network notifier block - * @nb: notifier + * unregister_netdevice_notifier - unregister a network notifier block + * @nb: notifier * - * Unregister a notifier previously registered by - * register_netdevice_notifier(). The notifier is unlinked into the - * kernel structures and may then be reused. A negative errno code - * is returned on a failure. + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. * - * After unregistering unregister and down device events are synthesized - * for all devices on the device list to the removed notifier to remove - * the need for special case cleanup code. + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. */ int unregister_netdevice_notifier(struct notifier_block *nb) @@ -1696,27 +1730,54 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue); static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL static atomic_t netstamp_needed_deferred; +static atomic_t netstamp_wanted; static void netstamp_clear(struct work_struct *work) { int deferred = atomic_xchg(&netstamp_needed_deferred, 0); + int wanted; - while (deferred--) - static_key_slow_dec(&netstamp_needed); + wanted = atomic_add_return(deferred, &netstamp_wanted); + if (wanted > 0) + static_key_enable(&netstamp_needed); + else + static_key_disable(&netstamp_needed); } static DECLARE_WORK(netstamp_work, netstamp_clear); #endif void net_enable_timestamp(void) { +#ifdef HAVE_JUMP_LABEL + int wanted; + + while (1) { + wanted = atomic_read(&netstamp_wanted); + if (wanted <= 0) + break; + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) + return; + } + atomic_inc(&netstamp_needed_deferred); + schedule_work(&netstamp_work); +#else static_key_slow_inc(&netstamp_needed); +#endif } EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { #ifdef HAVE_JUMP_LABEL - /* net_disable_timestamp() can be called from non process context */ - atomic_inc(&netstamp_needed_deferred); + int wanted; + + while (1) { + wanted = atomic_read(&netstamp_wanted); + if (wanted <= 1) + break; + if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) + return; + } + atomic_dec(&netstamp_needed_deferred); schedule_work(&netstamp_work); #else static_key_slow_dec(&netstamp_needed); @@ -1801,7 +1862,7 @@ static inline int deliver_skb(struct sk_buff *skb, { if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) return -ENOMEM; - atomic_inc(&skb->users); + refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } @@ -2403,28 +2464,6 @@ void netif_schedule_queue(struct netdev_queue *txq) } EXPORT_SYMBOL(netif_schedule_queue); -/** - * netif_wake_subqueue - allow sending packets on subqueue - * @dev: network device - * @queue_index: sub queue index - * - * Resume individual transmit queue of a device with multiple transmit queues. - */ -void netif_wake_subqueue(struct net_device *dev, u16 queue_index) -{ - struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); - - if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { - struct Qdisc *q; - - rcu_read_lock(); - q = rcu_dereference(txq->qdisc); - __netif_schedule(q); - rcu_read_unlock(); - } -} -EXPORT_SYMBOL(netif_wake_subqueue); - void netif_tx_wake_queue(struct netdev_queue *dev_queue) { if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { @@ -2442,10 +2481,13 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) { unsigned long flags; - if (likely(atomic_read(&skb->users) == 1)) { + if (unlikely(!skb)) + return; + + if (likely(refcount_read(&skb->users) == 1)) { smp_rmb(); - atomic_set(&skb->users, 0); - } else if (likely(!atomic_dec_and_test(&skb->users))) { + refcount_set(&skb->users, 0); + } else if (likely(!refcount_dec_and_test(&skb->users))) { return; } get_kfree_skb_cb(skb)->reason = reason; @@ -2518,6 +2560,7 @@ u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, if (dev->num_tc) { u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; qcount = dev->tc_to_txq[tc].count; } @@ -2597,6 +2640,47 @@ out: } EXPORT_SYMBOL(skb_checksum_help); +int skb_crc32c_csum_help(struct sk_buff *skb) +{ + __le32 crc32c_csum; + int ret = 0, offset, start; + + if (skb->ip_summed != CHECKSUM_PARTIAL) + goto out; + + if (unlikely(skb_is_gso(skb))) + goto out; + + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (unlikely(skb_has_shared_frag(skb))) { + ret = __skb_linearize(skb); + if (ret) + goto out; + } + start = skb_checksum_start_offset(skb); + offset = start + offsetof(struct sctphdr, checksum); + if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + ret = -EINVAL; + goto out; + } + if (skb_cloned(skb) && + !skb_clone_writable(skb, offset + sizeof(__le32))) { + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (ret) + goto out; + } + crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, + skb->len - start, ~(__u32)0, + crc32c_csum_stub)); + *(__le32 *)(skb->data + offset) = crc32c_csum; + skb->ip_summed = CHECKSUM_NONE; + skb->csum_not_inet = 0; +out: + return ret; +} + __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; @@ -2654,9 +2738,10 @@ EXPORT_SYMBOL(skb_mac_gso_segment); static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL; - else - return skb->ip_summed == CHECKSUM_NONE; + return skb->ip_summed != CHECKSUM_PARTIAL && + skb->ip_summed != CHECKSUM_NONE; + + return skb->ip_summed == CHECKSUM_NONE; } /** @@ -2675,11 +2760,12 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) { + struct sk_buff *segs; + if (unlikely(skb_needs_check(skb, tx_path))) { int err; - skb_warn_bad_offload(skb); - + /* We're going to init ->check field in TCP or UDP header */ err = skb_cow_head(skb, 0); if (err < 0) return ERR_PTR(err); @@ -2707,7 +2793,12 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, skb_reset_mac_header(skb); skb_reset_mac_len(skb); - return skb_mac_gso_segment(skb, features); + segs = skb_mac_gso_segment(skb, features); + + if (unlikely(skb_needs_check(skb, tx_path))) + skb_warn_bad_offload(skb); + + return segs; } EXPORT_SYMBOL(__skb_gso_segment); @@ -2732,9 +2823,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; + if (!(dev->features & NETIF_F_HIGHDMA)) { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + if (PageHighMem(skb_frag_page(frag))) return 1; } @@ -2748,6 +2841,7 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; dma_addr_t addr = page_to_phys(skb_frag_page(frag)); + if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) return 1; } @@ -2929,6 +3023,17 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, return skb; } +int skb_csum_hwoffload_help(struct sk_buff *skb, + const netdev_features_t features) +{ + if (unlikely(skb->csum_not_inet)) + return !!(features & NETIF_F_SCTP_CRC) ? 0 : + skb_crc32c_csum_help(skb); + + return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb); +} +EXPORT_SYMBOL(skb_csum_hwoffload_help); + static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; @@ -2953,6 +3058,9 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device __skb_linearize(skb)) goto out_kfree_skb; + if (validate_xmit_xfrm(skb, features)) + goto out_kfree_skb; + /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. @@ -2964,8 +3072,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_CSUM_MASK) && - skb_checksum_help(skb)) + if (skb_csum_hwoffload_help(skb, features)) goto out_kfree_skb; } } @@ -3148,12 +3255,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) if (!cl) return skb; - /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set - * earlier by the caller. - */ + /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ qdisc_bstats_cpu_update(cl->q, skb); - switch (tc_classify(skb, cl, &cl_res, false)) { + switch (tcf_classify(skb, cl, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -3165,6 +3270,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *ret = NET_XMIT_SUCCESS; consume_skb(skb); return NULL; @@ -3225,6 +3331,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) if (queue_index < 0 || skb->ooo_okay || queue_index >= dev->real_num_tx_queues) { int new_index = get_xps_queue(dev, skb); + if (new_index < 0) new_index = skb_tx_hash(dev, skb); @@ -3254,6 +3361,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; + if (ops->ndo_select_queue) queue_index = ops->ndo_select_queue(dev, skb, accel_priv, __netdev_pick_tx); @@ -3315,7 +3423,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) qdisc_pkt_len_init(skb); #ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); + skb->tc_at_ingress = 0; # ifdef CONFIG_NET_EGRESS if (static_key_false(&egress_needed)) { skb = sch_handle_egress(skb, &rc, dev); @@ -3342,16 +3450,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) } /* The device has no queue. Common case for software devices: - loopback, all the sorts of tunnels... + * loopback, all the sorts of tunnels... - Really, it is unlikely that netif_tx_lock protection is necessary - here. (f.e. loopback and IP tunnels are clean ignoring statistics - counters.) - However, it is possible, that they rely on protection - made by us here. + * Really, it is unlikely that netif_tx_lock protection is necessary + * here. (f.e. loopback and IP tunnels are clean ignoring statistics + * counters.) + * However, it is possible, that they rely on protection + * made by us here. - Check this and shot the lock. It is not prone from deadlocks. - Either shot noqueue qdisc, it is even simpler 8) + * Check this and shot the lock. It is not prone from deadlocks. + *Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ @@ -3413,16 +3521,21 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) EXPORT_SYMBOL(dev_queue_xmit_accel); -/*======================================================================= - Receiver routines - =======================================================================*/ +/************************************************************************* + * Receiver routines + *************************************************************************/ int netdev_max_backlog __read_mostly = 1000; EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; -int weight_p __read_mostly = 64; /* old backlog weight */ +unsigned int __read_mostly netdev_budget_usecs = 2000; +int weight_p __read_mostly = 64; /* old backlog weight */ +int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ +int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ +int dev_rx_weight __read_mostly = 64; +int dev_tx_weight __read_mostly = 64; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -3779,6 +3892,7 @@ static int netif_rx_internal(struct sk_buff *skb) #endif { unsigned int qtail; + ret = enqueue_to_backlog(skb, get_cpu(), &qtail); put_cpu(); } @@ -3838,9 +3952,10 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) while (clist) { struct sk_buff *skb = clist; + clist = clist->next; - WARN_ON(atomic_read(&skb->users)); + WARN_ON(refcount_read(&skb->users)); if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) trace_consume_skb(skb); else @@ -3911,10 +4026,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, } qdisc_skb_cb(skb)->pkt_len = skb->len; - skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); + skb->tc_at_ingress = 1; qdisc_bstats_cpu_update(cl->q, skb); - switch (tc_classify(skb, cl, &cl_res, false)) { + switch (tcf_classify(skb, cl, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -3925,6 +4040,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: consume_skb(skb); return NULL; case TC_ACT_REDIRECT: @@ -3976,9 +4092,7 @@ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { - ASSERT_RTNL(); - - if (dev->rx_handler) + if (netdev_is_rx_handler_busy(dev)) return -EBUSY; /* Note: rx_handler_data must be set before rx_handler */ @@ -4084,12 +4198,8 @@ another_round: goto out; } -#ifdef CONFIG_NET_CLS_ACT - if (skb->tc_verd & TC_NCLS) { - skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); - goto ncls; - } -#endif + if (skb_skip_tc_classify(skb)) + goto skip_classify; if (pfmemalloc) goto skip_taps; @@ -4117,10 +4227,8 @@ skip_taps: goto out; } #endif -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = 0; -ncls: -#endif + skb_reset_tc(skb); +skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; @@ -4210,7 +4318,7 @@ static int __netif_receive_skb(struct sk_buff *skb) int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should @@ -4221,15 +4329,133 @@ static int __netif_receive_skb(struct sk_buff *skb) * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); ret = __netif_receive_skb_core(skb, true); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); } else ret = __netif_receive_skb_core(skb, false); return ret; } +static struct static_key generic_xdp_needed __read_mostly; + +static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) +{ + struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); + struct bpf_prog *new = xdp->prog; + int ret = 0; + + switch (xdp->command) { + case XDP_SETUP_PROG: + rcu_assign_pointer(dev->xdp_prog, new); + if (old) + bpf_prog_put(old); + + if (old && !new) { + static_key_slow_dec(&generic_xdp_needed); + } else if (new && !old) { + static_key_slow_inc(&generic_xdp_needed); + dev_disable_lro(dev); + } + break; + + case XDP_QUERY_PROG: + xdp->prog_attached = !!old; + xdp->prog_id = old ? old->aux->id : 0; + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct xdp_buff xdp; + u32 act = XDP_DROP; + void *orig_data; + int hlen, off; + u32 mac_len; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_cloned(skb)) + return XDP_PASS; + + if (skb_linearize(skb)) + goto do_drop; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hlen = skb_headlen(skb) + mac_len; + xdp.data = skb->data - mac_len; + xdp.data_end = xdp.data + hlen; + xdp.data_hard_start = skb->data - skb_headroom(skb); + orig_data = xdp.data; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + off = xdp.data - orig_data; + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + switch (act) { + case XDP_TX: + __skb_push(skb, mac_len); + /* fall through */ + case XDP_PASS: + break; + + default: + bpf_warn_invalid_xdp_action(act); + /* fall through */ + case XDP_ABORTED: + trace_xdp_exception(skb->dev, xdp_prog, act); + /* fall through */ + case XDP_DROP: + do_drop: + kfree_skb(skb); + break; + } + + return act; +} + +/* When doing generic XDP we have to bypass the qdisc layer and the + * network taps in order to match in-driver-XDP behavior. + */ +static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + bool free_skb = true; + int cpu, rc; + + txq = netdev_pick_tx(dev, skb, NULL); + cpu = smp_processor_id(); + HARD_TX_LOCK(dev, txq, cpu); + if (!netif_xmit_stopped(txq)) { + rc = netdev_start_xmit(skb, dev, txq, 0); + if (dev_xmit_complete(rc)) + free_skb = false; + } + HARD_TX_UNLOCK(dev, txq); + if (free_skb) { + trace_xdp_exception(dev, xdp_prog, XDP_TX); + kfree_skb(skb); + } +} + static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; @@ -4241,6 +4467,21 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); + if (static_key_false(&generic_xdp_needed)) { + struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); + + if (xdp_prog) { + u32 act = netif_receive_generic_xdp(skb, xdp_prog); + + if (act != XDP_PASS) { + rcu_read_unlock(); + if (act == XDP_TX) + generic_xdp_tx(skb, xdp_prog); + return NET_RX_DROP; + } + } + } + #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -4473,10 +4714,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff enum gro_result ret; int grow; - if (!(skb->dev->features & NETIF_F_GRO)) - goto normal; - - if (skb->csum_bad) + if (netif_elide_gro(skb->dev)) goto normal; gro_list_prepare(napi, skb); @@ -4521,6 +4759,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (&ptype->list == head) goto normal; + if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) { + ret = GRO_CONSUMED; + goto ok; + } + same_flow = NAPI_GRO_CB(skb)->same_flow; ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; @@ -4601,6 +4844,13 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); +static void napi_skb_free_stolen_head(struct sk_buff *skb) +{ + skb_dst_drop(skb); + secpath_reset(skb); + kmem_cache_free(skbuff_head_cache, skb); +} + static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) { switch (ret) { @@ -4614,16 +4864,15 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) break; case GRO_MERGED_FREE: - if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { - skb_dst_drop(skb); - kmem_cache_free(skbuff_head_cache, skb); - } else { + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else __kfree_skb(skb); - } break; case GRO_HELD: case GRO_MERGED: + case GRO_CONSUMED: break; } @@ -4656,6 +4905,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); + secpath_reset(skb); napi->skb = skb; } @@ -4689,11 +4939,18 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, break; case GRO_DROP: - case GRO_MERGED_FREE: napi_reuse_skb(napi, skb); break; + case GRO_MERGED_FREE: + if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) + napi_skb_free_stolen_head(skb); + else + napi_reuse_skb(napi, skb); + break; + case GRO_MERGED: + case GRO_CONSUMED: break; } @@ -4779,6 +5036,19 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) } EXPORT_SYMBOL(__skb_gro_checksum_complete); +static void net_rps_send_ipi(struct softnet_data *remsd) +{ +#ifdef CONFIG_RPS + while (remsd) { + struct softnet_data *next = remsd->rps_ipi_next; + + if (cpu_online(remsd->cpu)) + smp_call_function_single_async(remsd->cpu, &remsd->csd); + remsd = next; + } +#endif +} + /* * net_rps_action_and_irq_enable sends any pending IPI's for rps. * Note: called with local irq disabled, but exits with local irq enabled. @@ -4794,14 +5064,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) local_irq_enable(); /* Send pending IPI's to kick RPS processing on remote cpus. */ - while (remsd) { - struct softnet_data *next = remsd->rps_ipi_next; - - if (cpu_online(remsd->cpu)) - smp_call_function_single_async(remsd->cpu, - &remsd->csd); - remsd = next; - } + net_rps_send_ipi(remsd); } else #endif local_irq_enable(); @@ -4830,7 +5093,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = weight_p; + napi->weight = dev_rx_weight; while (again) { struct sk_buff *skb; @@ -4886,6 +5149,39 @@ void __napi_schedule(struct napi_struct *n) EXPORT_SYMBOL(__napi_schedule); /** + * napi_schedule_prep - check if napi can be scheduled + * @n: napi context + * + * Test if NAPI routine is already running, and if not mark + * it as running. This is used as a condition variable + * insure only one NAPI poll instance runs. We also make + * sure there is no pending NAPI disable. + */ +bool napi_schedule_prep(struct napi_struct *n) +{ + unsigned long val, new; + + do { + val = READ_ONCE(n->state); + if (unlikely(val & NAPIF_STATE_DISABLE)) + return false; + new = val | NAPIF_STATE_SCHED; + + /* Sets STATE_MISSED bit if STATE_SCHED was already set + * This was suggested by Alexander Duyck, as compiler + * emits better code than : + * if (val & NAPIF_STATE_SCHED) + * new |= NAPIF_STATE_MISSED; + */ + new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * + NAPIF_STATE_MISSED; + } while (cmpxchg(&n->state, val, new) != val); + + return !(val & NAPIF_STATE_SCHED); +} +EXPORT_SYMBOL(napi_schedule_prep); + +/** * __napi_schedule_irqoff - schedule for receive * @n: entry to schedule * @@ -4897,26 +5193,9 @@ void __napi_schedule_irqoff(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule_irqoff); -bool __napi_complete(struct napi_struct *n) -{ - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - - /* Some drivers call us directly, instead of calling - * napi_complete_done(). - */ - if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state))) - return false; - - list_del_init(&n->poll_list); - smp_mb__before_atomic(); - clear_bit(NAPI_STATE_SCHED, &n->state); - return true; -} -EXPORT_SYMBOL(__napi_complete); - bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags; + unsigned long flags, val, new; /* * 1) Don't let napi dequeue from the cpu poll list @@ -4940,14 +5219,33 @@ bool napi_complete_done(struct napi_struct *n, int work_done) else napi_gro_flush(n, false); } - if (likely(list_empty(&n->poll_list))) { - WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); - } else { + if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ local_irq_save(flags); - __napi_complete(n); + list_del_init(&n->poll_list); local_irq_restore(flags); } + + do { + val = READ_ONCE(n->state); + + WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); + + new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED); + + /* If STATE_MISSED was set, leave STATE_SCHED set, + * because we will call napi->poll() one more time. + * This C code was suggested by Alexander Duyck to help gcc. + */ + new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * + NAPIF_STATE_SCHED; + } while (cmpxchg(&n->state, val, new) != val); + + if (unlikely(val & NAPIF_STATE_MISSED)) { + __napi_schedule(n); + return false; + } + return true; } EXPORT_SYMBOL(napi_complete_done); @@ -4973,6 +5271,16 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) { int rc; + /* Busy polling means there is a high chance device driver hard irq + * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was + * set in napi_schedule_prep(). + * Since we are about to call napi->poll() once more, we can safely + * clear NAPI_STATE_MISSED. + * + * Note: x86 could use a single "lock and ..." instruction + * to perform these two clear_bit() + */ + clear_bit(NAPI_STATE_MISSED, &napi->state); clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state); local_bh_disable(); @@ -4985,40 +5293,31 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) if (rc == BUSY_POLL_BUDGET) __napi_schedule(napi); local_bh_enable(); - if (local_softirq_pending()) - do_softirq(); } -bool sk_busy_loop(struct sock *sk, int nonblock) +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) { - unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); - int (*busy_poll)(struct napi_struct *dev); void *have_poll_lock = NULL; struct napi_struct *napi; - int rc; restart: - rc = false; napi_poll = NULL; rcu_read_lock(); - napi = napi_by_id(sk->sk_napi_id); + napi = napi_by_id(napi_id); if (!napi) goto out; - /* Note: ndo_busy_poll method is optional in linux-4.5 */ - busy_poll = napi->dev->netdev_ops->ndo_busy_poll; - preempt_disable(); for (;;) { - rc = 0; + int work = 0; + local_bh_disable(); - if (busy_poll) { - rc = busy_poll(napi); - goto count; - } if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); @@ -5035,19 +5334,15 @@ restart: have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - rc = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + work = napi_poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, work, BUSY_POLL_BUDGET); count: - if (rc > 0) - __NET_ADD_STATS(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); + if (work > 0) + __NET_ADD_STATS(dev_net(napi->dev), + LINUX_MIB_BUSYPOLLRXPACKETS, work); local_bh_enable(); - if (rc == LL_FLUSH_FAILED) - break; /* permanent failure */ - - if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || - busy_loop_timeout(end_time)) + if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { @@ -5056,9 +5351,8 @@ count: preempt_enable(); rcu_read_unlock(); cond_resched(); - rc = !skb_queue_empty(&sk->sk_receive_queue); - if (rc || busy_loop_timeout(end_time)) - return rc; + if (loop_end(loop_end_arg, start_time)) + return; goto restart; } cpu_relax(); @@ -5066,12 +5360,10 @@ count: if (napi_poll) busy_poll_stop(napi, have_poll_lock); preempt_enable(); - rc = !skb_queue_empty(&sk->sk_receive_queue); out: rcu_read_unlock(); - return rc; } -EXPORT_SYMBOL(sk_busy_loop); +EXPORT_SYMBOL(napi_busy_loop); #endif /* CONFIG_NET_RX_BUSY_POLL */ @@ -5083,10 +5375,10 @@ static void napi_hash_add(struct napi_struct *napi) spin_lock(&napi_hash_lock); - /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ + /* 0..NR_CPUS range is reserved for sender_cpu use */ do { - if (unlikely(++napi_gen_id < NR_CPUS + 1)) - napi_gen_id = NR_CPUS + 1; + if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); napi->napi_id = napi_gen_id; @@ -5119,8 +5411,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) struct napi_struct *napi; napi = container_of(timer, struct napi_struct, timer); - if (napi->gro_list) - napi_schedule(napi); + + /* Note : we use a relaxed variant of napi_schedule_prep() not setting + * NAPI_STATE_MISSED, since we do not react to a device IRQ. + */ + if (napi->gro_list && !napi_disable_pending(napi) && + !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) + __napi_schedule_irqoff(napi); return HRTIMER_NORESTART; } @@ -5245,7 +5542,8 @@ out_unlock: static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); - unsigned long time_limit = jiffies + 2; + unsigned long time_limit = jiffies + + usecs_to_jiffies(netdev_budget_usecs); int budget = netdev_budget; LIST_HEAD(list); LIST_HEAD(repoll); @@ -5706,6 +6004,7 @@ static int netdev_adjacent_sysfs_add(struct net_device *dev, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; + sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", adj_dev->name); return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), @@ -5716,6 +6015,7 @@ static void netdev_adjacent_sysfs_del(struct net_device *dev, struct list_head *dev_list) { char linkname[IFNAMSIZ+7]; + sprintf(linkname, dev_list == &dev->adj_list.upper ? "upper_%s" : "lower_%s", name); sysfs_remove_link(&(dev->dev.kobj), linkname); @@ -5985,6 +6285,7 @@ void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { struct netdev_notifier_changeupper_info changeupper_info; + ASSERT_RTNL(); changeupper_info.upper_dev = upper_dev; @@ -6151,50 +6452,6 @@ void netdev_lower_state_changed(struct net_device *lower_dev, } EXPORT_SYMBOL(netdev_lower_state_changed); -int netdev_default_l2upper_neigh_construct(struct net_device *dev, - struct neighbour *n) -{ - struct net_device *lower_dev, *stop_dev; - struct list_head *iter; - int err; - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - if (!lower_dev->netdev_ops->ndo_neigh_construct) - continue; - err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n); - if (err) { - stop_dev = lower_dev; - goto rollback; - } - } - return 0; - -rollback: - netdev_for_each_lower_dev(dev, lower_dev, iter) { - if (lower_dev == stop_dev) - break; - if (!lower_dev->netdev_ops->ndo_neigh_destroy) - continue; - lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); - } - return err; -} -EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct); - -void netdev_default_l2upper_neigh_destroy(struct net_device *dev, - struct neighbour *n) -{ - struct net_device *lower_dev; - struct list_head *iter; - - netdev_for_each_lower_dev(dev, lower_dev, iter) { - if (!lower_dev->netdev_ops->ndo_neigh_destroy) - continue; - lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); - } -} -EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy); - static void dev_change_rx_flags(struct net_device *dev, int flags) { const struct net_device_ops *ops = dev->netdev_ops; @@ -6447,8 +6704,8 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags) } /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI - is important. Some (broken) drivers set IFF_PROMISC, when - IFF_ALLMULTI is requested not asking us and not reporting. + * is important. Some (broken) drivers set IFF_PROMISC, when + * IFF_ALLMULTI is requested not asking us and not reporting. */ if ((flags ^ dev->gflags) & IFF_ALLMULTI) { int inc = (flags & IFF_ALLMULTI) ? 1 : -1; @@ -6508,7 +6765,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags) } EXPORT_SYMBOL(dev_change_flags); -static int __dev_set_mtu(struct net_device *dev, int new_mtu) +int __dev_set_mtu(struct net_device *dev, int new_mtu) { const struct net_device_ops *ops = dev->netdev_ops; @@ -6518,6 +6775,7 @@ static int __dev_set_mtu(struct net_device *dev, int new_mtu) dev->mtu = new_mtu; return 0; } +EXPORT_SYMBOL(__dev_set_mtu); /** * dev_set_mtu - Change maximum transfer unit @@ -6687,53 +6945,84 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); +u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id) +{ + struct netdev_xdp xdp; + + memset(&xdp, 0, sizeof(xdp)); + xdp.command = XDP_QUERY_PROG; + + /* Query must always succeed. */ + WARN_ON(xdp_op(dev, &xdp) < 0); + if (prog_id) + *prog_id = xdp.prog_id; + + return xdp.prog_attached; +} + +static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op, + struct netlink_ext_ack *extack, u32 flags, + struct bpf_prog *prog) +{ + struct netdev_xdp xdp; + + memset(&xdp, 0, sizeof(xdp)); + if (flags & XDP_FLAGS_HW_MODE) + xdp.command = XDP_SETUP_PROG_HW; + else + xdp.command = XDP_SETUP_PROG; + xdp.extack = extack; + xdp.flags = flags; + xdp.prog = prog; + + return xdp_op(dev, &xdp); +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device + * @extack: netlink extended ack * @fd: new program fd or negative value to clear * @flags: xdp-related flags * * Set or clear a bpf program for a device */ -int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, u32 flags) { const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; - struct netdev_xdp xdp; + xdp_op_t xdp_op, xdp_chk; int err; ASSERT_RTNL(); - if (!ops->ndo_xdp) + xdp_op = xdp_chk = ops->ndo_xdp; + if (!xdp_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) return -EOPNOTSUPP; + if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE)) + xdp_op = generic_xdp_install; + if (xdp_op == xdp_chk) + xdp_chk = generic_xdp_install; + if (fd >= 0) { - if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG; - - err = ops->ndo_xdp(dev, &xdp); - if (err < 0) - return err; - if (xdp.prog_attached) - return -EBUSY; - } + if (xdp_chk && __dev_xdp_attached(dev, xdp_chk, NULL)) + return -EEXIST; + if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && + __dev_xdp_attached(dev, xdp_op, NULL)) + return -EBUSY; prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); if (IS_ERR(prog)) return PTR_ERR(prog); } - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_SETUP_PROG; - xdp.prog = prog; - - err = ops->ndo_xdp(dev, &xdp); + err = dev_xdp_install(dev, xdp_op, extack, flags, prog); if (err < 0 && prog) bpf_prog_put(prog); return err; } -EXPORT_SYMBOL(dev_change_xdp_fd); /** * dev_new_index - allocate an ifindex @@ -6746,6 +7035,7 @@ EXPORT_SYMBOL(dev_change_xdp_fd); static int dev_new_index(struct net *net) { int ifindex = net->ifindex; + for (;;) { if (++ifindex <= 0) ifindex = 1; @@ -6812,13 +7102,13 @@ static void rollback_registered_many(struct list_head *head) /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - */ + * this device. They should clean all the things. + */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL); /* @@ -6971,13 +7261,6 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~dev->gso_partial_features; } -#ifdef CONFIG_NET_RX_BUSY_POLL - if (dev->netdev_ops->ndo_busy_poll) - features |= NETIF_F_BUSY_POLL; - else -#endif - features &= ~NETIF_F_BUSY_POLL; - return features; } @@ -7085,13 +7368,10 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, else netif_dormant_off(dev); - if (netif_carrier_ok(rootdev)) { - if (!netif_carrier_ok(dev)) - netif_carrier_on(dev); - } else { - if (netif_carrier_ok(dev)) - netif_carrier_off(dev); - } + if (netif_carrier_ok(rootdev)) + netif_carrier_on(dev); + else + netif_carrier_off(dev); } EXPORT_SYMBOL(netif_stacked_transfer_operstate); @@ -7104,12 +7384,10 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); - rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!rx) { - rx = vzalloc(sz); - if (!rx) - return -ENOMEM; - } + rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + if (!rx) + return -ENOMEM; + dev->_rx = rx; for (i = 0; i < count; i++) @@ -7146,12 +7424,10 @@ static int netif_alloc_netdev_queues(struct net_device *dev) if (count < 1 || count > 0xffff) return -EINVAL; - tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!tx) { - tx = vzalloc(sz); - if (!tx) - return -ENOMEM; - } + tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + if (!tx) + return -ENOMEM; + dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); @@ -7166,6 +7442,7 @@ void netif_tx_stop_all_queues(struct net_device *dev) for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + netif_tx_stop_queue(txq); } } @@ -7324,6 +7601,8 @@ out: err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); + if (dev->priv_destructor) + dev->priv_destructor(dev); goto out; } EXPORT_SYMBOL(register_netdevice); @@ -7531,8 +7810,10 @@ void netdev_run_todo(void) WARN_ON(rcu_access_pointer(dev->ip6_ptr)); WARN_ON(dev->dn_ptr); - if (dev->destructor) - dev->destructor(dev); + if (dev->priv_destructor) + dev->priv_destructor(dev); + if (dev->needs_free_netdev) + free_netdev(dev); /* Report a network device has been unregistered */ rtnl_lock(); @@ -7555,7 +7836,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, { #if BITS_PER_LONG == 64 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); - memcpy(stats64, netdev_stats, sizeof(*stats64)); + memcpy(stats64, netdev_stats, sizeof(*netdev_stats)); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + sizeof(*netdev_stats), 0, sizeof(*stats64) - sizeof(*netdev_stats)); @@ -7597,9 +7878,9 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } else { netdev_stats_to_stats64(storage, &dev->stats); } - storage->rx_dropped += atomic_long_read(&dev->rx_dropped); - storage->tx_dropped += atomic_long_read(&dev->tx_dropped); - storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler); + storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped); + storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped); + storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler); return storage; } EXPORT_SYMBOL(dev_get_stats); @@ -7640,17 +7921,17 @@ void netdev_freemem(struct net_device *dev) } /** - * alloc_netdev_mqs - allocate network device - * @sizeof_priv: size of private data to allocate space for - * @name: device name format string - * @name_assign_type: origin of device name - * @setup: callback to initialize device - * @txqs: the number of TX subqueues to allocate - * @rxqs: the number of RX subqueues to allocate - * - * Allocates a struct net_device with private data area for driver use - * and performs basic initialization. Also allocates subqueue structs - * for each queue on the device. + * alloc_netdev_mqs - allocate network device + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @name_assign_type: origin of device name + * @setup: callback to initialize device + * @txqs: the number of TX subqueues to allocate + * @rxqs: the number of RX subqueues to allocate + * + * Allocates a struct net_device with private data area for driver use + * and performs basic initialization. Also allocates subqueue structs + * for each queue on the device. */ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, unsigned char name_assign_type, @@ -7684,9 +7965,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!p) - p = vzalloc(alloc_size); + p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!p) return NULL; @@ -7762,17 +8041,18 @@ free_dev: EXPORT_SYMBOL(alloc_netdev_mqs); /** - * free_netdev - free network device - * @dev: device + * free_netdev - free network device + * @dev: device * - * This function does the last stage of destroying an allocated device - * interface. The reference to the device object is released. - * If this is the last reference then it will be freed. - * Must be called in process context. + * This function does the last stage of destroying an allocated device + * interface. The reference to the device object is released. If this + * is the last reference then it will be freed.Must be called in process + * context. */ void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; + struct bpf_prog *prog; might_sleep(); netif_free_tx_queues(dev); @@ -7791,6 +8071,12 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; + prog = rcu_dereference_protected(dev->xdp_prog, 1); + if (prog) { + bpf_prog_put(prog); + static_key_slow_dec(&generic_xdp_needed); + } + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); @@ -7950,12 +8236,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char dev_shutdown(dev); /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - - Note that dev->reg_state stays at NETREG_REGISTERED. - This is wanted because this way 8021q and macvlan know - the device is just moving and can keep their slaves up. - */ + * this device. They should clean all the things. + * + * Note that dev->reg_state stays at NETREG_REGISTERED. + * This is wanted because this way 8021q and macvlan know + * the device is just moving and can keep their slaves up. + */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); @@ -8010,7 +8296,7 @@ static int dev_cpu_dead(unsigned int oldcpu) struct sk_buff **list_skb; struct sk_buff *skb; unsigned int cpu; - struct softnet_data *sd, *oldsd; + struct softnet_data *sd, *oldsd, *remsd = NULL; local_irq_disable(); cpu = smp_processor_id(); @@ -8051,6 +8337,13 @@ static int dev_cpu_dead(unsigned int oldcpu) raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_enable(); +#ifdef CONFIG_RPS + remsd = oldsd->rps_ipi_list; + oldsd->rps_ipi_list = NULL; +#endif + /* send out pending IPI's on offline CPU */ + net_rps_send_ipi(remsd); + /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { netif_rx_ni(skb); @@ -8400,7 +8693,6 @@ static int __init net_dev_init(void) rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", NULL, dev_cpu_dead); WARN_ON(rc < 0); - dst_subsys_init(); rc = 0; out: return rc; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index b94b1d293506..06b147d7d9e2 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -28,6 +28,7 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg) if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; + ifr.ifr_name[IFNAMSIZ-1] = 0; error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex); if (error) @@ -225,6 +226,7 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: rx_filter_valid = 1; break; } @@ -410,6 +412,24 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (cmd == SIOCGIFNAME) return dev_ifname(net, (struct ifreq __user *)arg); + /* + * Take care of Wireless Extensions. Unfortunately struct iwreq + * isn't a proper subset of struct ifreq (it's 8 byte shorter) + * so we need to treat it specially, otherwise applications may + * fault if the struct they're passing happens to land at the + * end of a mapped page. + */ + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + struct iwreq iwr; + + if (copy_from_user(&iwr, arg, sizeof(iwr))) + return -EFAULT; + + iwr.ifr_name[sizeof(iwr.ifr_name) - 1] = 0; + + return wext_handle_ioctl(net, &iwr, cmd, arg); + } + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; @@ -559,9 +579,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) ret = -EFAULT; return ret; } - /* Take care of Wireless Extensions */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) - return wext_handle_ioctl(net, &ifr, cmd, arg); return -ENOTTY; } } diff --git a/net/core/devlink.c b/net/core/devlink.c index 2b5bf9efa720..a0adfc31a3fe 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1392,15 +1392,15 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, return -EOPNOTSUPP; } -static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) +static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) { const struct devlink_ops *ops = devlink->ops; + u8 inline_mode, encap_mode; void *hdr; int err = 0; u16 mode; - u8 inline_mode; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) @@ -1408,50 +1408,61 @@ static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, err = devlink_nl_put_handle(msg, devlink); if (err) - goto out; + goto nla_put_failure; - err = ops->eswitch_mode_get(devlink, &mode); - if (err) - goto out; - err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); - if (err) - goto out; + if (ops->eswitch_mode_get) { + err = ops->eswitch_mode_get(devlink, &mode); + if (err) + goto nla_put_failure; + err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode); + if (err) + goto nla_put_failure; + } if (ops->eswitch_inline_mode_get) { err = ops->eswitch_inline_mode_get(devlink, &inline_mode); if (err) - goto out; + goto nla_put_failure; err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE, inline_mode); if (err) - goto out; + goto nla_put_failure; + } + + if (ops->eswitch_encap_mode_get) { + err = ops->eswitch_encap_mode_get(devlink, &encap_mode); + if (err) + goto nla_put_failure; + err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode); + if (err) + goto nla_put_failure; } genlmsg_end(msg, hdr); return 0; -out: +nla_put_failure: genlmsg_cancel(msg, hdr); return err; } -static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, - struct genl_info *info) +static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; struct sk_buff *msg; int err; - if (!ops || !ops->eswitch_mode_get) + if (!ops) return -EOPNOTSUPP; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; - err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET, - info->snd_portid, info->snd_seq, 0); + err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET, + info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); @@ -1461,14 +1472,14 @@ static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb, - struct genl_info *info) +static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, + struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; - u16 mode; - u8 inline_mode; + u8 inline_mode, encap_mode; int err = 0; + u16 mode; if (!ops) return -EOPNOTSUPP; @@ -1492,9 +1503,701 @@ static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb, return err; } + if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) { + if (!ops->eswitch_encap_mode_set) + return -EOPNOTSUPP; + encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]); + err = ops->eswitch_encap_mode_set(devlink, encap_mode); + if (err) + return err; + } + + return 0; +} + +int devlink_dpipe_match_put(struct sk_buff *skb, + struct devlink_dpipe_match *match) +{ + struct devlink_dpipe_header *header = match->header; + struct devlink_dpipe_field *field = &header->fields[match->field_id]; + struct nlattr *match_attr; + + match_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_MATCH); + if (!match_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_MATCH_TYPE, match->type) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, match->header_index) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || + nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) + goto nla_put_failure; + + nla_nest_end(skb, match_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, match_attr); + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_match_put); + +static int devlink_dpipe_matches_put(struct devlink_dpipe_table *table, + struct sk_buff *skb) +{ + struct nlattr *matches_attr; + + matches_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE_MATCHES); + if (!matches_attr) + return -EMSGSIZE; + + if (table->table_ops->matches_dump(table->priv, skb)) + goto nla_put_failure; + + nla_nest_end(skb, matches_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, matches_attr); + return -EMSGSIZE; +} + +int devlink_dpipe_action_put(struct sk_buff *skb, + struct devlink_dpipe_action *action) +{ + struct devlink_dpipe_header *header = action->header; + struct devlink_dpipe_field *field = &header->fields[action->field_id]; + struct nlattr *action_attr; + + action_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_ACTION); + if (!action_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_ACTION_TYPE, action->type) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, action->header_index) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || + nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) + goto nla_put_failure; + + nla_nest_end(skb, action_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, action_attr); + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_action_put); + +static int devlink_dpipe_actions_put(struct devlink_dpipe_table *table, + struct sk_buff *skb) +{ + struct nlattr *actions_attr; + + actions_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE_ACTIONS); + if (!actions_attr) + return -EMSGSIZE; + + if (table->table_ops->actions_dump(table->priv, skb)) + goto nla_put_failure; + + nla_nest_end(skb, actions_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, actions_attr); + return -EMSGSIZE; +} + +static int devlink_dpipe_table_put(struct sk_buff *skb, + struct devlink_dpipe_table *table) +{ + struct nlattr *table_attr; + + table_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE); + if (!table_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table->size, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED, + table->counters_enabled)) + goto nla_put_failure; + + if (devlink_dpipe_matches_put(table, skb)) + goto nla_put_failure; + + if (devlink_dpipe_actions_put(table, skb)) + goto nla_put_failure; + + nla_nest_end(skb, table_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, table_attr); + return -EMSGSIZE; +} + +static int devlink_dpipe_send_and_alloc_skb(struct sk_buff **pskb, + struct genl_info *info) +{ + int err; + + if (*pskb) { + err = genlmsg_reply(*pskb, info); + if (err) + return err; + } + *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!*pskb) + return -ENOMEM; + return 0; +} + +static int devlink_dpipe_tables_fill(struct genl_info *info, + enum devlink_command cmd, int flags, + struct list_head *dpipe_tables, + const char *table_name) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_dpipe_table *table; + struct nlattr *tables_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + bool incomplete; + void *hdr; + int i; + int err; + + table = list_first_entry(dpipe_tables, + struct devlink_dpipe_table, list); +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) { + nlmsg_free(skb); + return -EMSGSIZE; + } + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + tables_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLES); + if (!tables_attr) + goto nla_put_failure; + + i = 0; + incomplete = false; + list_for_each_entry_from(table, dpipe_tables, list) { + if (!table_name) { + err = devlink_dpipe_table_put(skb, table); + if (err) { + if (!i) + goto err_table_put; + incomplete = true; + break; + } + } else { + if (!strcmp(table->name, table_name)) { + err = devlink_dpipe_table_put(skb, table); + if (err) + break; + } + } + i++; + } + + nla_nest_end(skb, tables_attr); + genlmsg_end(skb, hdr); + if (incomplete) + goto start_again; + +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_table_put: +err_skb_send_alloc: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + return err; +} + +static int devlink_nl_cmd_dpipe_table_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const char *table_name = NULL; + + if (info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]) + table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); + + return devlink_dpipe_tables_fill(info, DEVLINK_CMD_DPIPE_TABLE_GET, 0, + &devlink->dpipe_table_list, + table_name); +} + +static int devlink_dpipe_value_put(struct sk_buff *skb, + struct devlink_dpipe_value *value) +{ + if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE, + value->value_size, value->value)) + return -EMSGSIZE; + if (value->mask) + if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE_MASK, + value->value_size, value->mask)) + return -EMSGSIZE; + if (value->mapping_valid) + if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_VALUE_MAPPING, + value->mapping_value)) + return -EMSGSIZE; + return 0; +} + +static int devlink_dpipe_action_value_put(struct sk_buff *skb, + struct devlink_dpipe_value *value) +{ + if (!value->action) + return -EINVAL; + if (devlink_dpipe_action_put(skb, value->action)) + return -EMSGSIZE; + if (devlink_dpipe_value_put(skb, value)) + return -EMSGSIZE; + return 0; +} + +static int devlink_dpipe_action_values_put(struct sk_buff *skb, + struct devlink_dpipe_value *values, + unsigned int values_count) +{ + struct nlattr *action_attr; + int i; + int err; + + for (i = 0; i < values_count; i++) { + action_attr = nla_nest_start(skb, + DEVLINK_ATTR_DPIPE_ACTION_VALUE); + if (!action_attr) + return -EMSGSIZE; + err = devlink_dpipe_action_value_put(skb, &values[i]); + if (err) + goto err_action_value_put; + nla_nest_end(skb, action_attr); + } + return 0; + +err_action_value_put: + nla_nest_cancel(skb, action_attr); + return err; +} + +static int devlink_dpipe_match_value_put(struct sk_buff *skb, + struct devlink_dpipe_value *value) +{ + if (!value->match) + return -EINVAL; + if (devlink_dpipe_match_put(skb, value->match)) + return -EMSGSIZE; + if (devlink_dpipe_value_put(skb, value)) + return -EMSGSIZE; + return 0; +} + +static int devlink_dpipe_match_values_put(struct sk_buff *skb, + struct devlink_dpipe_value *values, + unsigned int values_count) +{ + struct nlattr *match_attr; + int i; + int err; + + for (i = 0; i < values_count; i++) { + match_attr = nla_nest_start(skb, + DEVLINK_ATTR_DPIPE_MATCH_VALUE); + if (!match_attr) + return -EMSGSIZE; + err = devlink_dpipe_match_value_put(skb, &values[i]); + if (err) + goto err_match_value_put; + nla_nest_end(skb, match_attr); + } + return 0; + +err_match_value_put: + nla_nest_cancel(skb, match_attr); + return err; +} + +static int devlink_dpipe_entry_put(struct sk_buff *skb, + struct devlink_dpipe_entry *entry) +{ + struct nlattr *entry_attr, *matches_attr, *actions_attr; + int err; + + entry_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (entry->counter_valid) + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER, + entry->counter, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + matches_attr = nla_nest_start(skb, + DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES); + if (!matches_attr) + goto nla_put_failure; + + err = devlink_dpipe_match_values_put(skb, entry->match_values, + entry->match_values_count); + if (err) { + nla_nest_cancel(skb, matches_attr); + goto err_match_values_put; + } + nla_nest_end(skb, matches_attr); + + actions_attr = nla_nest_start(skb, + DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES); + if (!actions_attr) + goto nla_put_failure; + + err = devlink_dpipe_action_values_put(skb, entry->action_values, + entry->action_values_count); + if (err) { + nla_nest_cancel(skb, actions_attr); + goto err_action_values_put; + } + nla_nest_end(skb, actions_attr); + + nla_nest_end(skb, entry_attr); + return 0; + +nla_put_failure: + err = -EMSGSIZE; +err_match_values_put: +err_action_values_put: + nla_nest_cancel(skb, entry_attr); + return err; +} + +static struct devlink_dpipe_table * +devlink_dpipe_table_find(struct list_head *dpipe_tables, + const char *table_name) +{ + struct devlink_dpipe_table *table; + + list_for_each_entry_rcu(table, dpipe_tables, list) { + if (!strcmp(table->name, table_name)) + return table; + } + return NULL; +} + +int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) +{ + struct devlink *devlink; + int err; + + err = devlink_dpipe_send_and_alloc_skb(&dump_ctx->skb, + dump_ctx->info); + if (err) + return err; + + dump_ctx->hdr = genlmsg_put(dump_ctx->skb, + dump_ctx->info->snd_portid, + dump_ctx->info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, + dump_ctx->cmd); + if (!dump_ctx->hdr) + goto nla_put_failure; + + devlink = dump_ctx->info->user_ptr[0]; + if (devlink_nl_put_handle(dump_ctx->skb, devlink)) + goto nla_put_failure; + dump_ctx->nest = nla_nest_start(dump_ctx->skb, + DEVLINK_ATTR_DPIPE_ENTRIES); + if (!dump_ctx->nest) + goto nla_put_failure; + return 0; + +nla_put_failure: + genlmsg_cancel(dump_ctx->skb, dump_ctx->hdr); + nlmsg_free(dump_ctx->skb); + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_prepare); + +int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx, + struct devlink_dpipe_entry *entry) +{ + return devlink_dpipe_entry_put(dump_ctx->skb, entry); +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_append); + +int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx) +{ + nla_nest_end(dump_ctx->skb, dump_ctx->nest); + genlmsg_end(dump_ctx->skb, dump_ctx->hdr); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_close); + +static int devlink_dpipe_entries_fill(struct genl_info *info, + enum devlink_command cmd, int flags, + struct devlink_dpipe_table *table) +{ + struct devlink_dpipe_dump_ctx dump_ctx; + struct nlmsghdr *nlh; + int err; + + dump_ctx.skb = NULL; + dump_ctx.cmd = cmd; + dump_ctx.info = info; + + err = table->table_ops->entries_dump(table->priv, + table->counters_enabled, + &dump_ctx); + if (err) + goto err_entries_dump; + +send_done: + nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + return genlmsg_reply(dump_ctx.skb, info); + +err_entries_dump: +err_skb_send_alloc: + genlmsg_cancel(dump_ctx.skb, dump_ctx.hdr); + nlmsg_free(dump_ctx.skb); + return err; +} + +static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_dpipe_table *table; + const char *table_name; + + if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]) + return -EINVAL; + + table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) + return -EINVAL; + + if (!table->table_ops->entries_dump) + return -EINVAL; + + return devlink_dpipe_entries_fill(info, DEVLINK_CMD_DPIPE_ENTRIES_GET, + 0, table); +} + +static int devlink_dpipe_fields_put(struct sk_buff *skb, + const struct devlink_dpipe_header *header) +{ + struct devlink_dpipe_field *field; + struct nlattr *field_attr; + int i; + + for (i = 0; i < header->fields_count; i++) { + field = &header->fields[i]; + field_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_FIELD); + if (!field_attr) + return -EMSGSIZE; + if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_FIELD_NAME, field->name) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, field->bitwidth) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, field->mapping_type)) + goto nla_put_failure; + nla_nest_end(skb, field_attr); + } + return 0; + +nla_put_failure: + nla_nest_cancel(skb, field_attr); + return -EMSGSIZE; +} + +static int devlink_dpipe_header_put(struct sk_buff *skb, + struct devlink_dpipe_header *header) +{ + struct nlattr *fields_attr, *header_attr; + int err; + + header_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADER); + if (!header_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_HEADER_NAME, header->name) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || + nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) + goto nla_put_failure; + + fields_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADER_FIELDS); + if (!fields_attr) + goto nla_put_failure; + + err = devlink_dpipe_fields_put(skb, header); + if (err) { + nla_nest_cancel(skb, fields_attr); + goto nla_put_failure; + } + nla_nest_end(skb, fields_attr); + nla_nest_end(skb, header_attr); + return 0; + +nla_put_failure: + err = -EMSGSIZE; + nla_nest_cancel(skb, header_attr); + return err; +} + +static int devlink_dpipe_headers_fill(struct genl_info *info, + enum devlink_command cmd, int flags, + struct devlink_dpipe_headers * + dpipe_headers) +{ + struct devlink *devlink = info->user_ptr[0]; + struct nlattr *headers_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + void *hdr; + int i, j; + int err; + + i = 0; +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) { + nlmsg_free(skb); + return -EMSGSIZE; + } + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + headers_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADERS); + if (!headers_attr) + goto nla_put_failure; + + j = 0; + for (; i < dpipe_headers->headers_count; i++) { + err = devlink_dpipe_header_put(skb, dpipe_headers->headers[i]); + if (err) { + if (!j) + goto err_table_put; + break; + } + j++; + } + nla_nest_end(skb, headers_attr); + genlmsg_end(skb, hdr); + if (i != dpipe_headers->headers_count) + goto start_again; + +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_table_put: +err_skb_send_alloc: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + return err; +} + +static int devlink_nl_cmd_dpipe_headers_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + if (!devlink->dpipe_headers) + return -EOPNOTSUPP; + return devlink_dpipe_headers_fill(info, DEVLINK_CMD_DPIPE_HEADERS_GET, + 0, devlink->dpipe_headers); +} + +static int devlink_dpipe_table_counters_set(struct devlink *devlink, + const char *table_name, + bool enable) +{ + struct devlink_dpipe_table *table; + + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) + return -EINVAL; + + if (table->counter_control_extern) + return -EOPNOTSUPP; + + if (!(table->counters_enabled ^ enable)) + return 0; + + table->counters_enabled = enable; + if (table->table_ops->counters_set_update) + table->table_ops->counters_set_update(table->priv, enable); return 0; } +static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const char *table_name; + bool counters_enable; + + if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME] || + !info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]) + return -EINVAL; + + table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); + counters_enable = !!nla_get_u8(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]); + + return devlink_dpipe_table_counters_set(devlink, table_name, + counters_enable); +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -1510,6 +2213,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -1629,15 +2335,43 @@ static const struct genl_ops devlink_nl_ops[] = { DEVLINK_NL_FLAG_LOCK_PORTS, }, { - .cmd = DEVLINK_CMD_ESWITCH_MODE_GET, - .doit = devlink_nl_cmd_eswitch_mode_get_doit, + .cmd = DEVLINK_CMD_ESWITCH_GET, + .doit = devlink_nl_cmd_eswitch_get_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { - .cmd = DEVLINK_CMD_ESWITCH_MODE_SET, - .doit = devlink_nl_cmd_eswitch_mode_set_doit, + .cmd = DEVLINK_CMD_ESWITCH_SET, + .doit = devlink_nl_cmd_eswitch_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, + .doit = devlink_nl_cmd_dpipe_table_get, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, + .doit = devlink_nl_cmd_dpipe_entries_get, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, + .doit = devlink_nl_cmd_dpipe_headers_get, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, + .doit = devlink_nl_cmd_dpipe_table_counters_set, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, @@ -1678,6 +2412,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); + INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); return devlink; } EXPORT_SYMBOL_GPL(devlink_alloc); @@ -1878,6 +2613,133 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) } EXPORT_SYMBOL_GPL(devlink_sb_unregister); +/** + * devlink_dpipe_headers_register - register dpipe headers + * + * @devlink: devlink + * @dpipe_headers: dpipe header array + * + * Register the headers supported by hardware. + */ +int devlink_dpipe_headers_register(struct devlink *devlink, + struct devlink_dpipe_headers *dpipe_headers) +{ + mutex_lock(&devlink_mutex); + devlink->dpipe_headers = dpipe_headers; + mutex_unlock(&devlink_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register); + +/** + * devlink_dpipe_headers_unregister - unregister dpipe headers + * + * @devlink: devlink + * + * Unregister the headers supported by hardware. + */ +void devlink_dpipe_headers_unregister(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + devlink->dpipe_headers = NULL; + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister); + +/** + * devlink_dpipe_table_counter_enabled - check if counter allocation + * required + * @devlink: devlink + * @table_name: tables name + * + * Used by driver to check if counter allocation is required. + * After counter allocation is turned on the table entries + * are updated to include counter statistics. + * + * After that point on the driver must respect the counter + * state so that each entry added to the table is added + * with a counter. + */ +bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, + const char *table_name) +{ + struct devlink_dpipe_table *table; + bool enabled; + + rcu_read_lock(); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + enabled = false; + if (table) + enabled = table->counters_enabled; + rcu_read_unlock(); + return enabled; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled); + +/** + * devlink_dpipe_table_register - register dpipe table + * + * @devlink: devlink + * @table_name: table name + * @table_ops: table ops + * @priv: priv + * @size: size + * @counter_control_extern: external control for counters + */ +int devlink_dpipe_table_register(struct devlink *devlink, + const char *table_name, + struct devlink_dpipe_table_ops *table_ops, + void *priv, u64 size, + bool counter_control_extern) +{ + struct devlink_dpipe_table *table; + + if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name)) + return -EEXIST; + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return -ENOMEM; + + table->name = table_name; + table->table_ops = table_ops; + table->priv = priv; + table->size = size; + table->counter_control_extern = counter_control_extern; + + mutex_lock(&devlink_mutex); + list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); + mutex_unlock(&devlink_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_register); + +/** + * devlink_dpipe_table_unregister - unregister dpipe table + * + * @devlink: devlink + * @table_name: table name + */ +void devlink_dpipe_table_unregister(struct devlink *devlink, + const char *table_name) +{ + struct devlink_dpipe_table *table; + + mutex_lock(&devlink_mutex); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) + goto unlock; + list_del_rcu(&table->list); + mutex_unlock(&devlink_mutex); + kfree_rcu(table, rcu); + return; +unlock: + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index fb55327dcfea..70ccda233bd1 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -412,9 +412,8 @@ static int __init init_net_drop_monitor(void) for_each_possible_cpu(cpu) { data = &per_cpu(dm_cpu_data, cpu); INIT_WORK(&data->dm_alert_work, send_dm_alert); - init_timer(&data->send_timer); - data->send_timer.data = (unsigned long)data; - data->send_timer.function = sched_send_work; + setup_timer(&data->send_timer, sched_send_work, + (unsigned long)data); spin_lock_init(&data->lock); reset_per_cpu_data(data); } diff --git a/net/core/dst.c b/net/core/dst.c index b5cbbe07f786..00aa972ad1a1 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -42,108 +42,6 @@ * to dirty as few cache lines as possible in __dst_free(). * As this is not a very strong hint, we dont force an alignment on SMP. */ -static struct { - spinlock_t lock; - struct dst_entry *list; - unsigned long timer_inc; - unsigned long timer_expires; -} dst_garbage = { - .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), - .timer_inc = DST_GC_MAX, -}; -static void dst_gc_task(struct work_struct *work); -static void ___dst_free(struct dst_entry *dst); - -static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); - -static DEFINE_MUTEX(dst_gc_mutex); -/* - * long lived entries are maintained in this list, guarded by dst_gc_mutex - */ -static struct dst_entry *dst_busy_list; - -static void dst_gc_task(struct work_struct *work) -{ - int delayed = 0; - int work_performed = 0; - unsigned long expires = ~0L; - struct dst_entry *dst, *next, head; - struct dst_entry *last = &head; - - mutex_lock(&dst_gc_mutex); - next = dst_busy_list; - -loop: - while ((dst = next) != NULL) { - next = dst->next; - prefetch(&next->next); - cond_resched(); - if (likely(atomic_read(&dst->__refcnt))) { - last->next = dst; - last = dst; - delayed++; - continue; - } - work_performed++; - - dst = dst_destroy(dst); - if (dst) { - /* NOHASH and still referenced. Unless it is already - * on gc list, invalidate it and add to gc list. - * - * Note: this is temporary. Actually, NOHASH dst's - * must be obsoleted when parent is obsoleted. - * But we do not have state "obsoleted, but - * referenced by parent", so it is right. - */ - if (dst->obsolete > 0) - continue; - - ___dst_free(dst); - dst->next = next; - next = dst; - } - } - - spin_lock_bh(&dst_garbage.lock); - next = dst_garbage.list; - if (next) { - dst_garbage.list = NULL; - spin_unlock_bh(&dst_garbage.lock); - goto loop; - } - last->next = NULL; - dst_busy_list = head.next; - if (!dst_busy_list) - dst_garbage.timer_inc = DST_GC_MAX; - else { - /* - * if we freed less than 1/10 of delayed entries, - * we can sleep longer. - */ - if (work_performed <= delayed/10) { - dst_garbage.timer_expires += dst_garbage.timer_inc; - if (dst_garbage.timer_expires > DST_GC_MAX) - dst_garbage.timer_expires = DST_GC_MAX; - dst_garbage.timer_inc += DST_GC_INC; - } else { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - } - expires = dst_garbage.timer_expires; - /* - * if the next desired timer is more than 4 seconds in the - * future then round the timer to whole seconds - */ - if (expires > 4*HZ) - expires = round_jiffies_relative(expires); - schedule_delayed_work(&dst_gc_work, expires); - } - - spin_unlock_bh(&dst_garbage.lock); - mutex_unlock(&dst_gc_mutex); -} - int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); @@ -151,13 +49,13 @@ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(dst_discard_out); -const u32 dst_default_metrics[RTAX_MAX + 1] = { +const struct dst_metrics dst_default_metrics = { /* This initializer is needed to force linker to place this variable * into const section. Otherwise it might end into bss section. * We really want to avoid false sharing on this variable, and catch * any writes on it. */ - [RTAX_MAX] = 0xdeadbeef, + .refcnt = ATOMIC_INIT(1), }; void dst_init(struct dst_entry *dst, struct dst_ops *ops, @@ -169,7 +67,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, if (dev) dev_hold(dev); dst->ops = ops; - dst_init_metrics(dst, dst_default_metrics, true); + dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; dst->path = dst; dst->from = NULL; @@ -190,7 +88,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; - dst->pending_confirm = 0; dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); @@ -217,41 +114,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, } EXPORT_SYMBOL(dst_alloc); -static void ___dst_free(struct dst_entry *dst) -{ - /* The first case (dev==NULL) is required, when - protocol module is unloaded. - */ - if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { - dst->input = dst_discard; - dst->output = dst_discard_out; - } - dst->obsolete = DST_OBSOLETE_DEAD; -} - -void __dst_free(struct dst_entry *dst) -{ - spin_lock_bh(&dst_garbage.lock); - ___dst_free(dst); - dst->next = dst_garbage.list; - dst_garbage.list = dst; - if (dst_garbage.timer_inc > DST_GC_INC) { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - mod_delayed_work(system_wq, &dst_gc_work, - dst_garbage.timer_expires); - } - spin_unlock_bh(&dst_garbage.lock); -} -EXPORT_SYMBOL(__dst_free); - struct dst_entry *dst_destroy(struct dst_entry * dst) { struct dst_entry *child; smp_rmb(); -again: child = dst->child; if (!(dst->flags & DST_NOCOUNT)) @@ -270,20 +138,8 @@ again: kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; - if (dst) { - int nohash = dst->flags & DST_NOHASH; - - if (atomic_dec_and_test(&dst->__refcnt)) { - /* We were real parent of this dst, so kill child. */ - if (nohash) - goto again; - } else { - /* Child is still referenced, return it for freeing. */ - if (nohash) - return dst; - /* Child is still in his hash table */ - } - } + if (dst) + dst_release_immediate(dst); return NULL; } EXPORT_SYMBOL(dst_destroy); @@ -293,47 +149,88 @@ static void dst_destroy_rcu(struct rcu_head *head) struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); dst = dst_destroy(dst); - if (dst) - __dst_free(dst); } +/* Operations to mark dst as DEAD and clean up the net device referenced + * by dst: + * 1. put the dst under loopback interface and discard all tx/rx packets + * on this route. + * 2. release the net_device + * This function should be called when removing routes from the fib tree + * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to + * make the next dst_ops->check() fail. + */ +void dst_dev_put(struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + + dst->obsolete = DST_OBSOLETE_DEAD; + if (dst->ops->ifdown) + dst->ops->ifdown(dst, dev, true); + dst->input = dst_discard; + dst->output = dst_discard_out; + dst->dev = dev_net(dst->dev)->loopback_dev; + dev_hold(dst->dev); + dev_put(dev); +} +EXPORT_SYMBOL(dst_dev_put); + void dst_release(struct dst_entry *dst) { if (dst) { int newrefcnt; - unsigned short nocache = dst->flags & DST_NOCACHE; newrefcnt = atomic_dec_return(&dst->__refcnt); if (unlikely(newrefcnt < 0)) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); - if (!newrefcnt && unlikely(nocache)) + if (!newrefcnt) call_rcu(&dst->rcu_head, dst_destroy_rcu); } } EXPORT_SYMBOL(dst_release); +void dst_release_immediate(struct dst_entry *dst) +{ + if (dst) { + int newrefcnt; + + newrefcnt = atomic_dec_return(&dst->__refcnt); + if (unlikely(newrefcnt < 0)) + net_warn_ratelimited("%s: dst:%p refcnt:%d\n", + __func__, dst, newrefcnt); + if (!newrefcnt) + dst_destroy(dst); + } +} +EXPORT_SYMBOL(dst_release_immediate); + u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { - u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); + struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); if (p) { - u32 *old_p = __DST_METRICS_PTR(old); + struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); unsigned long prev, new; - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + atomic_set(&p->refcnt, 1); + memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); new = (unsigned long) p; prev = cmpxchg(&dst->_metrics, old, new); if (prev != old) { kfree(p); - p = __DST_METRICS_PTR(prev); + p = (struct dst_metrics *)__DST_METRICS_PTR(prev); if (prev & DST_METRICS_READ_ONLY) p = NULL; + } else if (prev & DST_METRICS_REFCOUNTED) { + if (atomic_dec_and_test(&old_p->refcnt)) + kfree(old_p); } } - return p; + BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); + return (u32 *)p; } EXPORT_SYMBOL(dst_cow_metrics_generic); @@ -342,7 +239,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) { unsigned long prev, new; - new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; + new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; prev = cmpxchg(&dst->_metrics, old, new); if (prev == old) kfree(__DST_METRICS_PTR(old)); @@ -367,21 +264,25 @@ static int dst_md_discard(struct sk_buff *skb) return 0; } -static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) +static void __metadata_dst_init(struct metadata_dst *md_dst, + enum metadata_type type, u8 optslen) + { struct dst_entry *dst; dst = &md_dst->dst; dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, - DST_METADATA | DST_NOCACHE | DST_NOCOUNT); + DST_METADATA | DST_NOCOUNT); dst->input = dst_md_discard; dst->output = dst_md_discard_out; memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); + md_dst->type = type; } -struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) +struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, + gfp_t flags) { struct metadata_dst *md_dst; @@ -389,7 +290,7 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) if (!md_dst) return NULL; - __metadata_dst_init(md_dst, optslen); + __metadata_dst_init(md_dst, type, optslen); return md_dst; } @@ -403,7 +304,8 @@ void metadata_dst_free(struct metadata_dst *md_dst) kfree(md_dst); } -struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) +struct metadata_dst __percpu * +metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) { int cpu; struct metadata_dst __percpu *md_dst; @@ -414,77 +316,8 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) return NULL; for_each_possible_cpu(cpu) - __metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen); + __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen); return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); - -/* Dirty hack. We did it in 2.2 (in __dst_free), - * we have _very_ good reasons not to repeat - * this mistake in 2.3, but we have no choice - * now. _It_ _is_ _explicit_ _deliberate_ - * _race_ _condition_. - * - * Commented and originally written by Alexey. - */ -static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) -{ - if (dst->ops->ifdown) - dst->ops->ifdown(dst, dev, unregister); - - if (dev != dst->dev) - return; - - if (!unregister) { - dst->input = dst_discard; - dst->output = dst_discard_out; - } else { - dst->dev = dev_net(dst->dev)->loopback_dev; - dev_hold(dst->dev); - dev_put(dev); - } -} - -static int dst_dev_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct dst_entry *dst, *last = NULL; - - switch (event) { - case NETDEV_UNREGISTER_FINAL: - case NETDEV_DOWN: - mutex_lock(&dst_gc_mutex); - for (dst = dst_busy_list; dst; dst = dst->next) { - last = dst; - dst_ifdown(dst, dev, event != NETDEV_DOWN); - } - - spin_lock_bh(&dst_garbage.lock); - dst = dst_garbage.list; - dst_garbage.list = NULL; - spin_unlock_bh(&dst_garbage.lock); - - if (last) - last->next = dst; - else - dst_busy_list = dst; - for (; dst; dst = dst->next) - dst_ifdown(dst, dev, event != NETDEV_DOWN); - mutex_unlock(&dst_gc_mutex); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block dst_dev_notifier = { - .notifier_call = dst_dev_event, - .priority = -10, /* must be called after other network notifiers */ -}; - -void __init dst_subsys_init(void) -{ - register_netdevice_notifier(&dst_dev_notifier); -} diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d92de0a1f0a4..674b6c9cec18 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -24,7 +24,7 @@ #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/rtnetlink.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/net.h> /* @@ -90,6 +90,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", + [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", @@ -102,14 +103,16 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXFCS_BIT] = "rx-fcs", [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", - [NETIF_F_BUSY_POLL_BIT] = "busy-poll", [NETIF_F_HW_TC_BIT] = "hw-tc-offload", + [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", + [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", }; static const char rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { [ETH_RSS_HASH_TOP_BIT] = "toeplitz", [ETH_RSS_HASH_XOR_BIT] = "xor", + [ETH_RSS_HASH_CRC32_BIT] = "crc32", }; static const char @@ -1820,11 +1823,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) ret = __ethtool_get_sset_count(dev, gstrings.string_set); if (ret < 0) return ret; + if (ret > S32_MAX / ETH_GSTRING_LEN) + return -ENOMEM; + WARN_ON_ONCE(!ret); gstrings.len = ret; - - data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER); - if (!data) + data = vzalloc(gstrings.len * ETH_GSTRING_LEN); + if (gstrings.len && !data) return -ENOMEM; __ethtool_get_strings(dev, gstrings.string_set, data); @@ -1833,12 +1838,13 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) goto out; useraddr += sizeof(gstrings); - if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + if (gstrings.len && + copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) goto out; ret = 0; out: - kfree(data); + vfree(data); return ret; } @@ -1915,14 +1921,15 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) n_stats = ops->get_sset_count(dev, ETH_SS_STATS); if (n_stats < 0) return n_stats; - WARN_ON(n_stats == 0); - + if (n_stats > S32_MAX / sizeof(u64)) + return -ENOMEM; + WARN_ON_ONCE(!n_stats); if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; stats.n_stats = n_stats; - data = kmalloc(n_stats * sizeof(u64), GFP_USER); - if (!data) + data = vzalloc(n_stats * sizeof(u64)); + if (n_stats && !data) return -ENOMEM; ops->get_ethtool_stats(dev, &stats, data); @@ -1931,12 +1938,12 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); - if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) + if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) goto out; ret = 0; out: - kfree(data); + vfree(data); return ret; } @@ -1951,17 +1958,18 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) return -EOPNOTSUPP; n_stats = phy_get_sset_count(phydev); - if (n_stats < 0) return n_stats; - WARN_ON(n_stats == 0); + if (n_stats > S32_MAX / sizeof(u64)) + return -ENOMEM; + WARN_ON_ONCE(!n_stats); if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; stats.n_stats = n_stats; - data = kmalloc_array(n_stats, sizeof(u64), GFP_USER); - if (!data) + data = vzalloc(n_stats * sizeof(u64)); + if (n_stats && !data) return -ENOMEM; mutex_lock(&phydev->lock); @@ -1972,12 +1980,12 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); - if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) + if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) goto out; ret = 0; out: - kfree(data); + vfree(data); return ret; } @@ -2314,16 +2322,12 @@ static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) ret = ethtool_tunable_valid(&tuna); if (ret) return ret; - data = kmalloc(tuna.len, GFP_USER); - if (!data) - return -ENOMEM; useraddr += sizeof(tuna); - ret = -EFAULT; - if (copy_from_user(data, useraddr, tuna.len)) - goto out; + data = memdup_user(useraddr, tuna.len); + if (IS_ERR(data)) + return PTR_ERR(data); ret = ops->set_tunable(dev, &tuna, data); -out: kfree(data); return ret; } @@ -2499,18 +2503,14 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr) ret = ethtool_phy_tunable_valid(&tuna); if (ret) return ret; - data = kmalloc(tuna.len, GFP_USER); - if (!data) - return -ENOMEM; useraddr += sizeof(tuna); - ret = -EFAULT; - if (copy_from_user(data, useraddr, tuna.len)) - goto out; + data = memdup_user(useraddr, tuna.len); + if (IS_ERR(data)) + return PTR_ERR(data); mutex_lock(&phydev->lock); ret = phydev->drv->set_tunable(phydev, &tuna, data); mutex_unlock(&phydev->lock); -out: kfree(data); return ret; } diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index b6791d94841d..fdcb1bcd2afa 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -23,6 +23,20 @@ static const struct fib_kuid_range fib_kuid_range_unset = { KUIDT_INIT(~0), }; +bool fib_rule_matchall(const struct fib_rule *rule) +{ + if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || + rule->flags) + return false; + if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) + return false; + if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || + !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) + return false; + return true; +} +EXPORT_SYMBOL_GPL(fib_rule_matchall); + int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) { @@ -32,7 +46,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, if (r == NULL) return -ENOMEM; - atomic_set(&r->refcnt, 1); + refcount_set(&r->refcnt, 1); r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; @@ -269,7 +283,7 @@ jumped: if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || - likely(atomic_inc_not_zero(&rule->refcnt))) { + likely(refcount_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } @@ -354,7 +368,8 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, return 0; } -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) +int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -372,7 +387,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout; } - err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy); + err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); if (err < 0) goto errout; @@ -385,6 +400,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) err = -ENOMEM; goto errout; } + refcount_set(&rule->refcnt, 1); rule->fr_net = net; rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) @@ -425,6 +441,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (tb[FRA_TUN_ID]) rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + err = -EINVAL; if (tb[FRA_L3MDEV]) { #ifdef CONFIG_NET_L3_MASTER_DEV rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]); @@ -446,7 +463,6 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) else rule->suppress_ifgroup = -1; - err = -EINVAL; if (tb[FRA_GOTO]) { if (rule->action != FR_ACT_GOTO) goto errout_free; @@ -502,8 +518,6 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) last = r; } - fib_rule_get(rule); - if (last) list_add_rcu(&rule->list, &last->list); else @@ -547,12 +561,13 @@ errout: } EXPORT_SYMBOL_GPL(fib_nl_newrule); -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) +int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; - struct fib_rule *rule, *tmp; + struct fib_rule *rule, *r; struct nlattr *tb[FRA_MAX+1]; struct fib_kuid_range range; int err = -EINVAL; @@ -566,7 +581,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout; } - err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy); + err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); if (err < 0) goto errout; @@ -576,8 +591,10 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (tb[FRA_UID_RANGE]) { range = nla_get_kuid_range(tb); - if (!uid_range_set(&range)) + if (!uid_range_set(&range)) { + err = -EINVAL; goto errout; + } } else { range = fib_kuid_range_unset; } @@ -650,16 +667,23 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) /* * Check if this rule is a target to any of them. If so, + * adjust to the next one with the same preference or * disable them. As this operation is eventually very - * expensive, it is only performed if goto rules have - * actually been added. + * expensive, it is only performed if goto rules, except + * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { - list_for_each_entry(tmp, &ops->rules_list, list) { - if (rtnl_dereference(tmp->ctarget) == rule) { - RCU_INIT_POINTER(tmp->ctarget, NULL); + struct fib_rule *n; + + n = list_next_entry(rule, list); + if (&n->list == &ops->rules_list || n->pref != rule->pref) + n = NULL; + list_for_each_entry(r, &ops->rules_list, list) { + if (rtnl_dereference(r->ctarget) != rule) + continue; + rcu_assign_pointer(r->ctarget, n); + if (!n) ops->unresolved_rules++; - } } } diff --git a/net/core/filter.c b/net/core/filter.c index 1969b3f118c1..f44fc22fd45a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -26,6 +26,7 @@ #include <linux/mm.h> #include <linux/fcntl.h> #include <linux/socket.h> +#include <linux/sock_diag.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> @@ -52,6 +53,8 @@ #include <net/dst_metadata.h> #include <net/dst.h> #include <net/sock_reuseport.h> +#include <net/busy_poll.h> +#include <net/tcp.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -76,9 +79,10 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ - if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) + if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); return -ENOMEM; - + } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); if (err) return err; @@ -90,7 +94,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { - unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); + struct sock *save_sk = skb->sk; + unsigned int pkt_len; + + skb->sk = sk; + pkt_len = bpf_prog_run_save_cb(filter->prog, skb); + skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); @@ -344,10 +353,11 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program - * @new_prog: buffer where converted program will be stored + * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program * - * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. + * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' + * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: @@ -355,14 +365,13 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); */ static int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_insn *new_prog, int *new_len) + struct bpf_prog *new_prog, int *new_len) { - int new_flen = 0, pass = 0, target, i; - struct bpf_insn *new_insn; + int new_flen = 0, pass = 0, target, i, stack_off; + struct bpf_insn *new_insn, *first_insn = NULL; struct sock_filter *fp; int *addrs = NULL; u8 bpf_src; @@ -374,6 +383,7 @@ static int bpf_convert_filter(struct sock_filter *prog, int len, return -EINVAL; if (new_prog) { + first_insn = new_prog->insnsi; addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL | __GFP_NOWARN); if (!addrs) @@ -381,11 +391,11 @@ static int bpf_convert_filter(struct sock_filter *prog, int len, } do_pass: - new_insn = new_prog; + new_insn = first_insn; fp = prog; /* Classic BPF related prologue emission. */ - if (new_insn) { + if (new_prog) { /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ @@ -406,7 +416,7 @@ do_pass: struct bpf_insn *insn = tmp_insns; if (addrs) - addrs[i] = new_insn - new_prog; + addrs[i] = new_insn - first_insn; switch (fp->code) { /* All arithmetic insns and skb loads map as-is. */ @@ -552,17 +562,25 @@ do_pass: /* Store to stack. */ case BPF_ST: case BPF_STX: + stack_off = fp->k * 4 + 4; *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == BPF_ST ? BPF_REG_A : BPF_REG_X, - -(BPF_MEMWORDS - fp->k) * 4); + -stack_off); + /* check_load_and_stores() verifies that classic BPF can + * load from stack only after write, so tracking + * stack_depth for ST|STX insns is enough + */ + if (new_prog && new_prog->aux->stack_depth < stack_off) + new_prog->aux->stack_depth = stack_off; break; /* Load from stack. */ case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: + stack_off = fp->k * 4 + 4; *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_FP, - -(BPF_MEMWORDS - fp->k) * 4); + -stack_off); break; /* A = K or X = K */ @@ -610,13 +628,13 @@ do_pass: if (!new_prog) { /* Only calculating new length. */ - *new_len = new_insn - new_prog; + *new_len = new_insn - first_insn; return 0; } pass++; - if (new_flen != new_insn - new_prog) { - new_flen = new_insn - new_prog; + if (new_flen != new_insn - first_insn) { + new_flen = new_insn - first_insn; if (pass > 2) goto err; goto do_pass; @@ -927,7 +945,7 @@ static void sk_filter_release_rcu(struct rcu_head *rcu) */ static void sk_filter_release(struct sk_filter *fp) { - if (atomic_dec_and_test(&fp->refcnt)) + if (refcount_dec_and_test(&fp->refcnt)) call_rcu(&fp->rcu, sk_filter_release_rcu); } @@ -942,20 +960,27 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) /* try to charge the socket memory if there is space available * return true on success */ -bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) +static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); /* same check as in sock_kmalloc() */ if (filter_size <= sysctl_optmem_max && atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { - atomic_inc(&fp->refcnt); atomic_add(filter_size, &sk->sk_omem_alloc); return true; } return false; } +bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) +{ + bool ret = __sk_filter_charge(sk, fp); + if (ret) + refcount_inc(&fp->refcnt); + return ret; +} + static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; @@ -1001,7 +1026,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ - err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); + err = bpf_convert_filter(old_prog, old_len, fp, &new_len); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, @@ -1178,12 +1203,12 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return -ENOMEM; fp->prog = prog; - atomic_set(&fp->refcnt, 0); - if (!sk_filter_charge(sk, fp)) { + if (!__sk_filter_charge(sk, fp)) { kfree(fp); return -ENOMEM; } + refcount_set(&fp->refcnt, 1); old_fp = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); @@ -1416,8 +1441,8 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_STACK, - .arg4_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -1447,8 +1472,8 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_RAW_STACK, - .arg4_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) @@ -1522,10 +1547,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, { bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; + bool do_mforce = flags & BPF_F_MARK_ENFORCE; __sum16 *ptr; - if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | - BPF_F_HDR_FIELD_MASK))) + if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | + BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; @@ -1533,7 +1559,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, return -EFAULT; ptr = (__sum16 *)(skb->data + offset); - if (is_mmzero && !*ptr) + if (is_mmzero && !do_mforce && !*ptr) return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { @@ -1601,10 +1627,10 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_STACK, - .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_STACK, - .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; @@ -1849,6 +1875,24 @@ static const struct bpf_func_proto bpf_set_hash_invalid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) +{ + /* Set user specified hash as L4(+), so that it gets returned + * on skb_get_hash() call unless BPF prog later on triggers a + * skb_clear_hash(). + */ + __skb_set_sw_hash(skb, hash, true); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_proto = { + .func = bpf_set_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, u16, vlan_tci) { @@ -1968,7 +2012,7 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); - u32 off = skb->network_header - skb->mac_header; + u32 off = skb_mac_header_len(skb); int ret; ret = skb_cow(skb, len_diff); @@ -2004,7 +2048,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) static int bpf_skb_proto_6_to_4(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); - u32 off = skb->network_header - skb->mac_header; + u32 off = skb_mac_header_len(skb); int ret; ret = skb_unclone(skb, GFP_ATOMIC); @@ -2110,6 +2154,124 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = { .arg2_type = ARG_ANYTHING, }; +static u32 bpf_skb_net_base_len(const struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + return sizeof(struct iphdr); + case htons(ETH_P_IPV6): + return sizeof(struct ipv6hdr); + default: + return ~0U; + } +} + +static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) +{ + u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); + int ret; + + ret = skb_cow(skb, len_diff); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + /* Due to header grow, MSS needs to be downgraded. */ + skb_shinfo(skb)->gso_size -= len_diff; + /* Header must be checked, and gso_segs recomputed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + } + + return 0; +} + +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) +{ + u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); + int ret; + + ret = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_pop(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + /* Due to header shrink, MSS can be upgraded. */ + skb_shinfo(skb)->gso_size += len_diff; + /* Header must be checked, and gso_segs recomputed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + } + + return 0; +} + +static u32 __bpf_skb_max_len(const struct sk_buff *skb) +{ + return skb->dev->mtu + skb->dev->hard_header_len; +} + +static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) +{ + bool trans_same = skb->transport_header == skb->network_header; + u32 len_cur, len_diff_abs = abs(len_diff); + u32 len_min = bpf_skb_net_base_len(skb); + u32 len_max = __bpf_skb_max_len(skb); + __be16 proto = skb->protocol; + bool shrink = len_diff < 0; + int ret; + + if (unlikely(len_diff_abs > 0xfffU)) + return -EFAULT; + if (unlikely(proto != htons(ETH_P_IP) && + proto != htons(ETH_P_IPV6))) + return -ENOTSUPP; + + len_cur = skb->len - skb_network_offset(skb); + if (skb_transport_header_was_set(skb) && !trans_same) + len_cur = skb_network_header_len(skb); + if ((shrink && (len_diff_abs >= len_cur || + len_cur - len_diff_abs < len_min)) || + (!shrink && (skb->len + len_diff_abs > len_max && + !skb_is_gso(skb)))) + return -ENOTSUPP; + + ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : + bpf_skb_net_grow(skb, len_diff_abs); + + bpf_compute_data_end(skb); + return ret; +} + +BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) +{ + if (unlikely(flags)) + return -EINVAL; + if (likely(mode == BPF_ADJ_ROOM_NET)) + return bpf_skb_adjust_net(skb, len_diff); + + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_skb_adjust_room_proto = { + .func = bpf_skb_adjust_room, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + static u32 __bpf_skb_min_len(const struct sk_buff *skb) { u32 min_len = skb_network_offset(skb); @@ -2122,11 +2284,6 @@ static u32 __bpf_skb_min_len(const struct sk_buff *skb) return min_len; } -static u32 __bpf_skb_max_len(const struct sk_buff *skb) -{ - return skb->dev->mtu + skb->dev->hard_header_len; -} - static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) { unsigned int old_len = skb->len; @@ -2263,7 +2420,9 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_change_proto || func == bpf_skb_change_head || func == bpf_skb_change_tail || + func == bpf_skb_adjust_room || func == bpf_skb_pull_data || + func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head) @@ -2306,8 +2465,8 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -2377,8 +2536,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_RAW_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -2412,8 +2571,8 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_RAW_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, }; static struct metadata_dst __percpu *md_dst; @@ -2483,8 +2642,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -2509,8 +2668,8 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_STACK, - .arg3_type = ARG_CONST_STACK_SIZE, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * @@ -2521,6 +2680,7 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) * that is holding verifier mutex. */ md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, + METADATA_IP_TUNNEL, GFP_KERNEL); if (!md_dst) return NULL; @@ -2582,8 +2742,8 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) return -EFAULT; - return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, - bpf_xdp_copy); + return bpf_event_output(map, flags, meta, meta_size, xdp->data, + xdp_size, bpf_xdp_copy); } static const struct bpf_func_proto bpf_xdp_event_output_proto = { @@ -2593,12 +2753,146 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_STACK, - .arg5_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) +{ + return skb->sk ? sock_gen_cookie(skb->sk) : 0; +} + +static const struct bpf_func_proto bpf_get_socket_cookie_proto = { + .func = bpf_get_socket_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) +{ + struct sock *sk = sk_to_full_sk(skb->sk); + kuid_t kuid; + + if (!sk || !sk_fullsock(sk)) + return overflowuid; + kuid = sock_net_uid(sock_net(sk), sk); + return from_kuid_munged(sock_net(sk)->user_ns, kuid); +} + +static const struct bpf_func_proto bpf_get_socket_uid_proto = { + .func = bpf_get_socket_uid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + struct sock *sk = bpf_sock->sk; + int ret = 0; + int val; + + if (!sk_fullsock(sk)) + return -EINVAL; + + if (level == SOL_SOCKET) { + if (optlen != sizeof(int)) + return -EINVAL; + val = *((int *)optval); + + /* Only some socketops are supported */ + switch (optname) { + case SO_RCVBUF: + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); + break; + case SO_SNDBUF: + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); + break; + case SO_MAX_PACING_RATE: + sk->sk_max_pacing_rate = val; + sk->sk_pacing_rate = min(sk->sk_pacing_rate, + sk->sk_max_pacing_rate); + break; + case SO_PRIORITY: + sk->sk_priority = val; + break; + case SO_RCVLOWAT: + if (val < 0) + val = INT_MAX; + sk->sk_rcvlowat = val ? : 1; + break; + case SO_MARK: + sk->sk_mark = val; + break; + default: + ret = -EINVAL; + } +#ifdef CONFIG_INET + } else if (level == SOL_TCP && + sk->sk_prot->setsockopt == tcp_setsockopt) { + if (optname == TCP_CONGESTION) { + char name[TCP_CA_NAME_MAX]; + + strncpy(name, optval, min_t(long, optlen, + TCP_CA_NAME_MAX-1)); + name[TCP_CA_NAME_MAX-1] = 0; + ret = tcp_set_congestion_control(sk, name, false); + if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) + /* replacing an existing ca */ + tcp_reinit_congestion_control(sk, + inet_csk(sk)->icsk_ca_ops); + } else { + struct tcp_sock *tp = tcp_sk(sk); + + if (optlen != sizeof(int)) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case TCP_BPF_IW: + if (val <= 0 || tp->data_segs_out > 0) + ret = -EINVAL; + else + tp->snd_cwnd = val; + break; + case TCP_BPF_SNDCWND_CLAMP: + if (val <= 0) { + ret = -EINVAL; + } else { + tp->snd_cwnd_clamp = val; + tp->snd_ssthresh = val; + } + break; + default: + ret = -EINVAL; + } + } + ret = -EINVAL; +#endif + } else { + ret = -EINVAL; + } + return ret; +} + +static const struct bpf_func_proto bpf_setsockopt_proto = { + .func = bpf_setsockopt, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * -sk_filter_func_proto(enum bpf_func_id func_id) +bpf_base_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -2626,6 +2920,21 @@ sk_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * +sk_filter_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -2655,6 +2964,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_change_type: return &bpf_skb_change_type_proto; + case BPF_FUNC_skb_adjust_room: + return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_get_tunnel_key: @@ -2673,14 +2984,20 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_get_hash_recalc_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; + case BPF_FUNC_set_hash: + return &bpf_set_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; default: - return sk_filter_func_proto(func_id); + return bpf_base_func_proto(func_id); } } @@ -2695,18 +3012,7 @@ xdp_func_proto(enum bpf_func_id func_id) case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; default: - return sk_filter_func_proto(func_id); - } -} - -static const struct bpf_func_proto * -cg_skb_func_proto(enum bpf_func_id func_id) -{ - switch (func_id) { - case BPF_FUNC_skb_load_bytes: - return &bpf_skb_load_bytes_proto; - default: - return sk_filter_func_proto(func_id); + return bpf_base_func_proto(func_id); } } @@ -2733,7 +3039,18 @@ lwt_inout_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: - return sk_filter_func_proto(func_id); + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * + sock_ops_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_setsockopt: + return &bpf_setsockopt_proto; + default: + return bpf_base_func_proto(func_id); } } @@ -2772,58 +3089,80 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) } } -static bool __is_valid_access(int off, int size) +static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct __sk_buff)) return false; + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; + + switch (off) { + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + if (off + size > offsetofend(struct __sk_buff, cb[4])) + return false; + break; + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): + if (size != size_default) + return false; + break; + default: + /* Only narrow read access allowed for now. */ + if (type == BPF_WRITE) { + if (size != size_default) + return false; + } else { + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + } + } return true; } static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { switch (off) { - case offsetof(struct __sk_buff, tc_classid): - case offsetof(struct __sk_buff, data): - case offsetof(struct __sk_buff, data_end): + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): return false; } if (type == BPF_WRITE) { switch (off) { - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } - return __is_valid_access(off, size); + return bpf_skb_is_valid_access(off, size, type, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { switch (off) { - case offsetof(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, tc_classid): return false; } if (type == BPF_WRITE) { switch (off) { - case offsetof(struct __sk_buff, mark): - case offsetof(struct __sk_buff, priority): - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; @@ -2831,20 +3170,20 @@ static bool lwt_is_valid_access(int off, int size, } switch (off) { - case offsetof(struct __sk_buff, data): - *reg_type = PTR_TO_PACKET; + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; break; - case offsetof(struct __sk_buff, data_end): - *reg_type = PTR_TO_PACKET_END; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; break; } - return __is_valid_access(off, size); + return bpf_skb_is_valid_access(off, size, type, info); } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { @@ -2907,16 +3246,15 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { - case offsetof(struct __sk_buff, mark): - case offsetof(struct __sk_buff, tc_index): - case offsetof(struct __sk_buff, priority): - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): - case offsetof(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, tc_index): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; @@ -2924,15 +3262,15 @@ static bool tc_cls_act_is_valid_access(int off, int size, } switch (off) { - case offsetof(struct __sk_buff, data): - *reg_type = PTR_TO_PACKET; + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; break; - case offsetof(struct __sk_buff, data_end): - *reg_type = PTR_TO_PACKET_END; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; break; } - return __is_valid_access(off, size); + return bpf_skb_is_valid_access(off, size, type, info); } static bool __is_valid_xdp_access(int off, int size) @@ -2949,17 +3287,17 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) return false; switch (off) { case offsetof(struct xdp_md, data): - *reg_type = PTR_TO_PACKET; + info->reg_type = PTR_TO_PACKET; break; case offsetof(struct xdp_md, data_end): - *reg_type = PTR_TO_PACKET_END; + info->reg_type = PTR_TO_PACKET_END; break; } @@ -2972,243 +3310,308 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, - struct bpf_insn *insn_buf, - struct bpf_prog *prog) +static bool __is_valid_sock_ops_access(int off, int size) +{ + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) + return false; + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock_ops, op) ... + offsetof(struct bpf_sock_ops, replylong[3]): + break; + default: + return false; + } + } + + return __is_valid_sock_ops_access(off, size); +} + +static u32 bpf_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; - switch (ctx_off) { + switch (si->off) { case offsetof(struct __sk_buff, len): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); - - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, - offsetof(struct sk_buff, len)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, len, 4, + target_size)); break; case offsetof(struct __sk_buff, protocol): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); - - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, - offsetof(struct sk_buff, protocol)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, protocol, 2, + target_size)); break; case offsetof(struct __sk_buff, vlan_proto): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); - - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, - offsetof(struct sk_buff, vlan_proto)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, vlan_proto, 2, + target_size)); break; case offsetof(struct __sk_buff, priority): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); - if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, - offsetof(struct sk_buff, priority)); + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, priority, 4, + target_size)); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, - offsetof(struct sk_buff, priority)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, priority, 4, + target_size)); break; case offsetof(struct __sk_buff, ingress_ifindex): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); - - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, - offsetof(struct sk_buff, skb_iif)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, skb_iif, 4, + target_size)); break; case offsetof(struct __sk_buff, ifindex): - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); - *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, - offsetof(struct net_device, ifindex)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct net_device, ifindex, 4, + target_size)); break; case offsetof(struct __sk_buff, hash): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); - - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, - offsetof(struct sk_buff, hash)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, hash, 4, + target_size)); break; case offsetof(struct __sk_buff, mark): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, - offsetof(struct sk_buff, mark)); + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, mark, 4, + target_size)); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, - offsetof(struct sk_buff, mark)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, mark, 4, + target_size)); break; case offsetof(struct __sk_buff, pkt_type): - return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_TYPE_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); +#endif + break; case offsetof(struct __sk_buff, queue_mapping): - return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, queue_mapping, 2, + target_size)); + break; case offsetof(struct __sk_buff, vlan_present): - return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, - dst_reg, src_reg, insn); - case offsetof(struct __sk_buff, vlan_tci): - return convert_skb_access(SKF_AD_VLAN_TAG, - dst_reg, src_reg, insn); + BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, vlan_tci, 2, + target_size)); + if (si->off == offsetof(struct __sk_buff, vlan_tci)) { + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, + ~VLAN_TAG_PRESENT); + } else { + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + } + break; case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); + BUILD_BUG_ON((offsetof(struct sk_buff, cb) + + offsetof(struct qdisc_skb_cb, data)) % + sizeof(__u64)); prog->cb_access = 1; - ctx_off -= offsetof(struct __sk_buff, cb[0]); - ctx_off += offsetof(struct sk_buff, cb); - ctx_off += offsetof(struct qdisc_skb_cb, data); + off = si->off; + off -= offsetof(struct __sk_buff, cb[0]); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, + si->src_reg, off); break; case offsetof(struct __sk_buff, tc_classid): - ctx_off -= offsetof(struct __sk_buff, tc_classid); - ctx_off += offsetof(struct sk_buff, cb); - ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); + BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2); + + off = si->off; + off -= offsetof(struct __sk_buff, tc_classid); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, tc_classid); + *target_size = 2; if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, + si->src_reg, off); else - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, + si->src_reg, off); break; case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct __sk_buff, data_end): - ctx_off -= offsetof(struct __sk_buff, data_end); - ctx_off += offsetof(struct sk_buff, cb); - ctx_off += offsetof(struct bpf_skb_data_end, data_end); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, - ctx_off); + off = si->off; + off -= offsetof(struct __sk_buff, data_end); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct bpf_skb_data_end, data_end); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); break; case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); - if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, - offsetof(struct sk_buff, tc_index)); + *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); else - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, - offsetof(struct sk_buff, tc_index)); - break; + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); #else if (type == BPF_WRITE) - *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); + *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else - *insn++ = BPF_MOV64_IMM(dst_reg, 0); + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#endif break; + + case offsetof(struct __sk_buff, napi_id): +#if defined(CONFIG_NET_RX_BUSY_POLL) + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, napi_id, 4, + target_size)); + *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#else + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif + break; } return insn - insn_buf; } static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, - int dst_reg, int src_reg, - int ctx_off, + const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); else - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); break; case offsetof(struct bpf_sock, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sock, sk_family)); break; case offsetof(struct bpf_sock, type): - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_TYPE_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_TYPE_SHIFT); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); break; case offsetof(struct bpf_sock, protocol): - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, SK_FL_PROTO_MASK); - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, SK_FL_PROTO_SHIFT); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; } return insn - insn_buf; } -static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct __sk_buff, ifindex): - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); - *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, - offsetof(struct net_device, ifindex)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct net_device, ifindex, 4, + target_size)); break; default: - return sk_filter_convert_ctx_access(type, dst_reg, src_reg, - ctx_off, insn_buf, prog); + return bpf_convert_ctx_access(type, si, insn_buf, prog, + target_size); } return insn - insn_buf; } -static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, - int src_reg, int ctx_off, +static u32 xdp_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - switch (ctx_off) { + switch (si->off) { case offsetof(struct xdp_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), - dst_reg, src_reg, + si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; } @@ -3216,111 +3619,194 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, return insn - insn_buf; } -static const struct bpf_verifier_ops sk_filter_ops = { +static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock_ops, op) ... + offsetof(struct bpf_sock_ops, replylong[3]): + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, op)); + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, reply)); + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong)); + off = si->off; + off -= offsetof(struct bpf_sock_ops, op); + off += offsetof(struct bpf_sock_ops_kern, op); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + break; + + case offsetof(struct bpf_sock_ops, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct bpf_sock_ops, local_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... + offsetof(struct bpf_sock_ops, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_ip6[0]) ... + offsetof(struct bpf_sock_ops, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, remote_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; + } + return insn - insn_buf; +} + +const struct bpf_verifier_ops sk_filter_prog_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = sk_filter_convert_ctx_access, + .convert_ctx_access = bpf_convert_ctx_access, }; -static const struct bpf_verifier_ops tc_cls_act_ops = { +const struct bpf_verifier_ops tc_cls_act_prog_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, + .test_run = bpf_prog_test_run_skb, }; -static const struct bpf_verifier_ops xdp_ops = { +const struct bpf_verifier_ops xdp_prog_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, + .test_run = bpf_prog_test_run_xdp, }; -static const struct bpf_verifier_ops cg_skb_ops = { - .get_func_proto = cg_skb_func_proto, +const struct bpf_verifier_ops cg_skb_prog_ops = { + .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = sk_filter_convert_ctx_access, + .convert_ctx_access = bpf_convert_ctx_access, + .test_run = bpf_prog_test_run_skb, }; -static const struct bpf_verifier_ops lwt_inout_ops = { +const struct bpf_verifier_ops lwt_inout_prog_ops = { .get_func_proto = lwt_inout_func_proto, .is_valid_access = lwt_is_valid_access, - .convert_ctx_access = sk_filter_convert_ctx_access, + .convert_ctx_access = bpf_convert_ctx_access, + .test_run = bpf_prog_test_run_skb, }; -static const struct bpf_verifier_ops lwt_xmit_ops = { +const struct bpf_verifier_ops lwt_xmit_prog_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, - .convert_ctx_access = sk_filter_convert_ctx_access, + .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, + .test_run = bpf_prog_test_run_skb, }; -static const struct bpf_verifier_ops cg_sock_ops = { - .get_func_proto = sk_filter_func_proto, +const struct bpf_verifier_ops cg_sock_prog_ops = { + .get_func_proto = bpf_base_func_proto, .is_valid_access = sock_filter_is_valid_access, .convert_ctx_access = sock_filter_convert_ctx_access, }; -static struct bpf_prog_type_list sk_filter_type __read_mostly = { - .ops = &sk_filter_ops, - .type = BPF_PROG_TYPE_SOCKET_FILTER, +const struct bpf_verifier_ops sock_ops_prog_ops = { + .get_func_proto = sock_ops_func_proto, + .is_valid_access = sock_ops_is_valid_access, + .convert_ctx_access = sock_ops_convert_ctx_access, }; -static struct bpf_prog_type_list sched_cls_type __read_mostly = { - .ops = &tc_cls_act_ops, - .type = BPF_PROG_TYPE_SCHED_CLS, -}; - -static struct bpf_prog_type_list sched_act_type __read_mostly = { - .ops = &tc_cls_act_ops, - .type = BPF_PROG_TYPE_SCHED_ACT, -}; - -static struct bpf_prog_type_list xdp_type __read_mostly = { - .ops = &xdp_ops, - .type = BPF_PROG_TYPE_XDP, -}; - -static struct bpf_prog_type_list cg_skb_type __read_mostly = { - .ops = &cg_skb_ops, - .type = BPF_PROG_TYPE_CGROUP_SKB, -}; - -static struct bpf_prog_type_list lwt_in_type __read_mostly = { - .ops = &lwt_inout_ops, - .type = BPF_PROG_TYPE_LWT_IN, -}; - -static struct bpf_prog_type_list lwt_out_type __read_mostly = { - .ops = &lwt_inout_ops, - .type = BPF_PROG_TYPE_LWT_OUT, -}; - -static struct bpf_prog_type_list lwt_xmit_type __read_mostly = { - .ops = &lwt_xmit_ops, - .type = BPF_PROG_TYPE_LWT_XMIT, -}; - -static struct bpf_prog_type_list cg_sock_type __read_mostly = { - .ops = &cg_sock_ops, - .type = BPF_PROG_TYPE_CGROUP_SOCK -}; - -static int __init register_sk_filter_ops(void) -{ - bpf_register_prog_type(&sk_filter_type); - bpf_register_prog_type(&sched_cls_type); - bpf_register_prog_type(&sched_act_type); - bpf_register_prog_type(&xdp_type); - bpf_register_prog_type(&cg_skb_type); - bpf_register_prog_type(&cg_sock_type); - bpf_register_prog_type(&lwt_in_type); - bpf_register_prog_type(&lwt_out_type); - bpf_register_prog_type(&lwt_xmit_type); - - return 0; -} -late_initcall(register_sk_filter_ops); - int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow.c b/net/core/flow.c index f765c11d8df5..f7f5d1932a27 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -47,7 +47,7 @@ struct flow_flush_info { static struct kmem_cache *flow_cachep __read_mostly; -#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) +#define flow_cache_hash_size(cache) (1U << (cache)->hash_shift) #define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) static void flow_cache_new_hashrnd(unsigned long arg) @@ -99,7 +99,8 @@ static void flow_cache_gc_task(struct work_struct *work) } static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, - int deleted, struct list_head *gc_list, + unsigned int deleted, + struct list_head *gc_list, struct netns_xfrm *xfrm) { if (deleted) { @@ -114,17 +115,18 @@ static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, static void __flow_cache_shrink(struct flow_cache *fc, struct flow_cache_percpu *fcp, - int shrink_to) + unsigned int shrink_to) { struct flow_cache_entry *fle; struct hlist_node *tmp; LIST_HEAD(gc_list); - int i, deleted = 0; + unsigned int deleted = 0; struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, flow_cache_global); + unsigned int i; for (i = 0; i < flow_cache_hash_size(fc); i++) { - int saved = 0; + unsigned int saved = 0; hlist_for_each_entry_safe(fle, tmp, &fcp->hash_table[i], u.hlist) { @@ -145,7 +147,7 @@ static void __flow_cache_shrink(struct flow_cache *fc, static void flow_cache_shrink(struct flow_cache *fc, struct flow_cache_percpu *fcp) { - int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); + unsigned int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); __flow_cache_shrink(fc, fcp, shrink_to); } @@ -161,7 +163,7 @@ static void flow_new_hash_rnd(struct flow_cache *fc, static u32 flow_hash_code(struct flow_cache *fc, struct flow_cache_percpu *fcp, const struct flowi *key, - size_t keysize) + unsigned int keysize) { const u32 *k = (const u32 *) key; const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32); @@ -174,7 +176,7 @@ static u32 flow_hash_code(struct flow_cache *fc, * important assumptions that we can here, such as alignment. */ static int flow_key_compare(const struct flowi *key1, const struct flowi *key2, - size_t keysize) + unsigned int keysize) { const flow_compare_t *k1, *k1_lim, *k2; @@ -199,7 +201,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, struct flow_cache_percpu *fcp; struct flow_cache_entry *fle, *tfle; struct flow_cache_object *flo; - size_t keysize; + unsigned int keysize; unsigned int hash; local_bh_disable(); @@ -295,9 +297,10 @@ static void flow_cache_flush_tasklet(unsigned long data) struct flow_cache_entry *fle; struct hlist_node *tmp; LIST_HEAD(gc_list); - int i, deleted = 0; + unsigned int deleted = 0; struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, flow_cache_global); + unsigned int i; fcp = this_cpu_ptr(fc->percpu); for (i = 0; i < flow_cache_hash_size(fc); i++) { @@ -327,7 +330,7 @@ static void flow_cache_flush_tasklet(unsigned long data) static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu) { struct flow_cache_percpu *fcp; - int i; + unsigned int i; fcp = per_cpu_ptr(fc->percpu, cpu); for (i = 0; i < flow_cache_hash_size(fc); i++) @@ -402,12 +405,12 @@ void flow_cache_flush_deferred(struct net *net) static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) { struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); - size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); + unsigned int sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); if (!fcp->hash_table) { fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); if (!fcp->hash_table) { - pr_err("NET: failed to allocate flow cache sz %zu\n", sz); + pr_err("NET: failed to allocate flow cache sz %u\n", sz); return -ENOMEM; } fcp->hash_rnd_recalc = 1; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 1b7673aac59d..fc5fc4594c90 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -18,6 +18,7 @@ #include <linux/stddef.h> #include <linux/if_ether.h> #include <linux/mpls.h> +#include <linux/tcp.h> #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> @@ -113,6 +114,293 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +enum flow_dissect_ret { + FLOW_DISSECT_RET_OUT_GOOD, + FLOW_DISSECT_RET_OUT_BAD, + FLOW_DISSECT_RET_OUT_PROTO_AGAIN, +}; + +static enum flow_dissect_ret +__skb_flow_dissect_mpls(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, int nhoff, int hlen) +{ + struct flow_dissector_key_keyid *key_keyid; + struct mpls_label *hdr, _hdr[2]; + u32 entry, label; + + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && + !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) + return FLOW_DISSECT_RET_OUT_GOOD; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + entry = ntohl(hdr[0].entry); + label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { + struct flow_dissector_key_mpls *key_mpls; + + key_mpls = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS, + target_container); + key_mpls->mpls_label = label; + key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK) + >> MPLS_LS_TTL_SHIFT; + key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK) + >> MPLS_LS_TC_SHIFT; + key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK) + >> MPLS_LS_S_SHIFT; + } + + if (label == MPLS_LABEL_ENTROPY) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY, + target_container); + key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK); + } + return FLOW_DISSECT_RET_OUT_GOOD; +} + +static enum flow_dissect_ret +__skb_flow_dissect_arp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, int nhoff, int hlen) +{ + struct flow_dissector_key_arp *key_arp; + struct { + unsigned char ar_sha[ETH_ALEN]; + unsigned char ar_sip[4]; + unsigned char ar_tha[ETH_ALEN]; + unsigned char ar_tip[4]; + } *arp_eth, _arp_eth; + const struct arphdr *arp; + struct arphdr _arp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP)) + return FLOW_DISSECT_RET_OUT_GOOD; + + arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, + hlen, &_arp); + if (!arp) + return FLOW_DISSECT_RET_OUT_BAD; + + if (arp->ar_hrd != htons(ARPHRD_ETHER) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_hln != ETH_ALEN || + arp->ar_pln != 4 || + (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST))) + return FLOW_DISSECT_RET_OUT_BAD; + + arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), + sizeof(_arp_eth), data, + hlen, &_arp_eth); + if (!arp_eth) + return FLOW_DISSECT_RET_OUT_BAD; + + key_arp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ARP, + target_container); + + memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip)); + memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip)); + + /* Only store the lower byte of the opcode; + * this covers ARPOP_REPLY and ARPOP_REQUEST. + */ + key_arp->op = ntohs(arp->ar_op) & 0xff; + + ether_addr_copy(key_arp->sha, arp_eth->ar_sha); + ether_addr_copy(key_arp->tha, arp_eth->ar_tha); + + return FLOW_DISSECT_RET_OUT_GOOD; +} + +static enum flow_dissect_ret +__skb_flow_dissect_gre(const struct sk_buff *skb, + struct flow_dissector_key_control *key_control, + struct flow_dissector *flow_dissector, + void *target_container, void *data, + __be16 *p_proto, int *p_nhoff, int *p_hlen, + unsigned int flags) +{ + struct flow_dissector_key_keyid *key_keyid; + struct gre_base_hdr *hdr, _hdr; + int offset = 0; + u16 gre_ver; + + hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), + data, *p_hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + /* Only look inside GRE without routing */ + if (hdr->flags & GRE_ROUTING) + return FLOW_DISSECT_RET_OUT_GOOD; + + /* Only look inside GRE for version 0 and 1 */ + gre_ver = ntohs(hdr->flags & GRE_VERSION); + if (gre_ver > 1) + return FLOW_DISSECT_RET_OUT_GOOD; + + *p_proto = hdr->protocol; + if (gre_ver) { + /* Version1 must be PPTP, and check the flags */ + if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) + return FLOW_DISSECT_RET_OUT_GOOD; + } + + offset += sizeof(struct gre_base_hdr); + + if (hdr->flags & GRE_CSUM) + offset += sizeof(((struct gre_full_hdr *) 0)->csum) + + sizeof(((struct gre_full_hdr *) 0)->reserved1); + + if (hdr->flags & GRE_KEY) { + const __be32 *keyid; + __be32 _keyid; + + keyid = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_keyid), + data, *p_hlen, &_keyid); + if (!keyid) + return FLOW_DISSECT_RET_OUT_BAD; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID)) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID, + target_container); + if (gre_ver == 0) + key_keyid->keyid = *keyid; + else + key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; + } + offset += sizeof(((struct gre_full_hdr *) 0)->key); + } + + if (hdr->flags & GRE_SEQ) + offset += sizeof(((struct pptp_gre_header *) 0)->seq); + + if (gre_ver == 0) { + if (*p_proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_eth), + data, *p_hlen, &_eth); + if (!eth) + return FLOW_DISSECT_RET_OUT_BAD; + *p_proto = eth->h_proto; + offset += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + *p_hlen = *p_nhoff + offset; + } + } else { /* version 1, must be PPTP */ + u8 _ppp_hdr[PPP_HDRLEN]; + u8 *ppp_hdr; + + if (hdr->flags & GRE_ACK) + offset += sizeof(((struct pptp_gre_header *) 0)->ack); + + ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_ppp_hdr), + data, *p_hlen, _ppp_hdr); + if (!ppp_hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + switch (PPP_PROTOCOL(ppp_hdr)) { + case PPP_IP: + *p_proto = htons(ETH_P_IP); + break; + case PPP_IPV6: + *p_proto = htons(ETH_P_IPV6); + break; + default: + /* Could probably catch some more like MPLS */ + break; + } + + offset += PPP_HDRLEN; + } + + *p_nhoff += offset; + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return FLOW_DISSECT_RET_OUT_GOOD; + + return FLOW_DISSECT_RET_OUT_PROTO_AGAIN; +} + +static void +__skb_flow_dissect_tcp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, int thoff, int hlen) +{ + struct flow_dissector_key_tcp *key_tcp; + struct tcphdr *th, _th; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP)) + return; + + th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th); + if (!th) + return; + + if (unlikely(__tcp_hdrlen(th) < sizeof(_th))) + return; + + key_tcp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_TCP, + target_container); + key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); +} + +static void +__skb_flow_dissect_ipv4(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, const struct iphdr *iph) +{ + struct flow_dissector_key_ip *key_ip; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) + return; + + key_ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IP, + target_container); + key_ip->tos = iph->tos; + key_ip->ttl = iph->ttl; +} + +static void +__skb_flow_dissect_ipv6(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, const struct ipv6hdr *iph) +{ + struct flow_dissector_key_ip *key_ip; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) + return; + + key_ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IP, + target_container); + key_ip->tos = ipv6_get_dsfield(iph); + key_ip->ttl = iph->hop_limit; +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -142,7 +430,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; - struct flow_dissector_key_keyid *key_keyid; bool skip_vlan = false; u8 ip_proto = 0; bool ret; @@ -180,7 +467,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, memcpy(key_eth_addrs, ð->h_dest, sizeof(*key_eth_addrs)); } -again: +proto_again: switch (proto) { case htons(ETH_P_IP): { const struct iphdr *iph; @@ -216,6 +503,9 @@ ip: } } + __skb_flow_dissect_ipv4(skb, flow_dissector, + target_container, data, iph); + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) goto out_good; @@ -261,6 +551,9 @@ ipv6: goto out_good; } + __skb_flow_dissect_ipv6(skb, flow_dissector, + target_container, data, iph); + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) goto out_good; @@ -283,7 +576,7 @@ ipv6: proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); if (skip_vlan) - goto again; + goto proto_again; } skip_vlan = true; @@ -306,7 +599,7 @@ ipv6: } } - goto again; + goto proto_again; } case htons(ETH_P_PPP_SES): { struct { @@ -348,154 +641,52 @@ ipv6: } case htons(ETH_P_MPLS_UC): - case htons(ETH_P_MPLS_MC): { - struct mpls_label *hdr, _hdr[2]; + case htons(ETH_P_MPLS_MC): mpls: - hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, - hlen, &_hdr); - if (!hdr) - goto out_bad; - - if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >> - MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) { - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { - key_keyid = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_MPLS_ENTROPY, - target_container); - key_keyid->keyid = hdr[1].entry & - htonl(MPLS_LS_LABEL_MASK); - } - + switch (__skb_flow_dissect_mpls(skb, flow_dissector, + target_container, data, + nhoff, hlen)) { + case FLOW_DISSECT_RET_OUT_GOOD: goto out_good; + case FLOW_DISSECT_RET_OUT_BAD: + default: + goto out_bad; } - - goto out_good; - } - case htons(ETH_P_FCOE): if ((hlen - nhoff) < FCOE_HEADER_LEN) goto out_bad; nhoff += FCOE_HEADER_LEN; goto out_good; + + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + switch (__skb_flow_dissect_arp(skb, flow_dissector, + target_container, data, + nhoff, hlen)) { + case FLOW_DISSECT_RET_OUT_GOOD: + goto out_good; + case FLOW_DISSECT_RET_OUT_BAD: + default: + goto out_bad; + } default: goto out_bad; } ip_proto_again: switch (ip_proto) { - case IPPROTO_GRE: { - struct gre_base_hdr *hdr, _hdr; - u16 gre_ver; - int offset = 0; - - hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); - if (!hdr) + case IPPROTO_GRE: + switch (__skb_flow_dissect_gre(skb, key_control, flow_dissector, + target_container, data, + &proto, &nhoff, &hlen, flags)) { + case FLOW_DISSECT_RET_OUT_GOOD: + goto out_good; + case FLOW_DISSECT_RET_OUT_BAD: goto out_bad; - - /* Only look inside GRE without routing */ - if (hdr->flags & GRE_ROUTING) - break; - - /* Only look inside GRE for version 0 and 1 */ - gre_ver = ntohs(hdr->flags & GRE_VERSION); - if (gre_ver > 1) - break; - - proto = hdr->protocol; - if (gre_ver) { - /* Version1 must be PPTP, and check the flags */ - if (!(proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) - break; - } - - offset += sizeof(struct gre_base_hdr); - - if (hdr->flags & GRE_CSUM) - offset += sizeof(((struct gre_full_hdr *)0)->csum) + - sizeof(((struct gre_full_hdr *)0)->reserved1); - - if (hdr->flags & GRE_KEY) { - const __be32 *keyid; - __be32 _keyid; - - keyid = __skb_header_pointer(skb, nhoff + offset, sizeof(_keyid), - data, hlen, &_keyid); - if (!keyid) - goto out_bad; - - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_GRE_KEYID)) { - key_keyid = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_GRE_KEYID, - target_container); - if (gre_ver == 0) - key_keyid->keyid = *keyid; - else - key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; - } - offset += sizeof(((struct gre_full_hdr *)0)->key); - } - - if (hdr->flags & GRE_SEQ) - offset += sizeof(((struct pptp_gre_header *)0)->seq); - - if (gre_ver == 0) { - if (proto == htons(ETH_P_TEB)) { - const struct ethhdr *eth; - struct ethhdr _eth; - - eth = __skb_header_pointer(skb, nhoff + offset, - sizeof(_eth), - data, hlen, &_eth); - if (!eth) - goto out_bad; - proto = eth->h_proto; - offset += sizeof(*eth); - - /* Cap headers that we access via pointers at the - * end of the Ethernet header as our maximum alignment - * at that point is only 2 bytes. - */ - if (NET_IP_ALIGN) - hlen = (nhoff + offset); - } - } else { /* version 1, must be PPTP */ - u8 _ppp_hdr[PPP_HDRLEN]; - u8 *ppp_hdr; - - if (hdr->flags & GRE_ACK) - offset += sizeof(((struct pptp_gre_header *)0)->ack); - - ppp_hdr = __skb_header_pointer(skb, nhoff + offset, - sizeof(_ppp_hdr), - data, hlen, _ppp_hdr); - if (!ppp_hdr) - goto out_bad; - - switch (PPP_PROTOCOL(ppp_hdr)) { - case PPP_IP: - proto = htons(ETH_P_IP); - break; - case PPP_IPV6: - proto = htons(ETH_P_IPV6); - break; - default: - /* Could probably catch some more like MPLS */ - break; - } - - offset += PPP_HDRLEN; + case FLOW_DISSECT_RET_OUT_PROTO_AGAIN: + goto proto_again; } - - nhoff += offset; - key_control->flags |= FLOW_DIS_ENCAPSULATION; - if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) - goto out_good; - - goto again; - } case NEXTHDR_HOP: case NEXTHDR_ROUTING: case NEXTHDR_DEST: { @@ -557,6 +748,10 @@ ip_proto_again: case IPPROTO_MPLS: proto = htons(ETH_P_MPLS_UC); goto mpls; + case IPPROTO_TCP: + __skb_flow_dissect_tcp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; default: break; } diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c new file mode 100644 index 000000000000..814e58a3ce8b --- /dev/null +++ b/net/core/gro_cells.c @@ -0,0 +1,92 @@ +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/netdevice.h> +#include <net/gro_cells.h> + +struct gro_cell { + struct sk_buff_head napi_skbs; + struct napi_struct napi; +}; + +int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct gro_cell *cell; + + if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) + return netif_rx(skb); + + cell = this_cpu_ptr(gcells->cells); + + if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + + __skb_queue_tail(&cell->napi_skbs, skb); + if (skb_queue_len(&cell->napi_skbs) == 1) + napi_schedule(&cell->napi); + return NET_RX_SUCCESS; +} +EXPORT_SYMBOL(gro_cells_receive); + +/* called under BH context */ +static int gro_cell_poll(struct napi_struct *napi, int budget) +{ + struct gro_cell *cell = container_of(napi, struct gro_cell, napi); + struct sk_buff *skb; + int work_done = 0; + + while (work_done < budget) { + skb = __skb_dequeue(&cell->napi_skbs); + if (!skb) + break; + napi_gro_receive(napi, skb); + work_done++; + } + + if (work_done < budget) + napi_complete_done(napi, work_done); + return work_done; +} + +int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) +{ + int i; + + gcells->cells = alloc_percpu(struct gro_cell); + if (!gcells->cells) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); + + __skb_queue_head_init(&cell->napi_skbs); + + set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); + + netif_napi_add(dev, &cell->napi, gro_cell_poll, + NAPI_POLL_WEIGHT); + napi_enable(&cell->napi); + } + return 0; +} +EXPORT_SYMBOL(gro_cells_init); + +void gro_cells_destroy(struct gro_cells *gcells) +{ + int i; + + if (!gcells->cells) + return; + for_each_possible_cpu(i) { + struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); + + netif_napi_del(&cell->napi); + __skb_queue_purge(&cell->napi_skbs); + } + free_percpu(gcells->cells); + gcells->cells = NULL; +} +EXPORT_SYMBOL(gro_cells_destroy); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index b3eef90b2df9..1307731ddfe4 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -209,7 +209,8 @@ static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, int ret; u32 fd; - ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy); + ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, + NULL); if (ret < 0) return ret; @@ -237,9 +238,10 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, }; -static int bpf_build_state(struct net_device *dev, struct nlattr *nla, +static int bpf_build_state(struct nlattr *nla, unsigned int family, const void *cfg, - struct lwtunnel_state **ts) + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_BPF_MAX + 1]; struct lwtunnel_state *newts; @@ -249,7 +251,7 @@ static int bpf_build_state(struct net_device *dev, struct nlattr *nla, if (family != AF_INET && family != AF_INET6) return -EAFNOSUPPORT; - ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy); + ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); if (ret < 0) return ret; @@ -352,7 +354,7 @@ static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) 0; } -int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) +static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) { /* FIXME: * The LWT state is currently rebuilt for delete requests which diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index c23465005f2f..d9cb3532f1dd 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -101,39 +101,55 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, } EXPORT_SYMBOL(lwtunnel_encap_del_ops); -int lwtunnel_build_state(struct net_device *dev, u16 encap_type, +int lwtunnel_build_state(u16 encap_type, struct nlattr *encap, unsigned int family, - const void *cfg, struct lwtunnel_state **lws) + const void *cfg, struct lwtunnel_state **lws, + struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; + bool found = false; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || - encap_type > LWTUNNEL_ENCAP_MAX) + encap_type > LWTUNNEL_ENCAP_MAX) { + NL_SET_ERR_MSG_ATTR(extack, encap, + "Unknown LWT encapsulation type"); return ret; + } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); if (likely(ops && ops->build_state && try_module_get(ops->owner))) { - ret = ops->build_state(dev, encap, family, cfg, lws); + found = true; + ret = ops->build_state(encap, family, cfg, lws, extack); if (ret) module_put(ops->owner); } rcu_read_unlock(); + /* don't rely on -EOPNOTSUPP to detect match as build_state + * handlers could return it + */ + if (!found) { + NL_SET_ERR_MSG_ATTR(extack, encap, + "LWT encapsulation type not supported"); + } + return ret; } EXPORT_SYMBOL(lwtunnel_build_state); -int lwtunnel_valid_encap_type(u16 encap_type) +int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || - encap_type > LWTUNNEL_ENCAP_MAX) + encap_type > LWTUNNEL_ENCAP_MAX) { + NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type"); return ret; + } rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); @@ -153,16 +169,20 @@ int lwtunnel_valid_encap_type(u16 encap_type) } } #endif - return ops ? 0 : -EOPNOTSUPP; + ret = ops ? 0 : -EOPNOTSUPP; + if (ret < 0) + NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported"); + + return ret; } EXPORT_SYMBOL(lwtunnel_valid_encap_type); -int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining) +int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, + struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; struct nlattr *attrs; - struct nlattr *nla; u16 encap_type; int attrlen; @@ -170,13 +190,13 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining) attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { attrs = rtnh_attrs(rtnh); - nla = nla_find(attrs, attrlen, RTA_ENCAP); nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla_entype) { encap_type = nla_get_u16(nla_entype); - if (lwtunnel_valid_encap_type(encap_type) != 0) + if (lwtunnel_valid_encap_type(encap_type, + extack) != 0) return -EOPNOTSUPP; } } @@ -205,7 +225,7 @@ int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { const struct lwtunnel_encap_ops *ops; struct nlattr *nest; - int ret = -EINVAL; + int ret; if (!lwtstate) return 0; @@ -214,8 +234,11 @@ int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; - ret = -EOPNOTSUPP; nest = nla_nest_start(skb, RTA_ENCAP); + if (!nest) + return -EMSGSIZE; + + ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->fill_encap)) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 7bb12e07ffef..d0713627deb6 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -52,8 +52,9 @@ do { \ #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(unsigned long arg); -static void __neigh_notify(struct neighbour *n, int type, int flags); -static void neigh_update_notify(struct neighbour *neigh); +static void __neigh_notify(struct neighbour *n, int type, int flags, + u32 pid); +static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); #ifdef CONFIG_PROC_FS @@ -99,7 +100,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) if (neigh->parms->neigh_cleanup) neigh->parms->neigh_cleanup(neigh); - __neigh_notify(neigh, RTM_DELNEIGH, 0); + __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } @@ -117,6 +118,50 @@ unsigned long neigh_rand_reach_time(unsigned long base) EXPORT_SYMBOL(neigh_rand_reach_time); +static bool neigh_del(struct neighbour *n, __u8 state, + struct neighbour __rcu **np, struct neigh_table *tbl) +{ + bool retval = false; + + write_lock(&n->lock); + if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state)) { + struct neighbour *neigh; + + neigh = rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock)); + rcu_assign_pointer(*np, neigh); + n->dead = 1; + retval = true; + } + write_unlock(&n->lock); + if (retval) + neigh_cleanup_and_release(n); + return retval; +} + +bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) +{ + struct neigh_hash_table *nht; + void *pkey = ndel->primary_key; + u32 hash_val; + struct neighbour *n; + struct neighbour __rcu **np; + + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd); + hash_val = hash_val >> (32 - nht->hash_shift); + + np = &nht->hash_buckets[hash_val]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock)))) { + if (n == ndel) + return neigh_del(n, 0, np, tbl); + np = &n->next; + } + return false; +} + static int neigh_forced_gc(struct neigh_table *tbl) { int shrunk = 0; @@ -139,19 +184,10 @@ static int neigh_forced_gc(struct neigh_table *tbl) * - nobody refers to it. * - it is not permanent */ - write_lock(&n->lock); - if (atomic_read(&n->refcnt) == 1 && - !(n->nud_state & NUD_PERMANENT)) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); - n->dead = 1; - shrunk = 1; - write_unlock(&n->lock); - neigh_cleanup_and_release(n); + if (neigh_del(n, NUD_PERMANENT, np, tbl)) { + shrunk = 1; continue; } - write_unlock(&n->lock); np = &n->next; } } @@ -218,7 +254,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) neigh_del_timer(n); n->dead = 1; - if (atomic_read(&n->refcnt) != 1) { + if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. We must destroy neighbour entry, but someone still uses it. @@ -299,7 +335,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; - atomic_set(&n->refcnt, 1); + refcount_set(&n->refcnt, 1); n->dead = 1; out: return n; @@ -311,8 +347,7 @@ out_entries: static void neigh_get_hash_rnd(u32 *x) { - get_random_bytes(x, sizeof(*x)); - *x |= 1; + *x = get_random_u32() | 1; } static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) @@ -408,7 +443,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, rcu_read_lock_bh(); n = __neigh_lookup_noref(tbl, pkey, dev); if (n) { - if (!atomic_inc_not_zero(&n->refcnt)) + if (!refcount_inc_not_zero(&n->refcnt)) n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); } @@ -437,7 +472,7 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, n = rcu_dereference_bh(n->next)) { if (!memcmp(n->primary_key, pkey, key_len) && net_eq(dev_net(n->dev), net)) { - if (!atomic_inc_not_zero(&n->refcnt)) + if (!refcount_inc_not_zero(&n->refcnt)) n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; @@ -673,7 +708,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms); static inline void neigh_parms_put(struct neigh_parms *parms) { - if (atomic_dec_and_test(&parms->refcnt)) + if (refcount_dec_and_test(&parms->refcnt)) neigh_parms_destroy(parms); } @@ -785,7 +820,7 @@ static void neigh_periodic_work(struct work_struct *work) if (time_before(n->used, n->confirmed)) n->used = n->confirmed; - if (atomic_read(&n->refcnt) == 1 && + if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; @@ -860,7 +895,8 @@ static void neigh_probe(struct neighbour *neigh) if (skb) skb = skb_clone(skb, GFP_ATOMIC); write_unlock(&neigh->lock); - neigh->ops->solicit(neigh, skb); + if (neigh->ops->solicit) + neigh->ops->solicit(neigh, skb); atomic_inc(&neigh->probes); kfree_skb(skb); } @@ -948,7 +984,7 @@ out: } if (notify) - neigh_update_notify(neigh); + neigh_update_notify(neigh, 0); neigh_release(neigh); } @@ -1072,7 +1108,7 @@ static void neigh_update_hhs(struct neighbour *neigh) */ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, - u32 flags) + u32 flags, u32 nlmsg_pid) { u8 old; int err; @@ -1130,10 +1166,6 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, lladdr = neigh->ha; } - if (new & NUD_CONNECTED) - neigh->confirmed = jiffies; - neigh->updated = jiffies; - /* If entry was valid and address is not changed, do not change entry state, if new one is STALE. */ @@ -1155,6 +1187,16 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } } + /* Update timestamps only once we know we will make a change to the + * neighbour entry. Otherwise we risk to move the locktime window with + * noop updates and ignore relevant ARP updates. + */ + if (new != old || lladdr != neigh->ha) { + if (new & NUD_CONNECTED) + neigh->confirmed = jiffies; + neigh->updated = jiffies; + } + if (new != old) { neigh_del_timer(neigh); if (new & NUD_PROBE) @@ -1229,7 +1271,7 @@ out: write_unlock_bh(&neigh->lock); if (notify) - neigh_update_notify(neigh); + neigh_update_notify(neigh, nlmsg_pid); return err; } @@ -1260,7 +1302,7 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl, lladdr || !dev->addr_len); if (neigh) neigh_update(neigh, lladdr, NUD_STALE, - NEIGH_UPDATE_F_OVERRIDE); + NEIGH_UPDATE_F_OVERRIDE, 0); return neigh; } EXPORT_SYMBOL(neigh_event_ns); @@ -1436,7 +1478,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); if (p) { p->tbl = tbl; - atomic_set(&p->refcnt, 1); + refcount_set(&p->refcnt, 1); p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); dev_hold(dev); @@ -1499,7 +1541,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) INIT_LIST_HEAD(&tbl->parms_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); - atomic_set(&tbl->parms.refcnt, 1); + refcount_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); @@ -1588,7 +1630,8 @@ static struct neigh_table *neigh_find_table(int family) return tbl; } -static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) +static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; @@ -1638,14 +1681,19 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN); + NEIGH_UPDATE_F_ADMIN, + NETLINK_CB(skb).portid); + write_lock_bh(&tbl->lock); neigh_release(neigh); + neigh_remove_one(neigh, tbl); + write_unlock_bh(&tbl->lock); out: return err; } -static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) +static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; struct net *net = sock_net(skb->sk); @@ -1658,7 +1706,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) int err; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) goto out; @@ -1729,7 +1777,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) neigh_event_send(neigh, NULL); err = 0; } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); + err = neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid); neigh_release(neigh); out: @@ -1746,7 +1795,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) if ((parms->dev && nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || - nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) || + nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || /* approximative value for deprecated QUEUE_LEN (in packets) */ @@ -1932,7 +1981,8 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_LOCKTIME] = { .type = NLA_U64 }, }; -static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) +static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct neigh_table *tbl; @@ -1942,7 +1992,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) int err, tidx; err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, - nl_neightbl_policy); + nl_neightbl_policy, extack); if (err < 0) goto errout; @@ -1980,7 +2030,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) int i, ifindex = 0; err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS], - nl_ntbl_parm_policy); + nl_ntbl_parm_policy, extack); if (err < 0) goto errout_tbl_lock; @@ -2183,7 +2233,7 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, ci.ndm_used = jiffies_to_clock_t(now - neigh->used); ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); - ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1; + ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; read_unlock_bh(&neigh->lock); if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || @@ -2229,10 +2279,10 @@ nla_put_failure: return -EMSGSIZE; } -static void neigh_update_notify(struct neighbour *neigh) +static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid) { call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); - __neigh_notify(neigh, RTM_NEWNEIGH, 0); + __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); } static bool neigh_master_filtered(struct net_device *dev, int master_idx) @@ -2271,7 +2321,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, unsigned int flags = NLM_F_MULTI; int err; - err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL); + err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL); if (!err) { if (tb[NDA_IFINDEX]) filter_idx = nla_get_u32(tb[NDA_IFINDEX]); @@ -2830,7 +2880,8 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(4); /* NDA_PROBES */ } -static void __neigh_notify(struct neighbour *n, int type, int flags) +static void __neigh_notify(struct neighbour *n, int type, int flags, + u32 pid) { struct net *net = dev_net(n->dev); struct sk_buff *skb; @@ -2840,7 +2891,7 @@ static void __neigh_notify(struct neighbour *n, int type, int flags) if (skb == NULL) goto errout; - err = neigh_fill_info(skb, n, 0, 0, type, flags); + err = neigh_fill_info(skb, n, pid, 0, type, flags); if (err < 0) { /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2856,7 +2907,7 @@ errout: void neigh_app_ns(struct neighbour *n) { - __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST); + __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); } EXPORT_SYMBOL(neigh_app_ns); @@ -2923,7 +2974,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) return; set_bit(index, p->data_state); - call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); + if (index == NEIGH_VAR_DELAY_PROBE_TIME) + call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p); if (!dev) /* NULL dev means this is default value */ neigh_copy_dflt_parms(net, p, index); } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 14d09345f00d..4847964931df 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -363,15 +363,10 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) netif_addr_lock_bh(dev); netdev_for_each_mc_addr(ha, dev) { - int i; - - seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, - dev->name, ha->refcount, ha->global_use); - - for (i = 0; i < dev->addr_len; i++) - seq_printf(seq, "%02x", ha->addr[i]); - - seq_putc(seq, '\n'); + seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n", + dev->ifindex, dev->name, + ha->refcount, ha->global_use, + (int)dev->addr_len, ha->addr); } netif_addr_unlock_bh(dev); return 0; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b0c04cf4851d..b4f9922b6f23 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -15,6 +15,7 @@ #include <net/switchdev.h> #include <linux/if_arp.h> #include <linux/slab.h> +#include <linux/sched/signal.h> #include <linux/nsproxy.h> #include <net/sock.h> #include <net/net_namespace.h> @@ -322,7 +323,11 @@ NETDEVICE_SHOW_RW(flags, fmt_hex); static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) { - int res, orig_len = dev->tx_queue_len; + unsigned int orig_len = dev->tx_queue_len; + int res; + + if (new_len != (unsigned int)new_len) + return -ERANGE; if (new_len != orig_len) { dev->tx_queue_len = new_len; @@ -348,7 +353,7 @@ static ssize_t tx_queue_len_store(struct device *dev, return netdev_store(dev, attr, buf, len, change_tx_queue_len); } -NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); +NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { @@ -621,7 +626,7 @@ static struct attribute *netstat_attrs[] = { }; -static struct attribute_group netstat_group = { +static const struct attribute_group netstat_group = { .name = "statistics", .attrs = netstat_attrs, }; @@ -631,7 +636,7 @@ static struct attribute *wireless_attrs[] = { NULL }; -static struct attribute_group wireless_group = { +static const struct attribute_group wireless_group = { .name = "wireless", .attrs = wireless_attrs, }; @@ -952,7 +957,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct kobject *kobj = &dev->_rx[i].kobj; - if (!list_empty(&dev_net(dev)->exit_list)) + if (!atomic_read(&dev_net(dev)->count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -1199,7 +1204,7 @@ static struct attribute *dql_attrs[] = { NULL }; -static struct attribute_group dql_group = { +static const struct attribute_group dql_group = { .name = "byte_queue_limits", .attrs = dql_attrs, }; @@ -1370,7 +1375,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!list_empty(&dev_net(dev)->exit_list)) + if (!atomic_read(&dev_net(dev)->count)) queue->kobj.uevent_suppress = 1; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); @@ -1443,7 +1448,7 @@ static void *net_grab_current_ns(void) struct net *ns = current->nsproxy->net_ns; #ifdef CONFIG_NET_NS if (ns) - atomic_inc(&ns->passive); + refcount_inc(&ns->passive); #endif return ns; } @@ -1557,7 +1562,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &(ndev->dev); - if (!list_empty(&dev_net(ndev)->exit_list)) + if (!atomic_read(&dev_net(ndev)->count)) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 3c4bbec39713..8726d051f31d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -16,6 +16,8 @@ #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/net_namespace.h> +#include <linux/sched/task.h> + #include <net/sock.h> #include <net/netlink.h> #include <net/net_namespace.h> @@ -33,7 +35,8 @@ LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); struct net init_net = { - .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), + .count = ATOMIC_INIT(1), + .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), }; EXPORT_SYMBOL(init_net); @@ -281,7 +284,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) LIST_HEAD(net_exit_list); atomic_set(&net->count, 1); - atomic_set(&net->passive, 1); + refcount_set(&net->passive, 1); net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); @@ -312,6 +315,25 @@ out_undo: goto out; } +static int __net_init net_defaults_init_net(struct net *net) +{ + net->core.sysctl_somaxconn = SOMAXCONN; + return 0; +} + +static struct pernet_operations net_defaults_ops = { + .init = net_defaults_init_net, +}; + +static __init int net_defaults_init(void) +{ + if (register_pernet_subsys(&net_defaults_ops)) + panic("Cannot initialize net default settings"); + + return 0; +} + +core_initcall(net_defaults_init); #ifdef CONFIG_NET_NS static struct ucounts *inc_net_namespaces(struct user_namespace *ns) @@ -358,7 +380,7 @@ static void net_free(struct net *net) void net_drop_ns(void *p) { struct net *ns = p; - if (ns && atomic_dec_and_test(&ns->passive)) + if (ns && refcount_dec_and_test(&ns->passive)) net_free(ns); } @@ -479,6 +501,23 @@ static void cleanup_net(struct work_struct *work) net_drop_ns(net); } } + +/** + * net_ns_barrier - wait until concurrent net_cleanup_work is done + * + * cleanup_net runs from work queue and will first remove namespaces + * from the global list, then run net exit functions. + * + * Call this in module exit path to make sure that all netns + * ->exit ops have been invoked before the function is removed. + */ +void net_ns_barrier(void) +{ + mutex_lock(&net_mutex); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL(net_ns_barrier); + static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) @@ -569,34 +608,48 @@ static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_FD] = { .type = NLA_U32 }, }; -static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + struct nlattr *nla; struct net *peer; int nsid, err; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, - rtnl_net_policy); + rtnl_net_policy, extack); if (err < 0) return err; - if (!tb[NETNSA_NSID]) + if (!tb[NETNSA_NSID]) { + NL_SET_ERR_MSG(extack, "nsid is missing"); return -EINVAL; + } nsid = nla_get_s32(tb[NETNSA_NSID]); - if (tb[NETNSA_PID]) + if (tb[NETNSA_PID]) { peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); - else if (tb[NETNSA_FD]) + nla = tb[NETNSA_PID]; + } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); - else + nla = tb[NETNSA_FD]; + } else { + NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; - if (IS_ERR(peer)) + } + if (IS_ERR(peer)) { + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); return PTR_ERR(peer); + } spin_lock_bh(&net->nsid_lock); if (__peernet2id(net, peer) >= 0) { spin_unlock_bh(&net->nsid_lock); err = -EEXIST; + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, + "Peer netns already has a nsid assigned"); goto out; } @@ -605,6 +658,10 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err); err = 0; + } else if (err == -ENOSPC && nsid >= 0) { + err = -EEXIST; + NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]); + NL_SET_ERR_MSG(extack, "The specified nsid is already used"); } out: put_net(peer); @@ -642,27 +699,36 @@ nla_put_failure: return -EMSGSIZE; } -static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + struct nlattr *nla; struct sk_buff *msg; struct net *peer; int err, id; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, - rtnl_net_policy); + rtnl_net_policy, extack); if (err < 0) return err; - if (tb[NETNSA_PID]) + if (tb[NETNSA_PID]) { peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); - else if (tb[NETNSA_FD]) + nla = tb[NETNSA_PID]; + } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); - else + nla = tb[NETNSA_FD]; + } else { + NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; + } - if (IS_ERR(peer)) + if (IS_ERR(peer)) { + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); return PTR_ERR(peer); + } msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); if (!msg) { diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 11fce17274f6..029a61ac6cdd 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -12,6 +12,8 @@ #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fdtable.h> +#include <linux/sched/task.h> + #include <net/cls_cgroup.h> #include <net/sock.h> @@ -69,27 +71,17 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) return 0; } -static void update_classid(struct cgroup_subsys_state *css, void *v) +static void cgrp_attach(struct cgroup_taskset *tset) { - struct css_task_iter it; + struct cgroup_subsys_state *css; struct task_struct *p; - css_task_iter_start(css, &it); - while ((p = css_task_iter_next(&it))) { + cgroup_taskset_for_each(p, css, tset) { task_lock(p); - iterate_fd(p->files, 0, update_classid_sock, v); + iterate_fd(p->files, 0, update_classid_sock, + (void *)(unsigned long)css_cls_state(css)->classid); task_unlock(p); } - css_task_iter_end(&it); -} - -static void cgrp_attach(struct cgroup_taskset *tset) -{ - struct cgroup_subsys_state *css; - - cgroup_taskset_first(tset, &css); - update_classid(css, - (void *)(unsigned long)css_cls_state(css)->classid); } static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) @@ -101,12 +93,22 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, u64 value) { struct cgroup_cls_state *cs = css_cls_state(css); + struct css_task_iter it; + struct task_struct *p; cgroup_sk_alloc_disable(); cs->classid = (u32)value; - update_classid(css, (void *)(unsigned long)cs->classid); + css_task_iter_start(css, &it); + while ((p = css_task_iter_next(&it))) { + task_lock(p); + iterate_fd(p->files, 0, update_classid_sock, + (void *)(unsigned long)cs->classid); + task_unlock(p); + } + css_task_iter_end(&it); + return 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 9424673009c1..8357f164c660 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -105,15 +105,21 @@ static void queue_process(struct work_struct *work) while ((skb = skb_dequeue(&npinfo->txq))) { struct net_device *dev = skb->dev; struct netdev_queue *txq; + unsigned int q_index; if (!netif_device_present(dev) || !netif_running(dev)) { kfree_skb(skb); continue; } - txq = skb_get_tx_queue(dev, skb); - local_irq_save(flags); + /* check if skb->queue_mapping is still valid */ + q_index = skb_get_queue_mapping(skb); + if (unlikely(q_index >= dev->real_num_tx_queues)) { + q_index = q_index % dev->real_num_tx_queues; + skb_set_queue_mapping(skb, q_index); + } + txq = netdev_get_tx_queue(dev, q_index); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (netif_xmit_frozen_or_stopped(txq) || netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) { @@ -271,7 +277,7 @@ static void zap_completion_queue(void) struct sk_buff *skb = clist; clist = clist->next; if (!skb_irq_freeable(skb)) { - atomic_inc(&skb->users); + refcount_set(&skb->users, 1); dev_kfree_skb_any(skb); /* put this one back */ } else { __kfree_skb(skb); @@ -303,7 +309,7 @@ repeat: return NULL; } - atomic_set(&skb->users, 1); + refcount_set(&skb->users, 1); skb_reserve(skb, reserve); return skb; } @@ -435,7 +441,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) ip6h->saddr = np->local_ip.in6; ip6h->daddr = np->remote_ip.in6; - eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + eth = skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IPV6); } else { @@ -464,7 +470,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) put_unaligned(np->remote_ip.ip, &(iph->daddr)); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + eth = skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IP); } @@ -626,7 +632,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) skb_queue_head_init(&npinfo->txq); INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); - atomic_set(&npinfo->refcnt, 1); + refcount_set(&npinfo->refcnt, 1); ops = np->dev->netdev_ops; if (ops->ndo_netpoll_setup) { @@ -636,7 +642,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) } } else { npinfo = rtnl_dereference(ndev->npinfo); - atomic_inc(&npinfo->refcnt); + refcount_inc(&npinfo->refcnt); } npinfo->netpoll = np; @@ -815,7 +821,7 @@ void __netpoll_cleanup(struct netpoll *np) synchronize_srcu(&netpoll_srcu); - if (atomic_dec_and_test(&npinfo->refcnt)) { + if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; ops = np->dev->netdev_ops; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 2ec86fc552df..1c4810919a0a 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -11,14 +11,18 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> +#include <linux/module.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/atomic.h> +#include <linux/sched/task.h> + #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8e69ce472236..6e1e10ff433a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2675,7 +2675,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, goto err; } /* restore ll */ - eth = (struct ethhdr *)skb_push(skb, ETH_HLEN); + eth = skb_push(skb, ETH_HLEN); memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN); eth->h_proto = protocol; @@ -2714,11 +2714,11 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, struct timeval timestamp; struct pktgen_hdr *pgh; - pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh)); + pgh = skb_put(skb, sizeof(*pgh)); datalen -= sizeof(*pgh); if (pkt_dev->nfrags <= 0) { - memset(skb_put(skb, datalen), 0, datalen); + skb_put_zero(skb, datalen); } else { int frags = pkt_dev->nfrags; int i, len; @@ -2729,7 +2729,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, frags = MAX_SKB_FRAGS; len = datalen - frags * PAGE_SIZE; if (len > 0) { - memset(skb_put(skb, len), 0, len); + skb_put_zero(skb, len); datalen = frags * PAGE_SIZE; } @@ -2844,34 +2844,35 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb_reserve(skb, 16); /* Reserve for ethernet and IP header */ - eth = (__u8 *) skb_push(skb, 14); - mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32)); + eth = skb_push(skb, 14); + mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32)); if (pkt_dev->nr_labels) mpls_push(mpls, pkt_dev); if (pkt_dev->vlan_id != 0xffff) { if (pkt_dev->svlan_id != 0xffff) { - svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + svlan_tci = skb_put(skb, sizeof(__be16)); *svlan_tci = build_tci(pkt_dev->svlan_id, pkt_dev->svlan_cfi, pkt_dev->svlan_p); - svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + svlan_encapsulated_proto = skb_put(skb, + sizeof(__be16)); *svlan_encapsulated_proto = htons(ETH_P_8021Q); } - vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + vlan_tci = skb_put(skb, sizeof(__be16)); *vlan_tci = build_tci(pkt_dev->vlan_id, pkt_dev->vlan_cfi, pkt_dev->vlan_p); - vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + vlan_encapsulated_proto = skb_put(skb, sizeof(__be16)); *vlan_encapsulated_proto = htons(ETH_P_IP); } skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); - iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr)); + iph = skb_put(skb, sizeof(struct iphdr)); skb_set_transport_header(skb, skb->len); - udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); + udph = skb_put(skb, sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); skb->priority = pkt_dev->skb_priority; @@ -2971,34 +2972,35 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb_reserve(skb, 16); /* Reserve for ethernet and IP header */ - eth = (__u8 *) skb_push(skb, 14); - mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32)); + eth = skb_push(skb, 14); + mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32)); if (pkt_dev->nr_labels) mpls_push(mpls, pkt_dev); if (pkt_dev->vlan_id != 0xffff) { if (pkt_dev->svlan_id != 0xffff) { - svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + svlan_tci = skb_put(skb, sizeof(__be16)); *svlan_tci = build_tci(pkt_dev->svlan_id, pkt_dev->svlan_cfi, pkt_dev->svlan_p); - svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + svlan_encapsulated_proto = skb_put(skb, + sizeof(__be16)); *svlan_encapsulated_proto = htons(ETH_P_8021Q); } - vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + vlan_tci = skb_put(skb, sizeof(__be16)); *vlan_tci = build_tci(pkt_dev->vlan_id, pkt_dev->vlan_cfi, pkt_dev->vlan_p); - vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + vlan_encapsulated_proto = skb_put(skb, sizeof(__be16)); *vlan_encapsulated_proto = htons(ETH_P_IPV6); } skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); - iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); + iph = skb_put(skb, sizeof(struct ipv6hdr)); skb_set_transport_header(skb, skb->len); - udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); + udph = skb_put(skb, sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); skb->priority = pkt_dev->skb_priority; @@ -3361,7 +3363,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) { ktime_t idle_start = ktime_get(); - while (atomic_read(&(pkt_dev->skb->users)) != 1) { + while (refcount_read(&(pkt_dev->skb->users)) != 1) { if (signal_pending(current)) break; @@ -3418,7 +3420,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) { skb = pkt_dev->skb; skb->protocol = eth_type_trans(skb, skb->dev); - atomic_add(burst, &skb->users); + refcount_add(burst, &skb->users); local_bh_disable(); do { ret = netif_receive_skb(skb); @@ -3426,11 +3428,11 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->errors++; pkt_dev->sofar++; pkt_dev->seq_num++; - if (atomic_read(&skb->users) != burst) { + if (refcount_read(&skb->users) != burst) { /* skb was queued by rps/rfs or taps, * so cannot reuse this skb */ - atomic_sub(burst - 1, &skb->users); + WARN_ON(refcount_sub_and_test(burst - 1, &skb->users)); /* get out of the loop and wait * until skb is consumed */ @@ -3439,14 +3441,12 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) /* skb was 'freed' by stack, so clean few * bits and reuse it */ -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = 0; /* reset reclass/redir ttl */ -#endif + skb_reset_tc(skb); } while (--burst > 0); goto out; /* Skips xmit_mode M_START_XMIT */ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { local_bh_disable(); - atomic_inc(&pkt_dev->skb->users); + refcount_inc(&pkt_dev->skb->users); ret = dev_queue_xmit(pkt_dev->skb); switch (ret) { @@ -3487,7 +3487,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->last_ok = 0; goto unlock; } - atomic_add(burst, &pkt_dev->skb->users); + refcount_add(burst, &pkt_dev->skb->users); xmit_more: ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0); @@ -3513,11 +3513,11 @@ xmit_more: /* fallthru */ case NETDEV_TX_BUSY: /* Retry it next time */ - atomic_dec(&(pkt_dev->skb->users)); + refcount_dec(&(pkt_dev->skb->users)); pkt_dev->last_ok = 0; } if (unlikely(burst)) - atomic_sub(burst, &pkt_dev->skb->users); + WARN_ON(refcount_sub_and_test(burst, &pkt_dev->skb->users)); unlock: HARD_TX_UNLOCK(odev, txq); diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 5d26056b6d8f..9b8727c67b58 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -34,8 +34,6 @@ * and it will increase in proportion to the memory of machine. * Note : Dont forget somaxconn that may limit backlog too. */ -int sysctl_max_syn_backlog = 256; -EXPORT_SYMBOL(sysctl_max_syn_backlog); void reqsk_queue_alloc(struct request_sock_queue *queue) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 75e3ea7bda08..9201e3621351 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -16,6 +16,7 @@ * Vitaly E. Lavrov RTA_OK arithmetics was wrong. */ +#include <linux/bitops.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/types.h> @@ -39,6 +40,7 @@ #include <linux/if_vlan.h> #include <linux/pci.h> #include <linux/etherdevice.h> +#include <linux/bpf.h> #include <linux/uaccess.h> @@ -647,7 +649,7 @@ int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int g NETLINK_CB(skb).dst_group = group; if (echo) - atomic_inc(&skb->users); + refcount_inc(&skb->users); netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); if (echo) err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); @@ -837,8 +839,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, static inline int rtnl_vfinfo_size(const struct net_device *dev, u32 ext_filter_mask) { - if (dev->dev.parent && dev_is_pci(dev->dev.parent) && - (ext_filter_mask & RTEXT_FILTER_VF)) { + if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); size_t size = nla_total_size(0); size += num_vfs * @@ -877,8 +878,6 @@ static size_t rtnl_port_size(const struct net_device *dev, { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ - + nla_total_size(sizeof(struct ifla_port_vsi)) - /* PORT_VSI_TYPE */ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + nla_total_size(1) /* PROT_VDP_REQUEST */ @@ -899,15 +898,13 @@ static size_t rtnl_port_size(const struct net_device *dev, return port_self_size; } -static size_t rtnl_xdp_size(const struct net_device *dev) +static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ - nla_total_size(1); /* XDP_ATTACHED */ + nla_total_size(1) + /* XDP_ATTACHED */ + nla_total_size(4); /* XDP_PROG_ID */ - if (!dev->netdev_ops->ndo_xdp) - return 0; - else - return xdp_size; + return xdp_size; } static noinline size_t if_nlmsg_size(const struct net_device *dev, @@ -937,6 +934,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(4) /* IFLA_LINK_NETNSID */ + + nla_total_size(4) /* IFLA_GROUP */ + nla_total_size(ext_filter_mask & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ @@ -946,7 +944,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ - + rtnl_xdp_size(dev) /* IFLA_XDP */ + + rtnl_xdp_size() /* IFLA_XDP */ + + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1059,7 +1058,7 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) return err; } - if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name)) + if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; @@ -1130,6 +1129,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct ifla_vf_mac vf_mac; struct ifla_vf_info ivi; + memset(&ivi, 0, sizeof(ivi)); + /* Not all SR-IOV capable drivers support the * spoofcheck and "RSS query enable" query. Preset to * -1 so the user space tool can detect that the driver @@ -1138,7 +1139,6 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, ivi.spoofchk = -1; ivi.rss_query_en = -1; ivi.trusted = -1; - memset(ivi.mac, 0, sizeof(ivi.mac)); /* The default value for VF link state is "auto" * IFLA_VF_LINK_STATE_AUTO which equals zero */ @@ -1252,25 +1252,46 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) return 0; } +static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) +{ + const struct net_device_ops *ops = dev->netdev_ops; + const struct bpf_prog *generic_xdp_prog; + + ASSERT_RTNL(); + + *prog_id = 0; + generic_xdp_prog = rtnl_dereference(dev->xdp_prog); + if (generic_xdp_prog) { + *prog_id = generic_xdp_prog->aux->id; + return XDP_ATTACHED_SKB; + } + if (!ops->ndo_xdp) + return XDP_ATTACHED_NONE; + + return __dev_xdp_attached(dev, ops->ndo_xdp, prog_id); +} + static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { - struct netdev_xdp xdp_op = {}; struct nlattr *xdp; + u32 prog_id; int err; - if (!dev->netdev_ops->ndo_xdp) - return 0; xdp = nla_nest_start(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; - xdp_op.command = XDP_QUERY_PROG; - err = dev->netdev_ops->ndo_xdp(dev, &xdp_op); - if (err) - goto err_cancel; - err = nla_put_u8(skb, IFLA_XDP_ATTACHED, xdp_op.prog_attached); + + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, + rtnl_xdp_attached_mode(dev, &prog_id)); if (err) goto err_cancel; + if (prog_id) { + err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); + if (err) + goto err_cancel; + } + nla_nest_end(skb, xdp); return 0; @@ -1279,9 +1300,40 @@ err_cancel: return err; } +static u32 rtnl_get_event(unsigned long event) +{ + u32 rtnl_event_type = IFLA_EVENT_NONE; + + switch (event) { + case NETDEV_REBOOT: + rtnl_event_type = IFLA_EVENT_REBOOT; + break; + case NETDEV_FEAT_CHANGE: + rtnl_event_type = IFLA_EVENT_FEATURES; + break; + case NETDEV_BONDING_FAILOVER: + rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; + break; + case NETDEV_NOTIFY_PEERS: + rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; + break; + case NETDEV_RESEND_IGMP: + rtnl_event_type = IFLA_EVENT_IGMP_RESEND; + break; + case NETDEV_CHANGEINFODATA: + rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; + break; + default: + break; + } + + return rtnl_event_type; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, - unsigned int flags, u32 ext_filter_mask) + unsigned int flags, u32 ext_filter_mask, + u32 event) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1330,6 +1382,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; + if (event != IFLA_EVENT_NONE) { + if (nla_put_u32(skb, IFLA_EVENT, event)) + goto nla_put_failure; + } + if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure; @@ -1464,6 +1521,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, + [IFLA_EVENT] = { .type = NLA_U32 }, + [IFLA_GROUP] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1492,20 +1551,26 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_VF] = { .type = NLA_U32 }, [IFLA_PORT_PROFILE] = { .type = NLA_STRING, .len = PORT_PROFILE_MAX }, - [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, - .len = sizeof(struct ifla_port_vsi)}, [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, .len = PORT_UUID_MAX }, [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, .len = PORT_UUID_MAX }, [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, + + /* Unused, but we need to keep it here since user space could + * fill it. It's also broken with regard to NLA_BINARY use in + * combination with structs. + */ + [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, + .len = sizeof(struct ifla_port_vsi) }, }; static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, + [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) @@ -1513,7 +1578,8 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla const struct rtnl_link_ops *ops = NULL; struct nlattr *linfo[IFLA_INFO_MAX + 1]; - if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, ifla_info_policy) < 0) + if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, + ifla_info_policy, NULL) < 0) return NULL; if (linfo[IFLA_INFO_KIND]) { @@ -1590,8 +1656,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); - if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) { - + if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, + ifla_policy, NULL) >= 0) { if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); @@ -1617,14 +1683,14 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask); - /* If we ran out of room on the first message, - * we're in trouble - */ - WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + ext_filter_mask, 0); - if (err < 0) - goto out; + if (err < 0) { + if (likely(skb->len)) + goto out; + + goto out_err; + } nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: @@ -1632,15 +1698,18 @@ cont: } } out: + err = skb->len; +out_err: cb->args[1] = idx; cb->args[0] = h; - return skb->len; + return err; } -int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len) +int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, + struct netlink_ext_ack *exterr) { - return nla_parse(tb, IFLA_MAX, head, len, ifla_policy); + return nla_parse(tb, IFLA_MAX, head, len, ifla_policy, exterr); } EXPORT_SYMBOL(rtnl_nla_parse_ifla); @@ -1905,6 +1974,7 @@ static int do_set_master(struct net_device *dev, int ifindex) #define DO_SETLINK_NOTIFY 0x03 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, struct nlattr **tb, char *ifname, int status) { const struct net_device_ops *ops = dev->netdev_ops; @@ -1961,7 +2031,8 @@ static int do_setlink(const struct sk_buff *skb, struct sockaddr *sa; int len; - len = sizeof(sa_family_t) + dev->addr_len; + len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, + sizeof(*sa)); sa = kmalloc(len, GFP_KERNEL); if (!sa) { err = -ENOMEM; @@ -2035,8 +2106,8 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_TXQLEN]) { - unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); - unsigned long orig_len = dev->tx_queue_len; + unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); + unsigned int orig_len = dev->tx_queue_len; if (dev->tx_queue_len ^ value) { dev->tx_queue_len = value; @@ -2076,7 +2147,7 @@ static int do_setlink(const struct sk_buff *skb, goto errout; } err = nla_parse_nested(vfinfo, IFLA_VF_MAX, attr, - ifla_vf_policy); + ifla_vf_policy, NULL); if (err < 0) goto errout; err = do_setvfinfo(dev, vfinfo); @@ -2104,7 +2175,7 @@ static int do_setlink(const struct sk_buff *skb, goto errout; } err = nla_parse_nested(port, IFLA_PORT_MAX, attr, - ifla_port_policy); + ifla_port_policy, NULL); if (err < 0) goto errout; if (!port[IFLA_PORT_VF]) { @@ -2124,7 +2195,8 @@ static int do_setlink(const struct sk_buff *skb, struct nlattr *port[IFLA_PORT_MAX+1]; err = nla_parse_nested(port, IFLA_PORT_MAX, - tb[IFLA_PORT_SELF], ifla_port_policy); + tb[IFLA_PORT_SELF], ifla_port_policy, + NULL); if (err < 0) goto errout; @@ -2168,11 +2240,11 @@ static int do_setlink(const struct sk_buff *skb, u32 xdp_flags = 0; err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], - ifla_xdp_policy); + ifla_xdp_policy, NULL); if (err < 0) goto errout; - if (xdp[IFLA_XDP_ATTACHED]) { + if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) { err = -EINVAL; goto errout; } @@ -2183,10 +2255,14 @@ static int do_setlink(const struct sk_buff *skb, err = -EINVAL; goto errout; } + if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) { + err = -EINVAL; + goto errout; + } } if (xdp[IFLA_XDP_FD]) { - err = dev_change_xdp_fd(dev, + err = dev_change_xdp_fd(dev, extack, nla_get_s32(xdp[IFLA_XDP_FD]), xdp_flags); if (err) @@ -2208,7 +2284,8 @@ errout: return err; } -static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -2217,7 +2294,8 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *tb[IFLA_MAX+1]; char ifname[IFNAMSIZ]; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, + extack); if (err < 0) goto errout; @@ -2244,7 +2322,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) if (err < 0) goto errout; - err = do_setlink(skb, dev, ifm, tb, ifname, 0); + err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0); errout: return err; } @@ -2301,7 +2379,8 @@ int rtnl_delete_link(struct net_device *dev) } EXPORT_SYMBOL_GPL(rtnl_delete_link); -static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *dev; @@ -2310,7 +2389,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *tb[IFLA_MAX+1]; int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; @@ -2356,7 +2435,6 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[]) { - int err; struct net_device *dev; unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; @@ -2371,11 +2449,10 @@ struct net_device *rtnl_create_link(struct net *net, else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); - err = -ENOMEM; dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); if (!dev) - goto err; + return ERR_PTR(-ENOMEM); dev_net_set(dev, net); dev->rtnl_link_ops = ops; @@ -2401,15 +2478,13 @@ struct net_device *rtnl_create_link(struct net *net, dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); return dev; - -err: - return ERR_PTR(err); } EXPORT_SYMBOL(rtnl_create_link); static int rtnl_group_changelink(const struct sk_buff *skb, struct net *net, int group, struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, struct nlattr **tb) { struct net_device *dev, *aux; @@ -2417,7 +2492,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb, for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { - err = do_setlink(skb, dev, ifm, tb, NULL, 0); + err = do_setlink(skb, dev, ifm, extack, tb, NULL, 0); if (err < 0) return err; } @@ -2426,7 +2501,8 @@ static int rtnl_group_changelink(const struct sk_buff *skb, return 0; } -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; @@ -2444,7 +2520,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) #ifdef CONFIG_MODULES replay: #endif - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; @@ -2475,7 +2551,8 @@ replay: if (tb[IFLA_LINKINFO]) { err = nla_parse_nested(linkinfo, IFLA_INFO_MAX, - tb[IFLA_LINKINFO], ifla_info_policy); + tb[IFLA_LINKINFO], ifla_info_policy, + NULL); if (err < 0) return err; } else @@ -2500,13 +2577,13 @@ replay: if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { err = nla_parse_nested(attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], - ops->policy); + ops->policy, NULL); if (err < 0) return err; data = attr; } if (ops->validate) { - err = ops->validate(tb, data); + err = ops->validate(tb, data, extack); if (err < 0) return err; } @@ -2518,13 +2595,15 @@ replay: err = nla_parse_nested(slave_attr, m_ops->slave_maxtype, linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy); + m_ops->slave_policy, + NULL); if (err < 0) return err; slave_data = slave_attr; } if (m_ops->slave_validate) { - err = m_ops->slave_validate(tb, slave_data); + err = m_ops->slave_validate(tb, slave_data, + extack); if (err < 0) return err; } @@ -2543,7 +2622,7 @@ replay: !ops->changelink) return -EOPNOTSUPP; - err = ops->changelink(dev, tb, data); + err = ops->changelink(dev, tb, data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; @@ -2554,24 +2633,26 @@ replay: return -EOPNOTSUPP; err = m_ops->slave_changelink(master_dev, dev, - tb, slave_data); + tb, slave_data, + extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } - return do_setlink(skb, dev, ifm, tb, ifname, status); + return do_setlink(skb, dev, ifm, extack, tb, ifname, + status); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), - ifm, tb); + ifm, extack, tb); return -ENODEV; } - if (tb[IFLA_MAP] || tb[IFLA_MASTER] || tb[IFLA_PROTINFO]) + if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) return -EOPNOTSUPP; if (!ops) { @@ -2627,7 +2708,8 @@ replay: dev->ifindex = ifm->ifi_index; if (ops->newlink) { - err = ops->newlink(link_net ? : net, dev, tb, data); + err = ops->newlink(link_net ? : net, dev, tb, data, + extack); /* Drivers should call free_netdev() in ->destructor * and unregister it on failure after registration * so that device could be finally freed in rtnl_unlock. @@ -2653,6 +2735,11 @@ replay: if (err < 0) goto out_unregister; } + if (tb[IFLA_MASTER]) { + err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER])); + if (err) + goto out_unregister; + } out: if (link_net) put_net(link_net); @@ -2671,7 +2758,8 @@ out_unregister: } } -static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh) +static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -2682,7 +2770,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh) int err; u32 ext_filter_mask = 0; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; @@ -2708,7 +2796,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh) return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, 0, 0, ext_filter_mask); + nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -2732,7 +2820,7 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); - if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) { + if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); } @@ -2780,7 +2868,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, - unsigned int change, gfp_t flags) + unsigned int change, + u32 event, gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2791,7 +2880,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, if (skb == NULL) goto errout; - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2812,18 +2901,25 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, - gfp_t flags) +static void rtmsg_ifinfo_event(int type, struct net_device *dev, + unsigned int change, u32 event, + gfp_t flags) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) +{ + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags); +} EXPORT_SYMBOL(rtmsg_ifinfo); static int nlmsg_populate_fdb_fill(struct sk_buff *skb, @@ -2953,7 +3049,8 @@ static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid) return 0; } -static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; @@ -2963,7 +3060,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) u16 vid; int err; - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) return err; @@ -3053,7 +3150,8 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm, } EXPORT_SYMBOL(ndo_dflt_fdb_del); -static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; @@ -3066,7 +3164,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) return err; @@ -3201,8 +3299,11 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int err = 0; int fidx = 0; - if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, - ifla_policy) == 0) { + err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, NULL); + if (err < 0) { + return -EINVAL; + } else if (err == 0) { if (tb[IFLA_MASTER]) br_idx = nla_get_u32(tb[IFLA_MASTER]); } @@ -3425,8 +3526,12 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) err = br_dev->netdev_ops->ndo_bridge_getlink( skb, portid, seq, dev, filter_mask, NLM_F_MULTI); - if (err < 0 && err != -EOPNOTSUPP) - break; + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } } idx++; } @@ -3437,16 +3542,22 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) seq, dev, filter_mask, NLM_F_MULTI); - if (err < 0 && err != -EOPNOTSUPP) - break; + if (err < 0 && err != -EOPNOTSUPP) { + if (likely(skb->len)) + break; + + goto out_err; + } } idx++; } } + err = skb->len; +out_err: rcu_read_unlock(); cb->args[0] = idx; - return skb->len; + return err; } static inline size_t bridge_nlmsg_size(void) @@ -3496,7 +3607,8 @@ errout: return err; } -static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -3570,7 +3682,8 @@ out: return err; } -static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -3829,6 +3942,39 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, *idxattr = 0; } + if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) { + struct rtnl_af_ops *af_ops; + + *idxattr = IFLA_STATS_AF_SPEC; + attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC); + if (!attr) + goto nla_put_failure; + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->fill_stats_af) { + struct nlattr *af; + int err; + + af = nla_nest_start(skb, af_ops->family); + if (!af) + goto nla_put_failure; + + err = af_ops->fill_stats_af(skb, dev); + + if (err == -ENODATA) + nla_nest_cancel(skb, af); + else if (err < 0) + goto nla_put_failure; + + nla_nest_end(skb, af); + } + } + + nla_nest_end(skb, attr); + + *idxattr = 0; + } + nlmsg_end(skb, nlh); return 0; @@ -3885,10 +4031,28 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) size += rtnl_get_offload_stats_size(dev); + if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { + struct rtnl_af_ops *af_ops; + + /* for IFLA_STATS_AF_SPEC */ + size += nla_total_size(0); + + list_for_each_entry(af_ops, &rtnl_af_ops, list) { + if (af_ops->get_stats_af_size) { + size += nla_total_size( + af_ops->get_stats_af_size(dev)); + + /* for AF_* */ + size += nla_total_size(0); + } + } + } + return size; } -static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; @@ -3994,7 +4158,8 @@ out: /* Process one rtnetlink message. */ -static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); rtnl_doit_func doit; @@ -4049,7 +4214,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (doit == NULL) return -EOPNOTSUPP; - return doit(skb, nlh); + return doit(skb, nlh, extack); } static void rtnetlink_rcv(struct sk_buff *skb) @@ -4059,27 +4224,35 @@ static void rtnetlink_rcv(struct sk_buff *skb) rtnl_unlock(); } +static int rtnetlink_bind(struct net *net, int group) +{ + switch (group) { + case RTNLGRP_IPV4_MROUTE_R: + case RTNLGRP_IPV6_MROUTE_R: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + break; + } + return 0; +} + static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { - case NETDEV_UP: - case NETDEV_DOWN: - case NETDEV_PRE_UP: - case NETDEV_POST_INIT: - case NETDEV_REGISTER: - case NETDEV_CHANGE: - case NETDEV_PRE_TYPE_CHANGE: - case NETDEV_GOING_DOWN: - case NETDEV_UNREGISTER: - case NETDEV_UNREGISTER_FINAL: - case NETDEV_RELEASE: - case NETDEV_JOIN: - case NETDEV_BONDING_INFO: + case NETDEV_REBOOT: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGENAME: + case NETDEV_FEAT_CHANGE: + case NETDEV_BONDING_FAILOVER: + case NETDEV_NOTIFY_PEERS: + case NETDEV_RESEND_IGMP: + case NETDEV_CHANGEINFODATA: + rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), + GFP_KERNEL); break; default: - rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); break; } return NOTIFY_DONE; @@ -4098,6 +4271,7 @@ static int __net_init rtnetlink_net_init(struct net *net) .input = rtnetlink_rcv, .cb_mutex = &rtnl_mutex, .flags = NL_CFG_F_NONROOT_RECV, + .bind = rtnetlink_bind, }; sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); @@ -4133,6 +4307,7 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL); rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL); + rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, NULL); rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL); diff --git a/net/core/scm.c b/net/core/scm.c index d8820438ba37..b1ff8a441748 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -14,6 +14,7 @@ #include <linux/capability.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/sched/user.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/stat.h> @@ -71,7 +72,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) struct file **fpp; int i, num; - num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); + num = (cmsg->cmsg_len - sizeof(struct cmsghdr))/sizeof(int); if (num <= 0) return 0; diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 88a8e429fc3e..7232274de334 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -1,3 +1,7 @@ +/* + * Copyright (C) 2016 Jason A. Donenfeld <[email protected]>. All Rights Reserved. + */ + #include <linux/kernel.h> #include <linux/init.h> #include <linux/cryptohash.h> @@ -8,18 +12,24 @@ #include <linux/ktime.h> #include <linux/string.h> #include <linux/net.h> - +#include <linux/siphash.h> #include <net/secure_seq.h> #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) +#include <linux/in6.h> #include <net/tcp.h> -#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4) -static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned; +static siphash_key_t net_secret __read_mostly; +static siphash_key_t ts_secret __read_mostly; static __always_inline void net_secret_init(void) { - net_get_random_once(net_secret, sizeof(net_secret)); + net_get_random_once(&net_secret, sizeof(net_secret)); +} + +static __always_inline void ts_secret_init(void) +{ + net_get_random_once(&ts_secret, sizeof(ts_secret)); } #endif @@ -41,83 +51,101 @@ static u32 seq_scale(u32 seq) #endif #if IS_ENABLED(CONFIG_IPV6) -u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport, u32 *tsoff) +u32 secure_tcpv6_ts_off(const struct net *net, + const __be32 *saddr, const __be32 *daddr) +{ + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + }; + + if (net->ipv4.sysctl_tcp_timestamps != 1) + return 0; + + ts_secret_init(); + return siphash(&combined, offsetofend(typeof(combined), daddr), + &ts_secret); +} +EXPORT_SYMBOL(secure_tcpv6_ts_off); + +u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport) { - u32 secret[MD5_MESSAGE_BYTES / 4]; - u32 hash[MD5_DIGEST_WORDS]; - u32 i; + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .sport = sport, + .dport = dport + }; + u32 hash; net_secret_init(); - memcpy(hash, saddr, 16); - for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + (__force u32)daddr[i]; - secret[4] = net_secret[4] + - (((__force u16)sport << 16) + (__force u16)dport); - for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) - secret[i] = net_secret[i]; - - md5_transform(hash, secret); - - *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0; - return seq_scale(hash[0]); + hash = siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); + return seq_scale(hash); } -EXPORT_SYMBOL(secure_tcpv6_sequence_number); +EXPORT_SYMBOL(secure_tcpv6_seq); u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) { - u32 secret[MD5_MESSAGE_BYTES / 4]; - u32 hash[MD5_DIGEST_WORDS]; - u32 i; - + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .dport = dport + }; net_secret_init(); - memcpy(hash, saddr, 16); - for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + (__force u32) daddr[i]; - secret[4] = net_secret[4] + (__force u32)dport; - for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) - secret[i] = net_secret[i]; - - md5_transform(hash, secret); - - return hash[0]; + return siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); } EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET - -u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport, u32 *tsoff) +u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr) { - u32 hash[MD5_DIGEST_WORDS]; + if (net->ipv4.sysctl_tcp_timestamps != 1) + return 0; - net_secret_init(); - hash[0] = (__force u32)saddr; - hash[1] = (__force u32)daddr; - hash[2] = ((__force u16)sport << 16) + (__force u16)dport; - hash[3] = net_secret[15]; + ts_secret_init(); + return siphash_2u32((__force u32)saddr, (__force u32)daddr, + &ts_secret); +} - md5_transform(hash, net_secret); +/* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d), + * but fortunately, `sport' cannot be 0 in any circumstances. If this changes, + * it would be easy enough to have the former function use siphash_4u32, passing + * the arguments as separate u32. + */ +u32 secure_tcp_seq(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +{ + u32 hash; - *tsoff = sysctl_tcp_timestamps == 1 ? hash[1] : 0; - return seq_scale(hash[0]); + net_secret_init(); + hash = siphash_3u32((__force u32)saddr, (__force u32)daddr, + (__force u32)sport << 16 | (__force u32)dport, + &net_secret); + return seq_scale(hash); } u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { - u32 hash[MD5_DIGEST_WORDS]; - net_secret_init(); - hash[0] = (__force u32)saddr; - hash[1] = (__force u32)daddr; - hash[2] = (__force u32)dport ^ net_secret[14]; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - return hash[0]; + return siphash_3u32((__force u32)saddr, (__force u32)daddr, + (__force u16)dport, &net_secret); } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif @@ -126,21 +154,13 @@ EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) { - u32 hash[MD5_DIGEST_WORDS]; u64 seq; - net_secret_init(); - hash[0] = (__force u32)saddr; - hash[1] = (__force u32)daddr; - hash[2] = ((__force u16)sport << 16) + (__force u16)dport; - hash[3] = net_secret[15]; - - md5_transform(hash, net_secret); - - seq = hash[0] | (((u64)hash[1]) << 32); + seq = siphash_3u32((__force u32)saddr, (__force u32)daddr, + (__force u32)sport << 16 | (__force u32)dport, + &net_secret); seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; - return seq; } EXPORT_SYMBOL(secure_dccp_sequence_number); @@ -149,26 +169,23 @@ EXPORT_SYMBOL(secure_dccp_sequence_number); u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, __be16 sport, __be16 dport) { - u32 secret[MD5_MESSAGE_BYTES / 4]; - u32 hash[MD5_DIGEST_WORDS]; + const struct { + struct in6_addr saddr; + struct in6_addr daddr; + __be16 sport; + __be16 dport; + } __aligned(SIPHASH_ALIGNMENT) combined = { + .saddr = *(struct in6_addr *)saddr, + .daddr = *(struct in6_addr *)daddr, + .sport = sport, + .dport = dport + }; u64 seq; - u32 i; - net_secret_init(); - memcpy(hash, saddr, 16); - for (i = 0; i < 4; i++) - secret[i] = net_secret[i] + (__force u32)daddr[i]; - secret[4] = net_secret[4] + - (((__force u16)sport << 16) + (__force u16)dport); - for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++) - secret[i] = net_secret[i]; - - md5_transform(hash, secret); - - seq = hash[0] | (((u64)hash[1]) << 32); + seq = siphash(&combined, offsetofend(typeof(combined), dport), + &net_secret); seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; - return seq; } EXPORT_SYMBOL(secure_dccpv6_sequence_number); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 734c71468b01..f990eb8b30a9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -176,7 +176,7 @@ struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) memset(skb, 0, offsetof(struct sk_buff, tail)); skb->head = NULL; skb->truesize = sizeof(struct sk_buff); - atomic_set(&skb->users, 1); + refcount_set(&skb->users, 1); skb->mac_header = (typeof(skb->mac_header))~0U; out: @@ -247,7 +247,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, /* Account for allocated memory : skb + skb->head */ skb->truesize = SKB_TRUESIZE(size); skb->pfmemalloc = pfmemalloc; - atomic_set(&skb->users, 1); + refcount_set(&skb->users, 1); skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); @@ -268,10 +268,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, kmemcheck_annotate_bitfield(&fclones->skb2, flags1); skb->fclone = SKB_FCLONE_ORIG; - atomic_set(&fclones->fclone_ref, 1); + refcount_set(&fclones->fclone_ref, 1); fclones->skb2.fclone = SKB_FCLONE_CLONE; - fclones->skb2.pfmemalloc = pfmemalloc; } out: return skb; @@ -315,7 +314,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) memset(skb, 0, offsetof(struct sk_buff, tail)); skb->truesize = SKB_TRUESIZE(size); - atomic_set(&skb->users, 1); + refcount_set(&skb->users, 1); skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); @@ -630,7 +629,7 @@ static void kfree_skbmem(struct sk_buff *skb) * This test would have no chance to be true for the clone, * while here, branch prediction will be good. */ - if (atomic_read(&fclones->fclone_ref) == 1) + if (refcount_read(&fclones->fclone_ref) == 1) goto fastpath; break; @@ -638,24 +637,22 @@ static void kfree_skbmem(struct sk_buff *skb) fclones = container_of(skb, struct sk_buff_fclones, skb2); break; } - if (!atomic_dec_and_test(&fclones->fclone_ref)) + if (!refcount_dec_and_test(&fclones->fclone_ref)) return; fastpath: kmem_cache_free(skbuff_fclone_cache, fclones); } -static void skb_release_head_state(struct sk_buff *skb) +void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); -#ifdef CONFIG_XFRM - secpath_put(skb->sp); -#endif + secpath_reset(skb); if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); } #if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_conntrack_put(skb->nfct); + nf_conntrack_put(skb_nfct(skb)); #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nf_bridge_put(skb->nf_bridge); @@ -695,12 +692,9 @@ EXPORT_SYMBOL(__kfree_skb); */ void kfree_skb(struct sk_buff *skb) { - if (unlikely(!skb)) - return; - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) + if (!skb_unref(skb)) return; + trace_kfree_skb(skb, __builtin_return_address(0)); __kfree_skb(skb); } @@ -747,17 +741,32 @@ EXPORT_SYMBOL(skb_tx_error); */ void consume_skb(struct sk_buff *skb) { - if (unlikely(!skb)) - return; - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) + if (!skb_unref(skb)) return; + trace_consume_skb(skb); __kfree_skb(skb); } EXPORT_SYMBOL(consume_skb); +/** + * consume_stateless_skb - free an skbuff, assuming it is stateless + * @skb: buffer to free + * + * Works like consume_skb(), but this variant assumes that all the head + * states have been already dropped. + */ +void consume_stateless_skb(struct sk_buff *skb) +{ + if (!skb_unref(skb)) + return; + + trace_consume_skb(skb); + if (likely(skb->head)) + skb_release_data(skb); + kfree_skbmem(skb); +} + void __kfree_skb_flush(void) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -808,10 +817,9 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) + if (!skb_unref(skb)) return; + /* if reaching here SKB is ready to free */ trace_consume_skb(skb); @@ -878,9 +886,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif #ifdef CONFIG_NET_SCHED CHECK_SKB_FIELD(tc_index); -#ifdef CONFIG_NET_CLS_ACT - CHECK_SKB_FIELD(tc_verd); -#endif #endif } @@ -910,7 +915,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) C(head_frag); C(data); C(truesize); - atomic_set(&n->users, 1); + refcount_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); skb->cloned = 1; @@ -1022,9 +1027,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL; if (skb->fclone == SKB_FCLONE_ORIG && - atomic_read(&fclones->fclone_ref) == 1) { + refcount_read(&fclones->fclone_ref) == 1) { n = &fclones->skb2; - atomic_set(&fclones->fclone_ref, 2); + refcount_set(&fclones->fclone_ref, 2); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -1195,10 +1200,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone); int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask) { - int i; - u8 *data; - int size = nhead + skb_end_offset(skb) + ntail; + int i, osize = skb_end_offset(skb); + int size = osize + nhead + ntail; long off; + u8 *data; BUG_ON(nhead < 0); @@ -1260,6 +1265,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->hdr_len = 0; skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); + + /* It is not generally safe to change skb->truesize. + * For the moment, we really care of rx path, or + * when skb is orphaned (not attached to a socket). + */ + if (!skb->sk || skb->destructor == sock_edemux) + skb->truesize += size - osize; + return 0; nofrags: @@ -1408,7 +1421,7 @@ EXPORT_SYMBOL(skb_pad); * returned. */ -unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) { if (tail != skb) { skb->data_len += len; @@ -1427,9 +1440,9 @@ EXPORT_SYMBOL_GPL(pskb_put); * exceed the total buffer size the kernel will panic. A pointer to the * first byte of the extra data is returned. */ -unsigned char *skb_put(struct sk_buff *skb, unsigned int len) +void *skb_put(struct sk_buff *skb, unsigned int len) { - unsigned char *tmp = skb_tail_pointer(skb); + void *tmp = skb_tail_pointer(skb); SKB_LINEAR_ASSERT(skb); skb->tail += len; skb->len += len; @@ -1448,7 +1461,7 @@ EXPORT_SYMBOL(skb_put); * start. If this would exceed the total buffer headroom the kernel will * panic. A pointer to the first byte of the extra data is returned. */ -unsigned char *skb_push(struct sk_buff *skb, unsigned int len) +void *skb_push(struct sk_buff *skb, unsigned int len) { skb->data -= len; skb->len += len; @@ -1468,7 +1481,7 @@ EXPORT_SYMBOL(skb_push); * is returned. Once the data has been pulled future pushes will overwrite * the old data. */ -unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) +void *skb_pull(struct sk_buff *skb, unsigned int len) { return skb_pull_inline(skb, len); } @@ -1572,6 +1585,8 @@ done: skb_set_tail_pointer(skb, len); } + if (!skb->sk || skb->destructor == sock_edemux) + skb_condense(skb); return 0; } EXPORT_SYMBOL(___pskb_trim); @@ -1601,7 +1616,7 @@ EXPORT_SYMBOL(___pskb_trim); * * It is pretty complicated. Luckily, it is called only in exceptional cases. */ -unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) +void *__pskb_pull_tail(struct sk_buff *skb, int delta) { /* If skb has not enough free space at tail, get new one * plus 128 bytes for future expansions. If we have enough @@ -1976,7 +1991,6 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, .pages = pages, .partial = partial, .nr_pages_max = MAX_SKB_FRAGS, - .flags = flags, .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; @@ -2238,6 +2252,32 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) +{ + net_warn_ratelimited( + "%s: attempt to compute crc32c without libcrc32c.ko\n", + __func__); + return 0; +} + +static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, + int offset, int len) +{ + net_warn_ratelimited( + "%s: attempt to compute crc32c without libcrc32c.ko\n", + __func__); + return 0; +} + +static const struct skb_checksum_ops default_crc32c_ops = { + .update = warn_crc32c_csum_update, + .combine = warn_crc32c_csum_combine, +}; + +const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = + &default_crc32c_ops; +EXPORT_SYMBOL(crc32c_csum_stub); + /** * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() * @from: source buffer @@ -2615,7 +2655,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); - skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; + skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & + SKBTX_SHARED_FRAG; if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ @@ -2983,7 +3024,7 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, get_page(pfrag->page); skb->truesize += copy; - atomic_add(copy, &sk->sk_wmem_alloc); + refcount_add(copy, &sk->sk_wmem_alloc); skb->len += copy; skb->data_len += copy; offset += copy; @@ -3024,7 +3065,7 @@ EXPORT_SYMBOL_GPL(skb_append_pagefrags); * that the checksum difference is zero (e.g., a valid IP header) * or you are setting ip_summed to CHECKSUM_NONE. */ -unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) +void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) { unsigned char *data = skb->data; @@ -3078,22 +3119,32 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (sg && csum && (mss != GSO_BY_FRAGS)) { if (!(features & NETIF_F_GSO_PARTIAL)) { struct sk_buff *iter; + unsigned int frag_len; if (!list_skb || !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) goto normal; - /* Split the buffer at the frag_list pointer. - * This is based on the assumption that all - * buffers in the chain excluding the last - * containing the same amount of data. + /* If we get here then all the required + * GSO features except frag_list are supported. + * Try to split the SKB to multiple GSO SKBs + * with no frag_list. + * Currently we can do that only when the buffers don't + * have a linear part and all the buffers except + * the last are of the same length. */ + frag_len = list_skb->len; skb_walk_frags(head_skb, iter) { - if (skb_headlen(iter)) + if (frag_len != iter->len && iter->next) + goto normal; + if (skb_headlen(iter) && !iter->head_frag) goto normal; len -= iter->len; } + + if (len != frag_len) + goto normal; } /* GSO partial only requires that we trim off any excess that @@ -3220,8 +3271,8 @@ normal: skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); - skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags & - SKBTX_SHARED_FRAG; + skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & + SKBTX_SHARED_FRAG; while (pos < offset + len) { if (i >= nfrags) { @@ -3467,24 +3518,18 @@ void __init skb_init(void) NULL); } -/** - * skb_to_sgvec - Fill a scatter-gather list from a socket buffer - * @skb: Socket buffer containing the buffers to be mapped - * @sg: The scatter-gather list to map into - * @offset: The offset into the buffer's contents to start mapping - * @len: Length of buffer space to be mapped - * - * Fill the specified scatter-gather list with mappings/pointers into a - * region of the buffer space attached to a socket buffer. - */ static int -__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, + unsigned int recursion_level) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int elt = 0; + if (unlikely(recursion_level >= 24)) + return -EMSGSIZE; + if (copy > 0) { if (copy > len) copy = len; @@ -3503,6 +3548,8 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + if (unlikely(elt && sg_is_last(&sg[elt - 1]))) + return -EMSGSIZE; if (copy > len) copy = len; @@ -3517,16 +3564,22 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) } skb_walk_frags(skb, frag_iter) { - int end; + int end, ret; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { + if (unlikely(elt && sg_is_last(&sg[elt - 1]))) + return -EMSGSIZE; + if (copy > len) copy = len; - elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, - copy); + ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, + copy, recursion_level + 1); + if (unlikely(ret < 0)) + return ret; + elt += ret; if ((len -= copy) == 0) return elt; offset += copy; @@ -3537,6 +3590,31 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) return elt; } +/** + * skb_to_sgvec - Fill a scatter-gather list from a socket buffer + * @skb: Socket buffer containing the buffers to be mapped + * @sg: The scatter-gather list to map into + * @offset: The offset into the buffer's contents to start mapping + * @len: Length of buffer space to be mapped + * + * Fill the specified scatter-gather list with mappings/pointers into a + * region of the buffer space attached to a socket buffer. Returns either + * the number of scatterlist items used, or -EMSGSIZE if the contents + * could not fit. + */ +int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); + + if (nsg <= 0) + return nsg; + + sg_mark_end(&sg[nsg - 1]); + + return nsg; +} +EXPORT_SYMBOL_GPL(skb_to_sgvec); + /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given * sglist without mark the sg which contain last skb data as the end. * So the caller can mannipulate sg list as will when padding new data after @@ -3559,19 +3637,11 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) { - return __skb_to_sgvec(skb, sg, offset, len); + return __skb_to_sgvec(skb, sg, offset, len, 0); } EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); -int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) -{ - int nsg = __skb_to_sgvec(skb, sg, offset, len); - - sg_mark_end(&sg[nsg - 1]); - return nsg; -} -EXPORT_SYMBOL_GPL(skb_to_sgvec); /** * skb_cow_data - Check that a socket buffer's data buffers are writable @@ -3690,6 +3760,15 @@ static void sock_rmem_free(struct sk_buff *skb) atomic_sub(skb->truesize, &sk->sk_rmem_alloc); } +static void skb_set_err_queue(struct sk_buff *skb) +{ + /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. + * So, it is safe to (mis)use it to mark skbs on the error queue. + */ + skb->pkt_type = PACKET_OUTGOING; + BUILD_BUG_ON(PACKET_OUTGOING == 0); +} + /* * Note: We dont mem charge error packets (no sk_forward_alloc changes) */ @@ -3703,6 +3782,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_rmem_free; atomic_add(skb->truesize, &sk->sk_rmem_alloc); + skb_set_err_queue(skb); /* before exiting rcu section, make sure dst is refcounted */ skb_dst_force(skb); @@ -3729,8 +3809,11 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk) spin_lock_irqsave(&q->lock, flags); skb = __skb_dequeue(q); - if (skb && (skb_next = skb_peek(q))) + if (skb && (skb_next = skb_peek(q))) { icmp_next = is_icmp_err_skb(skb_next); + if (icmp_next) + sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin; + } spin_unlock_irqrestore(&q->lock, flags); if (is_icmp_err_skb(skb) && !icmp_next) @@ -3761,7 +3844,7 @@ struct sk_buff *skb_clone_sk(struct sk_buff *skb) struct sock *sk = skb->sk; struct sk_buff *clone; - if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt)) + if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; clone = skb_clone(skb, GFP_ATOMIC); @@ -3779,16 +3862,21 @@ EXPORT_SYMBOL(skb_clone_sk); static void __skb_complete_tx_timestamp(struct sk_buff *skb, struct sock *sk, - int tstype) + int tstype, + bool opt_stats) { struct sock_exterr_skb *serr; int err; + BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); + serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = tstype; + serr->opt_stats = opt_stats; + serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk->sk_protocol == IPPROTO_TCP && @@ -3824,13 +3912,14 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, if (!skb_may_tx_timestamp(sk, false)) return; - /* take a reference to prevent skb_orphan() from freeing the socket */ - sock_hold(sk); - - *skb_hwtstamps(skb) = *hwtstamps; - __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); - - sock_put(sk); + /* Take a reference to prevent skb_orphan() from freeing the socket, + * but only if the socket refcount is not zero. + */ + if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { + *skb_hwtstamps(skb) = *hwtstamps; + __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); + sock_put(sk); + } } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); @@ -3839,11 +3928,15 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, struct sock *sk, int tstype) { struct sk_buff *skb; - bool tsonly; + bool tsonly, opt_stats = false; if (!sk) return; + if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) + return; + tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; @@ -3852,9 +3945,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, #ifdef CONFIG_INET if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) + sk->sk_type == SOCK_STREAM) { skb = tcp_get_timestamping_opt_stats(sk); - else + opt_stats = true; + } else #endif skb = alloc_skb(0, GFP_ATOMIC); } else { @@ -3864,7 +3958,8 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, return; if (tsonly) { - skb_shinfo(skb)->tx_flags = skb_shinfo(orig_skb)->tx_flags; + skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & + SKBTX_ANY_TSTAMP; skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; } @@ -3873,7 +3968,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, else skb->tstamp = ktime_get_real(); - __skb_complete_tx_timestamp(skb, sk, tstype); + __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); } EXPORT_SYMBOL_GPL(__skb_tstamp_tx); @@ -3889,7 +3984,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) { struct sock *sk = skb->sk; struct sock_exterr_skb *serr; - int err; + int err = 1; skb->wifi_acked_valid = 1; skb->wifi_acked = acked; @@ -3899,14 +3994,15 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; - /* take a reference to prevent skb_orphan() from freeing the socket */ - sock_hold(sk); - - err = sock_queue_err_skb(sk, skb); + /* Take a reference to prevent skb_orphan() from freeing the socket, + * but only if the socket refcount is not zero. + */ + if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { + err = sock_queue_err_skb(sk, skb); + sock_put(sk); + } if (err) kfree_skb(skb); - - sock_put(sk); } EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); @@ -4651,7 +4747,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, gfp_head = gfp_mask; if (gfp_head & __GFP_DIRECT_RECLAIM) - gfp_head |= __GFP_REPEAT; + gfp_head |= __GFP_RETRY_MAYFAIL; *errcode = -ENOBUFS; skb = alloc_skb(header_len, gfp_head); diff --git a/net/core/sock.c b/net/core/sock.c index 4eca27dc5c94..ac2a404c73eb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -102,6 +102,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> @@ -138,10 +139,7 @@ #include <trace/events/sock.h> -#ifdef CONFIG_INET #include <net/tcp.h> -#endif - #include <net/busy_poll.h> static DEFINE_MUTEX(proto_list_mutex); @@ -197,73 +195,117 @@ EXPORT_SYMBOL(sk_net_capable); /* * Each address family might have different locking rules, so we have - * one slock key per address family: + * one slock key per address family and separate keys for internal and + * userspace sockets. */ static struct lock_class_key af_family_keys[AF_MAX]; +static struct lock_class_key af_family_kern_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; +static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket * locks is fast): */ + +#define _sock_locks(x) \ + x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ + x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ + x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ + x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ + x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ + x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ + x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ + x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ + x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ + x "27" , x "28" , x "AF_CAN" , \ + x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ + x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ + x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ + x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ + x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX" + static const char *const af_family_key_strings[AF_MAX+1] = { - "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , - "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", - "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , - "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" , - "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , - "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , - "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , - "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , - "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , - "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , - "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , - "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , - "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , - "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" , - "sk_lock-AF_QIPCRTR", "sk_lock-AF_MAX" + _sock_locks("sk_lock-") }; static const char *const af_family_slock_key_strings[AF_MAX+1] = { - "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , - "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", - "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , - "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" , - "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , - "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , - "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , - "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , - "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , - "slock-27" , "slock-28" , "slock-AF_CAN" , - "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , - "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , - "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , - "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" , - "slock-AF_QIPCRTR", "slock-AF_MAX" + _sock_locks("slock-") }; static const char *const af_family_clock_key_strings[AF_MAX+1] = { - "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , - "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", - "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , - "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , - "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , - "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , - "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , - "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , - "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , - "clock-27" , "clock-28" , "clock-AF_CAN" , - "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , - "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , - "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , - "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" , - "clock-AF_QIPCRTR", "clock-AF_MAX" + _sock_locks("clock-") +}; + +static const char *const af_family_kern_key_strings[AF_MAX+1] = { + _sock_locks("k-sk_lock-") +}; +static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { + _sock_locks("k-slock-") +}; +static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { + _sock_locks("k-clock-") +}; +static const char *const af_family_rlock_key_strings[AF_MAX+1] = { + "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" , + "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK", + "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" , + "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" , + "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" , + "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" , + "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" , + "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" , + "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" , + "rlock-27" , "rlock-28" , "rlock-AF_CAN" , + "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" , + "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , + "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , + "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , + "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX" +}; +static const char *const af_family_wlock_key_strings[AF_MAX+1] = { + "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , + "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK", + "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" , + "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" , + "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" , + "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" , + "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" , + "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" , + "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" , + "wlock-27" , "wlock-28" , "wlock-AF_CAN" , + "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" , + "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , + "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , + "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , + "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX" +}; +static const char *const af_family_elock_key_strings[AF_MAX+1] = { + "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , + "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK", + "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" , + "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" , + "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" , + "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" , + "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" , + "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" , + "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" , + "elock-27" , "elock-28" , "elock-AF_CAN" , + "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" , + "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , + "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , + "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , + "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX" }; /* - * sk_callback_lock locking rules are per-address-family, + * sk_callback_lock and sk queues locking rules are per-address-family, * so split the lock classes by using a per-AF key: */ static struct lock_class_key af_callback_keys[AF_MAX]; +static struct lock_class_key af_rlock_keys[AF_MAX]; +static struct lock_class_key af_wlock_keys[AF_MAX]; +static struct lock_class_key af_elock_keys[AF_MAX]; +static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across @@ -328,14 +370,14 @@ EXPORT_SYMBOL_GPL(sk_clear_memalloc); int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int ret; - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* these should have been dropped before queueing */ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); ret = sk->sk_backlog_rcv(sk, skb); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); return ret; } @@ -367,7 +409,7 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) if (tv.tv_sec == 0 && tv.tv_usec == 0) return 0; if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) - *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); + *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ); return 0; } @@ -502,6 +544,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk_tx_queue_clear(sk); + sk->sk_dst_pending_confirm = 0; RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; @@ -762,11 +805,8 @@ set_rcvbuf: goto set_rcvbuf; case SO_KEEPALIVE: -#ifdef CONFIG_INET - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_type == SOCK_STREAM) - tcp_set_keepalive(sk, valbool); -#endif + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; @@ -998,6 +1038,10 @@ set_rcvbuf: #endif case SO_MAX_PACING_RATE: + if (val != ~0U) + cmpxchg(&sk->sk_pacing_status, + SK_PACING_NONE, + SK_PACING_NEEDED); sk->sk_max_pacing_rate = val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); @@ -1034,6 +1078,18 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred, } } +static int groups_to_user(gid_t __user *dst, const struct group_info *src) +{ + struct user_namespace *user_ns = current_user_ns(); + int i; + + for (i = 0; i < src->ngroups; i++) + if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) + return -EFAULT; + + return 0; +} + int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { @@ -1041,6 +1097,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, union { int val; + u64 val64; struct linger ling; struct timeval tm; } v; @@ -1148,7 +1205,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.tm.tv_usec = 0; } else { v.tm.tv_sec = sk->sk_rcvtimeo / HZ; - v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; + v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ; } break; @@ -1159,7 +1216,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.tm.tv_usec = 0; } else { v.tm.tv_sec = sk->sk_sndtimeo / HZ; - v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; + v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ; } break; @@ -1186,6 +1243,27 @@ int sock_getsockopt(struct socket *sock, int level, int optname, goto lenout; } + case SO_PEERGROUPS: + { + int ret, n; + + if (!sk->sk_peer_cred) + return -ENODATA; + + n = sk->sk_peer_cred->group_info->ngroups; + if (len < n * sizeof(gid_t)) { + len = n * sizeof(gid_t); + return put_user(len, optlen) ? -EFAULT : -ERANGE; + } + len = n * sizeof(gid_t); + + ret = groups_to_user((gid_t __user *)optval, + sk->sk_peer_cred->group_info); + if (ret) + return ret; + goto lenout; + } + case SO_PEERNAME: { char address[128]; @@ -1271,6 +1349,40 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_incoming_cpu; break; + case SO_MEMINFO: + { + u32 meminfo[SK_MEMINFO_VARS]; + + if (get_user(len, optlen)) + return -EFAULT; + + sk_get_meminfo(sk, meminfo); + + len = min_t(unsigned int, len, sizeof(meminfo)); + if (copy_to_user(optval, &meminfo, len)) + return -EFAULT; + + goto lenout; + } + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_INCOMING_NAPI_ID: + v.val = READ_ONCE(sk->sk_napi_id); + + /* aggregate non-NAPI IDs down to 0 */ + if (v.val < MIN_NAPI_ID) + v.val = 0; + + break; +#endif + + case SO_COOKIE: + lv = sizeof(u64); + if (len < lv) + return -EINVAL; + v.val64 = sock_gen_cookie(sk); + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -1295,7 +1407,16 @@ lenout: */ static inline void sock_lock_init(struct sock *sk) { - sock_lock_init_class_and_name(sk, + if (sk->sk_kern_sock) + sock_lock_init_class_and_name( + sk, + af_family_kern_slock_key_strings[sk->sk_family], + af_family_kern_slock_keys + sk->sk_family, + af_family_kern_key_strings[sk->sk_family], + af_family_kern_keys + sk->sk_family); + else + sock_lock_init_class_and_name( + sk, af_family_slock_key_strings[sk->sk_family], af_family_slock_keys + sk->sk_family, af_family_key_strings[sk->sk_family], @@ -1401,12 +1522,13 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; + sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) get_net(net); sock_net_set(sk, net); - atomic_set(&sk->sk_wmem_alloc, 1); + refcount_set(&sk->sk_wmem_alloc, 1); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); @@ -1430,7 +1552,7 @@ static void __sk_destruct(struct rcu_head *head) sk->sk_destruct(sk); filter = rcu_dereference_check(sk->sk_filter, - atomic_read(&sk->sk_wmem_alloc) == 0); + refcount_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); @@ -1444,6 +1566,11 @@ static void __sk_destruct(struct rcu_head *head) pr_debug("%s: optmem leakage (%d bytes) detected\n", __func__, atomic_read(&sk->sk_omem_alloc)); + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + } + if (sk->sk_peer_cred) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); @@ -1475,11 +1602,32 @@ void sk_free(struct sock *sk) * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later */ - if (atomic_dec_and_test(&sk->sk_wmem_alloc)) + if (refcount_dec_and_test(&sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sk_free); +static void sk_init_common(struct sock *sk) +{ + skb_queue_head_init(&sk->sk_receive_queue); + skb_queue_head_init(&sk->sk_write_queue); + skb_queue_head_init(&sk->sk_error_queue); + + rwlock_init(&sk->sk_callback_lock); + lockdep_set_class_and_name(&sk->sk_receive_queue.lock, + af_rlock_keys + sk->sk_family, + af_family_rlock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_write_queue.lock, + af_wlock_keys + sk->sk_family, + af_family_wlock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_error_queue.lock, + af_elock_keys + sk->sk_family, + af_family_elock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_callback_keys + sk->sk_family, + af_family_clock_key_strings[sk->sk_family]); +} + /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone @@ -1511,17 +1659,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) /* * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ - atomic_set(&newsk->sk_wmem_alloc, 1); + refcount_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); - skb_queue_head_init(&newsk->sk_receive_queue); - skb_queue_head_init(&newsk->sk_write_queue); - - rwlock_init(&newsk->sk_callback_lock); - lockdep_set_class_and_name(&newsk->sk_callback_lock, - af_callback_keys + newsk->sk_family, - af_family_clock_key_strings[newsk->sk_family]); + sk_init_common(newsk); newsk->sk_dst_cache = NULL; + newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; atomic_set(&newsk->sk_drops, 0); @@ -1529,7 +1672,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; sock_reset_flag(newsk, SOCK_DONE); - skb_queue_head_init(&newsk->sk_error_queue); filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) @@ -1540,11 +1682,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) is_charged = sk_filter_charge(newsk, filter); if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - bh_unlock_sock(newsk); - sk_free(newsk); + /* We need to make sure that we don't uncharge the new + * socket if we couldn't charge it in the first place + * as otherwise we uncharge the parent's filter. + */ + if (!is_charged) + RCU_INIT_POINTER(newsk->sk_filter, NULL); + sk_free_unlock_clone(newsk); newsk = NULL; goto out; } @@ -1564,7 +1708,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) * (Documentation/RCU/rculist_nulls.txt for details) */ smp_wmb(); - atomic_set(&newsk->sk_refcnt, 2); + refcount_set(&newsk->sk_refcnt, 2); /* * Increment the counter in the same struct proto as the master @@ -1593,6 +1737,16 @@ out: } EXPORT_SYMBOL_GPL(sk_clone_lock); +void sk_free_unlock_clone(struct sock *sk) +{ + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + sk->sk_destruct = NULL; + bh_unlock_sock(sk); + sk_free(sk); +} +EXPORT_SYMBOL_GPL(sk_free_unlock_clone); + void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; @@ -1633,7 +1787,7 @@ void sock_wfree(struct sk_buff *skb) * Keep a reference on sk_wmem_alloc, this will be released * after sk_write_space() call */ - atomic_sub(len - 1, &sk->sk_wmem_alloc); + WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); sk->sk_write_space(sk); len = 1; } @@ -1641,7 +1795,7 @@ void sock_wfree(struct sk_buff *skb) * if sk_wmem_alloc reaches 0, we must finish what sk_free() * could not do because of in-flight packets */ - if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) + if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sock_wfree); @@ -1653,7 +1807,7 @@ void __sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; - if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) + if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) __sk_free(sk); } @@ -1675,7 +1829,7 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) * is enough to guarantee sk_free() wont free this sock until * all in-flight packets are completed */ - atomic_add(skb->truesize, &sk->sk_wmem_alloc); + refcount_add(skb->truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(skb_set_owner_w); @@ -1683,28 +1837,24 @@ EXPORT_SYMBOL(skb_set_owner_w); * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. * But we also want to keep skb->sk set because some packet schedulers - * rely on it (sch_fq for example). So we set skb->truesize to a small - * amount (1) and decrease sk_wmem_alloc accordingly. + * rely on it (sch_fq for example). */ void skb_orphan_partial(struct sk_buff *skb) { - /* If this skb is a TCP pure ACK or already went here, - * we have nothing to do. 2 is already a very small truesize. - */ - if (skb->truesize <= 2) + if (skb_is_tcp_pure_ack(skb)) return; - /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, - * so we do not completely orphan skb, but transfert all - * accounted bytes but one, to avoid unexpected reorders. - */ if (skb->destructor == sock_wfree #ifdef CONFIG_INET || skb->destructor == tcp_wfree #endif ) { - atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); - skb->truesize = 1; + struct sock *sk = skb->sk; + + if (refcount_inc_not_zero(&sk->sk_refcnt)) { + WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); + skb->destructor = sock_efree; + } } else { skb_orphan(skb); } @@ -1762,7 +1912,7 @@ EXPORT_SYMBOL(sock_i_ino); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { - if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { struct sk_buff *skb = alloc_skb(size, priority); if (skb) { skb_set_owner_w(skb, sk); @@ -1837,7 +1987,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) + if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) break; if (sk->sk_shutdown & SEND_SHUTDOWN) break; @@ -1959,6 +2109,26 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, } EXPORT_SYMBOL(sock_cmsg_send); +static void sk_enter_memory_pressure(struct sock *sk) +{ + if (!sk->sk_prot->enter_memory_pressure) + return; + + sk->sk_prot->enter_memory_pressure(sk); +} + +static void sk_leave_memory_pressure(struct sock *sk) +{ + if (sk->sk_prot->leave_memory_pressure) { + sk->sk_prot->leave_memory_pressure(sk); + } else { + unsigned long *memory_pressure = sk->sk_prot->memory_pressure; + + if (memory_pressure && *memory_pressure) + *memory_pressure = 0; + } +} + /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) @@ -2140,7 +2310,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) if (sk->sk_type == SOCK_STREAM) { if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) return 1; - } else if (atomic_read(&sk->sk_wmem_alloc) < + } else if (refcount_read(&sk->sk_wmem_alloc) < prot->sysctl_wmem[0]) return 1; } @@ -2272,7 +2442,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2) } EXPORT_SYMBOL(sock_no_socketpair); -int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) +int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern) { return -EOPNOTSUPP; } @@ -2406,7 +2577,7 @@ static void sock_def_write_space(struct sock *sk) /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | @@ -2449,10 +2620,7 @@ EXPORT_SYMBOL(sk_stop_timer); void sock_init_data(struct socket *sock, struct sock *sk) { - skb_queue_head_init(&sk->sk_receive_queue); - skb_queue_head_init(&sk->sk_write_queue); - skb_queue_head_init(&sk->sk_error_queue); - + sk_init_common(sk); sk->sk_send_head = NULL; init_timer(&sk->sk_timer); @@ -2476,7 +2644,14 @@ void sock_init_data(struct socket *sock, struct sock *sk) } rwlock_init(&sk->sk_callback_lock); - lockdep_set_class_and_name(&sk->sk_callback_lock, + if (sk->sk_kern_sock) + lockdep_set_class_and_name( + &sk->sk_callback_lock, + af_kern_callback_keys + sk->sk_family, + af_family_kern_clock_key_strings[sk->sk_family]); + else + lockdep_set_class_and_name( + &sk->sk_callback_lock, af_callback_keys + sk->sk_family, af_family_clock_key_strings[sk->sk_family]); @@ -2497,7 +2672,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - sk->sk_stamp = ktime_set(-1L, 0); + sk->sk_stamp = SK_DEFAULT_STAMP; #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; @@ -2512,7 +2687,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) * (Documentation/RCU/rculist_nulls.txt for details) */ smp_wmb(); - atomic_set(&sk->sk_refcnt, 1); + refcount_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } EXPORT_SYMBOL(sock_init_data); @@ -2557,9 +2732,12 @@ EXPORT_SYMBOL(release_sock); * @sk: socket * * This version should be used for very small section, where process wont block - * return false if fast path is taken + * return false if fast path is taken: + * * sk_lock.slock locked, owned = 0, BH disabled - * return true if slow path is taken + * + * return true if slow path is taken: + * * sk_lock.slock unlocked, owned = 1, BH enabled */ bool lock_sock_fast(struct sock *sk) @@ -2774,15 +2952,25 @@ void sk_common_release(struct sock *sk) sk_refcnt_debug_release(sk); - if (sk->sk_frag.page) { - put_page(sk->sk_frag.page); - sk->sk_frag.page = NULL; - } - sock_put(sk); } EXPORT_SYMBOL(sk_common_release); +void sk_get_meminfo(const struct sock *sk, u32 *mem) +{ + memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); + + mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); + mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; + mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); + mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; + mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; + mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); +} + #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { @@ -3123,3 +3311,14 @@ static int __init proto_init(void) subsys_initcall(proto_init); #endif /* PROC_FS */ + +#ifdef CONFIG_NET_RX_BUSY_POLL +bool sk_busy_loop_end(void *p, unsigned long start_time) +{ + struct sock *sk = p; + + return !skb_queue_empty(&sk->sk_receive_queue) || + sk_busy_loop_timeout(sk, start_time); +} +EXPORT_SYMBOL(sk_busy_loop_end); +#endif /* CONFIG_NET_RX_BUSY_POLL */ diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 6b10573cc9fa..217f4e3b82f6 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -19,7 +19,7 @@ static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); static DEFINE_MUTEX(sock_diag_table_mutex); static struct workqueue_struct *broadcast_wq; -static u64 sock_gen_cookie(struct sock *sk) +u64 sock_gen_cookie(struct sock *sk) { while (1) { u64 res = atomic64_read(&sk->sk_cookie); @@ -59,15 +59,7 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) { u32 mem[SK_MEMINFO_VARS]; - mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); - mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; - mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); - mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; - mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; - mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; - mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + sk_get_meminfo(sk, mem); return nla_put(skb, attrtype, sizeof(mem), &mem); } @@ -246,7 +238,8 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } -static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { int ret; diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 9a1a352fd1eb..eed1ebf7f29d 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -13,9 +13,9 @@ static DEFINE_SPINLOCK(reuseport_lock); -static struct sock_reuseport *__reuseport_alloc(u16 max_socks) +static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { - size_t size = sizeof(struct sock_reuseport) + + unsigned int size = sizeof(struct sock_reuseport) + sizeof(struct sock *) * max_socks; struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); diff --git a/net/core/stream.c b/net/core/stream.c index f575bcf64af2..20231dbb1da0 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -13,6 +13,7 @@ */ #include <linux/module.h> +#include <linux/sched/signal.h> #include <linux/net.h> #include <linux/signal.h> #include <linux/tcp.h> diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 2a46e4009f62..b7cd9aafe99e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -222,6 +222,21 @@ static int set_default_qdisc(struct ctl_table *table, int write, } #endif +static int proc_do_dev_weight(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret != 0) + return ret; + + dev_rx_weight = weight_p * dev_weight_rx_bias; + dev_tx_weight = weight_p * dev_weight_tx_bias; + + return ret; +} + static int proc_do_rss_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -273,7 +288,21 @@ static struct ctl_table net_core_table[] = { .data = &weight_p, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_do_dev_weight, + }, + { + .procname = "dev_weight_rx_bias", + .data = &dev_weight_rx_bias, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_dev_weight, + }, + { + .procname = "dev_weight_tx_bias", + .data = &dev_weight_tx_bias, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_do_dev_weight, }, { .procname = "netdev_max_backlog", @@ -305,6 +334,13 @@ static struct ctl_table net_core_table[] = { .mode = 0600, .proc_handler = proc_dointvec, }, + { + .procname = "bpf_jit_kallsyms", + .data = &bpf_jit_kallsyms, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec, + }, # endif #endif { @@ -372,14 +408,16 @@ static struct ctl_table net_core_table[] = { .data = &sysctl_net_busy_poll, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "busy_read", .data = &sysctl_net_busy_read, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, #endif #ifdef CONFIG_NET_SCHED @@ -414,6 +452,14 @@ static struct ctl_table net_core_table[] = { .extra1 = &one, .extra2 = &max_skb_frags, }, + { + .procname = "netdev_budget_usecs", + .data = &netdev_budget_usecs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, { } }; @@ -433,8 +479,6 @@ static __net_init int sysctl_core_net_init(struct net *net) { struct ctl_table *tbl; - net->core.sysctl_somaxconn = SOMAXCONN; - tbl = netns_core_table; if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); diff --git a/net/core/utils.c b/net/core/utils.c index 6592d7bbed39..93066bd0305a 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -26,9 +26,11 @@ #include <linux/percpu.h> #include <linux/init.h> #include <linux/ratelimit.h> +#include <linux/socket.h> #include <net/sock.h> #include <net/net_ratelimit.h> +#include <net/ipv6.h> #include <asm/byteorder.h> #include <linux/uaccess.h> @@ -51,7 +53,7 @@ EXPORT_SYMBOL(net_ratelimit); __be32 in_aton(const char *str) { - unsigned long l; + unsigned int l; unsigned int val; int i; @@ -300,6 +302,107 @@ out: } EXPORT_SYMBOL(in6_pton); +static int inet4_pton(const char *src, u16 port_num, + struct sockaddr_storage *addr) +{ + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + int srclen = strlen(src); + + if (srclen > INET_ADDRSTRLEN) + return -EINVAL; + + if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr, + '\n', NULL) == 0) + return -EINVAL; + + addr4->sin_family = AF_INET; + addr4->sin_port = htons(port_num); + + return 0; +} + +static int inet6_pton(struct net *net, const char *src, u16 port_num, + struct sockaddr_storage *addr) +{ + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + const char *scope_delim; + int srclen = strlen(src); + + if (srclen > INET6_ADDRSTRLEN) + return -EINVAL; + + if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr, + '%', &scope_delim) == 0) + return -EINVAL; + + if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL && + src + srclen != scope_delim && *scope_delim == '%') { + struct net_device *dev; + char scope_id[16]; + size_t scope_len = min_t(size_t, sizeof(scope_id) - 1, + src + srclen - scope_delim - 1); + + memcpy(scope_id, scope_delim + 1, scope_len); + scope_id[scope_len] = '\0'; + + dev = dev_get_by_name(net, scope_id); + if (dev) { + addr6->sin6_scope_id = dev->ifindex; + dev_put(dev); + } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) { + return -EINVAL; + } + } + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(port_num); + + return 0; +} + +/** + * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address + * @net: net namespace (used for scope handling) + * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either + * @src: the start of the address string + * @port: the start of the port string (or NULL for none) + * @addr: output socket address + * + * Return zero on success, return errno when any error occurs. + */ +int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, + const char *src, const char *port, struct sockaddr_storage *addr) +{ + u16 port_num; + int ret = -EINVAL; + + if (port) { + if (kstrtou16(port, 0, &port_num)) + return -EINVAL; + } else { + port_num = 0; + } + + switch (af) { + case AF_INET: + ret = inet4_pton(src, port_num, addr); + break; + case AF_INET6: + ret = inet6_pton(net, src, port_num, addr); + break; + case AF_UNSPEC: + ret = inet4_pton(src, port_num, addr); + if (ret) + ret = inet6_pton(net, src, port_num, addr); + break; + default: + pr_err("unexpected address family %d\n", af); + }; + + return ret; +} +EXPORT_SYMBOL(inet_pton_with_scope); + void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr) { |