diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/.gitignore | 1 | ||||
-rw-r--r-- | net/core/Makefile | 22 | ||||
-rw-r--r-- | net/core/bpf_sk_storage.c | 5 | ||||
-rw-r--r-- | net/core/datagram.c | 2 | ||||
-rw-r--r-- | net/core/dev.c | 49 | ||||
-rw-r--r-- | net/core/devlink.c | 339 | ||||
-rw-r--r-- | net/core/drop_monitor.c | 3 | ||||
-rw-r--r-- | net/core/filter.c | 753 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 55 | ||||
-rw-r--r-- | net/core/flow_offload.c | 7 | ||||
-rw-r--r-- | net/core/gen_stats.c | 2 | ||||
-rw-r--r-- | net/core/gro.c | 18 | ||||
-rw-r--r-- | net/core/gro_cells.c | 5 | ||||
-rw-r--r-- | net/core/lwtunnel.c | 3 | ||||
-rw-r--r-- | net/core/neighbour.c | 79 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 58 | ||||
-rw-r--r-- | net/core/net_namespace.c | 14 | ||||
-rw-r--r-- | net/core/netclassid_cgroup.c | 2 | ||||
-rw-r--r-- | net/core/netpoll.c | 4 | ||||
-rw-r--r-- | net/core/pktgen.c | 47 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 24 | ||||
-rw-r--r-- | net/core/skbuff.c | 202 | ||||
-rw-r--r-- | net/core/skmsg.c | 36 | ||||
-rw-r--r-- | net/core/sock.c | 158 | ||||
-rw-r--r-- | net/core/sock_map.c | 19 | ||||
-rw-r--r-- | net/core/sock_reuseport.c | 16 | ||||
-rw-r--r-- | net/core/stream.c | 5 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 16 | ||||
-rw-r--r-- | net/core/xdp.c | 10 |
29 files changed, 1125 insertions, 829 deletions
diff --git a/net/core/.gitignore b/net/core/.gitignore deleted file mode 100644 index df1e74372cce..000000000000 --- a/net/core/.gitignore +++ /dev/null @@ -1 +0,0 @@ -dropreason_str.c diff --git a/net/core/Makefile b/net/core/Makefile index e8ce3bd283a6..5857cec87b83 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -5,7 +5,7 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \ - flow_dissector.o dropreason_str.o + flow_dissector.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o @@ -40,23 +40,3 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o - -clean-files := dropreason_str.c - -quiet_cmd_dropreason_str = GEN $@ -cmd_dropreason_str = awk -F ',' 'BEGIN{ print "\#include <net/dropreason.h>\n"; \ - print "const char * const drop_reasons[] = {" }\ - /^enum skb_drop/ { dr=1; }\ - /^\};/ { dr=0; }\ - /^\tSKB_DROP_REASON_/ {\ - if (dr) {\ - sub(/\tSKB_DROP_REASON_/, "", $$1);\ - printf "\t[SKB_DROP_REASON_%s] = \"%s\",\n", $$1, $$1;\ - }\ - }\ - END{ print "};" }' $< > $@ - -$(obj)/dropreason_str.c: $(srctree)/include/net/dropreason.h - $(call cmd,dropreason_str) - -$(obj)/dropreason_str.o: $(obj)/dropreason_str.c diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 1b7f385643b4..94374d529ea4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -310,11 +310,12 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) { + int optmem_max = READ_ONCE(sysctl_optmem_max); struct sock *sk = (struct sock *)owner; /* same check as in sock_kmalloc() */ - if (size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + if (size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { atomic_add(size, &sk->sk_omem_alloc); return 0; } diff --git a/net/core/datagram.c b/net/core/datagram.c index 7255531f63ae..e4ff2db40c98 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -677,7 +677,7 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, page_ref_sub(last_head, refs); refs = 0; } - skb_fill_page_desc(skb, frag++, head, start, size); + skb_fill_page_desc_noacc(skb, frag++, head, start, size); } if (refs) page_ref_sub(last_head, refs); diff --git a/net/core/dev.c b/net/core/dev.c index 716df64fcfa5..3be256051e99 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1100,7 +1100,7 @@ static int dev_alloc_name_ns(struct net *net, BUG_ON(!net); ret = __dev_alloc_name(net, name, buf); if (ret >= 0) - strlcpy(dev->name, buf, IFNAMSIZ); + strscpy(dev->name, buf, IFNAMSIZ); return ret; } @@ -1137,7 +1137,7 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev, else if (netdev_name_in_use(net, name)) return -EEXIST; else if (dev->name != name) - strlcpy(dev->name, name, IFNAMSIZ); + strscpy(dev->name, name, IFNAMSIZ); return 0; } @@ -4624,7 +4624,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) struct softnet_data *sd; unsigned int old_flow, new_flow; - if (qlen < (netdev_max_backlog >> 1)) + if (qlen < (READ_ONCE(netdev_max_backlog) >> 1)) return false; sd = this_cpu_ptr(&softnet_data); @@ -4672,7 +4672,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, if (!netif_running(skb->dev)) goto drop; qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { + if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) { if (qlen) { enqueue: __skb_queue_tail(&sd->input_pkt_queue, skb); @@ -4928,7 +4928,7 @@ static int netif_rx_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_rx(skb); @@ -5136,11 +5136,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); + *ret = NET_RX_DROP; return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); + *ret = NET_RX_SUCCESS; return NULL; case TC_ACT_REDIRECT: /* skb_mac_header check was done by cls/act_bpf, so @@ -5153,8 +5155,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, *another = true; break; } + *ret = NET_RX_SUCCESS; return NULL; case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; return NULL; default: break; @@ -5281,7 +5285,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, int ret = NET_RX_DROP; __be16 type; - net_timestamp_check(!netdev_tstamp_prequeue, skb); + net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb); trace_netif_receive_skb(skb); @@ -5664,7 +5668,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; @@ -5694,7 +5698,7 @@ void netif_receive_skb_list_internal(struct list_head *head) INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { - net_timestamp_check(netdev_tstamp_prequeue, skb); + net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb); skb_list_del_init(skb); if (!skb_defer_rx_timestamp(skb)) list_add_tail(&skb->list, &sublist); @@ -5918,7 +5922,7 @@ static int process_backlog(struct napi_struct *napi, int quota) net_rps_action_and_irq_enable(sd); } - napi->weight = dev_rx_weight; + napi->weight = READ_ONCE(dev_rx_weight); while (again) { struct sk_buff *skb; @@ -6358,23 +6362,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } EXPORT_SYMBOL(dev_set_threaded); -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -static void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { @@ -6665,8 +6652,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + - usecs_to_jiffies(netdev_budget_usecs); - int budget = netdev_budget; + usecs_to_jiffies(READ_ONCE(netdev_budget_usecs)); + int budget = READ_ONCE(netdev_budget); LIST_HEAD(list); LIST_HEAD(repoll); @@ -10284,7 +10271,7 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list) return dev; if (time_after(jiffies, warning_time + - netdev_unregister_timeout_secs * HZ)) { + READ_ONCE(netdev_unregister_timeout_secs) * HZ)) { list_for_each_entry(dev, list, todo_list) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, netdev_refcnt_read(dev)); @@ -10370,9 +10357,7 @@ void netdev_run_todo(void) BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); -#if IS_ENABLED(CONFIG_DECNET) - WARN_ON(dev->dn_ptr); -#endif + if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) diff --git a/net/core/devlink.c b/net/core/devlink.c index b50bcc18b8d9..89baa7c0938b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -371,6 +371,13 @@ static struct devlink *devlink_get_from_attrs(struct net *net, return ERR_PTR(-ENODEV); } +#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ + WARN_ON_ONCE(!(devlink_port)->registered) +#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ + WARN_ON_ONCE((devlink_port)->registered) +#define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \ + WARN_ON_ONCE(!(devlink_port)->initialized) + static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, unsigned int port_index) { @@ -1710,7 +1717,7 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb, struct devlink *devlink = info->user_ptr[0]; u32 count; - if (!info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT)) return -EINVAL; if (!devlink->ops->port_split) return -EOPNOTSUPP; @@ -1838,7 +1845,7 @@ static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb, if (!devlink->ops->port_del) return -EOPNOTSUPP; - if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) { + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_INDEX)) { NL_SET_ERR_MSG_MOD(extack, "Port index is not specified"); return -EINVAL; } @@ -2690,7 +2697,7 @@ static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb, if (err) return err; - if (!info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_POOL_SIZE)) return -EINVAL; size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]); @@ -2900,7 +2907,7 @@ static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb, if (err) return err; - if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD)) return -EINVAL; threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); @@ -3156,7 +3163,7 @@ static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb, if (err) return err; - if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD)) return -EINVAL; threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); @@ -3845,7 +3852,7 @@ static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, struct devlink_dpipe_table *table; const char *table_name; - if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME)) return -EINVAL; table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); @@ -4029,8 +4036,9 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, const char *table_name; bool counters_enable; - if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME] || - !info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME) || + GENL_REQ_ATTR_CHECK(info, + DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED)) return -EINVAL; table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); @@ -4119,8 +4127,8 @@ static int devlink_nl_cmd_resource_set(struct sk_buff *skb, u64 size; int err; - if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] || - !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) || + GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE)) return -EINVAL; resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); @@ -4742,10 +4750,76 @@ void devlink_flash_update_timeout_notify(struct devlink *devlink, } EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify); +struct devlink_info_req { + struct sk_buff *msg; + void (*version_cb)(const char *version_name, + enum devlink_info_version_type version_type, + void *version_cb_priv); + void *version_cb_priv; +}; + +struct devlink_flash_component_lookup_ctx { + const char *lookup_name; + bool lookup_name_found; +}; + +static void +devlink_flash_component_lookup_cb(const char *version_name, + enum devlink_info_version_type version_type, + void *version_cb_priv) +{ + struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv; + + if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT || + lookup_ctx->lookup_name_found) + return; + + lookup_ctx->lookup_name_found = + !strcmp(lookup_ctx->lookup_name, version_name); +} + +static int devlink_flash_component_get(struct devlink *devlink, + struct nlattr *nla_component, + const char **p_component, + struct netlink_ext_ack *extack) +{ + struct devlink_flash_component_lookup_ctx lookup_ctx = {}; + struct devlink_info_req req = {}; + const char *component; + int ret; + + if (!nla_component) + return 0; + + component = nla_data(nla_component); + + if (!devlink->ops->info_get) { + NL_SET_ERR_MSG_ATTR(extack, nla_component, + "component update is not supported by this device"); + return -EOPNOTSUPP; + } + + lookup_ctx.lookup_name = component; + req.version_cb = devlink_flash_component_lookup_cb; + req.version_cb_priv = &lookup_ctx; + + ret = devlink->ops->info_get(devlink, &req, NULL); + if (ret) + return ret; + + if (!lookup_ctx.lookup_name_found) { + NL_SET_ERR_MSG_ATTR(extack, nla_component, + "selected component is not supported by this device"); + return -EINVAL; + } + *p_component = component; + return 0; +} + static int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *nla_component, *nla_overwrite_mask, *nla_file_name; + struct nlattr *nla_overwrite_mask, *nla_file_name; struct devlink_flash_update_params params = {}; struct devlink *devlink = info->user_ptr[0]; const char *file_name; @@ -4755,20 +4829,16 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb, if (!devlink->ops->flash_update) return -EOPNOTSUPP; - if (!info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME)) return -EINVAL; - supported_params = devlink->ops->supported_flash_update_params; + ret = devlink_flash_component_get(devlink, + info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT], + ¶ms.component, info->extack); + if (ret) + return ret; - nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]; - if (nla_component) { - if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT)) { - NL_SET_ERR_MSG_ATTR(info->extack, nla_component, - "component update is not supported by this device"); - return -EOPNOTSUPP; - } - params.component = nla_data(nla_component); - } + supported_params = devlink->ops->supported_flash_update_params; nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK]; if (nla_overwrite_mask) { @@ -4936,10 +5006,8 @@ static int devlink_nl_cmd_selftests_run(struct sk_buff *skb, if (!devlink->ops->selftest_run || !devlink->ops->selftest_check) return -EOPNOTSUPP; - if (!info->attrs[DEVLINK_ATTR_SELFTESTS]) { - NL_SET_ERR_MSG_MOD(info->extack, "selftest required"); + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS)) return -EINVAL; - } attrs = info->attrs[DEVLINK_ATTR_SELFTESTS]; @@ -5393,7 +5461,7 @@ static int devlink_param_type_get_from_info(struct genl_info *info, enum devlink_param_type *param_type) { - if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE)) return -EINVAL; switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { @@ -5470,7 +5538,7 @@ devlink_param_get_from_info(struct list_head *param_list, { char *param_name; - if (!info->attrs[DEVLINK_ATTR_PARAM_NAME]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME)) return NULL; param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); @@ -5536,7 +5604,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, return err; } - if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE)) return -EINVAL; cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]); if (!devlink_param_cmode_is_supported(param, cmode)) @@ -5574,89 +5642,22 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { - struct devlink_param_item *param_item; - struct devlink_port *devlink_port; - struct devlink *devlink; - int start = cb->args[0]; - unsigned long index; - int idx = 0; - int err = 0; - - devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - devl_lock(devlink); - list_for_each_entry(devlink_port, &devlink->port_list, list) { - list_for_each_entry(param_item, - &devlink_port->param_list, list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_param_fill(msg, - devlink_port->devlink, - devlink_port->index, param_item, - DEVLINK_CMD_PORT_PARAM_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err == -EOPNOTSUPP) { - err = 0; - } else if (err) { - devl_unlock(devlink); - devlink_put(devlink); - goto out; - } - idx++; - } - } - devl_unlock(devlink); - devlink_put(devlink); - } -out: - if (err != -EMSGSIZE) - return err; - - cb->args[0] = idx; + NL_SET_ERR_MSG_MOD(cb->extack, "Port params are not supported"); return msg->len; } static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink_param_item *param_item; - struct sk_buff *msg; - int err; - - param_item = devlink_param_get_from_info(&devlink_port->param_list, - info); - if (!param_item) - return -EINVAL; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_param_fill(msg, devlink_port->devlink, - devlink_port->index, param_item, - DEVLINK_CMD_PORT_PARAM_GET, - info->snd_portid, info->snd_seq, 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); + NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported"); + return -EINVAL; } static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[1]; - - return __devlink_nl_cmd_param_set_doit(devlink_port->devlink, - devlink_port->index, - &devlink_port->param_list, info, - DEVLINK_CMD_PORT_PARAM_NEW); + NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported"); + return -EINVAL; } static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, @@ -6056,7 +6057,7 @@ static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb, unsigned int index; int err; - if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) return -EINVAL; if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { @@ -6189,8 +6190,8 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb, unsigned int index; u32 snapshot_id; - if (!info->attrs[DEVLINK_ATTR_REGION_NAME] || - !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) || + GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID)) return -EINVAL; region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); @@ -6238,7 +6239,7 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) u8 *data; int err; - if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) { + if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) { NL_SET_ERR_MSG_MOD(info->extack, "No region name provided"); return -EINVAL; } @@ -6553,18 +6554,18 @@ out_unlock: return err; } -struct devlink_info_req { - struct sk_buff *msg; -}; - int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name) { + if (!req->msg) + return 0; return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name); } EXPORT_SYMBOL_GPL(devlink_info_driver_name_put); int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) { + if (!req->msg) + return 0; return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn); } EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); @@ -6572,6 +6573,8 @@ EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); int devlink_info_board_serial_number_put(struct devlink_info_req *req, const char *bsn) { + if (!req->msg) + return 0; return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER, bsn); } @@ -6579,11 +6582,19 @@ EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put); static int devlink_info_version_put(struct devlink_info_req *req, int attr, const char *version_name, - const char *version_value) + const char *version_value, + enum devlink_info_version_type version_type) { struct nlattr *nest; int err; + if (req->version_cb) + req->version_cb(version_name, version_type, + req->version_cb_priv); + + if (!req->msg) + return 0; + nest = nla_nest_start_noflag(req->msg, attr); if (!nest) return -EMSGSIZE; @@ -6612,7 +6623,8 @@ int devlink_info_version_fixed_put(struct devlink_info_req *req, const char *version_value) { return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED, - version_name, version_value); + version_name, version_value, + DEVLINK_INFO_VERSION_TYPE_NONE); } EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put); @@ -6621,25 +6633,49 @@ int devlink_info_version_stored_put(struct devlink_info_req *req, const char *version_value) { return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, - version_name, version_value); + version_name, version_value, + DEVLINK_INFO_VERSION_TYPE_NONE); } EXPORT_SYMBOL_GPL(devlink_info_version_stored_put); +int devlink_info_version_stored_put_ext(struct devlink_info_req *req, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, + version_name, version_value, + version_type); +} +EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext); + int devlink_info_version_running_put(struct devlink_info_req *req, const char *version_name, const char *version_value) { return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, - version_name, version_value); + version_name, version_value, + DEVLINK_INFO_VERSION_TYPE_NONE); } EXPORT_SYMBOL_GPL(devlink_info_version_running_put); +int devlink_info_version_running_put_ext(struct devlink_info_req *req, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, + version_name, version_value, + version_type); +} +EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext); + static int devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { - struct devlink_info_req req; + struct devlink_info_req req = {}; void *hdr; int err; @@ -9513,6 +9549,7 @@ static struct genl_family devlink_nl_family __ro_after_init = { .module = THIS_MODULE, .small_ops = devlink_nl_ops, .n_small_ops = ARRAY_SIZE(devlink_nl_ops), + .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1, .mcgrps = devlink_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps), }; @@ -9818,6 +9855,44 @@ static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) } /** + * devlink_port_init() - Init devlink port + * + * @devlink: devlink + * @devlink_port: devlink port + * + * Initialize essencial stuff that is needed for functions + * that may be called before devlink port registration. + * Call to this function is optional and not needed + * in case the driver does not use such functions. + */ +void devlink_port_init(struct devlink *devlink, + struct devlink_port *devlink_port) +{ + if (devlink_port->initialized) + return; + devlink_port->devlink = devlink; + INIT_LIST_HEAD(&devlink_port->region_list); + devlink_port->initialized = true; +} +EXPORT_SYMBOL_GPL(devlink_port_init); + +/** + * devlink_port_fini() - Deinitialize devlink port + * + * @devlink_port: devlink port + * + * Deinitialize essencial stuff that is in use for functions + * that may be called after devlink port unregistration. + * Call to this function is optional and not needed + * in case the driver does not use such functions. + */ +void devlink_port_fini(struct devlink_port *devlink_port) +{ + WARN_ON(!list_empty(&devlink_port->region_list)); +} +EXPORT_SYMBOL_GPL(devlink_port_fini); + +/** * devl_port_register() - Register devlink port * * @devlink: devlink @@ -9839,15 +9914,15 @@ int devl_port_register(struct devlink *devlink, if (devlink_port_index_exists(devlink, port_index)) return -EEXIST; - WARN_ON(devlink_port->devlink); - devlink_port->devlink = devlink; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + + devlink_port_init(devlink, devlink_port); + devlink_port->registered = true; devlink_port->index = port_index; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); mutex_init(&devlink_port->reporters_lock); list_add_tail(&devlink_port->list, &devlink->port_list); - INIT_LIST_HEAD(&devlink_port->param_list); - INIT_LIST_HEAD(&devlink_port->region_list); INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); @@ -9897,8 +9972,8 @@ void devl_port_unregister(struct devlink_port *devlink_port) devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); list_del(&devlink_port->list); WARN_ON(!list_empty(&devlink_port->reporter_list)); - WARN_ON(!list_empty(&devlink_port->region_list)); mutex_destroy(&devlink_port->reporters_lock); + devlink_port->registered = false; } EXPORT_SYMBOL_GPL(devl_port_unregister); @@ -9923,8 +9998,8 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { - if (WARN_ON(!devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); + devlink_port_type_warn_cancel(devlink_port); spin_lock_bh(&devlink_port->type_lock); devlink_port->type = type; @@ -10043,8 +10118,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, { int ret; - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) @@ -10067,8 +10142,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) @@ -10094,8 +10169,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) @@ -10122,8 +10197,8 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); if (ret) @@ -10238,8 +10313,8 @@ EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy); void devlink_port_linecard_set(struct devlink_port *devlink_port, struct devlink_linecard *linecard) { - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + devlink_port->linecard = linecard; } EXPORT_SYMBOL_GPL(devlink_port_linecard_set); @@ -11310,6 +11385,8 @@ devlink_port_region_create(struct devlink_port *port, struct devlink_region *region; int err = 0; + ASSERT_DEVLINK_PORT_INITIALIZED(port); + if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) return ERR_PTR(-EINVAL); @@ -12306,8 +12383,8 @@ EXPORT_SYMBOL_GPL(devl_trap_policers_unregister); static void __devlink_compat_running_version(struct devlink *devlink, char *buf, size_t len) { + struct devlink_info_req req = {}; const struct nlattr *nlattr; - struct devlink_info_req req; struct sk_buff *msg; int rem, err; diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 75501e1bdd25..f084a4a6b7ab 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -464,7 +464,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, goto out; hw_entry = &hw_entries->entries[hw_entries->num_entries]; - strlcpy(hw_entry->trap_name, metadata->trap_name, + strscpy(hw_entry->trap_name, metadata->trap_name, NET_DM_MAX_HW_TRAP_NAME_LEN - 1); hw_entry->count = 1; hw_entries->num_entries++; @@ -1645,6 +1645,7 @@ static struct genl_family net_drop_monitor_family __ro_after_init = { .module = THIS_MODULE, .small_ops = dropmon_ops, .n_small_ops = ARRAY_SIZE(dropmon_ops), + .resv_start_op = NET_DM_CMD_STATS_GET + 1, .mcgrps = dropmon_mcgrps, .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps), }; diff --git a/net/core/filter.c b/net/core/filter.c index e8508aaafd27..bb0136e7a8e4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -18,6 +18,7 @@ */ #include <linux/atomic.h> +#include <linux/bpf_verifier.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> @@ -1214,10 +1215,11 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); + int optmem_max = READ_ONCE(sysctl_optmem_max); /* same check as in sock_kmalloc() */ - if (filter_size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { + if (filter_size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) { atomic_add(filter_size, &sk->sk_omem_alloc); return true; } @@ -1548,7 +1550,7 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - if (bpf_prog_size(prog->len) > sysctl_optmem_max) + if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); @@ -1615,7 +1617,7 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ - if (bpf_prog_size(prog->len) > sysctl_optmem_max) { + if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) { err = -ENOMEM; goto err_prog_put; } @@ -3009,7 +3011,7 @@ BPF_CALL_0(bpf_get_cgroup_classid_curr) return __task_get_classid(current); } -static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { +const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .func = bpf_get_cgroup_classid_curr, .gpl_only = false, .ret_type = RET_INTEGER, @@ -4488,7 +4490,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key void *to_orig = to; int err; - if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | + BPF_F_TUNINFO_FLAGS)))) { err = -EINVAL; goto err_clear; } @@ -4520,7 +4523,10 @@ set_compat: to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; - to->tunnel_ext = 0; + if (flags & BPF_F_TUNINFO_FLAGS) + to->tunnel_flags = info->key.tun_flags; + else + to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, @@ -5013,359 +5019,303 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static int __bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) +static int sol_socket_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + switch (optname) { + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_RCVBUF: + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_REUSEPORT: + case SO_RCVLOWAT: + case SO_MARK: + case SO_MAX_PACING_RATE: + case SO_BINDTOIFINDEX: + case SO_TXREHASH: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + case SO_BINDTODEVICE: + break; + default: + return -EINVAL; + } + + if (getopt) { + if (optname == SO_BINDTODEVICE) + return -EINVAL; + return sk_getsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } + + return sk_setsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), *optlen); +} + +static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, + char *optval, int optlen) { - char devname[IFNAMSIZ]; - int val, valbool; - struct net *net; - int ifindex; - int ret = 0; + struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; + int val; - if (!sk_fullsock(sk)) + if (optlen != sizeof(int)) return -EINVAL; - if (level == SOL_SOCKET) { - if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) + val = *(int *)optval; + + /* Only some options are supported */ + switch (optname) { + case TCP_BPF_IW: + if (val <= 0 || tp->data_segs_out > tp->syn_data) return -EINVAL; - val = *((int *)optval); - valbool = val ? 1 : 0; - - /* Only some socketops are supported */ - switch (optname) { - case SO_RCVBUF: - val = min_t(u32, val, sysctl_rmem_max); - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - WRITE_ONCE(sk->sk_rcvbuf, - max_t(int, val * 2, SOCK_MIN_RCVBUF)); - break; - case SO_SNDBUF: - val = min_t(u32, val, sysctl_wmem_max); - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - WRITE_ONCE(sk->sk_sndbuf, - max_t(int, val * 2, SOCK_MIN_SNDBUF)); - break; - case SO_MAX_PACING_RATE: /* 32bit version */ - if (val != ~0U) - cmpxchg(&sk->sk_pacing_status, - SK_PACING_NONE, - SK_PACING_NEEDED); - sk->sk_max_pacing_rate = (val == ~0U) ? - ~0UL : (unsigned int)val; - sk->sk_pacing_rate = min(sk->sk_pacing_rate, - sk->sk_max_pacing_rate); - break; - case SO_PRIORITY: - sk->sk_priority = val; - break; - case SO_RCVLOWAT: - if (val < 0) - val = INT_MAX; - if (sk->sk_socket && sk->sk_socket->ops->set_rcvlowat) - ret = sk->sk_socket->ops->set_rcvlowat(sk, val); - else - WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); - break; - case SO_MARK: - if (sk->sk_mark != val) { - sk->sk_mark = val; - sk_dst_reset(sk); - } - break; - case SO_BINDTODEVICE: - optlen = min_t(long, optlen, IFNAMSIZ - 1); - strncpy(devname, optval, optlen); - devname[optlen] = 0; + tcp_snd_cwnd_set(tp, val); + break; + case TCP_BPF_SNDCWND_CLAMP: + if (val <= 0) + return -EINVAL; + tp->snd_cwnd_clamp = val; + tp->snd_ssthresh = val; + break; + case TCP_BPF_DELACK_MAX: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_DELACK_MAX || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_delack_max = timeout; + break; + case TCP_BPF_RTO_MIN: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_RTO_MIN || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_rto_min = timeout; + break; + default: + return -EINVAL; + } - ifindex = 0; - if (devname[0] != '\0') { - struct net_device *dev; + return 0; +} - ret = -ENODEV; +static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, + int *optlen, bool getopt) +{ + struct tcp_sock *tp; + int ret; - net = sock_net(sk); - dev = dev_get_by_name(net, devname); - if (!dev) - break; - ifindex = dev->ifindex; - dev_put(dev); - } - fallthrough; - case SO_BINDTOIFINDEX: - if (optname == SO_BINDTOIFINDEX) - ifindex = val; - ret = sock_bindtoindex(sk, ifindex, false); - break; - case SO_KEEPALIVE: - if (sk->sk_prot->keepalive) - sk->sk_prot->keepalive(sk, valbool); - sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); - break; - case SO_REUSEPORT: - sk->sk_reuseport = valbool; - break; - case SO_TXREHASH: - if (val < -1 || val > 1) { - ret = -EINVAL; - break; - } - sk->sk_txrehash = (u8)val; - break; - default: - ret = -EINVAL; - } -#ifdef CONFIG_INET - } else if (level == SOL_IP) { - if (optlen != sizeof(int) || sk->sk_family != AF_INET) + if (*optlen < 2) + return -EINVAL; + + if (getopt) { + if (!inet_csk(sk)->icsk_ca_ops) return -EINVAL; + /* BPF expects NULL-terminated tcp-cc string */ + optval[--(*optlen)] = '\0'; + return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case IP_TOS: - if (val < -1 || val > 0xff) { - ret = -EINVAL; - } else { - struct inet_sock *inet = inet_sk(sk); + /* "cdg" is the only cc that alloc a ptr + * in inet_csk_ca area. The bpf-tcp-cc may + * overwrite this ptr after switching to cdg. + */ + if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) + return -ENOTSUPP; - if (val == -1) - val = 0; - inet->tos = val; - } - break; - default: - ret = -EINVAL; - } -#if IS_ENABLED(CONFIG_IPV6) - } else if (level == SOL_IPV6) { - if (optlen != sizeof(int) || sk->sk_family != AF_INET6) - return -EINVAL; + /* It stops this looping + * + * .init => bpf_setsockopt(tcp_cc) => .init => + * bpf_setsockopt(tcp_cc)" => .init => .... + * + * The second bpf_setsockopt(tcp_cc) is not allowed + * in order to break the loop when both .init + * are the same bpf prog. + * + * This applies even the second bpf_setsockopt(tcp_cc) + * does not cause a loop. This limits only the first + * '.init' can call bpf_setsockopt(TCP_CONGESTION) to + * pick a fallback cc (eg. peer does not support ECN) + * and the second '.init' cannot fallback to + * another. + */ + tp = tcp_sk(sk); + if (tp->bpf_chg_cc_inprogress) + return -EBUSY; + + tp->bpf_chg_cc_inprogress = 1; + ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), *optlen); + tp->bpf_chg_cc_inprogress = 0; + return ret; +} - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case IPV6_TCLASS: - if (val < -1 || val > 0xff) { - ret = -EINVAL; - } else { - struct ipv6_pinfo *np = inet6_sk(sk); +static int sol_tcp_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + if (sk->sk_prot->setsockopt != tcp_setsockopt) + return -EINVAL; - if (val == -1) - val = 0; - np->tclass = val; - } - break; - default: - ret = -EINVAL; - } -#endif - } else if (level == SOL_TCP && - sk->sk_prot->setsockopt == tcp_setsockopt) { - if (optname == TCP_CONGESTION) { - char name[TCP_CA_NAME_MAX]; - - strncpy(name, optval, min_t(long, optlen, - TCP_CA_NAME_MAX-1)); - name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, true); - } else { - struct inet_connection_sock *icsk = inet_csk(sk); + switch (optname) { + case TCP_NODELAY: + case TCP_MAXSEG: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + case TCP_THIN_LINEAR_TIMEOUTS: + case TCP_USER_TIMEOUT: + case TCP_NOTSENT_LOWAT: + case TCP_SAVE_SYN: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + case TCP_CONGESTION: + return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); + case TCP_SAVED_SYN: + if (*optlen < 1) + return -EINVAL; + break; + default: + if (getopt) + return -EINVAL; + return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); + } + + if (getopt) { + if (optname == TCP_SAVED_SYN) { struct tcp_sock *tp = tcp_sk(sk); - unsigned long timeout; - if (optlen != sizeof(int)) + if (!tp->saved_syn || + *optlen > tcp_saved_syn_len(tp->saved_syn)) return -EINVAL; - - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case TCP_BPF_IW: - if (val <= 0 || tp->data_segs_out > tp->syn_data) - ret = -EINVAL; - else - tcp_snd_cwnd_set(tp, val); - break; - case TCP_BPF_SNDCWND_CLAMP: - if (val <= 0) { - ret = -EINVAL; - } else { - tp->snd_cwnd_clamp = val; - tp->snd_ssthresh = val; - } - break; - case TCP_BPF_DELACK_MAX: - timeout = usecs_to_jiffies(val); - if (timeout > TCP_DELACK_MAX || - timeout < TCP_TIMEOUT_MIN) - return -EINVAL; - inet_csk(sk)->icsk_delack_max = timeout; - break; - case TCP_BPF_RTO_MIN: - timeout = usecs_to_jiffies(val); - if (timeout > TCP_RTO_MIN || - timeout < TCP_TIMEOUT_MIN) - return -EINVAL; - inet_csk(sk)->icsk_rto_min = timeout; - break; - case TCP_SAVE_SYN: - if (val < 0 || val > 1) - ret = -EINVAL; - else - tp->save_syn = val; - break; - case TCP_KEEPIDLE: - ret = tcp_sock_set_keepidle_locked(sk, val); - break; - case TCP_KEEPINTVL: - if (val < 1 || val > MAX_TCP_KEEPINTVL) - ret = -EINVAL; - else - tp->keepalive_intvl = val * HZ; - break; - case TCP_KEEPCNT: - if (val < 1 || val > MAX_TCP_KEEPCNT) - ret = -EINVAL; - else - tp->keepalive_probes = val; - break; - case TCP_SYNCNT: - if (val < 1 || val > MAX_TCP_SYNCNT) - ret = -EINVAL; - else - icsk->icsk_syn_retries = val; - break; - case TCP_USER_TIMEOUT: - if (val < 0) - ret = -EINVAL; - else - icsk->icsk_user_timeout = val; - break; - case TCP_NOTSENT_LOWAT: - tp->notsent_lowat = val; - sk->sk_write_space(sk); - break; - case TCP_WINDOW_CLAMP: - ret = tcp_set_window_clamp(sk, val); - break; - default: - ret = -EINVAL; - } + memcpy(optval, tp->saved_syn->data, *optlen); + /* It cannot free tp->saved_syn here because it + * does not know if the user space still needs it. + */ + return 0; } -#endif - } else { - ret = -EINVAL; + + return do_tcp_getsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); } - return ret; + + return do_tcp_setsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), *optlen); } -static int _bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) +static int sol_ip_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) { - if (sk_fullsock(sk)) - sock_owned_by_me(sk); - return __bpf_setsockopt(sk, level, optname, optval, optlen); + if (sk->sk_family != AF_INET) + return -EINVAL; + + switch (optname) { + case IP_TOS: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (getopt) + return do_ip_getsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + + return do_ip_setsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), *optlen); } -static int __bpf_getsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) +static int sol_ipv6_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) { - if (!sk_fullsock(sk)) - goto err_clear; + if (sk->sk_family != AF_INET6) + return -EINVAL; - if (level == SOL_SOCKET) { - if (optlen != sizeof(int)) - goto err_clear; + switch (optname) { + case IPV6_TCLASS: + case IPV6_AUTOFLOWLABEL: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } - switch (optname) { - case SO_RCVBUF: - *((int *)optval) = sk->sk_rcvbuf; - break; - case SO_SNDBUF: - *((int *)optval) = sk->sk_sndbuf; - break; - case SO_MARK: - *((int *)optval) = sk->sk_mark; - break; - case SO_PRIORITY: - *((int *)optval) = sk->sk_priority; - break; - case SO_BINDTOIFINDEX: - *((int *)optval) = sk->sk_bound_dev_if; - break; - case SO_REUSEPORT: - *((int *)optval) = sk->sk_reuseport; - break; - case SO_TXREHASH: - *((int *)optval) = sk->sk_txrehash; - break; - default: - goto err_clear; - } -#ifdef CONFIG_INET - } else if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { - struct inet_connection_sock *icsk; - struct tcp_sock *tp; + if (getopt) + return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); - switch (optname) { - case TCP_CONGESTION: - icsk = inet_csk(sk); + return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), *optlen); +} - if (!icsk->icsk_ca_ops || optlen <= 1) - goto err_clear; - strncpy(optval, icsk->icsk_ca_ops->name, optlen); - optval[optlen - 1] = 0; - break; - case TCP_SAVED_SYN: - tp = tcp_sk(sk); +static int __bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (!sk_fullsock(sk)) + return -EINVAL; - if (optlen <= 0 || !tp->saved_syn || - optlen > tcp_saved_syn_len(tp->saved_syn)) - goto err_clear; - memcpy(optval, tp->saved_syn->data, optlen); - break; - default: - goto err_clear; - } - } else if (level == SOL_IP) { - struct inet_sock *inet = inet_sk(sk); + if (level == SOL_SOCKET) + return sol_socket_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) + return sol_ip_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) + return sol_tcp_sockopt(sk, optname, optval, &optlen, false); - if (optlen != sizeof(int) || sk->sk_family != AF_INET) - goto err_clear; + return -EINVAL; +} - /* Only some options are supported */ - switch (optname) { - case IP_TOS: - *((int *)optval) = (int)inet->tos; - break; - default: - goto err_clear; - } -#if IS_ENABLED(CONFIG_IPV6) - } else if (level == SOL_IPV6) { - struct ipv6_pinfo *np = inet6_sk(sk); +static int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} - if (optlen != sizeof(int) || sk->sk_family != AF_INET6) - goto err_clear; +static int __bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + int err, saved_optlen = optlen; - /* Only some options are supported */ - switch (optname) { - case IPV6_TCLASS: - *((int *)optval) = (int)np->tclass; - break; - default: - goto err_clear; - } -#endif -#endif - } else { - goto err_clear; + if (!sk_fullsock(sk)) { + err = -EINVAL; + goto done; } - return 0; -err_clear: - memset(optval, 0, optlen); - return -EINVAL; + + if (level == SOL_SOCKET) + err = sol_socket_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) + err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) + err = sol_ip_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); + else + err = -EINVAL; + +done: + if (err) + optlen = 0; + if (optlen < saved_optlen) + memset(optval + optlen, 0, saved_optlen - optlen); + return err; } static int _bpf_getsockopt(struct sock *sk, int level, int optname, @@ -5379,12 +5329,6 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { - if (level == SOL_TCP && optname == TCP_CONGESTION) { - if (optlen >= sizeof("cdg") - 1 && - !strncmp("cdg", optval, optlen)) - return -ENOTSUPP; - } - return _bpf_setsockopt(sk, level, optname, optval, optlen); } @@ -6468,6 +6412,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { + struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; @@ -6476,7 +6421,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0, + sk = __inet_lookup(net, hinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); @@ -6490,7 +6435,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, + sk = __inet6_lookup(net, hinfo, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); @@ -7666,34 +7611,23 @@ const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: @@ -7706,12 +7640,17 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) static const struct bpf_func_proto * sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; case BPF_FUNC_bind: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_CONNECT: @@ -7724,24 +7663,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_addr_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; @@ -7822,9 +7745,13 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: @@ -8064,6 +7991,12 @@ const struct bpf_func_proto bpf_sock_hash_update_proto __weak; static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sock_ops_setsockopt_proto; @@ -8077,8 +8010,6 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: @@ -8713,6 +8644,36 @@ static bool tc_cls_act_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } +DEFINE_MUTEX(nf_conn_btf_access_lock); +EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); + +int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); +EXPORT_SYMBOL_GPL(nfct_btf_struct_access); + +static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + int ret = -EACCES; + + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) @@ -8772,6 +8733,27 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static int xdp_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + int ret = -EACCES; + + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -10666,6 +10648,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, + .btf_struct_access = tc_cls_act_btf_struct_access, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { @@ -10677,6 +10660,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, .gen_prologue = bpf_noop_prologue, + .btf_struct_access = xdp_btf_struct_access, }; const struct bpf_prog_ops xdp_prog_ops = { @@ -10811,14 +10795,13 @@ int sk_detach_filter(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_detach_filter); -int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, - unsigned int len) +int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) { struct sock_fprog_kern *fprog; struct sk_filter *filter; int ret = 0; - lock_sock(sk); + sockopt_lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (!filter) @@ -10843,7 +10826,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, goto out; ret = -EFAULT; - if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) + if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number @@ -10851,7 +10834,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, */ ret = fprog->len; out: - release_sock(sk); + sockopt_release_sock(sk); return ret; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 764c4cb3fe8f..007730412947 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -204,6 +204,30 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } +static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_l2tpv3 *key_l2tpv3; + struct { + __be32 session_id; + } *hdr, _hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_l2tpv3 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_L2TPV3, + target_container); + + key_l2tpv3->session_id = hdr->session_id; +} + void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) @@ -272,7 +296,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb, key->ct_zone = ct->zone.id; #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - key->ct_mark = ct->mark; + key->ct_mark = READ_ONCE(ct->mark); #endif cl = nf_ct_labels_find(ct); @@ -866,8 +890,8 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } } -bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen, unsigned int flags) +u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; @@ -892,7 +916,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->nhoff, hlen); - return result == BPF_OK; + return result; } static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr) @@ -1008,6 +1032,7 @@ bool __skb_flow_dissect(const struct net *net, }; __be16 n_proto = proto; struct bpf_prog *prog; + u32 result; if (skb) { ctx.skb = skb; @@ -1019,13 +1044,16 @@ bool __skb_flow_dissect(const struct net *net, } prog = READ_ONCE(run_array->items[0].prog); - ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, - hlen, flags); + result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, + hlen, flags); + if (result == BPF_FLOW_DISSECTOR_CONTINUE) + goto dissect_continue; __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); - return ret; + return result == BPF_OK; } +dissect_continue: rcu_read_unlock(); } @@ -1173,8 +1201,8 @@ proto_again: nhoff += sizeof(*vlan); } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) { + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) && + !(key_control->flags & FLOW_DIS_ENCAPSULATION)) { struct flow_dissector_key_num_of_vlans *key_nvs; key_nvs = skb_flow_dissector_target(flow_dissector, @@ -1497,6 +1525,10 @@ ip_proto_again: __skb_flow_dissect_icmp(skb, flow_dissector, target_container, data, nhoff, hlen); break; + case IPPROTO_L2TP: + __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; default: break; @@ -1611,9 +1643,8 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys) switch (keys->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: - addr_diff = (__force u32)keys->addrs.v4addrs.dst - - (__force u32)keys->addrs.v4addrs.src; - if (addr_diff < 0) + if ((__force u32)keys->addrs.v4addrs.dst < + (__force u32)keys->addrs.v4addrs.src) swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); if ((__force u16)keys->ports.dst < diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 8cfb63528d18..abe423fd5736 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -237,6 +237,13 @@ void flow_rule_match_pppoe(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_pppoe); +void flow_rule_match_l2tpv3(const struct flow_rule *rule, + struct flow_match_l2tpv3 *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_L2TPV3, out); +} +EXPORT_SYMBOL(flow_rule_match_l2tpv3); + struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)) diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index a10335b4ba2d..c8d137ef5980 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -345,7 +345,7 @@ static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats, for_each_possible_cpu(i) { const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); - qstats->qlen += qcpu->backlog; + qstats->qlen += qcpu->qlen; qstats->backlog += qcpu->backlog; qstats->drops += qcpu->drops; qstats->requeues += qcpu->requeues; diff --git a/net/core/gro.c b/net/core/gro.c index b4190eb08467..bc9451743307 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) unsigned int gro_max_size; unsigned int new_truesize; struct sk_buff *lp; + int segs; /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ gro_max_size = READ_ONCE(p->dev->gro_max_size); @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) return -E2BIG; } + segs = NAPI_GRO_CB(skb)->count; lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); @@ -265,7 +267,7 @@ merge: lp = p; done: - NAPI_GRO_CB(p)->count++; + NAPI_GRO_CB(p)->count += segs; p->data_len += len; p->truesize += delta_truesize; p->len += len; @@ -496,8 +498,15 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); NAPI_GRO_CB(skb)->is_atomic = 1; + NAPI_GRO_CB(skb)->count = 1; + if (unlikely(skb_is_gso(skb))) { + NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; + /* Only support TCP at the moment. */ + if (!skb_is_gso_tcp(skb)) + NAPI_GRO_CB(skb)->flush = 1; + } /* Setup for GRO checksum validation */ switch (skb->ip_summed) { @@ -545,10 +554,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff else gro_list->count++; - NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; - skb_shinfo(skb)->gso_size = skb_gro_len(skb); + if (!skb_is_gso(skb)) + skb_shinfo(skb)->gso_size = skb_gro_len(skb); list_add(&skb->list, &gro_list->list); ret = GRO_HELD; @@ -660,6 +669,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; + skb_shinfo(skb)->gso_size = 0; if (unlikely(skb->slow_gro)) { skb_orphan(skb); skb_ext_reset(skb); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index 541c7a72a28a..ed5ec5de47f6 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -26,7 +26,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) cell = this_cpu_ptr(gcells->cells); - if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) { + if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) { drop: dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); @@ -81,8 +81,7 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); - netif_napi_add(dev, &cell->napi, gro_cell_poll, - NAPI_POLL_WEIGHT); + netif_napi_add(dev, &cell->napi, gro_cell_poll); napi_enable(&cell->napi); } return 0; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 9ccd64e8a666..711cd3b4347a 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -48,6 +48,9 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "RPL"; case LWTUNNEL_ENCAP_IOAM6: return "IOAM6"; + case LWTUNNEL_ENCAP_XFRM: + /* module autoload not supported for encap type */ + return NULL; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6a8c2596ebab..952a54763358 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -111,7 +111,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) unsigned long neigh_rand_reach_time(unsigned long base) { - return base ? (prandom_u32() % base) + (base >> 1) : 0; + return base ? prandom_u32_max(base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); @@ -307,11 +307,53 @@ static int neigh_del_timer(struct neighbour *n) return 0; } -static void pneigh_queue_purge(struct sk_buff_head *list) +static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, + int family) +{ + switch (family) { + case AF_INET: + return __in_dev_arp_parms_get_rcu(dev); + case AF_INET6: + return __in6_dev_nd_parms_get_rcu(dev); + } + return NULL; +} + +static void neigh_parms_qlen_dec(struct net_device *dev, int family) +{ + struct neigh_parms *p; + + rcu_read_lock(); + p = neigh_get_dev_parms_rcu(dev, family); + if (p) + p->qlen--; + rcu_read_unlock(); +} + +static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net, + int family) { + struct sk_buff_head tmp; + unsigned long flags; struct sk_buff *skb; - while ((skb = skb_dequeue(list)) != NULL) { + skb_queue_head_init(&tmp); + spin_lock_irqsave(&list->lock, flags); + skb = skb_peek(list); + while (skb != NULL) { + struct sk_buff *skb_next = skb_peek_next(skb, list); + struct net_device *dev = skb->dev; + + if (net == NULL || net_eq(dev_net(dev), net)) { + neigh_parms_qlen_dec(dev, family); + __skb_unlink(skb, list); + __skb_queue_tail(&tmp, skb); + } + skb = skb_next; + } + spin_unlock_irqrestore(&list->lock, flags); + + while ((skb = __skb_dequeue(&tmp))) { dev_put(skb->dev); kfree_skb(skb); } @@ -385,9 +427,10 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); pneigh_ifdown_and_unlock(tbl, dev); - - del_timer_sync(&tbl->proxy_timer); - pneigh_queue_purge(&tbl->proxy_queue); + pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, + tbl->family); + if (skb_queue_empty_lockless(&tbl->proxy_queue)) + del_timer_sync(&tbl->proxy_timer); return 0; } @@ -1598,7 +1641,9 @@ static void neigh_proxy_process(struct timer_list *t) if (tdif <= 0) { struct net_device *dev = skb->dev; + neigh_parms_qlen_dec(dev, tbl->family); __skb_unlink(skb, &tbl->proxy_queue); + if (tbl->proxy_redo && netif_running(dev)) { rcu_read_lock(); tbl->proxy_redo(skb); @@ -1623,7 +1668,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, unsigned long sched_next = jiffies + prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY)); - if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) { + if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); return; } @@ -1639,6 +1684,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, skb_dst_drop(skb); dev_hold(skb->dev); __skb_queue_tail(&tbl->proxy_queue, skb); + p->qlen++; mod_timer(&tbl->proxy_timer, sched_next); spin_unlock(&tbl->proxy_queue.lock); } @@ -1671,6 +1717,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, refcount_set(&p->refcnt, 1); p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + p->qlen = 0; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; write_pnet(&p->net, net); @@ -1736,6 +1783,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) refcount_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); + tbl->parms.qlen = 0; tbl->stats = alloc_percpu(struct neigh_statistics); if (!tbl->stats) @@ -1787,7 +1835,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl) cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); - pneigh_queue_purge(&tbl->proxy_queue); + pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) pr_crit("neighbour leakage\n"); @@ -1819,9 +1867,6 @@ static struct neigh_table *neigh_find_table(int family) case AF_INET6: tbl = neigh_tables[NEIGH_ND_TABLE]; break; - case AF_DECnet: - tbl = neigh_tables[NEIGH_DN_TABLE]; - break; } return tbl; @@ -3508,18 +3553,6 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write, return ret; } -static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev, - int family) -{ - switch (family) { - case AF_INET: - return __in_dev_arp_parms_get_rcu(dev); - case AF_INET6: - return __in6_dev_nd_parms_get_rcu(dev); - } - return NULL; -} - static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p, int index) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index d61afd21aab5..8409d41405df 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -59,7 +59,7 @@ static ssize_t netdev_show(const struct device *dev, #define NETDEVICE_SHOW(field, format_string) \ static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ - return sprintf(buf, format_string, dev->field); \ + return sysfs_emit(buf, format_string, dev->field); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -118,13 +118,13 @@ static ssize_t iflink_show(struct device *dev, struct device_attribute *attr, { struct net_device *ndev = to_net_dev(dev); - return sprintf(buf, fmt_dec, dev_get_iflink(ndev)); + return sysfs_emit(buf, fmt_dec, dev_get_iflink(ndev)); } static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { - return sprintf(buf, fmt_dec, dev->name_assign_type); + return sysfs_emit(buf, fmt_dec, dev->name_assign_type); } static ssize_t name_assign_type_show(struct device *dev, @@ -194,7 +194,7 @@ static ssize_t carrier_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) - return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev)); + return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); return -EINVAL; } @@ -219,7 +219,7 @@ static ssize_t speed_show(struct device *dev, struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) - ret = sprintf(buf, fmt_dec, cmd.base.speed); + ret = sysfs_emit(buf, fmt_dec, cmd.base.speed); } rtnl_unlock(); return ret; @@ -258,7 +258,7 @@ static ssize_t duplex_show(struct device *dev, duplex = "unknown"; break; } - ret = sprintf(buf, "%s\n", duplex); + ret = sysfs_emit(buf, "%s\n", duplex); } } rtnl_unlock(); @@ -272,7 +272,7 @@ static ssize_t testing_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) - return sprintf(buf, fmt_dec, !!netif_testing(netdev)); + return sysfs_emit(buf, fmt_dec, !!netif_testing(netdev)); return -EINVAL; } @@ -284,7 +284,7 @@ static ssize_t dormant_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) - return sprintf(buf, fmt_dec, !!netif_dormant(netdev)); + return sysfs_emit(buf, fmt_dec, !!netif_dormant(netdev)); return -EINVAL; } @@ -315,7 +315,7 @@ static ssize_t operstate_show(struct device *dev, if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ - return sprintf(buf, "%s\n", operstates[operstate]); + return sysfs_emit(buf, "%s\n", operstates[operstate]); } static DEVICE_ATTR_RO(operstate); @@ -325,9 +325,9 @@ static ssize_t carrier_changes_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); - return sprintf(buf, fmt_dec, - atomic_read(&netdev->carrier_up_count) + - atomic_read(&netdev->carrier_down_count)); + return sysfs_emit(buf, fmt_dec, + atomic_read(&netdev->carrier_up_count) + + atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_changes); @@ -337,7 +337,7 @@ static ssize_t carrier_up_count_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); - return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); + return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); } static DEVICE_ATTR_RO(carrier_up_count); @@ -347,7 +347,7 @@ static ssize_t carrier_down_count_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); - return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); + return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_down_count); @@ -462,7 +462,7 @@ static ssize_t ifalias_show(struct device *dev, ret = dev_get_alias(netdev, tmp, sizeof(tmp)); if (ret > 0) - ret = sprintf(buf, "%s\n", tmp); + ret = sysfs_emit(buf, "%s\n", tmp); return ret; } static DEVICE_ATTR_RW(ifalias); @@ -514,7 +514,7 @@ static ssize_t phys_port_id_show(struct device *dev, ret = dev_get_phys_port_id(netdev, &ppid); if (!ret) - ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); } rtnl_unlock(); @@ -543,7 +543,7 @@ static ssize_t phys_port_name_show(struct device *dev, ret = dev_get_phys_port_name(netdev, name, sizeof(name)); if (!ret) - ret = sprintf(buf, "%s\n", name); + ret = sysfs_emit(buf, "%s\n", name); } rtnl_unlock(); @@ -573,7 +573,7 @@ static ssize_t phys_switch_id_show(struct device *dev, ret = dev_get_port_parent_id(netdev, &ppid, false); if (!ret) - ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); } rtnl_unlock(); @@ -591,7 +591,7 @@ static ssize_t threaded_show(struct device *dev, return restart_syscall(); if (dev_isalive(netdev)) - ret = sprintf(buf, fmt_dec, netdev->threaded); + ret = sysfs_emit(buf, fmt_dec, netdev->threaded); rtnl_unlock(); return ret; @@ -673,7 +673,7 @@ static ssize_t netstat_show(const struct device *d, struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); - ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); + ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } read_unlock(&dev_base_lock); return ret; @@ -824,7 +824,7 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) for (i = 0; i < map->len; i++) cpumask_set_cpu(map->cpus[i], mask); - len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); + len = sysfs_emit(buf, "%*pb\n", cpumask_pr_args(mask)); rcu_read_unlock(); free_cpumask_var(mask); @@ -910,7 +910,7 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, val = (unsigned long)flow_table->mask + 1; rcu_read_unlock(); - return sprintf(buf, "%lu\n", val); + return sysfs_emit(buf, "%lu\n", val); } static void rps_dev_flow_table_release(struct rcu_head *rcu) @@ -1208,7 +1208,7 @@ static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) { unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); - return sprintf(buf, fmt_ulong, trans_timeout); + return sysfs_emit(buf, fmt_ulong, trans_timeout); } static unsigned int get_netdev_queue_index(struct netdev_queue *queue) @@ -1255,15 +1255,15 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, * belongs to the root device it will be reported with just the * traffic class, so just "0" for TC 0 for example. */ - return num_tc < 0 ? sprintf(buf, "%d%d\n", tc, num_tc) : - sprintf(buf, "%d\n", tc); + return num_tc < 0 ? sysfs_emit(buf, "%d%d\n", tc, num_tc) : + sysfs_emit(buf, "%d\n", tc); } #ifdef CONFIG_XPS static ssize_t tx_maxrate_show(struct netdev_queue *queue, char *buf) { - return sprintf(buf, "%lu\n", queue->tx_maxrate); + return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); } static ssize_t tx_maxrate_store(struct netdev_queue *queue, @@ -1317,7 +1317,7 @@ static struct netdev_queue_attribute queue_traffic_class __ro_after_init */ static ssize_t bql_show(char *buf, unsigned int value) { - return sprintf(buf, "%u\n", value); + return sysfs_emit(buf, "%u\n", value); } static ssize_t bql_set(const char *buf, const size_t count, @@ -1346,7 +1346,7 @@ static ssize_t bql_show_hold_time(struct netdev_queue *queue, { struct dql *dql = &queue->dql; - return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } static ssize_t bql_set_hold_time(struct netdev_queue *queue, @@ -1374,7 +1374,7 @@ static ssize_t bql_show_inflight(struct netdev_queue *queue, { struct dql *dql = &queue->dql; - return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed); + return sysfs_emit(buf, "%u\n", dql->num_queued - dql->num_completed); } static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6b9f19122ec1..f64654df71a2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -18,7 +18,6 @@ #include <linux/user_namespace.h> #include <linux/net_namespace.h> #include <linux/sched/task.h> -#include <linux/sched/mm.h> #include <linux/uidgid.h> #include <linux/cookie.h> @@ -118,6 +117,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) static int ops_init(const struct pernet_operations *ops, struct net *net) { + struct net_generic *ng; int err = -ENOMEM; void *data = NULL; @@ -136,7 +136,13 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) if (!err) return 0; + if (ops->id && ops->size) { cleanup: + ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&pernet_ops_rwsem)); + ng->ptr[*ops->id] = NULL; + } + kfree(data); out: @@ -1144,13 +1150,7 @@ static int __register_pernet_operations(struct list_head *list, * setup_net() and cleanup_net() are not possible. */ for_each_net(net) { - struct mem_cgroup *old, *memcg; - - memcg = mem_cgroup_or_root(get_mem_cgroup_from_obj(net)); - old = set_active_memcg(memcg); error = ops_init(ops, net); - set_active_memcg(old); - mem_cgroup_put(memcg); if (error) goto out_undo; list_add_tail(&net->exit_list, &net_exit_list); diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 1a6a86693b74..d6a70aeaa503 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -66,7 +66,7 @@ struct update_classid_context { #define UPDATE_CLASSID_BATCH 1000 -static int update_classid_sock(const void *v, struct file *file, unsigned n) +static int update_classid_sock(const void *v, struct file *file, unsigned int n) { struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 5d27067b72d5..9be762e1d042 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -556,7 +556,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) if ((delim = strchr(cur, ',')) == NULL) goto parse_failed; *delim = 0; - strlcpy(np->dev_name, cur, sizeof(np->dev_name)); + strscpy(np->dev_name, cur, sizeof(np->dev_name)); cur = delim; } cur++; @@ -610,7 +610,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) int err; np->dev = ndev; - strlcpy(np->dev_name, ndev->name, IFNAMSIZ); + strscpy(np->dev_name, ndev->name, IFNAMSIZ); if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 88906ba6d9a7..c3763056c554 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2324,7 +2324,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) pkt_dev->curfl = 0; /*reset */ } } else { - flow = prandom_u32() % pkt_dev->cflows; + flow = prandom_u32_max(pkt_dev->cflows); pkt_dev->curfl = flow; if (pkt_dev->flows[flow].count > pkt_dev->lflow) { @@ -2380,10 +2380,9 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; if (pkt_dev->flags & F_QUEUE_MAP_RND) { - t = prandom_u32() % - (pkt_dev->queue_map_max - - pkt_dev->queue_map_min + 1) - + pkt_dev->queue_map_min; + t = prandom_u32_max(pkt_dev->queue_map_max - + pkt_dev->queue_map_min + 1) + + pkt_dev->queue_map_min; } else { t = pkt_dev->cur_queue_map + 1; if (t > pkt_dev->queue_map_max) @@ -2412,7 +2411,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACSRC_RND) - mc = prandom_u32() % pkt_dev->src_mac_count; + mc = prandom_u32_max(pkt_dev->src_mac_count); else { mc = pkt_dev->cur_src_mac_offset++; if (pkt_dev->cur_src_mac_offset >= @@ -2438,7 +2437,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACDST_RND) - mc = prandom_u32() % pkt_dev->dst_mac_count; + mc = prandom_u32_max(pkt_dev->dst_mac_count); else { mc = pkt_dev->cur_dst_mac_offset++; @@ -2465,23 +2464,23 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | - ((__force __be32)prandom_u32() & + ((__force __be32)get_random_u32() & htonl(0x000fffff)); } if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { - pkt_dev->vlan_id = prandom_u32() & (4096 - 1); + pkt_dev->vlan_id = prandom_u32_max(4096); } if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { - pkt_dev->svlan_id = prandom_u32() & (4096 - 1); + pkt_dev->svlan_id = prandom_u32_max(4096); } if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { if (pkt_dev->flags & F_UDPSRC_RND) - pkt_dev->cur_udp_src = prandom_u32() % - (pkt_dev->udp_src_max - pkt_dev->udp_src_min) - + pkt_dev->udp_src_min; + pkt_dev->cur_udp_src = prandom_u32_max( + pkt_dev->udp_src_max - pkt_dev->udp_src_min) + + pkt_dev->udp_src_min; else { pkt_dev->cur_udp_src++; @@ -2492,9 +2491,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { if (pkt_dev->flags & F_UDPDST_RND) { - pkt_dev->cur_udp_dst = prandom_u32() % - (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) - + pkt_dev->udp_dst_min; + pkt_dev->cur_udp_dst = prandom_u32_max( + pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) + + pkt_dev->udp_dst_min; } else { pkt_dev->cur_udp_dst++; if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max) @@ -2509,7 +2508,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; if (pkt_dev->flags & F_IPSRC_RND) - t = prandom_u32() % (imx - imn) + imn; + t = prandom_u32_max(imx - imn) + imn; else { t = ntohl(pkt_dev->cur_saddr); t++; @@ -2531,8 +2530,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_IPDST_RND) { do { - t = prandom_u32() % - (imx - imn) + imn; + t = prandom_u32_max(imx - imn) + + imn; s = htonl(t); } while (ipv4_is_loopback(s) || ipv4_is_multicast(s) || @@ -2569,7 +2568,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < 4; i++) { pkt_dev->cur_in6_daddr.s6_addr32[i] = - (((__force __be32)prandom_u32() | + (((__force __be32)get_random_u32() | pkt_dev->min_in6_daddr.s6_addr32[i]) & pkt_dev->max_in6_daddr.s6_addr32[i]); } @@ -2579,9 +2578,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; if (pkt_dev->flags & F_TXSIZE_RND) { - t = prandom_u32() % - (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size) - + pkt_dev->min_pkt_size; + t = prandom_u32_max(pkt_dev->max_pkt_size - + pkt_dev->min_pkt_size) + + pkt_dev->min_pkt_size; } else { t = pkt_dev->cur_pkt_size + 1; if (t > pkt_dev->max_pkt_size) @@ -2590,7 +2589,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->cur_pkt_size = t; } else if (pkt_dev->n_imix_entries > 0) { struct imix_pkt *entry; - __u32 t = prandom_u32() % IMIX_PRECISION; + __u32 t = prandom_u32_max(IMIX_PRECISION); __u8 entry_index = pkt_dev->imix_distribution[t]; entry = &pkt_dev->imix_entries[entry_index]; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ac45328607f7..74864dc46a7e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -866,14 +866,12 @@ static void set_operstate(struct net_device *dev, unsigned char transition) break; case IF_OPER_TESTING: - if (operstate == IF_OPER_UP || - operstate == IF_OPER_UNKNOWN) + if (netif_oper_up(dev)) operstate = IF_OPER_TESTING; break; case IF_OPER_DORMANT: - if (operstate == IF_OPER_UP || - operstate == IF_OPER_UNKNOWN) + if (netif_oper_up(dev)) operstate = IF_OPER_DORMANT; break; } @@ -1059,6 +1057,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + + nla_total_size(4) /* IFLA_ALLMULTI */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ @@ -1767,6 +1766,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || + nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || @@ -1928,6 +1928,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, + [IFLA_ALLMULTI] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2776,13 +2777,6 @@ static int do_setlink(const struct sk_buff *skb, call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } - if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), - extack); - if (err < 0) - goto errout; - } - if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) @@ -2790,6 +2784,13 @@ static int do_setlink(const struct sk_buff *skb, status |= DO_SETLINK_MODIFIED; } + if (ifm->ifi_flags || ifm->ifi_change) { + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); + if (err < 0) + goto errout; + } + if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) @@ -6070,6 +6071,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) && !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) { NL_SET_ERR_MSG(extack, "Bulk delete is not supported"); + module_put(owner); goto err_unlock; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 974bbbbe7138..88fa40571d0c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -91,7 +91,11 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); -/* The array 'drop_reasons' is auto-generated in dropreason_str.c */ +#undef FN +#define FN(reason) [SKB_DROP_REASON_##reason] = #reason, +const char * const drop_reasons[] = { + DEFINE_DROP_REASON(FN, FN) +}; EXPORT_SYMBOL(drop_reasons); /** @@ -130,8 +134,66 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) +#if PAGE_SIZE == SZ_4K + +#define NAPI_HAS_SMALL_PAGE_FRAG 1 +#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) + +/* specialized page frag allocator using a single order 0 page + * and slicing it into 1K sized fragment. Constrained to systems + * with a very limited amount of 1K fragments fitting a single + * page - to avoid excessive truesize underestimation + */ + +struct page_frag_1k { + void *va; + u16 offset; + bool pfmemalloc; +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) +{ + struct page *page; + int offset; + + offset = nc->offset - SZ_1K; + if (likely(offset >= 0)) + goto use_frag; + + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + if (!page) + return NULL; + + nc->va = page_address(page); + nc->pfmemalloc = page_is_pfmemalloc(page); + offset = PAGE_SIZE - SZ_1K; + page_ref_add(page, offset / SZ_1K); + +use_frag: + nc->offset = offset; + return nc->va + offset; +} +#else + +/* the small page is actually unused in this build; add dummy helpers + * to please the compiler and avoid later preprocessor's conditionals + */ +#define NAPI_HAS_SMALL_PAGE_FRAG 0 +#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false + +struct page_frag_1k { +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) +{ + return NULL; +} + +#endif + struct napi_alloc_cache { struct page_frag_cache page; + struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -139,6 +201,23 @@ struct napi_alloc_cache { static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -557,6 +636,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, { struct napi_alloc_cache *nc; struct sk_buff *skb; + bool pfmemalloc; void *data; DEBUG_NET_WARN_ON_ONCE(!in_softirq()); @@ -564,8 +644,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. + * When the small frag allocator is available, prefer it over kmalloc + * for small fragments */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -576,13 +658,33 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, } nc = this_cpu_ptr(&napi_alloc_cache); - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - len = SKB_DATA_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = page_frag_alloc(&nc->page, len, gfp_mask); + if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { + /* we are artificially inflating the allocation size, but + * that is not as bad as it may look like, as: + * - 'len' less than GRO_MAX_HEAD makes little sense + * - On most systems, larger 'len' values lead to fragment + * size above 512 bytes + * - kmalloc would use the kmalloc-1k slab for such values + * - Builds with smaller GRO_MAX_HEAD will very likely do + * little networking, as that implies no WiFi and no + * tunnels support, and 32 bits arches. + */ + len = SZ_1K; + + data = page_frag_alloc_1k(&nc->page_small, gfp_mask); + pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); + } else { + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); + + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = nc->page.pfmemalloc; + } + if (unlikely(!data)) return NULL; @@ -592,7 +694,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, return NULL; } - if (nc->page.pfmemalloc) + if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -777,9 +879,10 @@ EXPORT_SYMBOL(__kfree_skb); * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' * tracepoint. */ -void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) +void __fix_address +kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) { - if (!skb_unref(skb)) + if (unlikely(!skb_unref(skb))) return; DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); @@ -1183,7 +1286,7 @@ EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) { - struct ubuf_info *uarg; + struct ubuf_info_msgzc *uarg; struct sk_buff *skb; WARN_ON_ONCE(!in_task()); @@ -1201,19 +1304,19 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) return NULL; } - uarg->callback = msg_zerocopy_callback; + uarg->ubuf.callback = msg_zerocopy_callback; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; uarg->zerocopy = 1; - uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; - refcount_set(&uarg->refcnt, 1); + uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + refcount_set(&uarg->ubuf.refcnt, 1); sock_hold(sk); - return uarg; + return &uarg->ubuf; } -static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) +static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) { return container_of((void *)uarg, struct sk_buff, cb); } @@ -1222,6 +1325,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg) { if (uarg) { + struct ubuf_info_msgzc *uarg_zc; const u32 byte_limit = 1 << 19; /* limit to a few TSO */ u32 bytelen, next; @@ -1237,8 +1341,9 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, return NULL; } - bytelen = uarg->bytelen + size; - if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) { + uarg_zc = uarg_to_msgzc(uarg); + bytelen = uarg_zc->bytelen + size; + if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { /* TCP can create new skb to attach new uarg */ if (sk->sk_type == SOCK_STREAM) goto new_alloc; @@ -1246,11 +1351,11 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, } next = (u32)atomic_read(&sk->sk_zckey); - if ((u32)(uarg->id + uarg->len) == next) { - if (mm_account_pinned_pages(&uarg->mmp, size)) + if ((u32)(uarg_zc->id + uarg_zc->len) == next) { + if (mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; - uarg->len++; - uarg->bytelen = bytelen; + uarg_zc->len++; + uarg_zc->bytelen = bytelen; atomic_set(&sk->sk_zckey, ++next); /* no extra ref when appending to datagram (MSG_MORE) */ @@ -1286,7 +1391,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) return true; } -static void __msg_zerocopy_callback(struct ubuf_info *uarg) +static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) { struct sk_buff *tail, *skb = skb_from_uarg(uarg); struct sock_exterr_skb *serr; @@ -1339,19 +1444,21 @@ release: void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, bool success) { - uarg->zerocopy = uarg->zerocopy & success; + struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); + + uarg_zc->zerocopy = uarg_zc->zerocopy & success; if (refcount_dec_and_test(&uarg->refcnt)) - __msg_zerocopy_callback(uarg); + __msg_zerocopy_callback(uarg_zc); } EXPORT_SYMBOL_GPL(msg_zerocopy_callback); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { - struct sock *sk = skb_from_uarg(uarg)->sk; + struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; atomic_dec(&sk->sk_zckey); - uarg->len--; + uarg_to_msgzc(uarg)->len--; if (have_uref) msg_zerocopy_callback(NULL, uarg, true); @@ -3864,7 +3971,7 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page, } else if (i < MAX_SKB_FRAGS) { skb_zcopy_downgrade_managed(skb); get_page(page); - skb_fill_page_desc(skb, i, page, offset, size); + skb_fill_page_desc_noacc(skb, i, page, offset, size); } else { return -EMSGSIZE; } @@ -4027,23 +4134,25 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int i = 0; int pos; - if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) && - (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) { - /* gso_size is untrusted, and we have a frag_list with a linear - * non head_frag head. - * - * (we assume checking the first list_skb member suffices; - * i.e if either of the list_skb members have non head_frag - * head, then the first one has too). - * - * If head_skb's headlen does not fit requested gso_size, it - * means that the frag_list members do NOT terminate on exact - * gso_size boundaries. Hence we cannot perform skb_frag_t page - * sharing. Therefore we must fallback to copying the frag_list - * skbs; we do so by disabling SG. - */ - if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) - features &= ~NETIF_F_SG; + if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && + mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { + struct sk_buff *check_skb; + + for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { + if (skb_headlen(check_skb) && !check_skb->head_frag) { + /* gso_size is untrusted, and we have a frag_list with + * a linear non head_frag item. + * + * If head_skb's headlen does not fit requested gso_size, + * it means that the frag_list members do NOT terminate + * on exact gso_size boundaries. Hence we cannot perform + * skb_frag_t page sharing. Therefore we must fallback to + * copying the frag_list skbs; we do so by disabling SG. + */ + features &= ~NETIF_F_SG; + break; + } + } } __skb_push(head_skb, doffset); @@ -4205,9 +4314,8 @@ normal: SKB_GSO_CB(nskb)->csum_start = skb_headroom(nskb) + doffset; } else { - skb_copy_bits(head_skb, offset, - skb_put(nskb, len), - len); + if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) + goto err; } continue; } @@ -4798,7 +4906,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) { bool ret; - if (likely(sysctl_tstamp_allow_data || tsonly)) + if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) return true; read_lock_bh(&sk->sk_callback_lock); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index f47338d89d5d..e6b9ced3eda8 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -434,8 +434,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (copied + copy > len) copy = len - copied; copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; + if (!copy) { + copied = copied ? copied : -EFAULT; + goto out; + } copied += copy; if (likely(!peek)) { @@ -455,13 +457,13 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, * didn't copy the entire length lets just break. */ if (copy != sge->length) - return copied; + goto out; sk_msg_iter_var_next(i); } if (copied == len) break; - } while (!sg_is_last(sge)); + } while ((i != msg_rx->sg.end) && !sg_is_last(sge)); if (unlikely(peek)) { msg_rx = sk_psock_next_msg(psock, msg_rx); @@ -471,13 +473,15 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } msg_rx->sg.start = i; - if (!sge->length && sg_is_last(sge)) { + if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) { msg_rx = sk_psock_dequeue_msg(psock); kfree_sk_msg(msg_rx); } msg_rx = sk_psock_peek_msg(psock); } - +out: + if (psock->work_state.skb && copied > 0) + schedule_work(&psock->work); return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); @@ -496,11 +500,11 @@ bool sk_msg_is_readable(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_msg_is_readable); -static struct sk_msg *alloc_sk_msg(void) +static struct sk_msg *alloc_sk_msg(gfp_t gfp) { struct sk_msg *msg; - msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); + msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN); if (unlikely(!msg)) return NULL; sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); @@ -516,7 +520,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; - return alloc_sk_msg(); + return alloc_sk_msg(GFP_KERNEL); } static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, @@ -593,7 +597,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) { - struct sk_msg *msg = alloc_sk_msg(); + struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; int err; @@ -799,16 +803,13 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } -void sk_psock_stop(struct sk_psock *psock, bool wait) +void sk_psock_stop(struct sk_psock *psock) { spin_lock_bh(&psock->ingress_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); sk_psock_cork_free(psock); __sk_psock_zap_ingress(psock); spin_unlock_bh(&psock->ingress_lock); - - if (wait) - cancel_work_sync(&psock->work); } static void sk_psock_done_strp(struct sk_psock *psock); @@ -846,7 +847,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - sk_psock_stop(psock, false); + sk_psock_stop(psock); INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); queue_rcu_work(system_wq, &psock->rwork); @@ -1194,8 +1195,9 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } - if (sk_psock_verdict_apply(psock, skb, ret) < 0) - len = 0; + ret = sk_psock_verdict_apply(psock, skb, ret); + if (ret < 0) + len = ret; out: rcu_read_unlock(); return len; diff --git a/net/core/sock.c b/net/core/sock.c index 4cb957d934a2..a3ba0358c77c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -703,15 +703,17 @@ static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) goto out; } - return sock_bindtoindex(sk, index, true); + sockopt_lock_sock(sk); + ret = sock_bindtoindex_locked(sk, index); + sockopt_release_sock(sk); out: #endif return ret; } -static int sock_getbindtodevice(struct sock *sk, char __user *optval, - int __user *optlen, int len) +static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -735,12 +737,12 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, len = strlen(devname) + 1; ret = -EFAULT; - if (copy_to_user(optval, devname, len)) + if (copy_to_sockptr(optval, devname, len)) goto out; zero: ret = -EFAULT; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) goto out; ret = 0; @@ -1036,17 +1038,51 @@ static int sock_reserve_memory(struct sock *sk, int bytes) return 0; } +void sockopt_lock_sock(struct sock *sk) +{ + /* When current->bpf_ctx is set, the setsockopt is called from + * a bpf prog. bpf has ensured the sk lock has been + * acquired before calling setsockopt(). + */ + if (has_current_bpf_ctx()) + return; + + lock_sock(sk); +} +EXPORT_SYMBOL(sockopt_lock_sock); + +void sockopt_release_sock(struct sock *sk) +{ + if (has_current_bpf_ctx()) + return; + + release_sock(sk); +} +EXPORT_SYMBOL(sockopt_release_sock); + +bool sockopt_ns_capable(struct user_namespace *ns, int cap) +{ + return has_current_bpf_ctx() || ns_capable(ns, cap); +} +EXPORT_SYMBOL(sockopt_ns_capable); + +bool sockopt_capable(int cap) +{ + return has_current_bpf_ctx() || capable(cap); +} +EXPORT_SYMBOL(sockopt_capable); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */ -int sock_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int optlen) +int sk_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct so_timestamping timestamping; + struct socket *sock = sk->sk_socket; struct sock_txtime sk_txtime; - struct sock *sk = sock->sk; int val; int valbool; struct linger ling; @@ -1067,11 +1103,11 @@ int sock_setsockopt(struct socket *sock, int level, int optname, valbool = val ? 1 : 0; - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { case SO_DEBUG: - if (val && !capable(CAP_NET_ADMIN)) + if (val && !sockopt_capable(CAP_NET_ADMIN)) ret = -EACCES; else sock_valbool_flag(sk, SOCK_DBG, valbool); @@ -1101,7 +1137,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_wmem_max); + val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); set_sndbuf: /* Ensure val * 2 fits into an int, to prevent max_t() * from treating it as a negative value. @@ -1115,7 +1151,7 @@ set_sndbuf: break; case SO_SNDBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1133,11 +1169,11 @@ set_sndbuf: * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); + __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); break; case SO_RCVBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1164,8 +1200,8 @@ set_sndbuf: case SO_PRIORITY: if ((val >= 0 && val <= 6) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) sk->sk_priority = val; else ret = -EPERM; @@ -1228,7 +1264,7 @@ set_sndbuf: case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - if (sock->ops->set_rcvlowat) + if (sock && sock->ops->set_rcvlowat) ret = sock->ops->set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); @@ -1310,8 +1346,8 @@ set_sndbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1319,8 +1355,8 @@ set_sndbuf: __sock_set_mark(sk, val); break; case SO_RCVMARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1354,7 +1390,7 @@ set_sndbuf: #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: /* allow unprivileged users to decrease the value */ - if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) + if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN)) ret = -EPERM; else { if (val < 0) @@ -1364,13 +1400,13 @@ set_sndbuf: } break; case SO_PREFER_BUSY_POLL: - if (valbool && !capable(CAP_NET_ADMIN)) + if (valbool && !sockopt_capable(CAP_NET_ADMIN)) ret = -EPERM; else WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); break; case SO_BUSY_POLL_BUDGET: - if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; } else { if (val < 0 || val > U16_MAX) @@ -1441,7 +1477,7 @@ set_sndbuf: * scheduler has enough safe guards. */ if (sk_txtime.clockid != CLOCK_MONOTONIC && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1496,9 +1532,16 @@ set_sndbuf: ret = -ENOPROTOOPT; break; } - release_sock(sk); + sockopt_release_sock(sk); return ret; } + +int sock_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + return sk_setsockopt(sock->sk, level, optname, + optval, optlen); +} EXPORT_SYMBOL(sock_setsockopt); static const struct cred *sk_get_peer_cred(struct sock *sk) @@ -1525,22 +1568,25 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred, } } -static int groups_to_user(gid_t __user *dst, const struct group_info *src) +static int groups_to_user(sockptr_t dst, const struct group_info *src) { struct user_namespace *user_ns = current_user_ns(); int i; - for (i = 0; i < src->ngroups; i++) - if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) + for (i = 0; i < src->ngroups; i++) { + gid_t gid = from_kgid_munged(user_ns, src->gid[i]); + + if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) return -EFAULT; + } return 0; } -int sock_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) +int sk_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen) { - struct sock *sk = sock->sk; + struct socket *sock = sk->sk_socket; union { int val; @@ -1557,7 +1603,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, int lv = sizeof(int); int len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; @@ -1692,7 +1738,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); spin_unlock(&sk->sk_peer_lock); - if (copy_to_user(optval, &peercred, len)) + if (copy_to_sockptr(optval, &peercred, len)) return -EFAULT; goto lenout; } @@ -1710,11 +1756,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); put_cred(cred); - return put_user(len, optlen) ? -EFAULT : -ERANGE; + return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); - ret = groups_to_user((gid_t __user *)optval, cred->group_info); + ret = groups_to_user(optval, cred->group_info); put_cred(cred); if (ret) return ret; @@ -1730,7 +1776,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, return -ENOTCONN; if (lv < len) return -EINVAL; - if (copy_to_user(optval, address, len)) + if (copy_to_sockptr(optval, address, len)) return -EFAULT; goto lenout; } @@ -1747,7 +1793,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_PEERSEC: - return security_socket_getpeersec_stream(sock, optval, optlen, len); + return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len); case SO_MARK: v.val = sk->sk_mark; @@ -1779,7 +1825,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, return sock_getbindtodevice(sk, optval, optlen, len); case SO_GET_FILTER: - len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); + len = sk_get_filter(sk, optval, len); if (len < 0) return len; @@ -1827,7 +1873,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, sk_get_meminfo(sk, meminfo); len = min_t(unsigned int, len, sizeof(meminfo)); - if (copy_to_user(optval, &meminfo, len)) + if (copy_to_sockptr(optval, &meminfo, len)) return -EFAULT; goto lenout; @@ -1896,14 +1942,22 @@ int sock_getsockopt(struct socket *sock, int level, int optname, if (len > lv) len = lv; - if (copy_to_user(optval, &v, len)) + if (copy_to_sockptr(optval, &v, len)) return -EFAULT; lenout: - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } +int sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + return sk_getsockopt(sock->sk, level, optname, + USER_SOCKPTR(optval), + USER_SOCKPTR(optlen)); +} + /* * Initialize an sk_lock. * @@ -2536,7 +2590,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > - sysctl_optmem_max) + READ_ONCE(sysctl_optmem_max)) return NULL; skb = alloc_skb(size, priority); @@ -2554,8 +2608,10 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, */ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) { - if ((unsigned int)size <= sysctl_optmem_max && - atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + int optmem_max = READ_ONCE(sysctl_optmem_max); + + if ((unsigned int)size <= optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { void *mem; /* First do the add, to avoid the race if kmalloc * might sleep. @@ -3309,8 +3365,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) timer_setup(&sk->sk_timer, NULL, 0); sk->sk_allocation = GFP_KERNEL; - sk->sk_rcvbuf = sysctl_rmem_default; - sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); + sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); sk->sk_state = TCP_CLOSE; sk_set_socket(sk, sock); @@ -3365,7 +3421,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; - sk->sk_ll_usec = sysctl_net_busy_read; + sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); #endif sk->sk_max_pacing_rate = ~0UL; @@ -3554,7 +3610,8 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt); @@ -3580,7 +3637,8 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 9a9fb9487d63..81beb16ab1eb 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -41,7 +41,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) attr->map_flags & ~SOCK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); - stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT); + stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE); if (!stab) return ERR_PTR(-ENOMEM); @@ -52,7 +52,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) sizeof(struct sock *), stab->map.numa_node); if (!stab->sks) { - kfree(stab); + bpf_map_area_free(stab); return ERR_PTR(-ENOMEM); } @@ -361,7 +361,7 @@ static void sock_map_free(struct bpf_map *map) synchronize_rcu(); bpf_map_area_free(stab->sks); - kfree(stab); + bpf_map_area_free(stab); } static void sock_map_release_progs(struct bpf_map *map) @@ -1085,7 +1085,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) if (attr->key_size > MAX_BPF_STACK) return ERR_PTR(-E2BIG); - htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); + htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE); if (!htab) return ERR_PTR(-ENOMEM); @@ -1115,7 +1115,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr) return &htab->map; free_htab: - kfree(htab); + bpf_map_area_free(htab); return ERR_PTR(err); } @@ -1168,7 +1168,7 @@ static void sock_hash_free(struct bpf_map *map) synchronize_rcu(); bpf_map_area_free(htab->buckets); - kfree(htab); + bpf_map_area_free(htab); } static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) @@ -1596,7 +1596,7 @@ void sock_map_destroy(struct sock *sk) saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); - sk_psock_stop(psock, false); + sk_psock_stop(psock); sk_psock_put(sk, psock); saved_destroy(sk); } @@ -1619,9 +1619,10 @@ void sock_map_close(struct sock *sk, long timeout) saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); - sk_psock_stop(psock, true); - sk_psock_put(sk, psock); + sk_psock_stop(psock); release_sock(sk); + cancel_work_sync(&psock->work); + sk_psock_put(sk, psock); saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 5daa1fa54249..fb90e1e00773 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -21,6 +21,22 @@ static DEFINE_IDA(reuseport_ida); static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, struct sock_reuseport *reuse, bool bind_inany); +void reuseport_has_conns_set(struct sock *sk) +{ + struct sock_reuseport *reuse; + + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (likely(reuse)) + reuse->has_conns = 1; + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_has_conns_set); + static int reuseport_sock_index(struct sock *sk, const struct sock_reuseport *reuse, bool closed) diff --git a/net/core/stream.c b/net/core/stream.c index ccc083cdef23..75fded8495f5 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -123,7 +123,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) - current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; + current_timeo = vm_wait = prandom_u32_max(HZ / 5) + 2; add_wait_queue(sk_sleep(sk), &wait); @@ -159,7 +159,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) *timeo_p = current_timeo; } out: - remove_wait_queue(sk_sleep(sk), &wait); + if (!sock_flag(sk, SOCK_DEAD)) + remove_wait_queue(sk_sleep(sk), &wait); return err; do_error: diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 71a13596ea2b..5b1ce656baa1 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -29,7 +29,6 @@ static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; -static long long_max __maybe_unused = LONG_MAX; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -234,14 +233,17 @@ static int set_default_qdisc(struct ctl_table *table, int write, static int proc_do_dev_weight(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret; + static DEFINE_MUTEX(dev_weight_mutex); + int ret, weight; + mutex_lock(&dev_weight_mutex); ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret != 0) - return ret; - - dev_rx_weight = weight_p * dev_weight_rx_bias; - dev_tx_weight = weight_p * dev_weight_tx_bias; + if (!ret && write) { + weight = READ_ONCE(weight_p); + WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); + WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); + } + mutex_unlock(&dev_weight_mutex); return ret; } diff --git a/net/core/xdp.c b/net/core/xdp.c index 24420209bf0e..844c9d99dc0e 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -375,19 +375,17 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, struct xdp_buff *xdp) { - struct xdp_mem_allocator *xa; struct page *page; switch (mem->type) { case MEM_TYPE_PAGE_POOL: - rcu_read_lock(); - /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; - page_pool_put_full_page(xa->page_pool, page, napi_direct); - rcu_read_unlock(); + /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) + * as mem->type knows this a page_pool page + */ + page_pool_put_full_page(page->pp, page, napi_direct); break; case MEM_TYPE_PAGE_SHARED: page_frag_free(data); |