aboutsummaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/.gitignore1
-rw-r--r--net/core/Makefile22
-rw-r--r--net/core/bpf_sk_storage.c5
-rw-r--r--net/core/datagram.c2
-rw-r--r--net/core/dev.c49
-rw-r--r--net/core/devlink.c339
-rw-r--r--net/core/drop_monitor.c3
-rw-r--r--net/core/filter.c753
-rw-r--r--net/core/flow_dissector.c55
-rw-r--r--net/core/flow_offload.c7
-rw-r--r--net/core/gen_stats.c2
-rw-r--r--net/core/gro.c18
-rw-r--r--net/core/gro_cells.c5
-rw-r--r--net/core/lwtunnel.c3
-rw-r--r--net/core/neighbour.c79
-rw-r--r--net/core/net-sysfs.c58
-rw-r--r--net/core/net_namespace.c14
-rw-r--r--net/core/netclassid_cgroup.c2
-rw-r--r--net/core/netpoll.c4
-rw-r--r--net/core/pktgen.c47
-rw-r--r--net/core/rtnetlink.c24
-rw-r--r--net/core/skbuff.c202
-rw-r--r--net/core/skmsg.c36
-rw-r--r--net/core/sock.c158
-rw-r--r--net/core/sock_map.c19
-rw-r--r--net/core/sock_reuseport.c16
-rw-r--r--net/core/stream.c5
-rw-r--r--net/core/sysctl_net_core.c16
-rw-r--r--net/core/xdp.c10
29 files changed, 1125 insertions, 829 deletions
diff --git a/net/core/.gitignore b/net/core/.gitignore
deleted file mode 100644
index df1e74372cce..000000000000
--- a/net/core/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-dropreason_str.c
diff --git a/net/core/Makefile b/net/core/Makefile
index e8ce3bd283a6..5857cec87b83 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -5,7 +5,7 @@
obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \
- flow_dissector.o dropreason_str.o
+ flow_dissector.o
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
@@ -40,23 +40,3 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
obj-$(CONFIG_OF) += of_net.o
-
-clean-files := dropreason_str.c
-
-quiet_cmd_dropreason_str = GEN $@
-cmd_dropreason_str = awk -F ',' 'BEGIN{ print "\#include <net/dropreason.h>\n"; \
- print "const char * const drop_reasons[] = {" }\
- /^enum skb_drop/ { dr=1; }\
- /^\};/ { dr=0; }\
- /^\tSKB_DROP_REASON_/ {\
- if (dr) {\
- sub(/\tSKB_DROP_REASON_/, "", $$1);\
- printf "\t[SKB_DROP_REASON_%s] = \"%s\",\n", $$1, $$1;\
- }\
- }\
- END{ print "};" }' $< > $@
-
-$(obj)/dropreason_str.c: $(srctree)/include/net/dropreason.h
- $(call cmd,dropreason_str)
-
-$(obj)/dropreason_str.o: $(obj)/dropreason_str.c
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 1b7f385643b4..94374d529ea4 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -310,11 +310,12 @@ BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap,
void *owner, u32 size)
{
+ int optmem_max = READ_ONCE(sysctl_optmem_max);
struct sock *sk = (struct sock *)owner;
/* same check as in sock_kmalloc() */
- if (size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+ if (size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
atomic_add(size, &sk->sk_omem_alloc);
return 0;
}
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 7255531f63ae..e4ff2db40c98 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -677,7 +677,7 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
page_ref_sub(last_head, refs);
refs = 0;
}
- skb_fill_page_desc(skb, frag++, head, start, size);
+ skb_fill_page_desc_noacc(skb, frag++, head, start, size);
}
if (refs)
page_ref_sub(last_head, refs);
diff --git a/net/core/dev.c b/net/core/dev.c
index 716df64fcfa5..3be256051e99 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1100,7 +1100,7 @@ static int dev_alloc_name_ns(struct net *net,
BUG_ON(!net);
ret = __dev_alloc_name(net, name, buf);
if (ret >= 0)
- strlcpy(dev->name, buf, IFNAMSIZ);
+ strscpy(dev->name, buf, IFNAMSIZ);
return ret;
}
@@ -1137,7 +1137,7 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev,
else if (netdev_name_in_use(net, name))
return -EEXIST;
else if (dev->name != name)
- strlcpy(dev->name, name, IFNAMSIZ);
+ strscpy(dev->name, name, IFNAMSIZ);
return 0;
}
@@ -4624,7 +4624,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
struct softnet_data *sd;
unsigned int old_flow, new_flow;
- if (qlen < (netdev_max_backlog >> 1))
+ if (qlen < (READ_ONCE(netdev_max_backlog) >> 1))
return false;
sd = this_cpu_ptr(&softnet_data);
@@ -4672,7 +4672,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
if (!netif_running(skb->dev))
goto drop;
qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
+ if (qlen <= READ_ONCE(netdev_max_backlog) && !skb_flow_limit(skb, qlen)) {
if (qlen) {
enqueue:
__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -4928,7 +4928,7 @@ static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
- net_timestamp_check(netdev_tstamp_prequeue, skb);
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
trace_netif_rx(skb);
@@ -5136,11 +5136,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
case TC_ACT_SHOT:
mini_qdisc_qstats_cpu_drop(miniq);
kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
+ *ret = NET_RX_DROP;
return NULL;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
consume_skb(skb);
+ *ret = NET_RX_SUCCESS;
return NULL;
case TC_ACT_REDIRECT:
/* skb_mac_header check was done by cls/act_bpf, so
@@ -5153,8 +5155,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
*another = true;
break;
}
+ *ret = NET_RX_SUCCESS;
return NULL;
case TC_ACT_CONSUMED:
+ *ret = NET_RX_SUCCESS;
return NULL;
default:
break;
@@ -5281,7 +5285,7 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
int ret = NET_RX_DROP;
__be16 type;
- net_timestamp_check(!netdev_tstamp_prequeue, skb);
+ net_timestamp_check(!READ_ONCE(netdev_tstamp_prequeue), skb);
trace_netif_receive_skb(skb);
@@ -5664,7 +5668,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
{
int ret;
- net_timestamp_check(netdev_tstamp_prequeue, skb);
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
@@ -5694,7 +5698,7 @@ void netif_receive_skb_list_internal(struct list_head *head)
INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
- net_timestamp_check(netdev_tstamp_prequeue, skb);
+ net_timestamp_check(READ_ONCE(netdev_tstamp_prequeue), skb);
skb_list_del_init(skb);
if (!skb_defer_rx_timestamp(skb))
list_add_tail(&skb->list, &sublist);
@@ -5918,7 +5922,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
net_rps_action_and_irq_enable(sd);
}
- napi->weight = dev_rx_weight;
+ napi->weight = READ_ONCE(dev_rx_weight);
while (again) {
struct sk_buff *skb;
@@ -6358,23 +6362,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
}
EXPORT_SYMBOL(dev_set_threaded);
-/* Double check that napi_get_frags() allocates skbs with
- * skb->head being backed by slab, not a page fragment.
- * This is to make sure bug fixed in 3226b158e67c
- * ("net: avoid 32 x truesize under-estimation for tiny skbs")
- * does not accidentally come back.
- */
-static void napi_get_frags_check(struct napi_struct *napi)
-{
- struct sk_buff *skb;
-
- local_bh_disable();
- skb = napi_get_frags(napi);
- WARN_ON_ONCE(skb && skb->head_frag);
- napi_free_frags(napi);
- local_bh_enable();
-}
-
void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -6665,8 +6652,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
- usecs_to_jiffies(netdev_budget_usecs);
- int budget = netdev_budget;
+ usecs_to_jiffies(READ_ONCE(netdev_budget_usecs));
+ int budget = READ_ONCE(netdev_budget);
LIST_HEAD(list);
LIST_HEAD(repoll);
@@ -10284,7 +10271,7 @@ static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
return dev;
if (time_after(jiffies, warning_time +
- netdev_unregister_timeout_secs * HZ)) {
+ READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
list_for_each_entry(dev, list, todo_list) {
pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
dev->name, netdev_refcnt_read(dev));
@@ -10370,9 +10357,7 @@ void netdev_run_todo(void)
BUG_ON(!list_empty(&dev->ptype_specific));
WARN_ON(rcu_access_pointer(dev->ip_ptr));
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
-#if IS_ENABLED(CONFIG_DECNET)
- WARN_ON(dev->dn_ptr);
-#endif
+
if (dev->priv_destructor)
dev->priv_destructor(dev);
if (dev->needs_free_netdev)
diff --git a/net/core/devlink.c b/net/core/devlink.c
index b50bcc18b8d9..89baa7c0938b 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -371,6 +371,13 @@ static struct devlink *devlink_get_from_attrs(struct net *net,
return ERR_PTR(-ENODEV);
}
+#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \
+ WARN_ON_ONCE(!(devlink_port)->registered)
+#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \
+ WARN_ON_ONCE((devlink_port)->registered)
+#define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \
+ WARN_ON_ONCE(!(devlink_port)->initialized)
+
static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
unsigned int port_index)
{
@@ -1710,7 +1717,7 @@ static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
struct devlink *devlink = info->user_ptr[0];
u32 count;
- if (!info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT))
return -EINVAL;
if (!devlink->ops->port_split)
return -EOPNOTSUPP;
@@ -1838,7 +1845,7 @@ static int devlink_nl_cmd_port_del_doit(struct sk_buff *skb,
if (!devlink->ops->port_del)
return -EOPNOTSUPP;
- if (!info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_INDEX)) {
NL_SET_ERR_MSG_MOD(extack, "Port index is not specified");
return -EINVAL;
}
@@ -2690,7 +2697,7 @@ static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb,
if (err)
return err;
- if (!info->attrs[DEVLINK_ATTR_SB_POOL_SIZE])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_POOL_SIZE))
return -EINVAL;
size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]);
@@ -2900,7 +2907,7 @@ static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb,
if (err)
return err;
- if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD))
return -EINVAL;
threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
@@ -3156,7 +3163,7 @@ static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
if (err)
return err;
- if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD))
return -EINVAL;
threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
@@ -3845,7 +3852,7 @@ static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb,
struct devlink_dpipe_table *table;
const char *table_name;
- if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME))
return -EINVAL;
table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
@@ -4029,8 +4036,9 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb,
const char *table_name;
bool counters_enable;
- if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME] ||
- !info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME) ||
+ GENL_REQ_ATTR_CHECK(info,
+ DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED))
return -EINVAL;
table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
@@ -4119,8 +4127,8 @@ static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
u64 size;
int err;
- if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] ||
- !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) ||
+ GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE))
return -EINVAL;
resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
@@ -4742,10 +4750,76 @@ void devlink_flash_update_timeout_notify(struct devlink *devlink,
}
EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify);
+struct devlink_info_req {
+ struct sk_buff *msg;
+ void (*version_cb)(const char *version_name,
+ enum devlink_info_version_type version_type,
+ void *version_cb_priv);
+ void *version_cb_priv;
+};
+
+struct devlink_flash_component_lookup_ctx {
+ const char *lookup_name;
+ bool lookup_name_found;
+};
+
+static void
+devlink_flash_component_lookup_cb(const char *version_name,
+ enum devlink_info_version_type version_type,
+ void *version_cb_priv)
+{
+ struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv;
+
+ if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT ||
+ lookup_ctx->lookup_name_found)
+ return;
+
+ lookup_ctx->lookup_name_found =
+ !strcmp(lookup_ctx->lookup_name, version_name);
+}
+
+static int devlink_flash_component_get(struct devlink *devlink,
+ struct nlattr *nla_component,
+ const char **p_component,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_flash_component_lookup_ctx lookup_ctx = {};
+ struct devlink_info_req req = {};
+ const char *component;
+ int ret;
+
+ if (!nla_component)
+ return 0;
+
+ component = nla_data(nla_component);
+
+ if (!devlink->ops->info_get) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_component,
+ "component update is not supported by this device");
+ return -EOPNOTSUPP;
+ }
+
+ lookup_ctx.lookup_name = component;
+ req.version_cb = devlink_flash_component_lookup_cb;
+ req.version_cb_priv = &lookup_ctx;
+
+ ret = devlink->ops->info_get(devlink, &req, NULL);
+ if (ret)
+ return ret;
+
+ if (!lookup_ctx.lookup_name_found) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_component,
+ "selected component is not supported by this device");
+ return -EINVAL;
+ }
+ *p_component = component;
+ return 0;
+}
+
static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
struct genl_info *info)
{
- struct nlattr *nla_component, *nla_overwrite_mask, *nla_file_name;
+ struct nlattr *nla_overwrite_mask, *nla_file_name;
struct devlink_flash_update_params params = {};
struct devlink *devlink = info->user_ptr[0];
const char *file_name;
@@ -4755,20 +4829,16 @@ static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
if (!devlink->ops->flash_update)
return -EOPNOTSUPP;
- if (!info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME))
return -EINVAL;
- supported_params = devlink->ops->supported_flash_update_params;
+ ret = devlink_flash_component_get(devlink,
+ info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT],
+ &params.component, info->extack);
+ if (ret)
+ return ret;
- nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT];
- if (nla_component) {
- if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT)) {
- NL_SET_ERR_MSG_ATTR(info->extack, nla_component,
- "component update is not supported by this device");
- return -EOPNOTSUPP;
- }
- params.component = nla_data(nla_component);
- }
+ supported_params = devlink->ops->supported_flash_update_params;
nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK];
if (nla_overwrite_mask) {
@@ -4936,10 +5006,8 @@ static int devlink_nl_cmd_selftests_run(struct sk_buff *skb,
if (!devlink->ops->selftest_run || !devlink->ops->selftest_check)
return -EOPNOTSUPP;
- if (!info->attrs[DEVLINK_ATTR_SELFTESTS]) {
- NL_SET_ERR_MSG_MOD(info->extack, "selftest required");
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS))
return -EINVAL;
- }
attrs = info->attrs[DEVLINK_ATTR_SELFTESTS];
@@ -5393,7 +5461,7 @@ static int
devlink_param_type_get_from_info(struct genl_info *info,
enum devlink_param_type *param_type)
{
- if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE))
return -EINVAL;
switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) {
@@ -5470,7 +5538,7 @@ devlink_param_get_from_info(struct list_head *param_list,
{
char *param_name;
- if (!info->attrs[DEVLINK_ATTR_PARAM_NAME])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME))
return NULL;
param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]);
@@ -5536,7 +5604,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
return err;
}
- if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE))
return -EINVAL;
cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]);
if (!devlink_param_cmode_is_supported(param, cmode))
@@ -5574,89 +5642,22 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg,
struct netlink_callback *cb)
{
- struct devlink_param_item *param_item;
- struct devlink_port *devlink_port;
- struct devlink *devlink;
- int start = cb->args[0];
- unsigned long index;
- int idx = 0;
- int err = 0;
-
- devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) {
- devl_lock(devlink);
- list_for_each_entry(devlink_port, &devlink->port_list, list) {
- list_for_each_entry(param_item,
- &devlink_port->param_list, list) {
- if (idx < start) {
- idx++;
- continue;
- }
- err = devlink_nl_param_fill(msg,
- devlink_port->devlink,
- devlink_port->index, param_item,
- DEVLINK_CMD_PORT_PARAM_GET,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI);
- if (err == -EOPNOTSUPP) {
- err = 0;
- } else if (err) {
- devl_unlock(devlink);
- devlink_put(devlink);
- goto out;
- }
- idx++;
- }
- }
- devl_unlock(devlink);
- devlink_put(devlink);
- }
-out:
- if (err != -EMSGSIZE)
- return err;
-
- cb->args[0] = idx;
+ NL_SET_ERR_MSG_MOD(cb->extack, "Port params are not supported");
return msg->len;
}
static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink_port *devlink_port = info->user_ptr[1];
- struct devlink_param_item *param_item;
- struct sk_buff *msg;
- int err;
-
- param_item = devlink_param_get_from_info(&devlink_port->param_list,
- info);
- if (!param_item)
- return -EINVAL;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_param_fill(msg, devlink_port->devlink,
- devlink_port->index, param_item,
- DEVLINK_CMD_PORT_PARAM_GET,
- info->snd_portid, info->snd_seq, 0);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
+ NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported");
+ return -EINVAL;
}
static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb,
struct genl_info *info)
{
- struct devlink_port *devlink_port = info->user_ptr[1];
-
- return __devlink_nl_cmd_param_set_doit(devlink_port->devlink,
- devlink_port->index,
- &devlink_port->param_list, info,
- DEVLINK_CMD_PORT_PARAM_NEW);
+ NL_SET_ERR_MSG_MOD(info->extack, "Port params are not supported");
+ return -EINVAL;
}
static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
@@ -6056,7 +6057,7 @@ static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
unsigned int index;
int err;
- if (!info->attrs[DEVLINK_ATTR_REGION_NAME])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME))
return -EINVAL;
if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
@@ -6189,8 +6190,8 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb,
unsigned int index;
u32 snapshot_id;
- if (!info->attrs[DEVLINK_ATTR_REGION_NAME] ||
- !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) ||
+ GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID))
return -EINVAL;
region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
@@ -6238,7 +6239,7 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
u8 *data;
int err;
- if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) {
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) {
NL_SET_ERR_MSG_MOD(info->extack, "No region name provided");
return -EINVAL;
}
@@ -6553,18 +6554,18 @@ out_unlock:
return err;
}
-struct devlink_info_req {
- struct sk_buff *msg;
-};
-
int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name)
{
+ if (!req->msg)
+ return 0;
return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name);
}
EXPORT_SYMBOL_GPL(devlink_info_driver_name_put);
int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
{
+ if (!req->msg)
+ return 0;
return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn);
}
EXPORT_SYMBOL_GPL(devlink_info_serial_number_put);
@@ -6572,6 +6573,8 @@ EXPORT_SYMBOL_GPL(devlink_info_serial_number_put);
int devlink_info_board_serial_number_put(struct devlink_info_req *req,
const char *bsn)
{
+ if (!req->msg)
+ return 0;
return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER,
bsn);
}
@@ -6579,11 +6582,19 @@ EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put);
static int devlink_info_version_put(struct devlink_info_req *req, int attr,
const char *version_name,
- const char *version_value)
+ const char *version_value,
+ enum devlink_info_version_type version_type)
{
struct nlattr *nest;
int err;
+ if (req->version_cb)
+ req->version_cb(version_name, version_type,
+ req->version_cb_priv);
+
+ if (!req->msg)
+ return 0;
+
nest = nla_nest_start_noflag(req->msg, attr);
if (!nest)
return -EMSGSIZE;
@@ -6612,7 +6623,8 @@ int devlink_info_version_fixed_put(struct devlink_info_req *req,
const char *version_value)
{
return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED,
- version_name, version_value);
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
}
EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put);
@@ -6621,25 +6633,49 @@ int devlink_info_version_stored_put(struct devlink_info_req *req,
const char *version_value)
{
return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
- version_name, version_value);
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
}
EXPORT_SYMBOL_GPL(devlink_info_version_stored_put);
+int devlink_info_version_stored_put_ext(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
+ version_name, version_value,
+ version_type);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext);
+
int devlink_info_version_running_put(struct devlink_info_req *req,
const char *version_name,
const char *version_value)
{
return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
- version_name, version_value);
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
}
EXPORT_SYMBOL_GPL(devlink_info_version_running_put);
+int devlink_info_version_running_put_ext(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
+ version_name, version_value,
+ version_type);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext);
+
static int
devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink,
enum devlink_command cmd, u32 portid,
u32 seq, int flags, struct netlink_ext_ack *extack)
{
- struct devlink_info_req req;
+ struct devlink_info_req req = {};
void *hdr;
int err;
@@ -9513,6 +9549,7 @@ static struct genl_family devlink_nl_family __ro_after_init = {
.module = THIS_MODULE,
.small_ops = devlink_nl_ops,
.n_small_ops = ARRAY_SIZE(devlink_nl_ops),
+ .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1,
.mcgrps = devlink_nl_mcgrps,
.n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps),
};
@@ -9818,6 +9855,44 @@ static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
}
/**
+ * devlink_port_init() - Init devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ *
+ * Initialize essencial stuff that is needed for functions
+ * that may be called before devlink port registration.
+ * Call to this function is optional and not needed
+ * in case the driver does not use such functions.
+ */
+void devlink_port_init(struct devlink *devlink,
+ struct devlink_port *devlink_port)
+{
+ if (devlink_port->initialized)
+ return;
+ devlink_port->devlink = devlink;
+ INIT_LIST_HEAD(&devlink_port->region_list);
+ devlink_port->initialized = true;
+}
+EXPORT_SYMBOL_GPL(devlink_port_init);
+
+/**
+ * devlink_port_fini() - Deinitialize devlink port
+ *
+ * @devlink_port: devlink port
+ *
+ * Deinitialize essencial stuff that is in use for functions
+ * that may be called after devlink port unregistration.
+ * Call to this function is optional and not needed
+ * in case the driver does not use such functions.
+ */
+void devlink_port_fini(struct devlink_port *devlink_port)
+{
+ WARN_ON(!list_empty(&devlink_port->region_list));
+}
+EXPORT_SYMBOL_GPL(devlink_port_fini);
+
+/**
* devl_port_register() - Register devlink port
*
* @devlink: devlink
@@ -9839,15 +9914,15 @@ int devl_port_register(struct devlink *devlink,
if (devlink_port_index_exists(devlink, port_index))
return -EEXIST;
- WARN_ON(devlink_port->devlink);
- devlink_port->devlink = devlink;
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ devlink_port_init(devlink, devlink_port);
+ devlink_port->registered = true;
devlink_port->index = port_index;
spin_lock_init(&devlink_port->type_lock);
INIT_LIST_HEAD(&devlink_port->reporter_list);
mutex_init(&devlink_port->reporters_lock);
list_add_tail(&devlink_port->list, &devlink->port_list);
- INIT_LIST_HEAD(&devlink_port->param_list);
- INIT_LIST_HEAD(&devlink_port->region_list);
INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
devlink_port_type_warn_schedule(devlink_port);
@@ -9897,8 +9972,8 @@ void devl_port_unregister(struct devlink_port *devlink_port)
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
list_del(&devlink_port->list);
WARN_ON(!list_empty(&devlink_port->reporter_list));
- WARN_ON(!list_empty(&devlink_port->region_list));
mutex_destroy(&devlink_port->reporters_lock);
+ devlink_port->registered = false;
}
EXPORT_SYMBOL_GPL(devl_port_unregister);
@@ -9923,8 +9998,8 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
enum devlink_port_type type,
void *type_dev)
{
- if (WARN_ON(!devlink_port->devlink))
- return;
+ ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);
+
devlink_port_type_warn_cancel(devlink_port);
spin_lock_bh(&devlink_port->type_lock);
devlink_port->type = type;
@@ -10043,8 +10118,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
{
int ret;
- if (WARN_ON(devlink_port->devlink))
- return;
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
devlink_port->attrs = *attrs;
ret = __devlink_port_attrs_set(devlink_port, attrs->flavour);
if (ret)
@@ -10067,8 +10142,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
- if (WARN_ON(devlink_port->devlink))
- return;
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
ret = __devlink_port_attrs_set(devlink_port,
DEVLINK_PORT_FLAVOUR_PCI_PF);
if (ret)
@@ -10094,8 +10169,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
- if (WARN_ON(devlink_port->devlink))
- return;
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
ret = __devlink_port_attrs_set(devlink_port,
DEVLINK_PORT_FLAVOUR_PCI_VF);
if (ret)
@@ -10122,8 +10197,8 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
struct devlink_port_attrs *attrs = &devlink_port->attrs;
int ret;
- if (WARN_ON(devlink_port->devlink))
- return;
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
ret = __devlink_port_attrs_set(devlink_port,
DEVLINK_PORT_FLAVOUR_PCI_SF);
if (ret)
@@ -10238,8 +10313,8 @@ EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy);
void devlink_port_linecard_set(struct devlink_port *devlink_port,
struct devlink_linecard *linecard)
{
- if (WARN_ON(devlink_port->devlink))
- return;
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
devlink_port->linecard = linecard;
}
EXPORT_SYMBOL_GPL(devlink_port_linecard_set);
@@ -11310,6 +11385,8 @@ devlink_port_region_create(struct devlink_port *port,
struct devlink_region *region;
int err = 0;
+ ASSERT_DEVLINK_PORT_INITIALIZED(port);
+
if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
return ERR_PTR(-EINVAL);
@@ -12306,8 +12383,8 @@ EXPORT_SYMBOL_GPL(devl_trap_policers_unregister);
static void __devlink_compat_running_version(struct devlink *devlink,
char *buf, size_t len)
{
+ struct devlink_info_req req = {};
const struct nlattr *nlattr;
- struct devlink_info_req req;
struct sk_buff *msg;
int rem, err;
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 75501e1bdd25..f084a4a6b7ab 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -464,7 +464,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
goto out;
hw_entry = &hw_entries->entries[hw_entries->num_entries];
- strlcpy(hw_entry->trap_name, metadata->trap_name,
+ strscpy(hw_entry->trap_name, metadata->trap_name,
NET_DM_MAX_HW_TRAP_NAME_LEN - 1);
hw_entry->count = 1;
hw_entries->num_entries++;
@@ -1645,6 +1645,7 @@ static struct genl_family net_drop_monitor_family __ro_after_init = {
.module = THIS_MODULE,
.small_ops = dropmon_ops,
.n_small_ops = ARRAY_SIZE(dropmon_ops),
+ .resv_start_op = NET_DM_CMD_STATS_GET + 1,
.mcgrps = dropmon_mcgrps,
.n_mcgrps = ARRAY_SIZE(dropmon_mcgrps),
};
diff --git a/net/core/filter.c b/net/core/filter.c
index e8508aaafd27..bb0136e7a8e4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -18,6 +18,7 @@
*/
#include <linux/atomic.h>
+#include <linux/bpf_verifier.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
@@ -1214,10 +1215,11 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
u32 filter_size = bpf_prog_size(fp->prog->len);
+ int optmem_max = READ_ONCE(sysctl_optmem_max);
/* same check as in sock_kmalloc() */
- if (filter_size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
+ if (filter_size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) {
atomic_add(filter_size, &sk->sk_omem_alloc);
return true;
}
@@ -1548,7 +1550,7 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (bpf_prog_size(prog->len) > sysctl_optmem_max)
+ if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max))
err = -ENOMEM;
else
err = reuseport_attach_prog(sk, prog);
@@ -1615,7 +1617,7 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
}
} else {
/* BPF_PROG_TYPE_SOCKET_FILTER */
- if (bpf_prog_size(prog->len) > sysctl_optmem_max) {
+ if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) {
err = -ENOMEM;
goto err_prog_put;
}
@@ -3009,7 +3011,7 @@ BPF_CALL_0(bpf_get_cgroup_classid_curr)
return __task_get_classid(current);
}
-static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
+const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
.func = bpf_get_cgroup_classid_curr,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -4488,7 +4490,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key
void *to_orig = to;
int err;
- if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
+ if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 |
+ BPF_F_TUNINFO_FLAGS)))) {
err = -EINVAL;
goto err_clear;
}
@@ -4520,7 +4523,10 @@ set_compat:
to->tunnel_id = be64_to_cpu(info->key.tun_id);
to->tunnel_tos = info->key.tos;
to->tunnel_ttl = info->key.ttl;
- to->tunnel_ext = 0;
+ if (flags & BPF_F_TUNINFO_FLAGS)
+ to->tunnel_flags = info->key.tun_flags;
+ else
+ to->tunnel_ext = 0;
if (flags & BPF_F_TUNINFO_IPV6) {
memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
@@ -5013,359 +5019,303 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-static int __bpf_setsockopt(struct sock *sk, int level, int optname,
- char *optval, int optlen)
+static int sol_socket_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ switch (optname) {
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ case SO_KEEPALIVE:
+ case SO_PRIORITY:
+ case SO_REUSEPORT:
+ case SO_RCVLOWAT:
+ case SO_MARK:
+ case SO_MAX_PACING_RATE:
+ case SO_BINDTOIFINDEX:
+ case SO_TXREHASH:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ case SO_BINDTODEVICE:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (getopt) {
+ if (optname == SO_BINDTODEVICE)
+ return -EINVAL;
+ return sk_getsockopt(sk, SOL_SOCKET, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+ }
+
+ return sk_setsockopt(sk, SOL_SOCKET, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
+
+static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
+ char *optval, int optlen)
{
- char devname[IFNAMSIZ];
- int val, valbool;
- struct net *net;
- int ifindex;
- int ret = 0;
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long timeout;
+ int val;
- if (!sk_fullsock(sk))
+ if (optlen != sizeof(int))
return -EINVAL;
- if (level == SOL_SOCKET) {
- if (optlen != sizeof(int) && optname != SO_BINDTODEVICE)
+ val = *(int *)optval;
+
+ /* Only some options are supported */
+ switch (optname) {
+ case TCP_BPF_IW:
+ if (val <= 0 || tp->data_segs_out > tp->syn_data)
return -EINVAL;
- val = *((int *)optval);
- valbool = val ? 1 : 0;
-
- /* Only some socketops are supported */
- switch (optname) {
- case SO_RCVBUF:
- val = min_t(u32, val, sysctl_rmem_max);
- val = min_t(int, val, INT_MAX / 2);
- sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- WRITE_ONCE(sk->sk_rcvbuf,
- max_t(int, val * 2, SOCK_MIN_RCVBUF));
- break;
- case SO_SNDBUF:
- val = min_t(u32, val, sysctl_wmem_max);
- val = min_t(int, val, INT_MAX / 2);
- sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- WRITE_ONCE(sk->sk_sndbuf,
- max_t(int, val * 2, SOCK_MIN_SNDBUF));
- break;
- case SO_MAX_PACING_RATE: /* 32bit version */
- if (val != ~0U)
- cmpxchg(&sk->sk_pacing_status,
- SK_PACING_NONE,
- SK_PACING_NEEDED);
- sk->sk_max_pacing_rate = (val == ~0U) ?
- ~0UL : (unsigned int)val;
- sk->sk_pacing_rate = min(sk->sk_pacing_rate,
- sk->sk_max_pacing_rate);
- break;
- case SO_PRIORITY:
- sk->sk_priority = val;
- break;
- case SO_RCVLOWAT:
- if (val < 0)
- val = INT_MAX;
- if (sk->sk_socket && sk->sk_socket->ops->set_rcvlowat)
- ret = sk->sk_socket->ops->set_rcvlowat(sk, val);
- else
- WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
- break;
- case SO_MARK:
- if (sk->sk_mark != val) {
- sk->sk_mark = val;
- sk_dst_reset(sk);
- }
- break;
- case SO_BINDTODEVICE:
- optlen = min_t(long, optlen, IFNAMSIZ - 1);
- strncpy(devname, optval, optlen);
- devname[optlen] = 0;
+ tcp_snd_cwnd_set(tp, val);
+ break;
+ case TCP_BPF_SNDCWND_CLAMP:
+ if (val <= 0)
+ return -EINVAL;
+ tp->snd_cwnd_clamp = val;
+ tp->snd_ssthresh = val;
+ break;
+ case TCP_BPF_DELACK_MAX:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_DELACK_MAX ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_delack_max = timeout;
+ break;
+ case TCP_BPF_RTO_MIN:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_RTO_MIN ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_rto_min = timeout;
+ break;
+ default:
+ return -EINVAL;
+ }
- ifindex = 0;
- if (devname[0] != '\0') {
- struct net_device *dev;
+ return 0;
+}
- ret = -ENODEV;
+static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
+ int *optlen, bool getopt)
+{
+ struct tcp_sock *tp;
+ int ret;
- net = sock_net(sk);
- dev = dev_get_by_name(net, devname);
- if (!dev)
- break;
- ifindex = dev->ifindex;
- dev_put(dev);
- }
- fallthrough;
- case SO_BINDTOIFINDEX:
- if (optname == SO_BINDTOIFINDEX)
- ifindex = val;
- ret = sock_bindtoindex(sk, ifindex, false);
- break;
- case SO_KEEPALIVE:
- if (sk->sk_prot->keepalive)
- sk->sk_prot->keepalive(sk, valbool);
- sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
- break;
- case SO_REUSEPORT:
- sk->sk_reuseport = valbool;
- break;
- case SO_TXREHASH:
- if (val < -1 || val > 1) {
- ret = -EINVAL;
- break;
- }
- sk->sk_txrehash = (u8)val;
- break;
- default:
- ret = -EINVAL;
- }
-#ifdef CONFIG_INET
- } else if (level == SOL_IP) {
- if (optlen != sizeof(int) || sk->sk_family != AF_INET)
+ if (*optlen < 2)
+ return -EINVAL;
+
+ if (getopt) {
+ if (!inet_csk(sk)->icsk_ca_ops)
return -EINVAL;
+ /* BPF expects NULL-terminated tcp-cc string */
+ optval[--(*optlen)] = '\0';
+ return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+ }
- val = *((int *)optval);
- /* Only some options are supported */
- switch (optname) {
- case IP_TOS:
- if (val < -1 || val > 0xff) {
- ret = -EINVAL;
- } else {
- struct inet_sock *inet = inet_sk(sk);
+ /* "cdg" is the only cc that alloc a ptr
+ * in inet_csk_ca area. The bpf-tcp-cc may
+ * overwrite this ptr after switching to cdg.
+ */
+ if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
+ return -ENOTSUPP;
- if (val == -1)
- val = 0;
- inet->tos = val;
- }
- break;
- default:
- ret = -EINVAL;
- }
-#if IS_ENABLED(CONFIG_IPV6)
- } else if (level == SOL_IPV6) {
- if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
- return -EINVAL;
+ /* It stops this looping
+ *
+ * .init => bpf_setsockopt(tcp_cc) => .init =>
+ * bpf_setsockopt(tcp_cc)" => .init => ....
+ *
+ * The second bpf_setsockopt(tcp_cc) is not allowed
+ * in order to break the loop when both .init
+ * are the same bpf prog.
+ *
+ * This applies even the second bpf_setsockopt(tcp_cc)
+ * does not cause a loop. This limits only the first
+ * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
+ * pick a fallback cc (eg. peer does not support ECN)
+ * and the second '.init' cannot fallback to
+ * another.
+ */
+ tp = tcp_sk(sk);
+ if (tp->bpf_chg_cc_inprogress)
+ return -EBUSY;
+
+ tp->bpf_chg_cc_inprogress = 1;
+ ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+ KERNEL_SOCKPTR(optval), *optlen);
+ tp->bpf_chg_cc_inprogress = 0;
+ return ret;
+}
- val = *((int *)optval);
- /* Only some options are supported */
- switch (optname) {
- case IPV6_TCLASS:
- if (val < -1 || val > 0xff) {
- ret = -EINVAL;
- } else {
- struct ipv6_pinfo *np = inet6_sk(sk);
+static int sol_tcp_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ if (sk->sk_prot->setsockopt != tcp_setsockopt)
+ return -EINVAL;
- if (val == -1)
- val = 0;
- np->tclass = val;
- }
- break;
- default:
- ret = -EINVAL;
- }
-#endif
- } else if (level == SOL_TCP &&
- sk->sk_prot->setsockopt == tcp_setsockopt) {
- if (optname == TCP_CONGESTION) {
- char name[TCP_CA_NAME_MAX];
-
- strncpy(name, optval, min_t(long, optlen,
- TCP_CA_NAME_MAX-1));
- name[TCP_CA_NAME_MAX-1] = 0;
- ret = tcp_set_congestion_control(sk, name, false, true);
- } else {
- struct inet_connection_sock *icsk = inet_csk(sk);
+ switch (optname) {
+ case TCP_NODELAY:
+ case TCP_MAXSEG:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_KEEPCNT:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ case TCP_USER_TIMEOUT:
+ case TCP_NOTSENT_LOWAT:
+ case TCP_SAVE_SYN:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ case TCP_CONGESTION:
+ return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
+ case TCP_SAVED_SYN:
+ if (*optlen < 1)
+ return -EINVAL;
+ break;
+ default:
+ if (getopt)
+ return -EINVAL;
+ return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
+ }
+
+ if (getopt) {
+ if (optname == TCP_SAVED_SYN) {
struct tcp_sock *tp = tcp_sk(sk);
- unsigned long timeout;
- if (optlen != sizeof(int))
+ if (!tp->saved_syn ||
+ *optlen > tcp_saved_syn_len(tp->saved_syn))
return -EINVAL;
-
- val = *((int *)optval);
- /* Only some options are supported */
- switch (optname) {
- case TCP_BPF_IW:
- if (val <= 0 || tp->data_segs_out > tp->syn_data)
- ret = -EINVAL;
- else
- tcp_snd_cwnd_set(tp, val);
- break;
- case TCP_BPF_SNDCWND_CLAMP:
- if (val <= 0) {
- ret = -EINVAL;
- } else {
- tp->snd_cwnd_clamp = val;
- tp->snd_ssthresh = val;
- }
- break;
- case TCP_BPF_DELACK_MAX:
- timeout = usecs_to_jiffies(val);
- if (timeout > TCP_DELACK_MAX ||
- timeout < TCP_TIMEOUT_MIN)
- return -EINVAL;
- inet_csk(sk)->icsk_delack_max = timeout;
- break;
- case TCP_BPF_RTO_MIN:
- timeout = usecs_to_jiffies(val);
- if (timeout > TCP_RTO_MIN ||
- timeout < TCP_TIMEOUT_MIN)
- return -EINVAL;
- inet_csk(sk)->icsk_rto_min = timeout;
- break;
- case TCP_SAVE_SYN:
- if (val < 0 || val > 1)
- ret = -EINVAL;
- else
- tp->save_syn = val;
- break;
- case TCP_KEEPIDLE:
- ret = tcp_sock_set_keepidle_locked(sk, val);
- break;
- case TCP_KEEPINTVL:
- if (val < 1 || val > MAX_TCP_KEEPINTVL)
- ret = -EINVAL;
- else
- tp->keepalive_intvl = val * HZ;
- break;
- case TCP_KEEPCNT:
- if (val < 1 || val > MAX_TCP_KEEPCNT)
- ret = -EINVAL;
- else
- tp->keepalive_probes = val;
- break;
- case TCP_SYNCNT:
- if (val < 1 || val > MAX_TCP_SYNCNT)
- ret = -EINVAL;
- else
- icsk->icsk_syn_retries = val;
- break;
- case TCP_USER_TIMEOUT:
- if (val < 0)
- ret = -EINVAL;
- else
- icsk->icsk_user_timeout = val;
- break;
- case TCP_NOTSENT_LOWAT:
- tp->notsent_lowat = val;
- sk->sk_write_space(sk);
- break;
- case TCP_WINDOW_CLAMP:
- ret = tcp_set_window_clamp(sk, val);
- break;
- default:
- ret = -EINVAL;
- }
+ memcpy(optval, tp->saved_syn->data, *optlen);
+ /* It cannot free tp->saved_syn here because it
+ * does not know if the user space still needs it.
+ */
+ return 0;
}
-#endif
- } else {
- ret = -EINVAL;
+
+ return do_tcp_getsockopt(sk, SOL_TCP, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
}
- return ret;
+
+ return do_tcp_setsockopt(sk, SOL_TCP, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
}
-static int _bpf_setsockopt(struct sock *sk, int level, int optname,
- char *optval, int optlen)
+static int sol_ip_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
{
- if (sk_fullsock(sk))
- sock_owned_by_me(sk);
- return __bpf_setsockopt(sk, level, optname, optval, optlen);
+ if (sk->sk_family != AF_INET)
+ return -EINVAL;
+
+ switch (optname) {
+ case IP_TOS:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (getopt)
+ return do_ip_getsockopt(sk, SOL_IP, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+
+ return do_ip_setsockopt(sk, SOL_IP, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
}
-static int __bpf_getsockopt(struct sock *sk, int level, int optname,
- char *optval, int optlen)
+static int sol_ipv6_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
{
- if (!sk_fullsock(sk))
- goto err_clear;
+ if (sk->sk_family != AF_INET6)
+ return -EINVAL;
- if (level == SOL_SOCKET) {
- if (optlen != sizeof(int))
- goto err_clear;
+ switch (optname) {
+ case IPV6_TCLASS:
+ case IPV6_AUTOFLOWLABEL:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
- switch (optname) {
- case SO_RCVBUF:
- *((int *)optval) = sk->sk_rcvbuf;
- break;
- case SO_SNDBUF:
- *((int *)optval) = sk->sk_sndbuf;
- break;
- case SO_MARK:
- *((int *)optval) = sk->sk_mark;
- break;
- case SO_PRIORITY:
- *((int *)optval) = sk->sk_priority;
- break;
- case SO_BINDTOIFINDEX:
- *((int *)optval) = sk->sk_bound_dev_if;
- break;
- case SO_REUSEPORT:
- *((int *)optval) = sk->sk_reuseport;
- break;
- case SO_TXREHASH:
- *((int *)optval) = sk->sk_txrehash;
- break;
- default:
- goto err_clear;
- }
-#ifdef CONFIG_INET
- } else if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
- struct inet_connection_sock *icsk;
- struct tcp_sock *tp;
+ if (getopt)
+ return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
- switch (optname) {
- case TCP_CONGESTION:
- icsk = inet_csk(sk);
+ return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
- if (!icsk->icsk_ca_ops || optlen <= 1)
- goto err_clear;
- strncpy(optval, icsk->icsk_ca_ops->name, optlen);
- optval[optlen - 1] = 0;
- break;
- case TCP_SAVED_SYN:
- tp = tcp_sk(sk);
+static int __bpf_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (!sk_fullsock(sk))
+ return -EINVAL;
- if (optlen <= 0 || !tp->saved_syn ||
- optlen > tcp_saved_syn_len(tp->saved_syn))
- goto err_clear;
- memcpy(optval, tp->saved_syn->data, optlen);
- break;
- default:
- goto err_clear;
- }
- } else if (level == SOL_IP) {
- struct inet_sock *inet = inet_sk(sk);
+ if (level == SOL_SOCKET)
+ return sol_socket_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
+ return sol_ip_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
+ return sol_ipv6_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
+ return sol_tcp_sockopt(sk, optname, optval, &optlen, false);
- if (optlen != sizeof(int) || sk->sk_family != AF_INET)
- goto err_clear;
+ return -EINVAL;
+}
- /* Only some options are supported */
- switch (optname) {
- case IP_TOS:
- *((int *)optval) = (int)inet->tos;
- break;
- default:
- goto err_clear;
- }
-#if IS_ENABLED(CONFIG_IPV6)
- } else if (level == SOL_IPV6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
+static int _bpf_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (sk_fullsock(sk))
+ sock_owned_by_me(sk);
+ return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
- if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
- goto err_clear;
+static int __bpf_getsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ int err, saved_optlen = optlen;
- /* Only some options are supported */
- switch (optname) {
- case IPV6_TCLASS:
- *((int *)optval) = (int)np->tclass;
- break;
- default:
- goto err_clear;
- }
-#endif
-#endif
- } else {
- goto err_clear;
+ if (!sk_fullsock(sk)) {
+ err = -EINVAL;
+ goto done;
}
- return 0;
-err_clear:
- memset(optval, 0, optlen);
- return -EINVAL;
+
+ if (level == SOL_SOCKET)
+ err = sol_socket_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
+ err = sol_tcp_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
+ err = sol_ip_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
+ err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true);
+ else
+ err = -EINVAL;
+
+done:
+ if (err)
+ optlen = 0;
+ if (optlen < saved_optlen)
+ memset(optval + optlen, 0, saved_optlen - optlen);
+ return err;
}
static int _bpf_getsockopt(struct sock *sk, int level, int optname,
@@ -5379,12 +5329,6 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
int, optname, char *, optval, int, optlen)
{
- if (level == SOL_TCP && optname == TCP_CONGESTION) {
- if (optlen >= sizeof("cdg") - 1 &&
- !strncmp("cdg", optval, optlen))
- return -ENOTSUPP;
- }
-
return _bpf_setsockopt(sk, level, optname, optval, optlen);
}
@@ -6468,6 +6412,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto)
{
+ struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
bool refcounted = false;
struct sock *sk = NULL;
@@ -6476,7 +6421,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
__be32 dst4 = tuple->ipv4.daddr;
if (proto == IPPROTO_TCP)
- sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0,
+ sk = __inet_lookup(net, hinfo, NULL, 0,
src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport,
dif, sdif, &refcounted);
@@ -6490,7 +6435,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
if (proto == IPPROTO_TCP)
- sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0,
+ sk = __inet6_lookup(net, hinfo, NULL, 0,
src6, tuple->ipv6.sport,
dst6, ntohs(tuple->ipv6.dport),
dif, sdif, &refcounted);
@@ -7666,34 +7611,23 @@ const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;
static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- /* inet and inet6 sockets are created in a process
- * context so there is always a valid uid/gid
- */
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_proto;
case BPF_FUNC_get_netns_cookie:
return &bpf_get_netns_cookie_sock_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
- case BPF_FUNC_get_current_comm:
- return &bpf_get_current_comm_proto;
-#ifdef CONFIG_CGROUPS
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_get_current_ancestor_cgroup_id:
- return &bpf_get_current_ancestor_cgroup_id_proto;
-#endif
-#ifdef CONFIG_CGROUP_NET_CLASSID
- case BPF_FUNC_get_cgroup_classid:
- return &bpf_get_cgroup_classid_curr_proto;
-#endif
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_cg_sock_proto;
case BPF_FUNC_ktime_get_coarse_ns:
@@ -7706,12 +7640,17 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
static const struct bpf_func_proto *
sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ func_proto = cgroup_current_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- /* inet and inet6 sockets are created in a process
- * context so there is always a valid uid/gid
- */
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_bind:
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_CONNECT:
@@ -7724,24 +7663,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_cookie_sock_addr_proto;
case BPF_FUNC_get_netns_cookie:
return &bpf_get_netns_cookie_sock_addr_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
- case BPF_FUNC_get_current_pid_tgid:
- return &bpf_get_current_pid_tgid_proto;
- case BPF_FUNC_get_current_comm:
- return &bpf_get_current_comm_proto;
-#ifdef CONFIG_CGROUPS
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_get_current_ancestor_cgroup_id:
- return &bpf_get_current_ancestor_cgroup_id_proto;
-#endif
-#ifdef CONFIG_CGROUP_NET_CLASSID
- case BPF_FUNC_get_cgroup_classid:
- return &bpf_get_cgroup_classid_curr_proto;
-#endif
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_tcp:
return &bpf_sock_addr_sk_lookup_tcp_proto;
@@ -7822,9 +7745,13 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;
static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto;
case BPF_FUNC_sk_storage_get:
@@ -8064,6 +7991,12 @@ const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
case BPF_FUNC_setsockopt:
return &bpf_sock_ops_setsockopt_proto;
@@ -8077,8 +8010,6 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_hash_update_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_ops_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
case BPF_FUNC_sk_storage_get:
@@ -8713,6 +8644,36 @@ static bool tc_cls_act_is_valid_access(int off, int size,
return bpf_skb_is_valid_access(off, size, type, prog, info);
}
+DEFINE_MUTEX(nf_conn_btf_access_lock);
+EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
+
+int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, int off, int size,
+ enum bpf_access_type atype, u32 *next_btf_id,
+ enum bpf_type_flag *flag);
+EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
+
+static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
+ const struct btf *btf,
+ const struct btf_type *t, int off,
+ int size, enum bpf_access_type atype,
+ u32 *next_btf_id,
+ enum bpf_type_flag *flag)
+{
+ int ret = -EACCES;
+
+ if (atype == BPF_READ)
+ return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+ flag);
+
+ mutex_lock(&nf_conn_btf_access_lock);
+ if (nfct_btf_struct_access)
+ ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag);
+ mutex_unlock(&nf_conn_btf_access_lock);
+
+ return ret;
+}
+
static bool __is_valid_xdp_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct xdp_md))
@@ -8772,6 +8733,27 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog,
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
+static int xdp_btf_struct_access(struct bpf_verifier_log *log,
+ const struct btf *btf,
+ const struct btf_type *t, int off,
+ int size, enum bpf_access_type atype,
+ u32 *next_btf_id,
+ enum bpf_type_flag *flag)
+{
+ int ret = -EACCES;
+
+ if (atype == BPF_READ)
+ return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+ flag);
+
+ mutex_lock(&nf_conn_btf_access_lock);
+ if (nfct_btf_struct_access)
+ ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag);
+ mutex_unlock(&nf_conn_btf_access_lock);
+
+ return ret;
+}
+
static bool sock_addr_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -10666,6 +10648,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
.convert_ctx_access = tc_cls_act_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue,
.gen_ld_abs = bpf_gen_ld_abs,
+ .btf_struct_access = tc_cls_act_btf_struct_access,
};
const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -10677,6 +10660,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
.is_valid_access = xdp_is_valid_access,
.convert_ctx_access = xdp_convert_ctx_access,
.gen_prologue = bpf_noop_prologue,
+ .btf_struct_access = xdp_btf_struct_access,
};
const struct bpf_prog_ops xdp_prog_ops = {
@@ -10811,14 +10795,13 @@ int sk_detach_filter(struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_detach_filter);
-int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
- unsigned int len)
+int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len)
{
struct sock_fprog_kern *fprog;
struct sk_filter *filter;
int ret = 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
filter = rcu_dereference_protected(sk->sk_filter,
lockdep_sock_is_held(sk));
if (!filter)
@@ -10843,7 +10826,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
goto out;
ret = -EFAULT;
- if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
+ if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog)))
goto out;
/* Instead of bytes, the API requests to return the number
@@ -10851,7 +10834,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
*/
ret = fprog->len;
out:
- release_sock(sk);
+ sockopt_release_sock(sk);
return ret;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 764c4cb3fe8f..007730412947 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -204,6 +204,30 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen);
}
+static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_l2tpv3 *key_l2tpv3;
+ struct {
+ __be32 session_id;
+ } *hdr, _hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3))
+ return;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return;
+
+ key_l2tpv3 = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_L2TPV3,
+ target_container);
+
+ key_l2tpv3->session_id = hdr->session_id;
+}
+
void skb_flow_dissect_meta(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container)
@@ -272,7 +296,7 @@ skb_flow_dissect_ct(const struct sk_buff *skb,
key->ct_zone = ct->zone.id;
#endif
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
- key->ct_mark = ct->mark;
+ key->ct_mark = READ_ONCE(ct->mark);
#endif
cl = nf_ct_labels_find(ct);
@@ -866,8 +890,8 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
}
}
-bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
- __be16 proto, int nhoff, int hlen, unsigned int flags)
+u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
+ __be16 proto, int nhoff, int hlen, unsigned int flags)
{
struct bpf_flow_keys *flow_keys = ctx->flow_keys;
u32 result;
@@ -892,7 +916,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
flow_keys->nhoff, hlen);
- return result == BPF_OK;
+ return result;
}
static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr)
@@ -1008,6 +1032,7 @@ bool __skb_flow_dissect(const struct net *net,
};
__be16 n_proto = proto;
struct bpf_prog *prog;
+ u32 result;
if (skb) {
ctx.skb = skb;
@@ -1019,13 +1044,16 @@ bool __skb_flow_dissect(const struct net *net,
}
prog = READ_ONCE(run_array->items[0].prog);
- ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
- hlen, flags);
+ result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
+ hlen, flags);
+ if (result == BPF_FLOW_DISSECTOR_CONTINUE)
+ goto dissect_continue;
__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
target_container);
rcu_read_unlock();
- return ret;
+ return result == BPF_OK;
}
+dissect_continue:
rcu_read_unlock();
}
@@ -1173,8 +1201,8 @@ proto_again:
nhoff += sizeof(*vlan);
}
- if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) {
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) &&
+ !(key_control->flags & FLOW_DIS_ENCAPSULATION)) {
struct flow_dissector_key_num_of_vlans *key_nvs;
key_nvs = skb_flow_dissector_target(flow_dissector,
@@ -1497,6 +1525,10 @@ ip_proto_again:
__skb_flow_dissect_icmp(skb, flow_dissector, target_container,
data, nhoff, hlen);
break;
+ case IPPROTO_L2TP:
+ __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
default:
break;
@@ -1611,9 +1643,8 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys)
switch (keys->control.addr_type) {
case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
- addr_diff = (__force u32)keys->addrs.v4addrs.dst -
- (__force u32)keys->addrs.v4addrs.src;
- if (addr_diff < 0)
+ if ((__force u32)keys->addrs.v4addrs.dst <
+ (__force u32)keys->addrs.v4addrs.src)
swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);
if ((__force u16)keys->ports.dst <
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 8cfb63528d18..abe423fd5736 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -237,6 +237,13 @@ void flow_rule_match_pppoe(const struct flow_rule *rule,
}
EXPORT_SYMBOL(flow_rule_match_pppoe);
+void flow_rule_match_l2tpv3(const struct flow_rule *rule,
+ struct flow_match_l2tpv3 *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_L2TPV3, out);
+}
+EXPORT_SYMBOL(flow_rule_match_l2tpv3);
+
struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb,
void *cb_ident, void *cb_priv,
void (*release)(void *cb_priv))
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index a10335b4ba2d..c8d137ef5980 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -345,7 +345,7 @@ static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats,
for_each_possible_cpu(i) {
const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
- qstats->qlen += qcpu->backlog;
+ qstats->qlen += qcpu->qlen;
qstats->backlog += qcpu->backlog;
qstats->drops += qcpu->drops;
qstats->requeues += qcpu->requeues;
diff --git a/net/core/gro.c b/net/core/gro.c
index b4190eb08467..bc9451743307 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
unsigned int gro_max_size;
unsigned int new_truesize;
struct sk_buff *lp;
+ int segs;
/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
gro_max_size = READ_ONCE(p->dev->gro_max_size);
@@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
return -E2BIG;
}
+ segs = NAPI_GRO_CB(skb)->count;
lp = NAPI_GRO_CB(p)->last;
pinfo = skb_shinfo(lp);
@@ -265,7 +267,7 @@ merge:
lp = p;
done:
- NAPI_GRO_CB(p)->count++;
+ NAPI_GRO_CB(p)->count += segs;
p->data_len += len;
p->truesize += delta_truesize;
p->len += len;
@@ -496,8 +498,15 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
sizeof(u32))); /* Avoid slow unaligned acc */
*(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
- NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
+ NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
NAPI_GRO_CB(skb)->is_atomic = 1;
+ NAPI_GRO_CB(skb)->count = 1;
+ if (unlikely(skb_is_gso(skb))) {
+ NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
+ /* Only support TCP at the moment. */
+ if (!skb_is_gso_tcp(skb))
+ NAPI_GRO_CB(skb)->flush = 1;
+ }
/* Setup for GRO checksum validation */
switch (skb->ip_summed) {
@@ -545,10 +554,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
else
gro_list->count++;
- NAPI_GRO_CB(skb)->count = 1;
NAPI_GRO_CB(skb)->age = jiffies;
NAPI_GRO_CB(skb)->last = skb;
- skb_shinfo(skb)->gso_size = skb_gro_len(skb);
+ if (!skb_is_gso(skb))
+ skb_shinfo(skb)->gso_size = skb_gro_len(skb);
list_add(&skb->list, &gro_list->list);
ret = GRO_HELD;
@@ -660,6 +669,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->encapsulation = 0;
skb_shinfo(skb)->gso_type = 0;
+ skb_shinfo(skb)->gso_size = 0;
if (unlikely(skb->slow_gro)) {
skb_orphan(skb);
skb_ext_reset(skb);
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
index 541c7a72a28a..ed5ec5de47f6 100644
--- a/net/core/gro_cells.c
+++ b/net/core/gro_cells.c
@@ -26,7 +26,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
cell = this_cpu_ptr(gcells->cells);
- if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
+ if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(netdev_max_backlog)) {
drop:
dev_core_stats_rx_dropped_inc(dev);
kfree_skb(skb);
@@ -81,8 +81,7 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
- netif_napi_add(dev, &cell->napi, gro_cell_poll,
- NAPI_POLL_WEIGHT);
+ netif_napi_add(dev, &cell->napi, gro_cell_poll);
napi_enable(&cell->napi);
}
return 0;
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 9ccd64e8a666..711cd3b4347a 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -48,6 +48,9 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "RPL";
case LWTUNNEL_ENCAP_IOAM6:
return "IOAM6";
+ case LWTUNNEL_ENCAP_XFRM:
+ /* module autoload not supported for encap type */
+ return NULL;
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 6a8c2596ebab..952a54763358 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -111,7 +111,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
unsigned long neigh_rand_reach_time(unsigned long base)
{
- return base ? (prandom_u32() % base) + (base >> 1) : 0;
+ return base ? prandom_u32_max(base) + (base >> 1) : 0;
}
EXPORT_SYMBOL(neigh_rand_reach_time);
@@ -307,11 +307,53 @@ static int neigh_del_timer(struct neighbour *n)
return 0;
}
-static void pneigh_queue_purge(struct sk_buff_head *list)
+static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
+ int family)
+{
+ switch (family) {
+ case AF_INET:
+ return __in_dev_arp_parms_get_rcu(dev);
+ case AF_INET6:
+ return __in6_dev_nd_parms_get_rcu(dev);
+ }
+ return NULL;
+}
+
+static void neigh_parms_qlen_dec(struct net_device *dev, int family)
+{
+ struct neigh_parms *p;
+
+ rcu_read_lock();
+ p = neigh_get_dev_parms_rcu(dev, family);
+ if (p)
+ p->qlen--;
+ rcu_read_unlock();
+}
+
+static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
+ int family)
{
+ struct sk_buff_head tmp;
+ unsigned long flags;
struct sk_buff *skb;
- while ((skb = skb_dequeue(list)) != NULL) {
+ skb_queue_head_init(&tmp);
+ spin_lock_irqsave(&list->lock, flags);
+ skb = skb_peek(list);
+ while (skb != NULL) {
+ struct sk_buff *skb_next = skb_peek_next(skb, list);
+ struct net_device *dev = skb->dev;
+
+ if (net == NULL || net_eq(dev_net(dev), net)) {
+ neigh_parms_qlen_dec(dev, family);
+ __skb_unlink(skb, list);
+ __skb_queue_tail(&tmp, skb);
+ }
+ skb = skb_next;
+ }
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ while ((skb = __skb_dequeue(&tmp))) {
dev_put(skb->dev);
kfree_skb(skb);
}
@@ -385,9 +427,10 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
write_lock_bh(&tbl->lock);
neigh_flush_dev(tbl, dev, skip_perm);
pneigh_ifdown_and_unlock(tbl, dev);
-
- del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue);
+ pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
+ tbl->family);
+ if (skb_queue_empty_lockless(&tbl->proxy_queue))
+ del_timer_sync(&tbl->proxy_timer);
return 0;
}
@@ -1598,7 +1641,9 @@ static void neigh_proxy_process(struct timer_list *t)
if (tdif <= 0) {
struct net_device *dev = skb->dev;
+ neigh_parms_qlen_dec(dev, tbl->family);
__skb_unlink(skb, &tbl->proxy_queue);
+
if (tbl->proxy_redo && netif_running(dev)) {
rcu_read_lock();
tbl->proxy_redo(skb);
@@ -1623,7 +1668,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
unsigned long sched_next = jiffies +
prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY));
- if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {
+ if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) {
kfree_skb(skb);
return;
}
@@ -1639,6 +1684,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
skb_dst_drop(skb);
dev_hold(skb->dev);
__skb_queue_tail(&tbl->proxy_queue, skb);
+ p->qlen++;
mod_timer(&tbl->proxy_timer, sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
@@ -1671,6 +1717,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
refcount_set(&p->refcnt, 1);
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ p->qlen = 0;
netdev_hold(dev, &p->dev_tracker, GFP_KERNEL);
p->dev = dev;
write_pnet(&p->net, net);
@@ -1736,6 +1783,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
refcount_set(&tbl->parms.refcnt, 1);
tbl->parms.reachable_time =
neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
+ tbl->parms.qlen = 0;
tbl->stats = alloc_percpu(struct neigh_statistics);
if (!tbl->stats)
@@ -1787,7 +1835,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl)
cancel_delayed_work_sync(&tbl->managed_work);
cancel_delayed_work_sync(&tbl->gc_work);
del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue);
+ pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family);
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
pr_crit("neighbour leakage\n");
@@ -1819,9 +1867,6 @@ static struct neigh_table *neigh_find_table(int family)
case AF_INET6:
tbl = neigh_tables[NEIGH_ND_TABLE];
break;
- case AF_DECnet:
- tbl = neigh_tables[NEIGH_DN_TABLE];
- break;
}
return tbl;
@@ -3508,18 +3553,6 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write,
return ret;
}
-static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
- int family)
-{
- switch (family) {
- case AF_INET:
- return __in_dev_arp_parms_get_rcu(dev);
- case AF_INET6:
- return __in6_dev_nd_parms_get_rcu(dev);
- }
- return NULL;
-}
-
static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
int index)
{
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index d61afd21aab5..8409d41405df 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -59,7 +59,7 @@ static ssize_t netdev_show(const struct device *dev,
#define NETDEVICE_SHOW(field, format_string) \
static ssize_t format_##field(const struct net_device *dev, char *buf) \
{ \
- return sprintf(buf, format_string, dev->field); \
+ return sysfs_emit(buf, format_string, dev->field); \
} \
static ssize_t field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
@@ -118,13 +118,13 @@ static ssize_t iflink_show(struct device *dev, struct device_attribute *attr,
{
struct net_device *ndev = to_net_dev(dev);
- return sprintf(buf, fmt_dec, dev_get_iflink(ndev));
+ return sysfs_emit(buf, fmt_dec, dev_get_iflink(ndev));
}
static DEVICE_ATTR_RO(iflink);
static ssize_t format_name_assign_type(const struct net_device *dev, char *buf)
{
- return sprintf(buf, fmt_dec, dev->name_assign_type);
+ return sysfs_emit(buf, fmt_dec, dev->name_assign_type);
}
static ssize_t name_assign_type_show(struct device *dev,
@@ -194,7 +194,7 @@ static ssize_t carrier_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
if (netif_running(netdev))
- return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev));
+ return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev));
return -EINVAL;
}
@@ -219,7 +219,7 @@ static ssize_t speed_show(struct device *dev,
struct ethtool_link_ksettings cmd;
if (!__ethtool_get_link_ksettings(netdev, &cmd))
- ret = sprintf(buf, fmt_dec, cmd.base.speed);
+ ret = sysfs_emit(buf, fmt_dec, cmd.base.speed);
}
rtnl_unlock();
return ret;
@@ -258,7 +258,7 @@ static ssize_t duplex_show(struct device *dev,
duplex = "unknown";
break;
}
- ret = sprintf(buf, "%s\n", duplex);
+ ret = sysfs_emit(buf, "%s\n", duplex);
}
}
rtnl_unlock();
@@ -272,7 +272,7 @@ static ssize_t testing_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
if (netif_running(netdev))
- return sprintf(buf, fmt_dec, !!netif_testing(netdev));
+ return sysfs_emit(buf, fmt_dec, !!netif_testing(netdev));
return -EINVAL;
}
@@ -284,7 +284,7 @@ static ssize_t dormant_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
if (netif_running(netdev))
- return sprintf(buf, fmt_dec, !!netif_dormant(netdev));
+ return sysfs_emit(buf, fmt_dec, !!netif_dormant(netdev));
return -EINVAL;
}
@@ -315,7 +315,7 @@ static ssize_t operstate_show(struct device *dev,
if (operstate >= ARRAY_SIZE(operstates))
return -EINVAL; /* should not happen */
- return sprintf(buf, "%s\n", operstates[operstate]);
+ return sysfs_emit(buf, "%s\n", operstates[operstate]);
}
static DEVICE_ATTR_RO(operstate);
@@ -325,9 +325,9 @@ static ssize_t carrier_changes_show(struct device *dev,
{
struct net_device *netdev = to_net_dev(dev);
- return sprintf(buf, fmt_dec,
- atomic_read(&netdev->carrier_up_count) +
- atomic_read(&netdev->carrier_down_count));
+ return sysfs_emit(buf, fmt_dec,
+ atomic_read(&netdev->carrier_up_count) +
+ atomic_read(&netdev->carrier_down_count));
}
static DEVICE_ATTR_RO(carrier_changes);
@@ -337,7 +337,7 @@ static ssize_t carrier_up_count_show(struct device *dev,
{
struct net_device *netdev = to_net_dev(dev);
- return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count));
+ return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count));
}
static DEVICE_ATTR_RO(carrier_up_count);
@@ -347,7 +347,7 @@ static ssize_t carrier_down_count_show(struct device *dev,
{
struct net_device *netdev = to_net_dev(dev);
- return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count));
+ return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_down_count));
}
static DEVICE_ATTR_RO(carrier_down_count);
@@ -462,7 +462,7 @@ static ssize_t ifalias_show(struct device *dev,
ret = dev_get_alias(netdev, tmp, sizeof(tmp));
if (ret > 0)
- ret = sprintf(buf, "%s\n", tmp);
+ ret = sysfs_emit(buf, "%s\n", tmp);
return ret;
}
static DEVICE_ATTR_RW(ifalias);
@@ -514,7 +514,7 @@ static ssize_t phys_port_id_show(struct device *dev,
ret = dev_get_phys_port_id(netdev, &ppid);
if (!ret)
- ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
}
rtnl_unlock();
@@ -543,7 +543,7 @@ static ssize_t phys_port_name_show(struct device *dev,
ret = dev_get_phys_port_name(netdev, name, sizeof(name));
if (!ret)
- ret = sprintf(buf, "%s\n", name);
+ ret = sysfs_emit(buf, "%s\n", name);
}
rtnl_unlock();
@@ -573,7 +573,7 @@ static ssize_t phys_switch_id_show(struct device *dev,
ret = dev_get_port_parent_id(netdev, &ppid, false);
if (!ret)
- ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+ ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
}
rtnl_unlock();
@@ -591,7 +591,7 @@ static ssize_t threaded_show(struct device *dev,
return restart_syscall();
if (dev_isalive(netdev))
- ret = sprintf(buf, fmt_dec, netdev->threaded);
+ ret = sysfs_emit(buf, fmt_dec, netdev->threaded);
rtnl_unlock();
return ret;
@@ -673,7 +673,7 @@ static ssize_t netstat_show(const struct device *d,
struct rtnl_link_stats64 temp;
const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
- ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset));
+ ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset));
}
read_unlock(&dev_base_lock);
return ret;
@@ -824,7 +824,7 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf)
for (i = 0; i < map->len; i++)
cpumask_set_cpu(map->cpus[i], mask);
- len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
+ len = sysfs_emit(buf, "%*pb\n", cpumask_pr_args(mask));
rcu_read_unlock();
free_cpumask_var(mask);
@@ -910,7 +910,7 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
val = (unsigned long)flow_table->mask + 1;
rcu_read_unlock();
- return sprintf(buf, "%lu\n", val);
+ return sysfs_emit(buf, "%lu\n", val);
}
static void rps_dev_flow_table_release(struct rcu_head *rcu)
@@ -1208,7 +1208,7 @@ static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf)
{
unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout);
- return sprintf(buf, fmt_ulong, trans_timeout);
+ return sysfs_emit(buf, fmt_ulong, trans_timeout);
}
static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
@@ -1255,15 +1255,15 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
* belongs to the root device it will be reported with just the
* traffic class, so just "0" for TC 0 for example.
*/
- return num_tc < 0 ? sprintf(buf, "%d%d\n", tc, num_tc) :
- sprintf(buf, "%d\n", tc);
+ return num_tc < 0 ? sysfs_emit(buf, "%d%d\n", tc, num_tc) :
+ sysfs_emit(buf, "%d\n", tc);
}
#ifdef CONFIG_XPS
static ssize_t tx_maxrate_show(struct netdev_queue *queue,
char *buf)
{
- return sprintf(buf, "%lu\n", queue->tx_maxrate);
+ return sysfs_emit(buf, "%lu\n", queue->tx_maxrate);
}
static ssize_t tx_maxrate_store(struct netdev_queue *queue,
@@ -1317,7 +1317,7 @@ static struct netdev_queue_attribute queue_traffic_class __ro_after_init
*/
static ssize_t bql_show(char *buf, unsigned int value)
{
- return sprintf(buf, "%u\n", value);
+ return sysfs_emit(buf, "%u\n", value);
}
static ssize_t bql_set(const char *buf, const size_t count,
@@ -1346,7 +1346,7 @@ static ssize_t bql_show_hold_time(struct netdev_queue *queue,
{
struct dql *dql = &queue->dql;
- return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
}
static ssize_t bql_set_hold_time(struct netdev_queue *queue,
@@ -1374,7 +1374,7 @@ static ssize_t bql_show_inflight(struct netdev_queue *queue,
{
struct dql *dql = &queue->dql;
- return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed);
+ return sysfs_emit(buf, "%u\n", dql->num_queued - dql->num_completed);
}
static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init =
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 6b9f19122ec1..f64654df71a2 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -18,7 +18,6 @@
#include <linux/user_namespace.h>
#include <linux/net_namespace.h>
#include <linux/sched/task.h>
-#include <linux/sched/mm.h>
#include <linux/uidgid.h>
#include <linux/cookie.h>
@@ -118,6 +117,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
static int ops_init(const struct pernet_operations *ops, struct net *net)
{
+ struct net_generic *ng;
int err = -ENOMEM;
void *data = NULL;
@@ -136,7 +136,13 @@ static int ops_init(const struct pernet_operations *ops, struct net *net)
if (!err)
return 0;
+ if (ops->id && ops->size) {
cleanup:
+ ng = rcu_dereference_protected(net->gen,
+ lockdep_is_held(&pernet_ops_rwsem));
+ ng->ptr[*ops->id] = NULL;
+ }
+
kfree(data);
out:
@@ -1144,13 +1150,7 @@ static int __register_pernet_operations(struct list_head *list,
* setup_net() and cleanup_net() are not possible.
*/
for_each_net(net) {
- struct mem_cgroup *old, *memcg;
-
- memcg = mem_cgroup_or_root(get_mem_cgroup_from_obj(net));
- old = set_active_memcg(memcg);
error = ops_init(ops, net);
- set_active_memcg(old);
- mem_cgroup_put(memcg);
if (error)
goto out_undo;
list_add_tail(&net->exit_list, &net_exit_list);
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 1a6a86693b74..d6a70aeaa503 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -66,7 +66,7 @@ struct update_classid_context {
#define UPDATE_CLASSID_BATCH 1000
-static int update_classid_sock(const void *v, struct file *file, unsigned n)
+static int update_classid_sock(const void *v, struct file *file, unsigned int n)
{
struct update_classid_context *ctx = (void *)v;
struct socket *sock = sock_from_file(file);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 5d27067b72d5..9be762e1d042 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -556,7 +556,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
if ((delim = strchr(cur, ',')) == NULL)
goto parse_failed;
*delim = 0;
- strlcpy(np->dev_name, cur, sizeof(np->dev_name));
+ strscpy(np->dev_name, cur, sizeof(np->dev_name));
cur = delim;
}
cur++;
@@ -610,7 +610,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
int err;
np->dev = ndev;
- strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
+ strscpy(np->dev_name, ndev->name, IFNAMSIZ);
if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
np_err(np, "%s doesn't support polling, aborting\n",
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 88906ba6d9a7..c3763056c554 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2324,7 +2324,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)
pkt_dev->curfl = 0; /*reset */
}
} else {
- flow = prandom_u32() % pkt_dev->cflows;
+ flow = prandom_u32_max(pkt_dev->cflows);
pkt_dev->curfl = flow;
if (pkt_dev->flows[flow].count > pkt_dev->lflow) {
@@ -2380,10 +2380,9 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
__u16 t;
if (pkt_dev->flags & F_QUEUE_MAP_RND) {
- t = prandom_u32() %
- (pkt_dev->queue_map_max -
- pkt_dev->queue_map_min + 1)
- + pkt_dev->queue_map_min;
+ t = prandom_u32_max(pkt_dev->queue_map_max -
+ pkt_dev->queue_map_min + 1) +
+ pkt_dev->queue_map_min;
} else {
t = pkt_dev->cur_queue_map + 1;
if (t > pkt_dev->queue_map_max)
@@ -2412,7 +2411,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 tmp;
if (pkt_dev->flags & F_MACSRC_RND)
- mc = prandom_u32() % pkt_dev->src_mac_count;
+ mc = prandom_u32_max(pkt_dev->src_mac_count);
else {
mc = pkt_dev->cur_src_mac_offset++;
if (pkt_dev->cur_src_mac_offset >=
@@ -2438,7 +2437,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 tmp;
if (pkt_dev->flags & F_MACDST_RND)
- mc = prandom_u32() % pkt_dev->dst_mac_count;
+ mc = prandom_u32_max(pkt_dev->dst_mac_count);
else {
mc = pkt_dev->cur_dst_mac_offset++;
@@ -2465,23 +2464,23 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
for (i = 0; i < pkt_dev->nr_labels; i++)
if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
- ((__force __be32)prandom_u32() &
+ ((__force __be32)get_random_u32() &
htonl(0x000fffff));
}
if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) {
- pkt_dev->vlan_id = prandom_u32() & (4096 - 1);
+ pkt_dev->vlan_id = prandom_u32_max(4096);
}
if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) {
- pkt_dev->svlan_id = prandom_u32() & (4096 - 1);
+ pkt_dev->svlan_id = prandom_u32_max(4096);
}
if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
if (pkt_dev->flags & F_UDPSRC_RND)
- pkt_dev->cur_udp_src = prandom_u32() %
- (pkt_dev->udp_src_max - pkt_dev->udp_src_min)
- + pkt_dev->udp_src_min;
+ pkt_dev->cur_udp_src = prandom_u32_max(
+ pkt_dev->udp_src_max - pkt_dev->udp_src_min) +
+ pkt_dev->udp_src_min;
else {
pkt_dev->cur_udp_src++;
@@ -2492,9 +2491,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
if (pkt_dev->flags & F_UDPDST_RND) {
- pkt_dev->cur_udp_dst = prandom_u32() %
- (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)
- + pkt_dev->udp_dst_min;
+ pkt_dev->cur_udp_dst = prandom_u32_max(
+ pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) +
+ pkt_dev->udp_dst_min;
} else {
pkt_dev->cur_udp_dst++;
if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)
@@ -2509,7 +2508,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (imn < imx) {
__u32 t;
if (pkt_dev->flags & F_IPSRC_RND)
- t = prandom_u32() % (imx - imn) + imn;
+ t = prandom_u32_max(imx - imn) + imn;
else {
t = ntohl(pkt_dev->cur_saddr);
t++;
@@ -2531,8 +2530,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->flags & F_IPDST_RND) {
do {
- t = prandom_u32() %
- (imx - imn) + imn;
+ t = prandom_u32_max(imx - imn) +
+ imn;
s = htonl(t);
} while (ipv4_is_loopback(s) ||
ipv4_is_multicast(s) ||
@@ -2569,7 +2568,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
for (i = 0; i < 4; i++) {
pkt_dev->cur_in6_daddr.s6_addr32[i] =
- (((__force __be32)prandom_u32() |
+ (((__force __be32)get_random_u32() |
pkt_dev->min_in6_daddr.s6_addr32[i]) &
pkt_dev->max_in6_daddr.s6_addr32[i]);
}
@@ -2579,9 +2578,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
__u32 t;
if (pkt_dev->flags & F_TXSIZE_RND) {
- t = prandom_u32() %
- (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)
- + pkt_dev->min_pkt_size;
+ t = prandom_u32_max(pkt_dev->max_pkt_size -
+ pkt_dev->min_pkt_size) +
+ pkt_dev->min_pkt_size;
} else {
t = pkt_dev->cur_pkt_size + 1;
if (t > pkt_dev->max_pkt_size)
@@ -2590,7 +2589,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
pkt_dev->cur_pkt_size = t;
} else if (pkt_dev->n_imix_entries > 0) {
struct imix_pkt *entry;
- __u32 t = prandom_u32() % IMIX_PRECISION;
+ __u32 t = prandom_u32_max(IMIX_PRECISION);
__u8 entry_index = pkt_dev->imix_distribution[t];
entry = &pkt_dev->imix_entries[entry_index];
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ac45328607f7..74864dc46a7e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -866,14 +866,12 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
break;
case IF_OPER_TESTING:
- if (operstate == IF_OPER_UP ||
- operstate == IF_OPER_UNKNOWN)
+ if (netif_oper_up(dev))
operstate = IF_OPER_TESTING;
break;
case IF_OPER_DORMANT:
- if (operstate == IF_OPER_UP ||
- operstate == IF_OPER_UNKNOWN)
+ if (netif_oper_up(dev))
operstate = IF_OPER_DORMANT;
break;
}
@@ -1059,6 +1057,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_MASTER */
+ nla_total_size(1) /* IFLA_CARRIER */
+ nla_total_size(4) /* IFLA_PROMISCUITY */
+ + nla_total_size(4) /* IFLA_ALLMULTI */
+ nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
@@ -1767,6 +1766,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) ||
nla_put_u32(skb, IFLA_GROUP, dev->group) ||
nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
+ nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) ||
nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
@@ -1928,6 +1928,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 },
[IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT },
[IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT },
+ [IFLA_ALLMULTI] = { .type = NLA_REJECT },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2776,13 +2777,6 @@ static int do_setlink(const struct sk_buff *skb,
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
}
- if (ifm->ifi_flags || ifm->ifi_change) {
- err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
- extack);
- if (err < 0)
- goto errout;
- }
-
if (tb[IFLA_MASTER]) {
err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
if (err)
@@ -2790,6 +2784,13 @@ static int do_setlink(const struct sk_buff *skb,
status |= DO_SETLINK_MODIFIED;
}
+ if (ifm->ifi_flags || ifm->ifi_change) {
+ err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
+ extack);
+ if (err < 0)
+ goto errout;
+ }
+
if (tb[IFLA_CARRIER]) {
err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
if (err)
@@ -6070,6 +6071,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) &&
!(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) {
NL_SET_ERR_MSG(extack, "Bulk delete is not supported");
+ module_put(owner);
goto err_unlock;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 974bbbbe7138..88fa40571d0c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -91,7 +91,11 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init;
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
-/* The array 'drop_reasons' is auto-generated in dropreason_str.c */
+#undef FN
+#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
+const char * const drop_reasons[] = {
+ DEFINE_DROP_REASON(FN, FN)
+};
EXPORT_SYMBOL(drop_reasons);
/**
@@ -130,8 +134,66 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
#define NAPI_SKB_CACHE_BULK 16
#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
+#if PAGE_SIZE == SZ_4K
+
+#define NAPI_HAS_SMALL_PAGE_FRAG 1
+#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
+
+/* specialized page frag allocator using a single order 0 page
+ * and slicing it into 1K sized fragment. Constrained to systems
+ * with a very limited amount of 1K fragments fitting a single
+ * page - to avoid excessive truesize underestimation
+ */
+
+struct page_frag_1k {
+ void *va;
+ u16 offset;
+ bool pfmemalloc;
+};
+
+static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
+{
+ struct page *page;
+ int offset;
+
+ offset = nc->offset - SZ_1K;
+ if (likely(offset >= 0))
+ goto use_frag;
+
+ page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+ if (!page)
+ return NULL;
+
+ nc->va = page_address(page);
+ nc->pfmemalloc = page_is_pfmemalloc(page);
+ offset = PAGE_SIZE - SZ_1K;
+ page_ref_add(page, offset / SZ_1K);
+
+use_frag:
+ nc->offset = offset;
+ return nc->va + offset;
+}
+#else
+
+/* the small page is actually unused in this build; add dummy helpers
+ * to please the compiler and avoid later preprocessor's conditionals
+ */
+#define NAPI_HAS_SMALL_PAGE_FRAG 0
+#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
+
+struct page_frag_1k {
+};
+
+static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
+{
+ return NULL;
+}
+
+#endif
+
struct napi_alloc_cache {
struct page_frag_cache page;
+ struct page_frag_1k page_small;
unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
@@ -139,6 +201,23 @@ struct napi_alloc_cache {
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
+ */
+void napi_get_frags_check(struct napi_struct *napi)
+{
+ struct sk_buff *skb;
+
+ local_bh_disable();
+ skb = napi_get_frags(napi);
+ WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
+ napi_free_frags(napi);
+ local_bh_enable();
+}
+
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
@@ -557,6 +636,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
{
struct napi_alloc_cache *nc;
struct sk_buff *skb;
+ bool pfmemalloc;
void *data;
DEBUG_NET_WARN_ON_ONCE(!in_softirq());
@@ -564,8 +644,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
+ * When the small frag allocator is available, prefer it over kmalloc
+ * for small fragments
*/
- if (len <= SKB_WITH_OVERHEAD(1024) ||
+ if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -576,13 +658,33 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
}
nc = this_cpu_ptr(&napi_alloc_cache);
- len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- len = SKB_DATA_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- data = page_frag_alloc(&nc->page, len, gfp_mask);
+ if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
+ /* we are artificially inflating the allocation size, but
+ * that is not as bad as it may look like, as:
+ * - 'len' less than GRO_MAX_HEAD makes little sense
+ * - On most systems, larger 'len' values lead to fragment
+ * size above 512 bytes
+ * - kmalloc would use the kmalloc-1k slab for such values
+ * - Builds with smaller GRO_MAX_HEAD will very likely do
+ * little networking, as that implies no WiFi and no
+ * tunnels support, and 32 bits arches.
+ */
+ len = SZ_1K;
+
+ data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
+ pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
+ } else {
+ len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ len = SKB_DATA_ALIGN(len);
+
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
+ pfmemalloc = nc->page.pfmemalloc;
+ }
+
if (unlikely(!data))
return NULL;
@@ -592,7 +694,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
return NULL;
}
- if (nc->page.pfmemalloc)
+ if (pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -777,9 +879,10 @@ EXPORT_SYMBOL(__kfree_skb);
* hit zero. Meanwhile, pass the drop reason to 'kfree_skb'
* tracepoint.
*/
-void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+void __fix_address
+kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
- if (!skb_unref(skb))
+ if (unlikely(!skb_unref(skb)))
return;
DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
@@ -1183,7 +1286,7 @@ EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
{
- struct ubuf_info *uarg;
+ struct ubuf_info_msgzc *uarg;
struct sk_buff *skb;
WARN_ON_ONCE(!in_task());
@@ -1201,19 +1304,19 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
return NULL;
}
- uarg->callback = msg_zerocopy_callback;
+ uarg->ubuf.callback = msg_zerocopy_callback;
uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
- uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
- refcount_set(&uarg->refcnt, 1);
+ uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+ refcount_set(&uarg->ubuf.refcnt, 1);
sock_hold(sk);
- return uarg;
+ return &uarg->ubuf;
}
-static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
+static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
{
return container_of((void *)uarg, struct sk_buff, cb);
}
@@ -1222,6 +1325,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
struct ubuf_info *uarg)
{
if (uarg) {
+ struct ubuf_info_msgzc *uarg_zc;
const u32 byte_limit = 1 << 19; /* limit to a few TSO */
u32 bytelen, next;
@@ -1237,8 +1341,9 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
return NULL;
}
- bytelen = uarg->bytelen + size;
- if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
+ uarg_zc = uarg_to_msgzc(uarg);
+ bytelen = uarg_zc->bytelen + size;
+ if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) {
/* TCP can create new skb to attach new uarg */
if (sk->sk_type == SOCK_STREAM)
goto new_alloc;
@@ -1246,11 +1351,11 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
}
next = (u32)atomic_read(&sk->sk_zckey);
- if ((u32)(uarg->id + uarg->len) == next) {
- if (mm_account_pinned_pages(&uarg->mmp, size))
+ if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
+ if (mm_account_pinned_pages(&uarg_zc->mmp, size))
return NULL;
- uarg->len++;
- uarg->bytelen = bytelen;
+ uarg_zc->len++;
+ uarg_zc->bytelen = bytelen;
atomic_set(&sk->sk_zckey, ++next);
/* no extra ref when appending to datagram (MSG_MORE) */
@@ -1286,7 +1391,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
return true;
}
-static void __msg_zerocopy_callback(struct ubuf_info *uarg)
+static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
{
struct sk_buff *tail, *skb = skb_from_uarg(uarg);
struct sock_exterr_skb *serr;
@@ -1339,19 +1444,21 @@ release:
void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
bool success)
{
- uarg->zerocopy = uarg->zerocopy & success;
+ struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);
+
+ uarg_zc->zerocopy = uarg_zc->zerocopy & success;
if (refcount_dec_and_test(&uarg->refcnt))
- __msg_zerocopy_callback(uarg);
+ __msg_zerocopy_callback(uarg_zc);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
- struct sock *sk = skb_from_uarg(uarg)->sk;
+ struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk;
atomic_dec(&sk->sk_zckey);
- uarg->len--;
+ uarg_to_msgzc(uarg)->len--;
if (have_uref)
msg_zerocopy_callback(NULL, uarg, true);
@@ -3864,7 +3971,7 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
} else if (i < MAX_SKB_FRAGS) {
skb_zcopy_downgrade_managed(skb);
get_page(page);
- skb_fill_page_desc(skb, i, page, offset, size);
+ skb_fill_page_desc_noacc(skb, i, page, offset, size);
} else {
return -EMSGSIZE;
}
@@ -4027,23 +4134,25 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
int i = 0;
int pos;
- if (list_skb && !list_skb->head_frag && skb_headlen(list_skb) &&
- (skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY)) {
- /* gso_size is untrusted, and we have a frag_list with a linear
- * non head_frag head.
- *
- * (we assume checking the first list_skb member suffices;
- * i.e if either of the list_skb members have non head_frag
- * head, then the first one has too).
- *
- * If head_skb's headlen does not fit requested gso_size, it
- * means that the frag_list members do NOT terminate on exact
- * gso_size boundaries. Hence we cannot perform skb_frag_t page
- * sharing. Therefore we must fallback to copying the frag_list
- * skbs; we do so by disabling SG.
- */
- if (mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb))
- features &= ~NETIF_F_SG;
+ if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
+ mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
+ struct sk_buff *check_skb;
+
+ for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
+ if (skb_headlen(check_skb) && !check_skb->head_frag) {
+ /* gso_size is untrusted, and we have a frag_list with
+ * a linear non head_frag item.
+ *
+ * If head_skb's headlen does not fit requested gso_size,
+ * it means that the frag_list members do NOT terminate
+ * on exact gso_size boundaries. Hence we cannot perform
+ * skb_frag_t page sharing. Therefore we must fallback to
+ * copying the frag_list skbs; we do so by disabling SG.
+ */
+ features &= ~NETIF_F_SG;
+ break;
+ }
+ }
}
__skb_push(head_skb, doffset);
@@ -4205,9 +4314,8 @@ normal:
SKB_GSO_CB(nskb)->csum_start =
skb_headroom(nskb) + doffset;
} else {
- skb_copy_bits(head_skb, offset,
- skb_put(nskb, len),
- len);
+ if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
+ goto err;
}
continue;
}
@@ -4798,7 +4906,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
bool ret;
- if (likely(sysctl_tstamp_allow_data || tsonly))
+ if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
return true;
read_lock_bh(&sk->sk_callback_lock);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index f47338d89d5d..e6b9ced3eda8 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -434,8 +434,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
if (copied + copy > len)
copy = len - copied;
copy = copy_page_to_iter(page, sge->offset, copy, iter);
- if (!copy)
- return copied ? copied : -EFAULT;
+ if (!copy) {
+ copied = copied ? copied : -EFAULT;
+ goto out;
+ }
copied += copy;
if (likely(!peek)) {
@@ -455,13 +457,13 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
* didn't copy the entire length lets just break.
*/
if (copy != sge->length)
- return copied;
+ goto out;
sk_msg_iter_var_next(i);
}
if (copied == len)
break;
- } while (!sg_is_last(sge));
+ } while ((i != msg_rx->sg.end) && !sg_is_last(sge));
if (unlikely(peek)) {
msg_rx = sk_psock_next_msg(psock, msg_rx);
@@ -471,13 +473,15 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
}
msg_rx->sg.start = i;
- if (!sge->length && sg_is_last(sge)) {
+ if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) {
msg_rx = sk_psock_dequeue_msg(psock);
kfree_sk_msg(msg_rx);
}
msg_rx = sk_psock_peek_msg(psock);
}
-
+out:
+ if (psock->work_state.skb && copied > 0)
+ schedule_work(&psock->work);
return copied;
}
EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
@@ -496,11 +500,11 @@ bool sk_msg_is_readable(struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_msg_is_readable);
-static struct sk_msg *alloc_sk_msg(void)
+static struct sk_msg *alloc_sk_msg(gfp_t gfp)
{
struct sk_msg *msg;
- msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL);
+ msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN);
if (unlikely(!msg))
return NULL;
sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS);
@@ -516,7 +520,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
if (!sk_rmem_schedule(sk, skb, skb->truesize))
return NULL;
- return alloc_sk_msg();
+ return alloc_sk_msg(GFP_KERNEL);
}
static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
@@ -593,7 +597,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len)
{
- struct sk_msg *msg = alloc_sk_msg();
+ struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
struct sock *sk = psock->sk;
int err;
@@ -799,16 +803,13 @@ static void sk_psock_link_destroy(struct sk_psock *psock)
}
}
-void sk_psock_stop(struct sk_psock *psock, bool wait)
+void sk_psock_stop(struct sk_psock *psock)
{
spin_lock_bh(&psock->ingress_lock);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
sk_psock_cork_free(psock);
__sk_psock_zap_ingress(psock);
spin_unlock_bh(&psock->ingress_lock);
-
- if (wait)
- cancel_work_sync(&psock->work);
}
static void sk_psock_done_strp(struct sk_psock *psock);
@@ -846,7 +847,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
- sk_psock_stop(psock, false);
+ sk_psock_stop(psock);
INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
queue_rcu_work(system_wq, &psock->rwork);
@@ -1194,8 +1195,9 @@ static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
ret = bpf_prog_run_pin_on_cpu(prog, skb);
ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
}
- if (sk_psock_verdict_apply(psock, skb, ret) < 0)
- len = 0;
+ ret = sk_psock_verdict_apply(psock, skb, ret);
+ if (ret < 0)
+ len = ret;
out:
rcu_read_unlock();
return len;
diff --git a/net/core/sock.c b/net/core/sock.c
index 4cb957d934a2..a3ba0358c77c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -703,15 +703,17 @@ static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
goto out;
}
- return sock_bindtoindex(sk, index, true);
+ sockopt_lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, index);
+ sockopt_release_sock(sk);
out:
#endif
return ret;
}
-static int sock_getbindtodevice(struct sock *sk, char __user *optval,
- int __user *optlen, int len)
+static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
@@ -735,12 +737,12 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval,
len = strlen(devname) + 1;
ret = -EFAULT;
- if (copy_to_user(optval, devname, len))
+ if (copy_to_sockptr(optval, devname, len))
goto out;
zero:
ret = -EFAULT;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
goto out;
ret = 0;
@@ -1036,17 +1038,51 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
return 0;
}
+void sockopt_lock_sock(struct sock *sk)
+{
+ /* When current->bpf_ctx is set, the setsockopt is called from
+ * a bpf prog. bpf has ensured the sk lock has been
+ * acquired before calling setsockopt().
+ */
+ if (has_current_bpf_ctx())
+ return;
+
+ lock_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_lock_sock);
+
+void sockopt_release_sock(struct sock *sk)
+{
+ if (has_current_bpf_ctx())
+ return;
+
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_release_sock);
+
+bool sockopt_ns_capable(struct user_namespace *ns, int cap)
+{
+ return has_current_bpf_ctx() || ns_capable(ns, cap);
+}
+EXPORT_SYMBOL(sockopt_ns_capable);
+
+bool sockopt_capable(int cap)
+{
+ return has_current_bpf_ctx() || capable(cap);
+}
+EXPORT_SYMBOL(sockopt_capable);
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
*/
-int sock_setsockopt(struct socket *sock, int level, int optname,
- sockptr_t optval, unsigned int optlen)
+int sk_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct so_timestamping timestamping;
+ struct socket *sock = sk->sk_socket;
struct sock_txtime sk_txtime;
- struct sock *sk = sock->sk;
int val;
int valbool;
struct linger ling;
@@ -1067,11 +1103,11 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
valbool = val ? 1 : 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
switch (optname) {
case SO_DEBUG:
- if (val && !capable(CAP_NET_ADMIN))
+ if (val && !sockopt_capable(CAP_NET_ADMIN))
ret = -EACCES;
else
sock_valbool_flag(sk, SOCK_DBG, valbool);
@@ -1101,7 +1137,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- val = min_t(u32, val, sysctl_wmem_max);
+ val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf:
/* Ensure val * 2 fits into an int, to prevent max_t()
* from treating it as a negative value.
@@ -1115,7 +1151,7 @@ set_sndbuf:
break;
case SO_SNDBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1133,11 +1169,11 @@ set_sndbuf:
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
+ __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
break;
case SO_RCVBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1164,8 +1200,8 @@ set_sndbuf:
case SO_PRIORITY:
if ((val >= 0 && val <= 6) ||
- ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
- ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
sk->sk_priority = val;
else
ret = -EPERM;
@@ -1228,7 +1264,7 @@ set_sndbuf:
case SO_RCVLOWAT:
if (val < 0)
val = INT_MAX;
- if (sock->ops->set_rcvlowat)
+ if (sock && sock->ops->set_rcvlowat)
ret = sock->ops->set_rcvlowat(sk, val);
else
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
@@ -1310,8 +1346,8 @@ set_sndbuf:
clear_bit(SOCK_PASSSEC, &sock->flags);
break;
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
- !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1319,8 +1355,8 @@ set_sndbuf:
__sock_set_mark(sk, val);
break;
case SO_RCVMARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
- !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1354,7 +1390,7 @@ set_sndbuf:
#ifdef CONFIG_NET_RX_BUSY_POLL
case SO_BUSY_POLL:
/* allow unprivileged users to decrease the value */
- if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
+ if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN))
ret = -EPERM;
else {
if (val < 0)
@@ -1364,13 +1400,13 @@ set_sndbuf:
}
break;
case SO_PREFER_BUSY_POLL:
- if (valbool && !capable(CAP_NET_ADMIN))
+ if (valbool && !sockopt_capable(CAP_NET_ADMIN))
ret = -EPERM;
else
WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
break;
case SO_BUSY_POLL_BUDGET:
- if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
+ if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
} else {
if (val < 0 || val > U16_MAX)
@@ -1441,7 +1477,7 @@ set_sndbuf:
* scheduler has enough safe guards.
*/
if (sk_txtime.clockid != CLOCK_MONOTONIC &&
- !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
@@ -1496,9 +1532,16 @@ set_sndbuf:
ret = -ENOPROTOOPT;
break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return ret;
}
+
+int sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ return sk_setsockopt(sock->sk, level, optname,
+ optval, optlen);
+}
EXPORT_SYMBOL(sock_setsockopt);
static const struct cred *sk_get_peer_cred(struct sock *sk)
@@ -1525,22 +1568,25 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred,
}
}
-static int groups_to_user(gid_t __user *dst, const struct group_info *src)
+static int groups_to_user(sockptr_t dst, const struct group_info *src)
{
struct user_namespace *user_ns = current_user_ns();
int i;
- for (i = 0; i < src->ngroups; i++)
- if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
+ for (i = 0; i < src->ngroups; i++) {
+ gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
+
+ if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
return -EFAULT;
+ }
return 0;
}
-int sock_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+int sk_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
- struct sock *sk = sock->sk;
+ struct socket *sock = sk->sk_socket;
union {
int val;
@@ -1557,7 +1603,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
int lv = sizeof(int);
int len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len < 0)
return -EINVAL;
@@ -1692,7 +1738,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
spin_unlock(&sk->sk_peer_lock);
- if (copy_to_user(optval, &peercred, len))
+ if (copy_to_sockptr(optval, &peercred, len))
return -EFAULT;
goto lenout;
}
@@ -1710,11 +1756,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
if (len < n * sizeof(gid_t)) {
len = n * sizeof(gid_t);
put_cred(cred);
- return put_user(len, optlen) ? -EFAULT : -ERANGE;
+ return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
}
len = n * sizeof(gid_t);
- ret = groups_to_user((gid_t __user *)optval, cred->group_info);
+ ret = groups_to_user(optval, cred->group_info);
put_cred(cred);
if (ret)
return ret;
@@ -1730,7 +1776,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
return -ENOTCONN;
if (lv < len)
return -EINVAL;
- if (copy_to_user(optval, address, len))
+ if (copy_to_sockptr(optval, address, len))
return -EFAULT;
goto lenout;
}
@@ -1747,7 +1793,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PEERSEC:
- return security_socket_getpeersec_stream(sock, optval, optlen, len);
+ return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len);
case SO_MARK:
v.val = sk->sk_mark;
@@ -1779,7 +1825,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
return sock_getbindtodevice(sk, optval, optlen, len);
case SO_GET_FILTER:
- len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
+ len = sk_get_filter(sk, optval, len);
if (len < 0)
return len;
@@ -1827,7 +1873,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
sk_get_meminfo(sk, meminfo);
len = min_t(unsigned int, len, sizeof(meminfo));
- if (copy_to_user(optval, &meminfo, len))
+ if (copy_to_sockptr(optval, &meminfo, len))
return -EFAULT;
goto lenout;
@@ -1896,14 +1942,22 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
if (len > lv)
len = lv;
- if (copy_to_user(optval, &v, len))
+ if (copy_to_sockptr(optval, &v, len))
return -EFAULT;
lenout:
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
return 0;
}
+int sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ return sk_getsockopt(sock->sk, level, optname,
+ USER_SOCKPTR(optval),
+ USER_SOCKPTR(optlen));
+}
+
/*
* Initialize an sk_lock.
*
@@ -2536,7 +2590,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
- sysctl_optmem_max)
+ READ_ONCE(sysctl_optmem_max))
return NULL;
skb = alloc_skb(size, priority);
@@ -2554,8 +2608,10 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
*/
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
- if ((unsigned int)size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+ int optmem_max = READ_ONCE(sysctl_optmem_max);
+
+ if ((unsigned int)size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
void *mem;
/* First do the add, to avoid the race if kmalloc
* might sleep.
@@ -3309,8 +3365,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
timer_setup(&sk->sk_timer, NULL, 0);
sk->sk_allocation = GFP_KERNEL;
- sk->sk_rcvbuf = sysctl_rmem_default;
- sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
+ sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
sk->sk_state = TCP_CLOSE;
sk_set_socket(sk, sock);
@@ -3365,7 +3421,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
#ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = 0;
- sk->sk_ll_usec = sysctl_net_busy_read;
+ sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
#endif
sk->sk_max_pacing_rate = ~0UL;
@@ -3554,7 +3610,8 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
- return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_getsockopt);
@@ -3580,7 +3637,8 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
- return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_setsockopt);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 9a9fb9487d63..81beb16ab1eb 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -41,7 +41,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
- stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT);
+ stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE);
if (!stab)
return ERR_PTR(-ENOMEM);
@@ -52,7 +52,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
sizeof(struct sock *),
stab->map.numa_node);
if (!stab->sks) {
- kfree(stab);
+ bpf_map_area_free(stab);
return ERR_PTR(-ENOMEM);
}
@@ -361,7 +361,7 @@ static void sock_map_free(struct bpf_map *map)
synchronize_rcu();
bpf_map_area_free(stab->sks);
- kfree(stab);
+ bpf_map_area_free(stab);
}
static void sock_map_release_progs(struct bpf_map *map)
@@ -1085,7 +1085,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
if (attr->key_size > MAX_BPF_STACK)
return ERR_PTR(-E2BIG);
- htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
+ htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -1115,7 +1115,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
return &htab->map;
free_htab:
- kfree(htab);
+ bpf_map_area_free(htab);
return ERR_PTR(err);
}
@@ -1168,7 +1168,7 @@ static void sock_hash_free(struct bpf_map *map)
synchronize_rcu();
bpf_map_area_free(htab->buckets);
- kfree(htab);
+ bpf_map_area_free(htab);
}
static void *sock_hash_lookup_sys(struct bpf_map *map, void *key)
@@ -1596,7 +1596,7 @@ void sock_map_destroy(struct sock *sk)
saved_destroy = psock->saved_destroy;
sock_map_remove_links(sk, psock);
rcu_read_unlock();
- sk_psock_stop(psock, false);
+ sk_psock_stop(psock);
sk_psock_put(sk, psock);
saved_destroy(sk);
}
@@ -1619,9 +1619,10 @@ void sock_map_close(struct sock *sk, long timeout)
saved_close = psock->saved_close;
sock_map_remove_links(sk, psock);
rcu_read_unlock();
- sk_psock_stop(psock, true);
- sk_psock_put(sk, psock);
+ sk_psock_stop(psock);
release_sock(sk);
+ cancel_work_sync(&psock->work);
+ sk_psock_put(sk, psock);
saved_close(sk, timeout);
}
EXPORT_SYMBOL_GPL(sock_map_close);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index 5daa1fa54249..fb90e1e00773 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -21,6 +21,22 @@ static DEFINE_IDA(reuseport_ida);
static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
struct sock_reuseport *reuse, bool bind_inany);
+void reuseport_has_conns_set(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return;
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ if (likely(reuse))
+ reuse->has_conns = 1;
+ spin_unlock_bh(&reuseport_lock);
+}
+EXPORT_SYMBOL(reuseport_has_conns_set);
+
static int reuseport_sock_index(struct sock *sk,
const struct sock_reuseport *reuse,
bool closed)
diff --git a/net/core/stream.c b/net/core/stream.c
index ccc083cdef23..75fded8495f5 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -123,7 +123,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk_stream_memory_free(sk))
- current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
+ current_timeo = vm_wait = prandom_u32_max(HZ / 5) + 2;
add_wait_queue(sk_sleep(sk), &wait);
@@ -159,7 +159,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
*timeo_p = current_timeo;
}
out:
- remove_wait_queue(sk_sleep(sk), &wait);
+ if (!sock_flag(sk, SOCK_DEAD))
+ remove_wait_queue(sk_sleep(sk), &wait);
return err;
do_error:
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 71a13596ea2b..5b1ce656baa1 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -29,7 +29,6 @@ static int int_3600 = 3600;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
static int max_skb_frags = MAX_SKB_FRAGS;
-static long long_max __maybe_unused = LONG_MAX;
static int net_msg_warn; /* Unused, but still a sysctl */
@@ -234,14 +233,17 @@ static int set_default_qdisc(struct ctl_table *table, int write,
static int proc_do_dev_weight(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret;
+ static DEFINE_MUTEX(dev_weight_mutex);
+ int ret, weight;
+ mutex_lock(&dev_weight_mutex);
ret = proc_dointvec(table, write, buffer, lenp, ppos);
- if (ret != 0)
- return ret;
-
- dev_rx_weight = weight_p * dev_weight_rx_bias;
- dev_tx_weight = weight_p * dev_weight_tx_bias;
+ if (!ret && write) {
+ weight = READ_ONCE(weight_p);
+ WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias);
+ WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias);
+ }
+ mutex_unlock(&dev_weight_mutex);
return ret;
}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 24420209bf0e..844c9d99dc0e 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -375,19 +375,17 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
struct xdp_buff *xdp)
{
- struct xdp_mem_allocator *xa;
struct page *page;
switch (mem->type) {
case MEM_TYPE_PAGE_POOL:
- rcu_read_lock();
- /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
if (napi_direct && xdp_return_frame_no_direct())
napi_direct = false;
- page_pool_put_full_page(xa->page_pool, page, napi_direct);
- rcu_read_unlock();
+ /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE)
+ * as mem->type knows this a page_pool page
+ */
+ page_pool_put_full_page(page->pp, page, napi_direct);
break;
case MEM_TYPE_PAGE_SHARED:
page_frag_free(data);