diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/Makefile | 2 | ||||
| -rw-r--r-- | net/core/dev.c | 105 | ||||
| -rw-r--r-- | net/core/devlink.c | 1923 | ||||
| -rw-r--r-- | net/core/dst.c | 6 | ||||
| -rw-r--r-- | net/core/ethtool.c | 262 | ||||
| -rw-r--r-- | net/core/filter.c | 640 | ||||
| -rw-r--r-- | net/core/flow_dissector.c | 92 | ||||
| -rw-r--r-- | net/core/flow_offload.c | 153 | ||||
| -rw-r--r-- | net/core/gen_stats.c | 2 | ||||
| -rw-r--r-- | net/core/lwt_bpf.c | 265 | ||||
| -rw-r--r-- | net/core/lwtunnel.c | 16 | ||||
| -rw-r--r-- | net/core/neighbour.c | 11 | ||||
| -rw-r--r-- | net/core/net-sysfs.c | 27 | ||||
| -rw-r--r-- | net/core/net-traces.c | 8 | ||||
| -rw-r--r-- | net/core/net_namespace.c | 38 | ||||
| -rw-r--r-- | net/core/page_pool.c | 22 | ||||
| -rw-r--r-- | net/core/pktgen.c | 3 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 128 | ||||
| -rw-r--r-- | net/core/scm.c | 27 | ||||
| -rw-r--r-- | net/core/skbuff.c | 4 | ||||
| -rw-r--r-- | net/core/skmsg.c | 28 | ||||
| -rw-r--r-- | net/core/sock.c | 255 | ||||
| -rw-r--r-- | net/core/sysctl_net_core.c | 18 | 
23 files changed, 3504 insertions, 531 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index fccd31e0e7f7..f97d6254e564 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o  obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \  			neighbour.o rtnetlink.o utils.o link_watch.o filter.o \  			sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ -			fib_notifier.o xdp.o +			fib_notifier.o xdp.o flow_offload.o  obj-y += net-sysfs.o  obj-$(CONFIG_PAGE_POOL) += page_pool.o diff --git a/net/core/dev.c b/net/core/dev.c index 82f20022259d..2b67f2aa59dd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3421,7 +3421,7 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)  	/* To get more precise estimation of bytes sent on wire,  	 * we add to pkt_len the headers size of all segments  	 */ -	if (shinfo->gso_size)  { +	if (shinfo->gso_size && skb_transport_header_was_set(skb)) {  		unsigned int hdr_len;  		u16 gso_segs = shinfo->gso_segs; @@ -7878,6 +7878,63 @@ int dev_get_phys_port_name(struct net_device *dev,  EXPORT_SYMBOL(dev_get_phys_port_name);  /** + *	dev_get_port_parent_id - Get the device's port parent identifier + *	@dev: network device + *	@ppid: pointer to a storage for the port's parent identifier + *	@recurse: allow/disallow recursion to lower devices + * + *	Get the devices's port parent identifier + */ +int dev_get_port_parent_id(struct net_device *dev, +			   struct netdev_phys_item_id *ppid, +			   bool recurse) +{ +	const struct net_device_ops *ops = dev->netdev_ops; +	struct netdev_phys_item_id first = { }; +	struct net_device *lower_dev; +	struct list_head *iter; +	int err = -EOPNOTSUPP; + +	if (ops->ndo_get_port_parent_id) +		return ops->ndo_get_port_parent_id(dev, ppid); + +	if (!recurse) +		return err; + +	netdev_for_each_lower_dev(dev, lower_dev, iter) { +		err = dev_get_port_parent_id(lower_dev, ppid, recurse); +		if (err) +			break; +		if (!first.id_len) +			first = *ppid; +		else if (memcmp(&first, ppid, sizeof(*ppid))) +			return -ENODATA; +	} + +	return err; +} +EXPORT_SYMBOL(dev_get_port_parent_id); + +/** + *	netdev_port_same_parent_id - Indicate if two network devices have + *	the same port parent identifier + *	@a: first network device + *	@b: second network device + */ +bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) +{ +	struct netdev_phys_item_id a_id = { }; +	struct netdev_phys_item_id b_id = { }; + +	if (dev_get_port_parent_id(a, &a_id, true) || +	    dev_get_port_parent_id(b, &b_id, true)) +		return false; + +	return netdev_phys_item_id_same(&a_id, &b_id); +} +EXPORT_SYMBOL(netdev_port_same_parent_id); + +/**   *	dev_change_proto_down - update protocol port state information   *	@dev: device   *	@proto_down: new value @@ -7897,6 +7954,25 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)  }  EXPORT_SYMBOL(dev_change_proto_down); +/** + *	dev_change_proto_down_generic - generic implementation for + * 	ndo_change_proto_down that sets carrier according to + * 	proto_down. + * + *	@dev: device + *	@proto_down: new value + */ +int dev_change_proto_down_generic(struct net_device *dev, bool proto_down) +{ +	if (proto_down) +		netif_carrier_off(dev); +	else +		netif_carrier_on(dev); +	dev->proto_down = proto_down; +	return 0; +} +EXPORT_SYMBOL(dev_change_proto_down_generic); +  u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,  		    enum bpf_netdev_command cmd)  { @@ -7976,35 +8052,41 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,  	enum bpf_netdev_command query;  	struct bpf_prog *prog = NULL;  	bpf_op_t bpf_op, bpf_chk; +	bool offload;  	int err;  	ASSERT_RTNL(); -	query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG; +	offload = flags & XDP_FLAGS_HW_MODE; +	query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;  	bpf_op = bpf_chk = ops->ndo_bpf; -	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) +	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) { +		NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode");  		return -EOPNOTSUPP; +	}  	if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))  		bpf_op = generic_xdp_install;  	if (bpf_op == bpf_chk)  		bpf_chk = generic_xdp_install;  	if (fd >= 0) { -		if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) || -		    __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW)) +		if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) { +			NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");  			return -EEXIST; +		}  		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && -		    __dev_xdp_query(dev, bpf_op, query)) +		    __dev_xdp_query(dev, bpf_op, query)) { +			NL_SET_ERR_MSG(extack, "XDP program already attached");  			return -EBUSY; +		}  		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,  					     bpf_op == ops->ndo_bpf);  		if (IS_ERR(prog))  			return PTR_ERR(prog); -		if (!(flags & XDP_FLAGS_HW_MODE) && -		    bpf_prog_is_dev_bound(prog->aux)) { +		if (!offload && bpf_prog_is_dev_bound(prog->aux)) {  			NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");  			bpf_prog_put(prog);  			return -EINVAL; @@ -8152,7 +8234,7 @@ static netdev_features_t netdev_sync_upper_features(struct net_device *lower,  	netdev_features_t feature;  	int feature_bit; -	for_each_netdev_feature(&upper_disables, feature_bit) { +	for_each_netdev_feature(upper_disables, feature_bit) {  		feature = __NETIF_F_BIT(feature_bit);  		if (!(upper->wanted_features & feature)  		    && (features & feature)) { @@ -8172,7 +8254,7 @@ static void netdev_sync_lower_features(struct net_device *upper,  	netdev_features_t feature;  	int feature_bit; -	for_each_netdev_feature(&upper_disables, feature_bit) { +	for_each_netdev_feature(upper_disables, feature_bit) {  		feature = __NETIF_F_BIT(feature_bit);  		if (!(features & feature) && (lower->features & feature)) {  			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", @@ -8712,6 +8794,9 @@ int init_dummy_netdev(struct net_device *dev)  	set_bit(__LINK_STATE_PRESENT, &dev->state);  	set_bit(__LINK_STATE_START, &dev->state); +	/* napi_busy_loop stats accounting wants this */ +	dev_net_set(dev, &init_net); +  	/* Note : We dont allocate pcpu_refcnt for dummy devices,  	 * because users of this 'device' dont need to change  	 * its refcount. diff --git a/net/core/devlink.c b/net/core/devlink.c index abb0da9d7b4b..78e22cea4cc7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -81,6 +81,7 @@ struct devlink_dpipe_header devlink_dpipe_header_ipv6 = {  EXPORT_SYMBOL(devlink_dpipe_header_ipv6);  EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); +EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr);  static LIST_HEAD(devlink_list); @@ -115,6 +116,8 @@ static struct devlink *devlink_get_from_attrs(struct net *net,  	busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);  	devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); +	lockdep_assert_held(&devlink_mutex); +  	list_for_each_entry(devlink, &devlink_list, list) {  		if (strcmp(devlink->dev->bus->name, busname) == 0 &&  		    strcmp(dev_name(devlink->dev), devname) == 0 && @@ -720,7 +723,7 @@ static int devlink_port_type_set(struct devlink *devlink,  {  	int err; -	if (devlink->ops && devlink->ops->port_type_set) { +	if (devlink->ops->port_type_set) {  		if (port_type == DEVLINK_PORT_TYPE_NOTSET)  			return -EINVAL;  		if (port_type == devlink_port->type) @@ -757,7 +760,7 @@ static int devlink_port_split(struct devlink *devlink, u32 port_index,  			      u32 count, struct netlink_ext_ack *extack)  { -	if (devlink->ops && devlink->ops->port_split) +	if (devlink->ops->port_split)  		return devlink->ops->port_split(devlink, port_index, count,  						extack);  	return -EOPNOTSUPP; @@ -783,7 +786,7 @@ static int devlink_port_unsplit(struct devlink *devlink, u32 port_index,  				struct netlink_ext_ack *extack)  { -	if (devlink->ops && devlink->ops->port_unsplit) +	if (devlink->ops->port_unsplit)  		return devlink->ops->port_unsplit(devlink, port_index, extack);  	return -EOPNOTSUPP;  } @@ -932,6 +935,9 @@ static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink,  	if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE,  		       pool_info.threshold_type))  		goto nla_put_failure; +	if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_CELL_SIZE, +			pool_info.cell_size)) +		goto nla_put_failure;  	genlmsg_end(msg, hdr);  	return 0; @@ -955,7 +961,7 @@ static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb,  	if (err)  		return err; -	if (!devlink->ops || !devlink->ops->sb_pool_get) +	if (!devlink->ops->sb_pool_get)  		return -EOPNOTSUPP;  	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -1011,7 +1017,7 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,  	mutex_lock(&devlink_mutex);  	list_for_each_entry(devlink, &devlink_list, list) {  		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || -		    !devlink->ops || !devlink->ops->sb_pool_get) +		    !devlink->ops->sb_pool_get)  			continue;  		mutex_lock(&devlink->lock);  		list_for_each_entry(devlink_sb, &devlink->sb_list, list) { @@ -1040,7 +1046,7 @@ static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index,  {  	const struct devlink_ops *ops = devlink->ops; -	if (ops && ops->sb_pool_set) +	if (ops->sb_pool_set)  		return ops->sb_pool_set(devlink, sb_index, pool_index,  					size, threshold_type);  	return -EOPNOTSUPP; @@ -1145,7 +1151,7 @@ static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb,  	if (err)  		return err; -	if (!devlink->ops || !devlink->ops->sb_port_pool_get) +	if (!devlink->ops->sb_port_pool_get)  		return -EOPNOTSUPP;  	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -1207,7 +1213,7 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,  	mutex_lock(&devlink_mutex);  	list_for_each_entry(devlink, &devlink_list, list) {  		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || -		    !devlink->ops || !devlink->ops->sb_port_pool_get) +		    !devlink->ops->sb_port_pool_get)  			continue;  		mutex_lock(&devlink->lock);  		list_for_each_entry(devlink_sb, &devlink->sb_list, list) { @@ -1236,7 +1242,7 @@ static int devlink_sb_port_pool_set(struct devlink_port *devlink_port,  {  	const struct devlink_ops *ops = devlink_port->devlink->ops; -	if (ops && ops->sb_port_pool_set) +	if (ops->sb_port_pool_set)  		return ops->sb_port_pool_set(devlink_port, sb_index,  					     pool_index, threshold);  	return -EOPNOTSUPP; @@ -1349,7 +1355,7 @@ static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb,  	if (err)  		return err; -	if (!devlink->ops || !devlink->ops->sb_tc_pool_bind_get) +	if (!devlink->ops->sb_tc_pool_bind_get)  		return -EOPNOTSUPP;  	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); @@ -1433,7 +1439,7 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,  	mutex_lock(&devlink_mutex);  	list_for_each_entry(devlink, &devlink_list, list) {  		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || -		    !devlink->ops || !devlink->ops->sb_tc_pool_bind_get) +		    !devlink->ops->sb_tc_pool_bind_get)  			continue;  		mutex_lock(&devlink->lock); @@ -1465,7 +1471,7 @@ static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,  {  	const struct devlink_ops *ops = devlink_port->devlink->ops; -	if (ops && ops->sb_tc_pool_bind_set) +	if (ops->sb_tc_pool_bind_set)  		return ops->sb_tc_pool_bind_set(devlink_port, sb_index,  						tc_index, pool_type,  						pool_index, threshold); @@ -1513,7 +1519,7 @@ static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb,  	struct devlink_sb *devlink_sb = info->user_ptr[1];  	const struct devlink_ops *ops = devlink->ops; -	if (ops && ops->sb_occ_snapshot) +	if (ops->sb_occ_snapshot)  		return ops->sb_occ_snapshot(devlink, devlink_sb->index);  	return -EOPNOTSUPP;  } @@ -1525,7 +1531,7 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,  	struct devlink_sb *devlink_sb = info->user_ptr[1];  	const struct devlink_ops *ops = devlink->ops; -	if (ops && ops->sb_occ_max_clear) +	if (ops->sb_occ_max_clear)  		return ops->sb_occ_max_clear(devlink, devlink_sb->index);  	return -EOPNOTSUPP;  } @@ -1588,13 +1594,9 @@ static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,  					   struct genl_info *info)  {  	struct devlink *devlink = info->user_ptr[0]; -	const struct devlink_ops *ops = devlink->ops;  	struct sk_buff *msg;  	int err; -	if (!ops) -		return -EOPNOTSUPP; -  	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);  	if (!msg)  		return -ENOMEM; @@ -1619,9 +1621,6 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,  	int err = 0;  	u16 mode; -	if (!ops) -		return -EOPNOTSUPP; -  	if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {  		if (!ops->eswitch_mode_set)  			return -EOPNOTSUPP; @@ -2656,6 +2655,27 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)  	return devlink->ops->reload(devlink, info->extack);  } +static int devlink_nl_cmd_flash_update(struct sk_buff *skb, +				       struct genl_info *info) +{ +	struct devlink *devlink = info->user_ptr[0]; +	const char *file_name, *component; +	struct nlattr *nla_component; + +	if (!devlink->ops->flash_update) +		return -EOPNOTSUPP; + +	if (!info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]) +		return -EINVAL; +	file_name = nla_data(info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]); + +	nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]; +	component = nla_component ? nla_data(nla_component) : NULL; + +	return devlink->ops->flash_update(devlink, file_name, component, +					  info->extack); +} +  static const struct devlink_param devlink_param_generic[] = {  	{  		.id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, @@ -2843,11 +2863,13 @@ nla_put_failure:  }  static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, +				 unsigned int port_index,  				 struct devlink_param_item *param_item,  				 enum devlink_command cmd,  				 u32 portid, u32 seq, int flags)  {  	union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; +	bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {};  	const struct devlink_param *param = param_item->param;  	struct devlink_param_gset_ctx ctx;  	struct nlattr *param_values_list; @@ -2866,12 +2888,15 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,  				return -EOPNOTSUPP;  			param_value[i] = param_item->driverinit_value;  		} else { +			if (!param_item->published) +				continue;  			ctx.cmode = i;  			err = devlink_param_get(devlink, param, &ctx);  			if (err)  				return err;  			param_value[i] = ctx.val;  		} +		param_value_set[i] = true;  	}  	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); @@ -2880,6 +2905,13 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,  	if (devlink_nl_put_handle(msg, devlink))  		goto genlmsg_cancel; + +	if (cmd == DEVLINK_CMD_PORT_PARAM_GET || +	    cmd == DEVLINK_CMD_PORT_PARAM_NEW || +	    cmd == DEVLINK_CMD_PORT_PARAM_DEL) +		if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index)) +			goto genlmsg_cancel; +  	param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM);  	if (!param_attr)  		goto genlmsg_cancel; @@ -2899,7 +2931,7 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,  		goto param_nest_cancel;  	for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { -		if (!devlink_param_cmode_is_supported(param, i)) +		if (!param_value_set[i])  			continue;  		err = devlink_nl_param_value_fill_one(msg, param->type,  						      i, param_value[i]); @@ -2922,18 +2954,22 @@ genlmsg_cancel:  }  static void devlink_param_notify(struct devlink *devlink, +				 unsigned int port_index,  				 struct devlink_param_item *param_item,  				 enum devlink_command cmd)  {  	struct sk_buff *msg;  	int err; -	WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL); +	WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && +		cmd != DEVLINK_CMD_PORT_PARAM_NEW && +		cmd != DEVLINK_CMD_PORT_PARAM_DEL);  	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);  	if (!msg)  		return; -	err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0); +	err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd, +				    0, 0, 0);  	if (err) {  		nlmsg_free(msg);  		return; @@ -2962,7 +2998,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,  				idx++;  				continue;  			} -			err = devlink_nl_param_fill(msg, devlink, param_item, +			err = devlink_nl_param_fill(msg, devlink, 0, param_item,  						    DEVLINK_CMD_PARAM_GET,  						    NETLINK_CB(cb->skb).portid,  						    cb->nlh->nlmsg_seq, @@ -3051,7 +3087,7 @@ devlink_param_value_get_from_info(const struct devlink_param *param,  }  static struct devlink_param_item * -devlink_param_get_from_info(struct devlink *devlink, +devlink_param_get_from_info(struct list_head *param_list,  			    struct genl_info *info)  {  	char *param_name; @@ -3060,7 +3096,7 @@ devlink_param_get_from_info(struct devlink *devlink,  		return NULL;  	param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); -	return devlink_param_find_by_name(&devlink->param_list, param_name); +	return devlink_param_find_by_name(param_list, param_name);  }  static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, @@ -3071,7 +3107,7 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,  	struct sk_buff *msg;  	int err; -	param_item = devlink_param_get_from_info(devlink, info); +	param_item = devlink_param_get_from_info(&devlink->param_list, info);  	if (!param_item)  		return -EINVAL; @@ -3079,7 +3115,7 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,  	if (!msg)  		return -ENOMEM; -	err = devlink_nl_param_fill(msg, devlink, param_item, +	err = devlink_nl_param_fill(msg, devlink, 0, param_item,  				    DEVLINK_CMD_PARAM_GET,  				    info->snd_portid, info->snd_seq, 0);  	if (err) { @@ -3090,10 +3126,12 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,  	return genlmsg_reply(msg, info);  } -static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, -					 struct genl_info *info) +static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, +					   unsigned int port_index, +					   struct list_head *param_list, +					   struct genl_info *info, +					   enum devlink_command cmd)  { -	struct devlink *devlink = info->user_ptr[0];  	enum devlink_param_type param_type;  	struct devlink_param_gset_ctx ctx;  	enum devlink_param_cmode cmode; @@ -3102,7 +3140,7 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,  	union devlink_param_value value;  	int err = 0; -	param_item = devlink_param_get_from_info(devlink, info); +	param_item = devlink_param_get_from_info(param_list, info);  	if (!param_item)  		return -EINVAL;  	param = param_item->param; @@ -3142,17 +3180,28 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,  			return err;  	} -	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); +	devlink_param_notify(devlink, port_index, param_item, cmd);  	return 0;  } +static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, +					 struct genl_info *info) +{ +	struct devlink *devlink = info->user_ptr[0]; + +	return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->param_list, +					       info, DEVLINK_CMD_PARAM_NEW); +} +  static int devlink_param_register_one(struct devlink *devlink, -				      const struct devlink_param *param) +				      unsigned int port_index, +				      struct list_head *param_list, +				      const struct devlink_param *param, +				      enum devlink_command cmd)  {  	struct devlink_param_item *param_item; -	if (devlink_param_find_by_name(&devlink->param_list, -				       param->name)) +	if (devlink_param_find_by_name(param_list, param->name))  		return -EEXIST;  	if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) @@ -3165,24 +3214,111 @@ static int devlink_param_register_one(struct devlink *devlink,  		return -ENOMEM;  	param_item->param = param; -	list_add_tail(¶m_item->list, &devlink->param_list); -	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); +	list_add_tail(¶m_item->list, param_list); +	devlink_param_notify(devlink, port_index, param_item, cmd);  	return 0;  }  static void devlink_param_unregister_one(struct devlink *devlink, -					 const struct devlink_param *param) +					 unsigned int port_index, +					 struct list_head *param_list, +					 const struct devlink_param *param, +					 enum devlink_command cmd)  {  	struct devlink_param_item *param_item; -	param_item = devlink_param_find_by_name(&devlink->param_list, -						param->name); +	param_item = devlink_param_find_by_name(param_list, param->name);  	WARN_ON(!param_item); -	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL); +	devlink_param_notify(devlink, port_index, param_item, cmd);  	list_del(¶m_item->list);  	kfree(param_item);  } +static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, +						struct netlink_callback *cb) +{ +	struct devlink_param_item *param_item; +	struct devlink_port *devlink_port; +	struct devlink *devlink; +	int start = cb->args[0]; +	int idx = 0; +	int err; + +	mutex_lock(&devlink_mutex); +	list_for_each_entry(devlink, &devlink_list, list) { +		if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) +			continue; +		mutex_lock(&devlink->lock); +		list_for_each_entry(devlink_port, &devlink->port_list, list) { +			list_for_each_entry(param_item, +					    &devlink_port->param_list, list) { +				if (idx < start) { +					idx++; +					continue; +				} +				err = devlink_nl_param_fill(msg, +						devlink_port->devlink, +						devlink_port->index, param_item, +						DEVLINK_CMD_PORT_PARAM_GET, +						NETLINK_CB(cb->skb).portid, +						cb->nlh->nlmsg_seq, +						NLM_F_MULTI); +				if (err) { +					mutex_unlock(&devlink->lock); +					goto out; +				} +				idx++; +			} +		} +		mutex_unlock(&devlink->lock); +	} +out: +	mutex_unlock(&devlink_mutex); + +	cb->args[0] = idx; +	return msg->len; +} + +static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, +					      struct genl_info *info) +{ +	struct devlink_port *devlink_port = info->user_ptr[0]; +	struct devlink_param_item *param_item; +	struct sk_buff *msg; +	int err; + +	param_item = devlink_param_get_from_info(&devlink_port->param_list, +						 info); +	if (!param_item) +		return -EINVAL; + +	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (!msg) +		return -ENOMEM; + +	err = devlink_nl_param_fill(msg, devlink_port->devlink, +				    devlink_port->index, param_item, +				    DEVLINK_CMD_PORT_PARAM_GET, +				    info->snd_portid, info->snd_seq, 0); +	if (err) { +		nlmsg_free(msg); +		return err; +	} + +	return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, +					      struct genl_info *info) +{ +	struct devlink_port *devlink_port = info->user_ptr[0]; + +	return __devlink_nl_cmd_param_set_doit(devlink_port->devlink, +					       devlink_port->index, +					       &devlink_port->param_list, info, +					       DEVLINK_CMD_PORT_PARAM_NEW); +} +  static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,  					     struct devlink *devlink,  					     struct devlink_snapshot *snapshot) @@ -3504,44 +3640,56 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,  					     struct netlink_callback *cb)  {  	u64 ret_offset, start_offset, end_offset = 0; -	struct nlattr *attrs[DEVLINK_ATTR_MAX + 1];  	const struct genl_ops *ops = cb->data;  	struct devlink_region *region;  	struct nlattr *chunks_attr;  	const char *region_name;  	struct devlink *devlink; +	struct nlattr **attrs;  	bool dump = true;  	void *hdr;  	int err;  	start_offset = *((u64 *)&cb->args[0]); +	attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL); +	if (!attrs) +		return -ENOMEM; +  	err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize,  			  attrs, DEVLINK_ATTR_MAX, ops->policy, cb->extack);  	if (err) -		goto out; +		goto out_free; +	mutex_lock(&devlink_mutex);  	devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs); -	if (IS_ERR(devlink)) -		goto out; +	if (IS_ERR(devlink)) { +		err = PTR_ERR(devlink); +		goto out_dev; +	} -	mutex_lock(&devlink_mutex);  	mutex_lock(&devlink->lock);  	if (!attrs[DEVLINK_ATTR_REGION_NAME] || -	    !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) +	    !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { +		err = -EINVAL;  		goto out_unlock; +	}  	region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]);  	region = devlink_region_get_by_name(devlink, region_name); -	if (!region) +	if (!region) { +		err = -EINVAL;  		goto out_unlock; +	}  	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,  			  &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI,  			  DEVLINK_CMD_REGION_READ); -	if (!hdr) +	if (!hdr) { +		err = -EMSGSIZE;  		goto out_unlock; +	}  	err = devlink_nl_put_handle(skb, devlink);  	if (err) @@ -3552,8 +3700,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,  		goto nla_put_failure;  	chunks_attr = nla_nest_start(skb, DEVLINK_ATTR_REGION_CHUNKS); -	if (!chunks_attr) +	if (!chunks_attr) { +		err = -EMSGSIZE;  		goto nla_put_failure; +	}  	if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&  	    attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { @@ -3576,8 +3726,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,  		goto nla_put_failure;  	/* Check if there was any progress done to prevent infinite loop */ -	if (ret_offset == start_offset) +	if (ret_offset == start_offset) { +		err = -EINVAL;  		goto nla_put_failure; +	}  	*((u64 *)&cb->args[0]) = ret_offset; @@ -3585,6 +3737,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,  	genlmsg_end(skb, hdr);  	mutex_unlock(&devlink->lock);  	mutex_unlock(&devlink_mutex); +	kfree(attrs);  	return skb->len; @@ -3592,8 +3745,1144 @@ nla_put_failure:  	genlmsg_cancel(skb, hdr);  out_unlock:  	mutex_unlock(&devlink->lock); +out_dev: +	mutex_unlock(&devlink_mutex); +out_free: +	kfree(attrs); +	return err; +} + +struct devlink_info_req { +	struct sk_buff *msg; +}; + +int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name) +{ +	return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name); +} +EXPORT_SYMBOL_GPL(devlink_info_driver_name_put); + +int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) +{ +	return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn); +} +EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); + +static int devlink_info_version_put(struct devlink_info_req *req, int attr, +				    const char *version_name, +				    const char *version_value) +{ +	struct nlattr *nest; +	int err; + +	nest = nla_nest_start(req->msg, attr); +	if (!nest) +		return -EMSGSIZE; + +	err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME, +			     version_name); +	if (err) +		goto nla_put_failure; + +	err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE, +			     version_value); +	if (err) +		goto nla_put_failure; + +	nla_nest_end(req->msg, nest); + +	return 0; + +nla_put_failure: +	nla_nest_cancel(req->msg, nest); +	return err; +} + +int devlink_info_version_fixed_put(struct devlink_info_req *req, +				   const char *version_name, +				   const char *version_value) +{ +	return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED, +					version_name, version_value); +} +EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put); + +int devlink_info_version_stored_put(struct devlink_info_req *req, +				    const char *version_name, +				    const char *version_value) +{ +	return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, +					version_name, version_value); +} +EXPORT_SYMBOL_GPL(devlink_info_version_stored_put); + +int devlink_info_version_running_put(struct devlink_info_req *req, +				     const char *version_name, +				     const char *version_value) +{ +	return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, +					version_name, version_value); +} +EXPORT_SYMBOL_GPL(devlink_info_version_running_put); + +static int +devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, +		     enum devlink_command cmd, u32 portid, +		     u32 seq, int flags, struct netlink_ext_ack *extack) +{ +	struct devlink_info_req req; +	void *hdr; +	int err; + +	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); +	if (!hdr) +		return -EMSGSIZE; + +	err = -EMSGSIZE; +	if (devlink_nl_put_handle(msg, devlink)) +		goto err_cancel_msg; + +	req.msg = msg; +	err = devlink->ops->info_get(devlink, &req, extack); +	if (err) +		goto err_cancel_msg; + +	genlmsg_end(msg, hdr); +	return 0; + +err_cancel_msg: +	genlmsg_cancel(msg, hdr); +	return err; +} + +static int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, +					struct genl_info *info) +{ +	struct devlink *devlink = info->user_ptr[0]; +	struct sk_buff *msg; +	int err; + +	if (!devlink->ops->info_get) +		return -EOPNOTSUPP; + +	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (!msg) +		return -ENOMEM; + +	err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, +				   info->snd_portid, info->snd_seq, 0, +				   info->extack); +	if (err) { +		nlmsg_free(msg); +		return err; +	} + +	return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, +					  struct netlink_callback *cb) +{ +	struct devlink *devlink; +	int start = cb->args[0]; +	int idx = 0; +	int err; + +	mutex_lock(&devlink_mutex); +	list_for_each_entry(devlink, &devlink_list, list) { +		if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) +			continue; +		if (idx < start) { +			idx++; +			continue; +		} + +		mutex_lock(&devlink->lock); +		err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, +					   NETLINK_CB(cb->skb).portid, +					   cb->nlh->nlmsg_seq, NLM_F_MULTI, +					   cb->extack); +		mutex_unlock(&devlink->lock); +		if (err) +			break; +		idx++; +	} +	mutex_unlock(&devlink_mutex); + +	cb->args[0] = idx; +	return msg->len; +} + +struct devlink_fmsg_item { +	struct list_head list; +	int attrtype; +	u8 nla_type; +	u16 len; +	int value[0]; +}; + +struct devlink_fmsg { +	struct list_head item_list; +}; + +static struct devlink_fmsg *devlink_fmsg_alloc(void) +{ +	struct devlink_fmsg *fmsg; + +	fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL); +	if (!fmsg) +		return NULL; + +	INIT_LIST_HEAD(&fmsg->item_list); + +	return fmsg; +} + +static void devlink_fmsg_free(struct devlink_fmsg *fmsg) +{ +	struct devlink_fmsg_item *item, *tmp; + +	list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) { +		list_del(&item->list); +		kfree(item); +	} +	kfree(fmsg); +} + +static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, +				    int attrtype) +{ +	struct devlink_fmsg_item *item; + +	item = kzalloc(sizeof(*item), GFP_KERNEL); +	if (!item) +		return -ENOMEM; + +	item->attrtype = attrtype; +	list_add_tail(&item->list, &fmsg->item_list); + +	return 0; +} + +int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) +{ +	return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start); + +static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg) +{ +	return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END); +} + +int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) +{ +	return devlink_fmsg_nest_end(fmsg); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end); + +#define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN) + +static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name) +{ +	struct devlink_fmsg_item *item; + +	if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) +		return -EMSGSIZE; + +	item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL); +	if (!item) +		return -ENOMEM; + +	item->nla_type = NLA_NUL_STRING; +	item->len = strlen(name) + 1; +	item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME; +	memcpy(&item->value, name, item->len); +	list_add_tail(&item->list, &fmsg->item_list); + +	return 0; +} + +int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) +{ +	int err; + +	err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START); +	if (err) +		return err; + +	err = devlink_fmsg_put_name(fmsg, name); +	if (err) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start); + +int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) +{ +	return devlink_fmsg_nest_end(fmsg); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end); + +int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, +				     const char *name) +{ +	int err; + +	err = devlink_fmsg_pair_nest_start(fmsg, name); +	if (err) +		return err; + +	err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START); +	if (err) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start); + +int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) +{ +	int err; + +	err = devlink_fmsg_nest_end(fmsg); +	if (err) +		return err; + +	err = devlink_fmsg_nest_end(fmsg); +	if (err) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end); + +static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg, +				  const void *value, u16 value_len, +				  u8 value_nla_type) +{ +	struct devlink_fmsg_item *item; + +	if (value_len > DEVLINK_FMSG_MAX_SIZE) +		return -EMSGSIZE; + +	item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL); +	if (!item) +		return -ENOMEM; + +	item->nla_type = value_nla_type; +	item->len = value_len; +	item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; +	memcpy(&item->value, value, item->len); +	list_add_tail(&item->list, &fmsg->item_list); + +	return 0; +} + +int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) +{ +	return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_bool_put); + +int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) +{ +	return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u8_put); + +int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) +{ +	return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); + +int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) +{ +	return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u64_put); + +int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) +{ +	return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, +				      NLA_NUL_STRING); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); + +int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, +			    u16 value_len) +{ +	return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); + +int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, +			       bool value) +{ +	int err; + +	err = devlink_fmsg_pair_nest_start(fmsg, name); +	if (err) +		return err; + +	err = devlink_fmsg_bool_put(fmsg, value); +	if (err) +		return err; + +	err = devlink_fmsg_pair_nest_end(fmsg); +	if (err) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put); + +int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, +			     u8 value) +{ +	int err; + +	err = devlink_fmsg_pair_nest_start(fmsg, name); +	if (err) +		return err; + +	err = devlink_fmsg_u8_put(fmsg, value); +	if (err) +		return err; + +	err = devlink_fmsg_pair_nest_end(fmsg); +	if (err) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put); + +int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, +			      u32 value) +{ +	int err; + +	err = devlink_fmsg_pair_nest_start(fmsg, name); +	if (err) +		return err; + +	err = devlink_fmsg_u32_put(fmsg, value); +	if (err) +		return err; + +	err = devlink_fmsg_pair_nest_end(fmsg); +	if (err) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put); + +int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, +			      u64 value) +{ +	int err; + +	err = devlink_fmsg_pair_nest_start(fmsg, name); +	if (err) +		return err; + +	err = devlink_fmsg_u64_put(fmsg, value); +	if (err) +		return err; + +	err = devlink_fmsg_pair_nest_end(fmsg); +	if (err) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put); + +int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, +				 const char *value) +{ +	int err; + +	err = devlink_fmsg_pair_nest_start(fmsg, name); +	if (err) +		return err; + +	err = devlink_fmsg_string_put(fmsg, value); +	if (err) +		return err; + +	err = devlink_fmsg_pair_nest_end(fmsg); +	if (err) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); + +int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, +				 const void *value, u16 value_len) +{ +	int err; + +	err = devlink_fmsg_pair_nest_start(fmsg, name); +	if (err) +		return err; + +	err = devlink_fmsg_binary_put(fmsg, value, value_len); +	if (err) +		return err; + +	err = devlink_fmsg_pair_nest_end(fmsg); +	if (err) +		return err; + +	return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); + +static int +devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb) +{ +	switch (msg->nla_type) { +	case NLA_FLAG: +	case NLA_U8: +	case NLA_U32: +	case NLA_U64: +	case NLA_NUL_STRING: +	case NLA_BINARY: +		return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, +				  msg->nla_type); +	default: +		return -EINVAL; +	} +} + +static int +devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb) +{ +	int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; +	u8 tmp; + +	switch (msg->nla_type) { +	case NLA_FLAG: +		/* Always provide flag data, regardless of its value */ +		tmp = *(bool *) msg->value; + +		return nla_put_u8(skb, attrtype, tmp); +	case NLA_U8: +		return nla_put_u8(skb, attrtype, *(u8 *) msg->value); +	case NLA_U32: +		return nla_put_u32(skb, attrtype, *(u32 *) msg->value); +	case NLA_U64: +		return nla_put_u64_64bit(skb, attrtype, *(u64 *) msg->value, +					 DEVLINK_ATTR_PAD); +	case NLA_NUL_STRING: +		return nla_put_string(skb, attrtype, (char *) &msg->value); +	case NLA_BINARY: +		return nla_put(skb, attrtype, msg->len, (void *) &msg->value); +	default: +		return -EINVAL; +	} +} + +static int +devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, +			 int *start) +{ +	struct devlink_fmsg_item *item; +	struct nlattr *fmsg_nlattr; +	int i = 0; +	int err; + +	fmsg_nlattr = nla_nest_start(skb, DEVLINK_ATTR_FMSG); +	if (!fmsg_nlattr) +		return -EMSGSIZE; + +	list_for_each_entry(item, &fmsg->item_list, list) { +		if (i < *start) { +			i++; +			continue; +		} + +		switch (item->attrtype) { +		case DEVLINK_ATTR_FMSG_OBJ_NEST_START: +		case DEVLINK_ATTR_FMSG_PAIR_NEST_START: +		case DEVLINK_ATTR_FMSG_ARR_NEST_START: +		case DEVLINK_ATTR_FMSG_NEST_END: +			err = nla_put_flag(skb, item->attrtype); +			break; +		case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA: +			err = devlink_fmsg_item_fill_type(item, skb); +			if (err) +				break; +			err = devlink_fmsg_item_fill_data(item, skb); +			break; +		case DEVLINK_ATTR_FMSG_OBJ_NAME: +			err = nla_put_string(skb, item->attrtype, +					     (char *) &item->value); +			break; +		default: +			err = -EINVAL; +			break; +		} +		if (!err) +			*start = ++i; +		else +			break; +	} + +	nla_nest_end(skb, fmsg_nlattr); +	return err; +} + +static int devlink_fmsg_snd(struct devlink_fmsg *fmsg, +			    struct genl_info *info, +			    enum devlink_command cmd, int flags) +{ +	struct nlmsghdr *nlh; +	struct sk_buff *skb; +	bool last = false; +	int index = 0; +	void *hdr; +	int err; + +	while (!last) { +		int tmp_index = index; + +		skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); +		if (!skb) +			return -ENOMEM; + +		hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, +				  &devlink_nl_family, flags | NLM_F_MULTI, cmd); +		if (!hdr) { +			err = -EMSGSIZE; +			goto nla_put_failure; +		} + +		err = devlink_fmsg_prepare_skb(fmsg, skb, &index); +		if (!err) +			last = true; +		else if (err != -EMSGSIZE || tmp_index == index) +			goto nla_put_failure; + +		genlmsg_end(skb, hdr); +		err = genlmsg_reply(skb, info); +		if (err) +			return err; +	} + +	skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (!skb) +		return -ENOMEM; +	nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, +			NLMSG_DONE, 0, flags | NLM_F_MULTI); +	if (!nlh) { +		err = -EMSGSIZE; +		goto nla_put_failure; +	} + +	return genlmsg_reply(skb, info); + +nla_put_failure: +	nlmsg_free(skb); +	return err; +} + +struct devlink_health_reporter { +	struct list_head list; +	void *priv; +	const struct devlink_health_reporter_ops *ops; +	struct devlink *devlink; +	struct devlink_fmsg *dump_fmsg; +	struct mutex dump_lock; /* lock parallel read/write from dump buffers */ +	u64 graceful_period; +	bool auto_recover; +	u8 health_state; +	u64 dump_ts; +	u64 error_count; +	u64 recovery_count; +	u64 last_recovery_ts; +}; + +void * +devlink_health_reporter_priv(struct devlink_health_reporter *reporter) +{ +	return reporter->priv; +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); + +static struct devlink_health_reporter * +devlink_health_reporter_find_by_name(struct devlink *devlink, +				     const char *reporter_name) +{ +	struct devlink_health_reporter *reporter; + +	list_for_each_entry(reporter, &devlink->reporter_list, list) +		if (!strcmp(reporter->ops->name, reporter_name)) +			return reporter; +	return NULL; +} + +/** + *	devlink_health_reporter_create - create devlink health reporter + * + *	@devlink: devlink + *	@ops: ops + *	@graceful_period: to avoid recovery loops, in msecs + *	@auto_recover: auto recover when error occurs + *	@priv: priv + */ +struct devlink_health_reporter * +devlink_health_reporter_create(struct devlink *devlink, +			       const struct devlink_health_reporter_ops *ops, +			       u64 graceful_period, bool auto_recover, +			       void *priv) +{ +	struct devlink_health_reporter *reporter; + +	mutex_lock(&devlink->lock); +	if (devlink_health_reporter_find_by_name(devlink, ops->name)) { +		reporter = ERR_PTR(-EEXIST); +		goto unlock; +	} + +	if (WARN_ON(auto_recover && !ops->recover) || +	    WARN_ON(graceful_period && !ops->recover)) { +		reporter = ERR_PTR(-EINVAL); +		goto unlock; +	} + +	reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); +	if (!reporter) { +		reporter = ERR_PTR(-ENOMEM); +		goto unlock; +	} + +	reporter->priv = priv; +	reporter->ops = ops; +	reporter->devlink = devlink; +	reporter->graceful_period = graceful_period; +	reporter->auto_recover = auto_recover; +	mutex_init(&reporter->dump_lock); +	list_add_tail(&reporter->list, &devlink->reporter_list); +unlock: +	mutex_unlock(&devlink->lock); +	return reporter; +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_create); + +/** + *	devlink_health_reporter_destroy - destroy devlink health reporter + * + *	@reporter: devlink health reporter to destroy + */ +void +devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) +{ +	mutex_lock(&reporter->devlink->lock); +	list_del(&reporter->list); +	mutex_unlock(&reporter->devlink->lock); +	if (reporter->dump_fmsg) +		devlink_fmsg_free(reporter->dump_fmsg); +	kfree(reporter); +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); + +void +devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, +				     enum devlink_health_reporter_state state) +{ +	if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY && +		    state != DEVLINK_HEALTH_REPORTER_STATE_ERROR)) +		return; + +	if (reporter->health_state == state) +		return; + +	reporter->health_state = state; +	trace_devlink_health_reporter_state_update(reporter->devlink, +						   reporter->ops->name, state); +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update); + +static int +devlink_health_reporter_recover(struct devlink_health_reporter *reporter, +				void *priv_ctx) +{ +	int err; + +	if (!reporter->ops->recover) +		return -EOPNOTSUPP; + +	err = reporter->ops->recover(reporter, priv_ctx); +	if (err) +		return err; + +	reporter->recovery_count++; +	reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; +	reporter->last_recovery_ts = jiffies; + +	return 0; +} + +static void +devlink_health_dump_clear(struct devlink_health_reporter *reporter) +{ +	if (!reporter->dump_fmsg) +		return; +	devlink_fmsg_free(reporter->dump_fmsg); +	reporter->dump_fmsg = NULL; +} + +static int devlink_health_do_dump(struct devlink_health_reporter *reporter, +				  void *priv_ctx) +{ +	int err; + +	if (!reporter->ops->dump) +		return 0; + +	if (reporter->dump_fmsg) +		return 0; + +	reporter->dump_fmsg = devlink_fmsg_alloc(); +	if (!reporter->dump_fmsg) { +		err = -ENOMEM; +		return err; +	} + +	err = devlink_fmsg_obj_nest_start(reporter->dump_fmsg); +	if (err) +		goto dump_err; + +	err = reporter->ops->dump(reporter, reporter->dump_fmsg, +				  priv_ctx); +	if (err) +		goto dump_err; + +	err = devlink_fmsg_obj_nest_end(reporter->dump_fmsg); +	if (err) +		goto dump_err; + +	reporter->dump_ts = jiffies; + +	return 0; + +dump_err: +	devlink_health_dump_clear(reporter); +	return err; +} + +int devlink_health_report(struct devlink_health_reporter *reporter, +			  const char *msg, void *priv_ctx) +{ +	enum devlink_health_reporter_state prev_health_state; +	struct devlink *devlink = reporter->devlink; + +	/* write a log message of the current error */ +	WARN_ON(!msg); +	trace_devlink_health_report(devlink, reporter->ops->name, msg); +	reporter->error_count++; +	prev_health_state = reporter->health_state; +	reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; + +	/* abort if the previous error wasn't recovered */ +	if (reporter->auto_recover && +	    (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || +	     jiffies - reporter->last_recovery_ts < +	     msecs_to_jiffies(reporter->graceful_period))) { +		trace_devlink_health_recover_aborted(devlink, +						     reporter->ops->name, +						     reporter->health_state, +						     jiffies - +						     reporter->last_recovery_ts); +		return -ECANCELED; +	} + +	reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; + +	mutex_lock(&reporter->dump_lock); +	/* store current dump of current error, for later analysis */ +	devlink_health_do_dump(reporter, priv_ctx); +	mutex_unlock(&reporter->dump_lock); + +	if (reporter->auto_recover) +		return devlink_health_reporter_recover(reporter, priv_ctx); + +	return 0; +} +EXPORT_SYMBOL_GPL(devlink_health_report); + +static struct devlink_health_reporter * +devlink_health_reporter_get_from_info(struct devlink *devlink, +				      struct genl_info *info) +{ +	char *reporter_name; + +	if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) +		return NULL; + +	reporter_name = +		nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); +	return devlink_health_reporter_find_by_name(devlink, reporter_name); +} + +static int +devlink_nl_health_reporter_fill(struct sk_buff *msg, +				struct devlink *devlink, +				struct devlink_health_reporter *reporter, +				enum devlink_command cmd, u32 portid, +				u32 seq, int flags) +{ +	struct nlattr *reporter_attr; +	void *hdr; + +	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); +	if (!hdr) +		return -EMSGSIZE; + +	if (devlink_nl_put_handle(msg, devlink)) +		goto genlmsg_cancel; + +	reporter_attr = nla_nest_start(msg, DEVLINK_ATTR_HEALTH_REPORTER); +	if (!reporter_attr) +		goto genlmsg_cancel; +	if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, +			   reporter->ops->name)) +		goto reporter_nest_cancel; +	if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, +		       reporter->health_state)) +		goto reporter_nest_cancel; +	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, +			      reporter->error_count, DEVLINK_ATTR_PAD)) +		goto reporter_nest_cancel; +	if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, +			      reporter->recovery_count, DEVLINK_ATTR_PAD)) +		goto reporter_nest_cancel; +	if (reporter->ops->recover && +	    nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, +			      reporter->graceful_period, +			      DEVLINK_ATTR_PAD)) +		goto reporter_nest_cancel; +	if (reporter->ops->recover && +	    nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, +		       reporter->auto_recover)) +		goto reporter_nest_cancel; +	if (reporter->dump_fmsg && +	    nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, +			      jiffies_to_msecs(reporter->dump_ts), +			      DEVLINK_ATTR_PAD)) +		goto reporter_nest_cancel; + +	nla_nest_end(msg, reporter_attr); +	genlmsg_end(msg, hdr); +	return 0; + +reporter_nest_cancel: +	nla_nest_end(msg, reporter_attr); +genlmsg_cancel: +	genlmsg_cancel(msg, hdr); +	return -EMSGSIZE; +} + +static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, +						   struct genl_info *info) +{ +	struct devlink *devlink = info->user_ptr[0]; +	struct devlink_health_reporter *reporter; +	struct sk_buff *msg; +	int err; + +	reporter = devlink_health_reporter_get_from_info(devlink, info); +	if (!reporter) +		return -EINVAL; + +	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (!msg) +		return -ENOMEM; + +	err = devlink_nl_health_reporter_fill(msg, devlink, reporter, +					      DEVLINK_CMD_HEALTH_REPORTER_GET, +					      info->snd_portid, info->snd_seq, +					      0); +	if (err) { +		nlmsg_free(msg); +		return err; +	} + +	return genlmsg_reply(msg, info); +} + +static int +devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, +					  struct netlink_callback *cb) +{ +	struct devlink_health_reporter *reporter; +	struct devlink *devlink; +	int start = cb->args[0]; +	int idx = 0; +	int err; + +	mutex_lock(&devlink_mutex); +	list_for_each_entry(devlink, &devlink_list, list) { +		if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) +			continue; +		mutex_lock(&devlink->lock); +		list_for_each_entry(reporter, &devlink->reporter_list, +				    list) { +			if (idx < start) { +				idx++; +				continue; +			} +			err = devlink_nl_health_reporter_fill(msg, devlink, +							      reporter, +							      DEVLINK_CMD_HEALTH_REPORTER_GET, +							      NETLINK_CB(cb->skb).portid, +							      cb->nlh->nlmsg_seq, +							      NLM_F_MULTI); +			if (err) { +				mutex_unlock(&devlink->lock); +				goto out; +			} +			idx++; +		} +		mutex_unlock(&devlink->lock); +	} +out:  	mutex_unlock(&devlink_mutex); + +	cb->args[0] = idx; +	return msg->len; +} + +static int +devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, +					struct genl_info *info) +{ +	struct devlink *devlink = info->user_ptr[0]; +	struct devlink_health_reporter *reporter; + +	reporter = devlink_health_reporter_get_from_info(devlink, info); +	if (!reporter) +		return -EINVAL; + +	if (!reporter->ops->recover && +	    (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || +	     info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) +		return -EOPNOTSUPP; + +	if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) +		reporter->graceful_period = +			nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); + +	if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) +		reporter->auto_recover = +			nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); + +	return 0; +} + +static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, +						       struct genl_info *info) +{ +	struct devlink *devlink = info->user_ptr[0]; +	struct devlink_health_reporter *reporter; + +	reporter = devlink_health_reporter_get_from_info(devlink, info); +	if (!reporter) +		return -EINVAL; + +	return devlink_health_reporter_recover(reporter, NULL); +} + +static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, +							struct genl_info *info) +{ +	struct devlink *devlink = info->user_ptr[0]; +	struct devlink_health_reporter *reporter; +	struct devlink_fmsg *fmsg; +	int err; + +	reporter = devlink_health_reporter_get_from_info(devlink, info); +	if (!reporter) +		return -EINVAL; + +	if (!reporter->ops->diagnose) +		return -EOPNOTSUPP; + +	fmsg = devlink_fmsg_alloc(); +	if (!fmsg) +		return -ENOMEM; + +	err = devlink_fmsg_obj_nest_start(fmsg); +	if (err) +		goto out; + +	err = reporter->ops->diagnose(reporter, fmsg); +	if (err) +		goto out; + +	err = devlink_fmsg_obj_nest_end(fmsg); +	if (err) +		goto out; + +	err = devlink_fmsg_snd(fmsg, info, +			       DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0); + +out: +	devlink_fmsg_free(fmsg); +	return err; +} + +static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, +							struct genl_info *info) +{ +	struct devlink *devlink = info->user_ptr[0]; +	struct devlink_health_reporter *reporter; +	int err; + +	reporter = devlink_health_reporter_get_from_info(devlink, info); +	if (!reporter) +		return -EINVAL; + +	if (!reporter->ops->dump) +		return -EOPNOTSUPP; + +	mutex_lock(&reporter->dump_lock); +	err = devlink_health_do_dump(reporter, NULL); +	if (err) +		goto out; + +	err = devlink_fmsg_snd(reporter->dump_fmsg, info, +			       DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, 0); +  out: +	mutex_unlock(&reporter->dump_lock); +	return err; +} + +static int +devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, +					       struct genl_info *info) +{ +	struct devlink *devlink = info->user_ptr[0]; +	struct devlink_health_reporter *reporter; + +	reporter = devlink_health_reporter_get_from_info(devlink, info); +	if (!reporter) +		return -EINVAL; + +	if (!reporter->ops->dump) +		return -EOPNOTSUPP; + +	mutex_lock(&reporter->dump_lock); +	devlink_health_dump_clear(reporter); +	mutex_unlock(&reporter->dump_lock);  	return 0;  } @@ -3622,6 +4911,11 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {  	[DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },  	[DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },  	[DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, +	[DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, +	[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, +	[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, +	[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, +	[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING },  };  static const struct genl_ops devlink_nl_ops[] = { @@ -3821,6 +5115,21 @@ static const struct genl_ops devlink_nl_ops[] = {  		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,  	},  	{ +		.cmd = DEVLINK_CMD_PORT_PARAM_GET, +		.doit = devlink_nl_cmd_port_param_get_doit, +		.dumpit = devlink_nl_cmd_port_param_get_dumpit, +		.policy = devlink_nl_policy, +		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT, +		/* can be retrieved by unprivileged users */ +	}, +	{ +		.cmd = DEVLINK_CMD_PORT_PARAM_SET, +		.doit = devlink_nl_cmd_port_param_set_doit, +		.policy = devlink_nl_policy, +		.flags = GENL_ADMIN_PERM, +		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT, +	}, +	{  		.cmd = DEVLINK_CMD_REGION_GET,  		.doit = devlink_nl_cmd_region_get_doit,  		.dumpit = devlink_nl_cmd_region_get_dumpit, @@ -3842,6 +5151,66 @@ static const struct genl_ops devlink_nl_ops[] = {  		.flags = GENL_ADMIN_PERM,  		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,  	}, +	{ +		.cmd = DEVLINK_CMD_INFO_GET, +		.doit = devlink_nl_cmd_info_get_doit, +		.dumpit = devlink_nl_cmd_info_get_dumpit, +		.policy = devlink_nl_policy, +		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, +		/* can be retrieved by unprivileged users */ +	}, +	{ +		.cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, +		.doit = devlink_nl_cmd_health_reporter_get_doit, +		.dumpit = devlink_nl_cmd_health_reporter_get_dumpit, +		.policy = devlink_nl_policy, +		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, +		/* can be retrieved by unprivileged users */ +	}, +	{ +		.cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, +		.doit = devlink_nl_cmd_health_reporter_set_doit, +		.policy = devlink_nl_policy, +		.flags = GENL_ADMIN_PERM, +		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, +	}, +	{ +		.cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER, +		.doit = devlink_nl_cmd_health_reporter_recover_doit, +		.policy = devlink_nl_policy, +		.flags = GENL_ADMIN_PERM, +		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, +	}, +	{ +		.cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, +		.doit = devlink_nl_cmd_health_reporter_diagnose_doit, +		.policy = devlink_nl_policy, +		.flags = GENL_ADMIN_PERM, +		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, +	}, +	{ +		.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, +		.doit = devlink_nl_cmd_health_reporter_dump_get_doit, +		.policy = devlink_nl_policy, +		.flags = GENL_ADMIN_PERM, +		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | +				  DEVLINK_NL_FLAG_NO_LOCK, +	}, +	{ +		.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, +		.doit = devlink_nl_cmd_health_reporter_dump_clear_doit, +		.policy = devlink_nl_policy, +		.flags = GENL_ADMIN_PERM, +		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | +				  DEVLINK_NL_FLAG_NO_LOCK, +	}, +	{ +		.cmd = DEVLINK_CMD_FLASH_UPDATE, +		.doit = devlink_nl_cmd_flash_update, +		.policy = devlink_nl_policy, +		.flags = GENL_ADMIN_PERM, +		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, +	},  };  static struct genl_family devlink_nl_family __ro_after_init = { @@ -3871,6 +5240,9 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)  {  	struct devlink *devlink; +	if (WARN_ON(!ops)) +		return NULL; +  	devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);  	if (!devlink)  		return NULL; @@ -3882,6 +5254,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)  	INIT_LIST_HEAD(&devlink->resource_list);  	INIT_LIST_HEAD(&devlink->param_list);  	INIT_LIST_HEAD(&devlink->region_list); +	INIT_LIST_HEAD(&devlink->reporter_list);  	mutex_init(&devlink->lock);  	return devlink;  } @@ -3891,6 +5264,7 @@ EXPORT_SYMBOL_GPL(devlink_alloc);   *	devlink_register - Register devlink instance   *   *	@devlink: devlink + *	@dev: parent device   */  int devlink_register(struct devlink *devlink, struct device *dev)  { @@ -3924,6 +5298,14 @@ EXPORT_SYMBOL_GPL(devlink_unregister);   */  void devlink_free(struct devlink *devlink)  { +	WARN_ON(!list_empty(&devlink->reporter_list)); +	WARN_ON(!list_empty(&devlink->region_list)); +	WARN_ON(!list_empty(&devlink->param_list)); +	WARN_ON(!list_empty(&devlink->resource_list)); +	WARN_ON(!list_empty(&devlink->dpipe_table_list)); +	WARN_ON(!list_empty(&devlink->sb_list)); +	WARN_ON(!list_empty(&devlink->port_list)); +  	kfree(devlink);  }  EXPORT_SYMBOL_GPL(devlink_free); @@ -3933,7 +5315,7 @@ EXPORT_SYMBOL_GPL(devlink_free);   *   *	@devlink: devlink   *	@devlink_port: devlink port - *	@port_index + *	@port_index: driver-specific numerical identifier of the port   *   *	Register devlink port with provided port index. User can use   *	any indexing, even hw-related one. devlink_port structure @@ -3954,6 +5336,7 @@ int devlink_port_register(struct devlink *devlink,  	devlink_port->index = port_index;  	devlink_port->registered = true;  	list_add_tail(&devlink_port->list, &devlink->port_list); +	INIT_LIST_HEAD(&devlink_port->param_list);  	mutex_unlock(&devlink->lock);  	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);  	return 0; @@ -4262,13 +5645,10 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);   *   *	@devlink: devlink   *	@resource_name: resource's name - *	@top_hierarchy: top hierarchy - *	@reload_required: reload is required for new configuration to - *			  apply   *	@resource_size: resource's size   *	@resource_id: resource's id - *	@parent_reosurce_id: resource's parent id - *	@size params: size parameters + *	@parent_resource_id: resource's parent id + *	@size_params: size parameters   */  int devlink_resource_register(struct devlink *devlink,  			      const char *resource_name, @@ -4471,18 +5851,23 @@ out:  }  EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); -/** - *	devlink_params_register - register configuration parameters - * - *	@devlink: devlink - *	@params: configuration parameters array - *	@params_count: number of parameters provided - * - *	Register the configuration parameters supported by the driver. - */ -int devlink_params_register(struct devlink *devlink, -			    const struct devlink_param *params, -			    size_t params_count) +static int devlink_param_verify(const struct devlink_param *param) +{ +	if (!param || !param->name || !param->supported_cmodes) +		return -EINVAL; +	if (param->generic) +		return devlink_param_generic_verify(param); +	else +		return devlink_param_driver_verify(param); +} + +static int __devlink_params_register(struct devlink *devlink, +				     unsigned int port_index, +				     struct list_head *param_list, +				     const struct devlink_param *params, +				     size_t params_count, +				     enum devlink_command reg_cmd, +				     enum devlink_command unreg_cmd)  {  	const struct devlink_param *param = params;  	int i; @@ -4490,20 +5875,12 @@ int devlink_params_register(struct devlink *devlink,  	mutex_lock(&devlink->lock);  	for (i = 0; i < params_count; i++, param++) { -		if (!param || !param->name || !param->supported_cmodes) { -			err = -EINVAL; +		err = devlink_param_verify(param); +		if (err)  			goto rollback; -		} -		if (param->generic) { -			err = devlink_param_generic_verify(param); -			if (err) -				goto rollback; -		} else { -			err = devlink_param_driver_verify(param); -			if (err) -				goto rollback; -		} -		err = devlink_param_register_one(devlink, param); + +		err = devlink_param_register_one(devlink, port_index, +						 param_list, param, reg_cmd);  		if (err)  			goto rollback;  	} @@ -4515,11 +5892,48 @@ rollback:  	if (!i)  		goto unlock;  	for (param--; i > 0; i--, param--) -		devlink_param_unregister_one(devlink, param); +		devlink_param_unregister_one(devlink, port_index, param_list, +					     param, unreg_cmd);  unlock:  	mutex_unlock(&devlink->lock);  	return err;  } + +static void __devlink_params_unregister(struct devlink *devlink, +					unsigned int port_index, +					struct list_head *param_list, +					const struct devlink_param *params, +					size_t params_count, +					enum devlink_command cmd) +{ +	const struct devlink_param *param = params; +	int i; + +	mutex_lock(&devlink->lock); +	for (i = 0; i < params_count; i++, param++) +		devlink_param_unregister_one(devlink, 0, param_list, param, +					     cmd); +	mutex_unlock(&devlink->lock); +} + +/** + *	devlink_params_register - register configuration parameters + * + *	@devlink: devlink + *	@params: configuration parameters array + *	@params_count: number of parameters provided + * + *	Register the configuration parameters supported by the driver. + */ +int devlink_params_register(struct devlink *devlink, +			    const struct devlink_param *params, +			    size_t params_count) +{ +	return __devlink_params_register(devlink, 0, &devlink->param_list, +					 params, params_count, +					 DEVLINK_CMD_PARAM_NEW, +					 DEVLINK_CMD_PARAM_DEL); +}  EXPORT_SYMBOL_GPL(devlink_params_register);  /** @@ -4532,36 +5946,103 @@ void devlink_params_unregister(struct devlink *devlink,  			       const struct devlink_param *params,  			       size_t params_count)  { -	const struct devlink_param *param = params; -	int i; - -	mutex_lock(&devlink->lock); -	for (i = 0; i < params_count; i++, param++) -		devlink_param_unregister_one(devlink, param); -	mutex_unlock(&devlink->lock); +	return __devlink_params_unregister(devlink, 0, &devlink->param_list, +					   params, params_count, +					   DEVLINK_CMD_PARAM_DEL);  }  EXPORT_SYMBOL_GPL(devlink_params_unregister);  /** - *	devlink_param_driverinit_value_get - get configuration parameter - *					     value for driver initializing + *	devlink_params_publish - publish configuration parameters   *   *	@devlink: devlink - *	@param_id: parameter ID - *	@init_val: value of parameter in driverinit configuration mode   * - *	This function should be used by the driver to get driverinit - *	configuration for initialization after reload command. + *	Publish previously registered configuration parameters.   */ -int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, -				       union devlink_param_value *init_val) +void devlink_params_publish(struct devlink *devlink)  {  	struct devlink_param_item *param_item; -	if (!devlink->ops || !devlink->ops->reload) -		return -EOPNOTSUPP; +	list_for_each_entry(param_item, &devlink->param_list, list) { +		if (param_item->published) +			continue; +		param_item->published = true; +		devlink_param_notify(devlink, 0, param_item, +				     DEVLINK_CMD_PARAM_NEW); +	} +} +EXPORT_SYMBOL_GPL(devlink_params_publish); -	param_item = devlink_param_find_by_id(&devlink->param_list, param_id); +/** + *	devlink_params_unpublish - unpublish configuration parameters + * + *	@devlink: devlink + * + *	Unpublish previously registered configuration parameters. + */ +void devlink_params_unpublish(struct devlink *devlink) +{ +	struct devlink_param_item *param_item; + +	list_for_each_entry(param_item, &devlink->param_list, list) { +		if (!param_item->published) +			continue; +		param_item->published = false; +		devlink_param_notify(devlink, 0, param_item, +				     DEVLINK_CMD_PARAM_DEL); +	} +} +EXPORT_SYMBOL_GPL(devlink_params_unpublish); + +/** + *	devlink_port_params_register - register port configuration parameters + * + *	@devlink_port: devlink port + *	@params: configuration parameters array + *	@params_count: number of parameters provided + * + *	Register the configuration parameters supported by the port. + */ +int devlink_port_params_register(struct devlink_port *devlink_port, +				 const struct devlink_param *params, +				 size_t params_count) +{ +	return __devlink_params_register(devlink_port->devlink, +					 devlink_port->index, +					 &devlink_port->param_list, params, +					 params_count, +					 DEVLINK_CMD_PORT_PARAM_NEW, +					 DEVLINK_CMD_PORT_PARAM_DEL); +} +EXPORT_SYMBOL_GPL(devlink_port_params_register); + +/** + *	devlink_port_params_unregister - unregister port configuration + *	parameters + * + *	@devlink_port: devlink port + *	@params: configuration parameters array + *	@params_count: number of parameters provided + */ +void devlink_port_params_unregister(struct devlink_port *devlink_port, +				    const struct devlink_param *params, +				    size_t params_count) +{ +	return __devlink_params_unregister(devlink_port->devlink, +					   devlink_port->index, +					   &devlink_port->param_list, +					   params, params_count, +					   DEVLINK_CMD_PORT_PARAM_DEL); +} +EXPORT_SYMBOL_GPL(devlink_port_params_unregister); + +static int +__devlink_param_driverinit_value_get(struct list_head *param_list, u32 param_id, +				     union devlink_param_value *init_val) +{ +	struct devlink_param_item *param_item; + +	param_item = devlink_param_find_by_id(param_list, param_id);  	if (!param_item)  		return -EINVAL; @@ -4577,6 +6058,54 @@ int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,  	return 0;  } + +static int +__devlink_param_driverinit_value_set(struct devlink *devlink, +				     unsigned int port_index, +				     struct list_head *param_list, u32 param_id, +				     union devlink_param_value init_val, +				     enum devlink_command cmd) +{ +	struct devlink_param_item *param_item; + +	param_item = devlink_param_find_by_id(param_list, param_id); +	if (!param_item) +		return -EINVAL; + +	if (!devlink_param_cmode_is_supported(param_item->param, +					      DEVLINK_PARAM_CMODE_DRIVERINIT)) +		return -EOPNOTSUPP; + +	if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) +		strcpy(param_item->driverinit_value.vstr, init_val.vstr); +	else +		param_item->driverinit_value = init_val; +	param_item->driverinit_value_valid = true; + +	devlink_param_notify(devlink, port_index, param_item, cmd); +	return 0; +} + +/** + *	devlink_param_driverinit_value_get - get configuration parameter + *					     value for driver initializing + * + *	@devlink: devlink + *	@param_id: parameter ID + *	@init_val: value of parameter in driverinit configuration mode + * + *	This function should be used by the driver to get driverinit + *	configuration for initialization after reload command. + */ +int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, +				       union devlink_param_value *init_val) +{ +	if (!devlink->ops->reload) +		return -EOPNOTSUPP; + +	return __devlink_param_driverinit_value_get(&devlink->param_list, +						    param_id, init_val); +}  EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get);  /** @@ -4594,26 +6123,61 @@ EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get);  int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,  				       union devlink_param_value init_val)  { -	struct devlink_param_item *param_item; +	return __devlink_param_driverinit_value_set(devlink, 0, +						    &devlink->param_list, +						    param_id, init_val, +						    DEVLINK_CMD_PARAM_NEW); +} +EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); -	param_item = devlink_param_find_by_id(&devlink->param_list, param_id); -	if (!param_item) -		return -EINVAL; +/** + *	devlink_port_param_driverinit_value_get - get configuration parameter + *						value for driver initializing + * + *	@devlink_port: devlink_port + *	@param_id: parameter ID + *	@init_val: value of parameter in driverinit configuration mode + * + *	This function should be used by the driver to get driverinit + *	configuration for initialization after reload command. + */ +int devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port, +					    u32 param_id, +					    union devlink_param_value *init_val) +{ +	struct devlink *devlink = devlink_port->devlink; -	if (!devlink_param_cmode_is_supported(param_item->param, -					      DEVLINK_PARAM_CMODE_DRIVERINIT)) +	if (!devlink->ops->reload)  		return -EOPNOTSUPP; -	if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) -		strcpy(param_item->driverinit_value.vstr, init_val.vstr); -	else -		param_item->driverinit_value = init_val; -	param_item->driverinit_value_valid = true; +	return __devlink_param_driverinit_value_get(&devlink_port->param_list, +						    param_id, init_val); +} +EXPORT_SYMBOL_GPL(devlink_port_param_driverinit_value_get); -	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); -	return 0; +/** + *     devlink_port_param_driverinit_value_set - set value of configuration + *                                               parameter for driverinit + *                                               configuration mode + * + *     @devlink_port: devlink_port + *     @param_id: parameter ID + *     @init_val: value of parameter to set for driverinit configuration mode + * + *     This function should be used by the driver to set driverinit + *     configuration mode default value. + */ +int devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port, +					    u32 param_id, +					    union devlink_param_value init_val) +{ +	return __devlink_param_driverinit_value_set(devlink_port->devlink, +						    devlink_port->index, +						    &devlink_port->param_list, +						    param_id, init_val, +						    DEVLINK_CMD_PORT_PARAM_NEW);  } -EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); +EXPORT_SYMBOL_GPL(devlink_port_param_driverinit_value_set);  /**   *	devlink_param_value_changed - notify devlink on a parameter's value @@ -4626,7 +6190,6 @@ EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set);   *	This function should be used by the driver to notify devlink on value   *	change, excluding driverinit configuration mode.   *	For driverinit configuration mode driver should use the function - *	devlink_param_driverinit_value_set() instead.   */  void devlink_param_value_changed(struct devlink *devlink, u32 param_id)  { @@ -4635,11 +6198,38 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id)  	param_item = devlink_param_find_by_id(&devlink->param_list, param_id);  	WARN_ON(!param_item); -	devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); +	devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW);  }  EXPORT_SYMBOL_GPL(devlink_param_value_changed);  /** + *     devlink_port_param_value_changed - notify devlink on a parameter's value + *                                      change. Should be called by the driver + *                                      right after the change. + * + *     @devlink_port: devlink_port + *     @param_id: parameter ID + * + *     This function should be used by the driver to notify devlink on value + *     change, excluding driverinit configuration mode. + *     For driverinit configuration mode driver should use the function + *     devlink_port_param_driverinit_value_set() instead. + */ +void devlink_port_param_value_changed(struct devlink_port *devlink_port, +				      u32 param_id) +{ +	struct devlink_param_item *param_item; + +	param_item = devlink_param_find_by_id(&devlink_port->param_list, +					      param_id); +	WARN_ON(!param_item); + +	devlink_param_notify(devlink_port->devlink, devlink_port->index, +			     param_item, DEVLINK_CMD_PORT_PARAM_NEW); +} +EXPORT_SYMBOL_GPL(devlink_port_param_value_changed); + +/**   *	devlink_param_value_str_fill - Safely fill-up the string preventing   *				       from overflow of the preallocated buffer   * @@ -4755,7 +6345,7 @@ EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get);   *	Multiple snapshots can be created on a region.   *	The @snapshot_id should be obtained using the getter function.   * - *	@devlink_region: devlink region of the snapshot + *	@region: devlink region of the snapshot   *	@data_len: size of snapshot data   *	@data: snapshot data   *	@snapshot_id: snapshot id to be created @@ -4808,20 +6398,93 @@ unlock:  }  EXPORT_SYMBOL_GPL(devlink_region_snapshot_create); -static int __init devlink_module_init(void) +static void __devlink_compat_running_version(struct devlink *devlink, +					     char *buf, size_t len)  { -	return genl_register_family(&devlink_nl_family); +	const struct nlattr *nlattr; +	struct devlink_info_req req; +	struct sk_buff *msg; +	int rem, err; + +	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (!msg) +		return; + +	req.msg = msg; +	err = devlink->ops->info_get(devlink, &req, NULL); +	if (err) +		goto free_msg; + +	nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) { +		const struct nlattr *kv; +		int rem_kv; + +		if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING) +			continue; + +		nla_for_each_nested(kv, nlattr, rem_kv) { +			if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE) +				continue; + +			strlcat(buf, nla_data(kv), len); +			strlcat(buf, " ", len); +		} +	} +free_msg: +	nlmsg_free(msg); +} + +void devlink_compat_running_version(struct net_device *dev, +				    char *buf, size_t len) +{ +	struct devlink *devlink; + +	dev_hold(dev); +	rtnl_unlock(); + +	mutex_lock(&devlink_mutex); +	devlink = netdev_to_devlink(dev); +	if (!devlink || !devlink->ops->info_get) +		goto unlock_list; + +	mutex_lock(&devlink->lock); +	__devlink_compat_running_version(devlink, buf, len); +	mutex_unlock(&devlink->lock); +unlock_list: +	mutex_unlock(&devlink_mutex); + +	rtnl_lock(); +	dev_put(dev);  } -static void __exit devlink_module_exit(void) +int devlink_compat_flash_update(struct net_device *dev, const char *file_name)  { -	genl_unregister_family(&devlink_nl_family); +	struct devlink *devlink; +	int ret = -EOPNOTSUPP; + +	dev_hold(dev); +	rtnl_unlock(); + +	mutex_lock(&devlink_mutex); +	devlink = netdev_to_devlink(dev); +	if (!devlink || !devlink->ops->flash_update) +		goto unlock_list; + +	mutex_lock(&devlink->lock); +	ret = devlink->ops->flash_update(devlink, file_name, NULL, NULL); +	mutex_unlock(&devlink->lock); +unlock_list: +	mutex_unlock(&devlink_mutex); + +	rtnl_lock(); +	dev_put(dev); + +	return ret;  } -module_init(devlink_module_init); -module_exit(devlink_module_exit); +static int __init devlink_init(void) +{ +	return genl_register_family(&devlink_nl_family); +} -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Jiri Pirko <[email protected]>"); -MODULE_DESCRIPTION("Network physical device Netlink interface"); -MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME); +subsys_initcall(devlink_init); diff --git a/net/core/dst.c b/net/core/dst.c index 81ccf20e2826..a263309df115 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -98,8 +98,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,  	struct dst_entry *dst;  	if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { -		if (ops->gc(ops)) +		if (ops->gc(ops)) { +			printk_ratelimited(KERN_NOTICE "Route cache is full: " +					   "consider increasing sysctl " +					   "net.ipv[4|6].route.max_size.\n");  			return NULL; +		}  	}  	dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 158264f7cfaf..d4918ffddda8 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -27,7 +27,9 @@  #include <linux/rtnetlink.h>  #include <linux/sched/signal.h>  #include <linux/net.h> +#include <net/devlink.h>  #include <net/xdp_sock.h> +#include <net/flow_offload.h>  /*   * Some useful ethtool_ops methods that're device independent. @@ -803,6 +805,10 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,  	if (ops->get_eeprom_len)  		info.eedump_len = ops->get_eeprom_len(dev); +	if (!info.fw_version[0]) +		devlink_compat_running_version(dev, info.fw_version, +					       sizeof(info.fw_version)); +  	if (copy_to_user(useraddr, &info, sizeof(info)))  		return -EFAULT;  	return 0; @@ -1348,12 +1354,9 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)  	if (regs.len > reglen)  		regs.len = reglen; -	regbuf = NULL; -	if (reglen) { -		regbuf = vzalloc(reglen); -		if (!regbuf) -			return -ENOMEM; -	} +	regbuf = vzalloc(reglen); +	if (!regbuf) +		return -ENOMEM;  	ops->get_regs(dev, ®s, regbuf); @@ -1714,7 +1717,7 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,  static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)  { -	struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; +	struct ethtool_pauseparam pauseparam = { .cmd = ETHTOOL_GPAUSEPARAM };  	if (!dev->ethtool_ops->get_pauseparam)  		return -EOPNOTSUPP; @@ -2033,11 +2036,10 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev,  	if (copy_from_user(&efl, useraddr, sizeof(efl)))  		return -EFAULT; +	efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0;  	if (!dev->ethtool_ops->flash_device) -		return -EOPNOTSUPP; - -	efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; +		return devlink_compat_flash_update(dev, efl.data);  	return dev->ethtool_ops->flash_device(dev, &efl);  } @@ -2501,7 +2503,7 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr)  static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr)  { -	struct ethtool_fecparam fecparam = { ETHTOOL_GFECPARAM }; +	struct ethtool_fecparam fecparam = { .cmd = ETHTOOL_GFECPARAM };  	int rc;  	if (!dev->ethtool_ops->get_fecparam) @@ -2816,3 +2818,241 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)  	return rc;  } + +struct ethtool_rx_flow_key { +	struct flow_dissector_key_basic			basic; +	union { +		struct flow_dissector_key_ipv4_addrs	ipv4; +		struct flow_dissector_key_ipv6_addrs	ipv6; +	}; +	struct flow_dissector_key_ports			tp; +	struct flow_dissector_key_ip			ip; +	struct flow_dissector_key_vlan			vlan; +	struct flow_dissector_key_eth_addrs		eth_addrs; +} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ + +struct ethtool_rx_flow_match { +	struct flow_dissector		dissector; +	struct ethtool_rx_flow_key	key; +	struct ethtool_rx_flow_key	mask; +}; + +struct ethtool_rx_flow_rule * +ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) +{ +	const struct ethtool_rx_flow_spec *fs = input->fs; +	static struct in6_addr zero_addr = {}; +	struct ethtool_rx_flow_match *match; +	struct ethtool_rx_flow_rule *flow; +	struct flow_action_entry *act; + +	flow = kzalloc(sizeof(struct ethtool_rx_flow_rule) + +		       sizeof(struct ethtool_rx_flow_match), GFP_KERNEL); +	if (!flow) +		return ERR_PTR(-ENOMEM); + +	/* ethtool_rx supports only one single action per rule. */ +	flow->rule = flow_rule_alloc(1); +	if (!flow->rule) { +		kfree(flow); +		return ERR_PTR(-ENOMEM); +	} + +	match = (struct ethtool_rx_flow_match *)flow->priv; +	flow->rule->match.dissector	= &match->dissector; +	flow->rule->match.mask		= &match->mask; +	flow->rule->match.key		= &match->key; + +	match->mask.basic.n_proto = htons(0xffff); + +	switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { +	case TCP_V4_FLOW: +	case UDP_V4_FLOW: { +		const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec; + +		match->key.basic.n_proto = htons(ETH_P_IP); + +		v4_spec = &fs->h_u.tcp_ip4_spec; +		v4_m_spec = &fs->m_u.tcp_ip4_spec; + +		if (v4_m_spec->ip4src) { +			match->key.ipv4.src = v4_spec->ip4src; +			match->mask.ipv4.src = v4_m_spec->ip4src; +		} +		if (v4_m_spec->ip4dst) { +			match->key.ipv4.dst = v4_spec->ip4dst; +			match->mask.ipv4.dst = v4_m_spec->ip4dst; +		} +		if (v4_m_spec->ip4src || +		    v4_m_spec->ip4dst) { +			match->dissector.used_keys |= +				BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS); +			match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] = +				offsetof(struct ethtool_rx_flow_key, ipv4); +		} +		if (v4_m_spec->psrc) { +			match->key.tp.src = v4_spec->psrc; +			match->mask.tp.src = v4_m_spec->psrc; +		} +		if (v4_m_spec->pdst) { +			match->key.tp.dst = v4_spec->pdst; +			match->mask.tp.dst = v4_m_spec->pdst; +		} +		if (v4_m_spec->psrc || +		    v4_m_spec->pdst) { +			match->dissector.used_keys |= +				BIT(FLOW_DISSECTOR_KEY_PORTS); +			match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = +				offsetof(struct ethtool_rx_flow_key, tp); +		} +		if (v4_m_spec->tos) { +			match->key.ip.tos = v4_spec->tos; +			match->mask.ip.tos = v4_m_spec->tos; +			match->dissector.used_keys |= +				BIT(FLOW_DISSECTOR_KEY_IP); +			match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = +				offsetof(struct ethtool_rx_flow_key, ip); +		} +		} +		break; +	case TCP_V6_FLOW: +	case UDP_V6_FLOW: { +		const struct ethtool_tcpip6_spec *v6_spec, *v6_m_spec; + +		match->key.basic.n_proto = htons(ETH_P_IPV6); + +		v6_spec = &fs->h_u.tcp_ip6_spec; +		v6_m_spec = &fs->m_u.tcp_ip6_spec; +		if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr))) { +			memcpy(&match->key.ipv6.src, v6_spec->ip6src, +			       sizeof(match->key.ipv6.src)); +			memcpy(&match->mask.ipv6.src, v6_m_spec->ip6src, +			       sizeof(match->mask.ipv6.src)); +		} +		if (memcmp(v6_m_spec->ip6dst, &zero_addr, sizeof(zero_addr))) { +			memcpy(&match->key.ipv6.dst, v6_spec->ip6dst, +			       sizeof(match->key.ipv6.dst)); +			memcpy(&match->mask.ipv6.dst, v6_m_spec->ip6dst, +			       sizeof(match->mask.ipv6.dst)); +		} +		if (memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr)) || +		    memcmp(v6_m_spec->ip6src, &zero_addr, sizeof(zero_addr))) { +			match->dissector.used_keys |= +				BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS); +			match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] = +				offsetof(struct ethtool_rx_flow_key, ipv6); +		} +		if (v6_m_spec->psrc) { +			match->key.tp.src = v6_spec->psrc; +			match->mask.tp.src = v6_m_spec->psrc; +		} +		if (v6_m_spec->pdst) { +			match->key.tp.dst = v6_spec->pdst; +			match->mask.tp.dst = v6_m_spec->pdst; +		} +		if (v6_m_spec->psrc || +		    v6_m_spec->pdst) { +			match->dissector.used_keys |= +				BIT(FLOW_DISSECTOR_KEY_PORTS); +			match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = +				offsetof(struct ethtool_rx_flow_key, tp); +		} +		if (v6_m_spec->tclass) { +			match->key.ip.tos = v6_spec->tclass; +			match->mask.ip.tos = v6_m_spec->tclass; +			match->dissector.used_keys |= +				BIT(FLOW_DISSECTOR_KEY_IP); +			match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = +				offsetof(struct ethtool_rx_flow_key, ip); +		} +		} +		break; +	default: +		ethtool_rx_flow_rule_destroy(flow); +		return ERR_PTR(-EINVAL); +	} + +	switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { +	case TCP_V4_FLOW: +	case TCP_V6_FLOW: +		match->key.basic.ip_proto = IPPROTO_TCP; +		break; +	case UDP_V4_FLOW: +	case UDP_V6_FLOW: +		match->key.basic.ip_proto = IPPROTO_UDP; +		break; +	} +	match->mask.basic.ip_proto = 0xff; + +	match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_BASIC); +	match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] = +		offsetof(struct ethtool_rx_flow_key, basic); + +	if (fs->flow_type & FLOW_EXT) { +		const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; +		const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; + +		if (ext_m_spec->vlan_etype && +		    ext_m_spec->vlan_tci) { +			match->key.vlan.vlan_tpid = ext_h_spec->vlan_etype; +			match->mask.vlan.vlan_tpid = ext_m_spec->vlan_etype; + +			match->key.vlan.vlan_id = +				ntohs(ext_h_spec->vlan_tci) & 0x0fff; +			match->mask.vlan.vlan_id = +				ntohs(ext_m_spec->vlan_tci) & 0x0fff; + +			match->key.vlan.vlan_priority = +				(ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13; +			match->mask.vlan.vlan_priority = +				(ntohs(ext_m_spec->vlan_tci) & 0xe000) >> 13; + +			match->dissector.used_keys |= +				BIT(FLOW_DISSECTOR_KEY_VLAN); +			match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = +				offsetof(struct ethtool_rx_flow_key, vlan); +		} +	} +	if (fs->flow_type & FLOW_MAC_EXT) { +		const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; +		const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; + +		memcpy(match->key.eth_addrs.dst, ext_h_spec->h_dest, +		       ETH_ALEN); +		memcpy(match->mask.eth_addrs.dst, ext_m_spec->h_dest, +		       ETH_ALEN); + +		match->dissector.used_keys |= +			BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS); +		match->dissector.offset[FLOW_DISSECTOR_KEY_ETH_ADDRS] = +			offsetof(struct ethtool_rx_flow_key, eth_addrs); +	} + +	act = &flow->rule->action.entries[0]; +	switch (fs->ring_cookie) { +	case RX_CLS_FLOW_DISC: +		act->id = FLOW_ACTION_DROP; +		break; +	case RX_CLS_FLOW_WAKE: +		act->id = FLOW_ACTION_WAKE; +		break; +	default: +		act->id = FLOW_ACTION_QUEUE; +		if (fs->flow_type & FLOW_RSS) +			act->queue.ctx = input->rss_ctx; + +		act->queue.vf = ethtool_get_flow_spec_ring_vf(fs->ring_cookie); +		act->queue.index = ethtool_get_flow_spec_ring(fs->ring_cookie); +		break; +	} + +	return flow; +} +EXPORT_SYMBOL(ethtool_rx_flow_rule_create); + +void ethtool_rx_flow_rule_destroy(struct ethtool_rx_flow_rule *flow) +{ +	kfree(flow->rule); +	kfree(flow); +} +EXPORT_SYMBOL(ethtool_rx_flow_rule_destroy); diff --git a/net/core/filter.c b/net/core/filter.c index 7559d6835ecb..5ceba98069d4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -73,6 +73,7 @@  #include <linux/seg6_local.h>  #include <net/seg6.h>  #include <net/seg6_local.h> +#include <net/lwtunnel.h>  /**   *	sk_filter_trim_cap - run a packet through a socket filter @@ -1793,6 +1794,20 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = {  	.arg2_type	= ARG_ANYTHING,  }; +BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) +{ +	sk = sk_to_full_sk(sk); + +	return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sk_fullsock_proto = { +	.func		= bpf_sk_fullsock, +	.gpl_only	= false, +	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL, +	.arg1_type	= ARG_PTR_TO_SOCK_COMMON, +}; +  static inline int sk_skb_try_make_writable(struct sk_buff *skb,  					   unsigned int write_len)  { @@ -2789,8 +2804,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)  	u32 off = skb_mac_header_len(skb);  	int ret; -	/* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ -	if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) +	if (!skb_is_gso_tcp(skb))  		return -ENOTSUPP;  	ret = skb_cow(skb, len_diff); @@ -2831,8 +2845,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)  	u32 off = skb_mac_header_len(skb);  	int ret; -	/* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ -	if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) +	if (!skb_is_gso_tcp(skb))  		return -ENOTSUPP;  	ret = skb_unclone(skb, GFP_ATOMIC); @@ -2957,8 +2970,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)  	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);  	int ret; -	/* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ -	if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) +	if (!skb_is_gso_tcp(skb))  		return -ENOTSUPP;  	ret = skb_cow(skb, len_diff); @@ -2987,8 +2999,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)  	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);  	int ret; -	/* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ -	if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) +	if (!skb_is_gso_tcp(skb))  		return -ENOTSUPP;  	ret = skb_unclone(skb, GFP_ATOMIC); @@ -4112,10 +4123,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,  		/* Only some socketops are supported */  		switch (optname) {  		case SO_RCVBUF: +			val = min_t(u32, val, sysctl_rmem_max);  			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;  			sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);  			break;  		case SO_SNDBUF: +			val = min_t(u32, val, sysctl_wmem_max);  			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;  			sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);  			break; @@ -4801,7 +4814,15 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len  }  #endif /* CONFIG_IPV6_SEG6_BPF */ -BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) +static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, +			     bool ingress) +{ +	return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); +} +#endif + +BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,  	   u32, len)  {  	switch (type) { @@ -4810,13 +4831,40 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,  	case BPF_LWT_ENCAP_SEG6_INLINE:  		return bpf_push_seg6_encap(skb, type, hdr, len);  #endif +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) +	case BPF_LWT_ENCAP_IP: +		return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); +#endif +	default: +		return -EINVAL; +	} +} + +BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, +	   void *, hdr, u32, len) +{ +	switch (type) { +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) +	case BPF_LWT_ENCAP_IP: +		return bpf_push_ip_encap(skb, hdr, len, false /* egress */); +#endif  	default:  		return -EINVAL;  	}  } -static const struct bpf_func_proto bpf_lwt_push_encap_proto = { -	.func		= bpf_lwt_push_encap, +static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { +	.func		= bpf_lwt_in_push_encap, +	.gpl_only	= false, +	.ret_type	= RET_INTEGER, +	.arg1_type	= ARG_PTR_TO_CTX, +	.arg2_type	= ARG_ANYTHING, +	.arg3_type	= ARG_PTR_TO_MEM, +	.arg4_type	= ARG_CONST_SIZE +}; + +static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { +	.func		= bpf_lwt_xmit_push_encap,  	.gpl_only	= false,  	.ret_type	= RET_INTEGER,  	.arg1_type	= ARG_PTR_TO_CTX, @@ -5016,6 +5064,54 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {  };  #endif /* CONFIG_IPV6_SEG6_BPF */ +#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT)		\ +do {									\ +	switch (si->off) {						\ +	case offsetof(md_type, snd_cwnd):				\ +		CONVERT(snd_cwnd); break;				\ +	case offsetof(md_type, srtt_us):				\ +		CONVERT(srtt_us); break;				\ +	case offsetof(md_type, snd_ssthresh):				\ +		CONVERT(snd_ssthresh); break;				\ +	case offsetof(md_type, rcv_nxt):				\ +		CONVERT(rcv_nxt); break;				\ +	case offsetof(md_type, snd_nxt):				\ +		CONVERT(snd_nxt); break;				\ +	case offsetof(md_type, snd_una):				\ +		CONVERT(snd_una); break;				\ +	case offsetof(md_type, mss_cache):				\ +		CONVERT(mss_cache); break;				\ +	case offsetof(md_type, ecn_flags):				\ +		CONVERT(ecn_flags); break;				\ +	case offsetof(md_type, rate_delivered):				\ +		CONVERT(rate_delivered); break;				\ +	case offsetof(md_type, rate_interval_us):			\ +		CONVERT(rate_interval_us); break;			\ +	case offsetof(md_type, packets_out):				\ +		CONVERT(packets_out); break;				\ +	case offsetof(md_type, retrans_out):				\ +		CONVERT(retrans_out); break;				\ +	case offsetof(md_type, total_retrans):				\ +		CONVERT(total_retrans); break;				\ +	case offsetof(md_type, segs_in):				\ +		CONVERT(segs_in); break;				\ +	case offsetof(md_type, data_segs_in):				\ +		CONVERT(data_segs_in); break;				\ +	case offsetof(md_type, segs_out):				\ +		CONVERT(segs_out); break;				\ +	case offsetof(md_type, data_segs_out):				\ +		CONVERT(data_segs_out); break;				\ +	case offsetof(md_type, lost_out):				\ +		CONVERT(lost_out); break;				\ +	case offsetof(md_type, sacked_out):				\ +		CONVERT(sacked_out); break;				\ +	case offsetof(md_type, bytes_received):				\ +		CONVERT(bytes_received); break;				\ +	case offsetof(md_type, bytes_acked):				\ +		CONVERT(bytes_acked); break;				\ +	}								\ +} while (0) +  #ifdef CONFIG_INET  static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,  			      int dif, int sdif, u8 family, u8 proto) @@ -5253,6 +5349,105 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {  	.arg5_type	= ARG_ANYTHING,  }; +bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, +				  struct bpf_insn_access_aux *info) +{ +	if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked)) +		return false; + +	if (off % size != 0) +		return false; + +	switch (off) { +	case offsetof(struct bpf_tcp_sock, bytes_received): +	case offsetof(struct bpf_tcp_sock, bytes_acked): +		return size == sizeof(__u64); +	default: +		return size == sizeof(__u32); +	} +} + +u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, +				    const struct bpf_insn *si, +				    struct bpf_insn *insn_buf, +				    struct bpf_prog *prog, u32 *target_size) +{ +	struct bpf_insn *insn = insn_buf; + +#define BPF_TCP_SOCK_GET_COMMON(FIELD)					\ +	do {								\ +		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) >	\ +			     FIELD_SIZEOF(struct bpf_tcp_sock, FIELD));	\ +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ +				      si->dst_reg, si->src_reg,		\ +				      offsetof(struct tcp_sock, FIELD)); \ +	} while (0) + +	CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock, +				       BPF_TCP_SOCK_GET_COMMON); + +	if (insn > insn_buf) +		return insn - insn_buf; + +	switch (si->off) { +	case offsetof(struct bpf_tcp_sock, rtt_min): +		BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != +			     sizeof(struct minmax)); +		BUILD_BUG_ON(sizeof(struct minmax) < +			     sizeof(struct minmax_sample)); + +		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, +				      offsetof(struct tcp_sock, rtt_min) + +				      offsetof(struct minmax_sample, v)); +		break; +	} + +	return insn - insn_buf; +} + +BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) +{ +	sk = sk_to_full_sk(sk); + +	if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) +		return (unsigned long)sk; + +	return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_tcp_sock_proto = { +	.func		= bpf_tcp_sock, +	.gpl_only	= false, +	.ret_type	= RET_PTR_TO_TCP_SOCK_OR_NULL, +	.arg1_type	= ARG_PTR_TO_SOCK_COMMON, +}; + +BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) +{ +	unsigned int iphdr_len; + +	if (skb->protocol == cpu_to_be16(ETH_P_IP)) +		iphdr_len = sizeof(struct iphdr); +	else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) +		iphdr_len = sizeof(struct ipv6hdr); +	else +		return 0; + +	if (skb_headlen(skb) < iphdr_len) +		return 0; + +	if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) +		return 0; + +	return INET_ECN_set_ce(skb); +} + +static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { +	.func           = bpf_skb_ecn_set_ce, +	.gpl_only       = false, +	.ret_type       = RET_INTEGER, +	.arg1_type      = ARG_PTR_TO_CTX, +};  #endif /* CONFIG_INET */  bool bpf_helper_changes_pkt_data(void *func) @@ -5282,7 +5477,8 @@ bool bpf_helper_changes_pkt_data(void *func)  	    func == bpf_lwt_seg6_adjust_srh ||  	    func == bpf_lwt_seg6_action ||  #endif -	    func == bpf_lwt_push_encap) +	    func == bpf_lwt_in_push_encap || +	    func == bpf_lwt_xmit_push_encap)  		return true;  	return false; @@ -5314,10 +5510,20 @@ bpf_base_func_proto(enum bpf_func_id func_id)  		return &bpf_tail_call_proto;  	case BPF_FUNC_ktime_get_ns:  		return &bpf_ktime_get_ns_proto; +	default: +		break; +	} + +	if (!capable(CAP_SYS_ADMIN)) +		return NULL; + +	switch (func_id) { +	case BPF_FUNC_spin_lock: +		return &bpf_spin_lock_proto; +	case BPF_FUNC_spin_unlock: +		return &bpf_spin_unlock_proto;  	case BPF_FUNC_trace_printk: -		if (capable(CAP_SYS_ADMIN)) -			return bpf_get_trace_printk_proto(); -		/* else, fall through */ +		return bpf_get_trace_printk_proto();  	default:  		return NULL;  	} @@ -5396,6 +5602,14 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  	switch (func_id) {  	case BPF_FUNC_get_local_storage:  		return &bpf_get_local_storage_proto; +	case BPF_FUNC_sk_fullsock: +		return &bpf_sk_fullsock_proto; +#ifdef CONFIG_INET +	case BPF_FUNC_tcp_sock: +		return &bpf_tcp_sock_proto; +	case BPF_FUNC_skb_ecn_set_ce: +		return &bpf_skb_ecn_set_ce_proto; +#endif  	default:  		return sk_filter_func_proto(func_id, prog);  	} @@ -5467,6 +5681,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_get_socket_uid_proto;  	case BPF_FUNC_fib_lookup:  		return &bpf_skb_fib_lookup_proto; +	case BPF_FUNC_sk_fullsock: +		return &bpf_sk_fullsock_proto;  #ifdef CONFIG_XFRM  	case BPF_FUNC_skb_get_xfrm_state:  		return &bpf_skb_get_xfrm_state_proto; @@ -5484,6 +5700,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_sk_lookup_udp_proto;  	case BPF_FUNC_sk_release:  		return &bpf_sk_release_proto; +	case BPF_FUNC_tcp_sock: +		return &bpf_tcp_sock_proto;  #endif  	default:  		return bpf_base_func_proto(func_id); @@ -5660,7 +5878,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  {  	switch (func_id) {  	case BPF_FUNC_lwt_push_encap: -		return &bpf_lwt_push_encap_proto; +		return &bpf_lwt_in_push_encap_proto;  	default:  		return lwt_out_func_proto(func_id, prog);  	} @@ -5696,6 +5914,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)  		return &bpf_l4_csum_replace_proto;  	case BPF_FUNC_set_hash_invalid:  		return &bpf_set_hash_invalid_proto; +	case BPF_FUNC_lwt_push_encap: +		return &bpf_lwt_xmit_push_encap_proto;  	default:  		return lwt_out_func_proto(func_id, prog);  	} @@ -5754,6 +5974,11 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type  		if (size != sizeof(__u64))  			return false;  		break; +	case offsetof(struct __sk_buff, sk): +		if (type == BPF_WRITE || size != sizeof(__u64)) +			return false; +		info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; +		break;  	default:  		/* Only narrow read access allowed for now. */  		if (type == BPF_WRITE) { @@ -5925,31 +6150,44 @@ full_access:  	return true;  } -static bool __sock_filter_check_size(int off, int size, +bool bpf_sock_common_is_valid_access(int off, int size, +				     enum bpf_access_type type,  				     struct bpf_insn_access_aux *info)  { -	const int size_default = sizeof(__u32); -  	switch (off) { -	case bpf_ctx_range(struct bpf_sock, src_ip4): -	case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): -		bpf_ctx_record_field_size(info, size_default); -		return bpf_ctx_narrow_access_ok(off, size, size_default); +	case bpf_ctx_range_till(struct bpf_sock, type, priority): +		return false; +	default: +		return bpf_sock_is_valid_access(off, size, type, info);  	} - -	return size == size_default;  }  bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,  			      struct bpf_insn_access_aux *info)  { +	const int size_default = sizeof(__u32); +  	if (off < 0 || off >= sizeof(struct bpf_sock))  		return false;  	if (off % size != 0)  		return false; -	if (!__sock_filter_check_size(off, size, info)) -		return false; -	return true; + +	switch (off) { +	case offsetof(struct bpf_sock, state): +	case offsetof(struct bpf_sock, family): +	case offsetof(struct bpf_sock, type): +	case offsetof(struct bpf_sock, protocol): +	case offsetof(struct bpf_sock, dst_port): +	case offsetof(struct bpf_sock, src_port): +	case bpf_ctx_range(struct bpf_sock, src_ip4): +	case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): +	case bpf_ctx_range(struct bpf_sock, dst_ip4): +	case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): +		bpf_ctx_record_field_size(info, size_default); +		return bpf_ctx_narrow_access_ok(off, size, size_default); +	} + +	return size == size_default;  }  static bool sock_filter_is_valid_access(int off, int size, @@ -6065,6 +6303,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,  		case bpf_ctx_range(struct __sk_buff, tc_classid):  		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):  		case bpf_ctx_range(struct __sk_buff, tstamp): +		case bpf_ctx_range(struct __sk_buff, queue_mapping):  			break;  		default:  			return false; @@ -6469,9 +6708,18 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,  		break;  	case offsetof(struct __sk_buff, queue_mapping): -		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, -				      bpf_target_off(struct sk_buff, queue_mapping, 2, -						     target_size)); +		if (type == BPF_WRITE) { +			*insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); +			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, +					      bpf_target_off(struct sk_buff, +							     queue_mapping, +							     2, target_size)); +		} else { +			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, +					      bpf_target_off(struct sk_buff, +							     queue_mapping, +							     2, target_size)); +		}  		break;  	case offsetof(struct __sk_buff, vlan_present): @@ -6708,6 +6956,27 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,  							     target_size));  		break; +	case offsetof(struct __sk_buff, gso_segs): +		/* si->dst_reg = skb_shinfo(SKB); */ +#ifdef NET_SKBUFF_DATA_USES_OFFSET +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), +				      si->dst_reg, si->src_reg, +				      offsetof(struct sk_buff, head)); +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), +				      BPF_REG_AX, si->src_reg, +				      offsetof(struct sk_buff, end)); +		*insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); +#else +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), +				      si->dst_reg, si->src_reg, +				      offsetof(struct sk_buff, end)); +#endif +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), +				      si->dst_reg, si->dst_reg, +				      bpf_target_off(struct skb_shared_info, +						     gso_segs, 2, +						     target_size)); +		break;  	case offsetof(struct __sk_buff, wire_len):  		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); @@ -6717,6 +6986,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,  		off += offsetof(struct qdisc_skb_cb, pkt_len);  		*target_size = 4;  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); +		break; + +	case offsetof(struct __sk_buff, sk): +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), +				      si->dst_reg, si->src_reg, +				      offsetof(struct sk_buff, sk)); +		break;  	}  	return insn - insn_buf; @@ -6765,24 +7041,32 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,  		break;  	case offsetof(struct bpf_sock, family): -		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); - -		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, -				      offsetof(struct sock, sk_family)); +		*insn++ = BPF_LDX_MEM( +			BPF_FIELD_SIZEOF(struct sock_common, skc_family), +			si->dst_reg, si->src_reg, +			bpf_target_off(struct sock_common, +				       skc_family, +				       FIELD_SIZEOF(struct sock_common, +						    skc_family), +				       target_size));  		break;  	case offsetof(struct bpf_sock, type): +		BUILD_BUG_ON(HWEIGHT32(SK_FL_TYPE_MASK) != BITS_PER_BYTE * 2);  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,  				      offsetof(struct sock, __sk_flags_offset));  		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);  		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); +		*target_size = 2;  		break;  	case offsetof(struct bpf_sock, protocol): +		BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE);  		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,  				      offsetof(struct sock, __sk_flags_offset));  		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);  		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); +		*target_size = 1;  		break;  	case offsetof(struct bpf_sock, src_ip4): @@ -6794,6 +7078,15 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,  				       target_size));  		break; +	case offsetof(struct bpf_sock, dst_ip4): +		*insn++ = BPF_LDX_MEM( +			BPF_SIZE(si->code), si->dst_reg, si->src_reg, +			bpf_target_off(struct sock_common, skc_daddr, +				       FIELD_SIZEOF(struct sock_common, +						    skc_daddr), +				       target_size)); +		break; +  	case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):  #if IS_ENABLED(CONFIG_IPV6)  		off = si->off; @@ -6812,6 +7105,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,  #endif  		break; +	case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) +		off = si->off; +		off -= offsetof(struct bpf_sock, dst_ip6[0]); +		*insn++ = BPF_LDX_MEM( +			BPF_SIZE(si->code), si->dst_reg, si->src_reg, +			bpf_target_off(struct sock_common, +				       skc_v6_daddr.s6_addr32[0], +				       FIELD_SIZEOF(struct sock_common, +						    skc_v6_daddr.s6_addr32[0]), +				       target_size) + off); +#else +		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +		*target_size = 4; +#endif +		break; +  	case offsetof(struct bpf_sock, src_port):  		*insn++ = BPF_LDX_MEM(  			BPF_FIELD_SIZEOF(struct sock_common, skc_num), @@ -6821,6 +7131,26 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,  						    skc_num),  				       target_size));  		break; + +	case offsetof(struct bpf_sock, dst_port): +		*insn++ = BPF_LDX_MEM( +			BPF_FIELD_SIZEOF(struct sock_common, skc_dport), +			si->dst_reg, si->src_reg, +			bpf_target_off(struct sock_common, skc_dport, +				       FIELD_SIZEOF(struct sock_common, +						    skc_dport), +				       target_size)); +		break; + +	case offsetof(struct bpf_sock, state): +		*insn++ = BPF_LDX_MEM( +			BPF_FIELD_SIZEOF(struct sock_common, skc_state), +			si->dst_reg, si->src_reg, +			bpf_target_off(struct sock_common, skc_state, +				       FIELD_SIZEOF(struct sock_common, +						    skc_state), +				       target_size)); +		break;  	}  	return insn - insn_buf; @@ -7068,6 +7398,85 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,  	struct bpf_insn *insn = insn_buf;  	int off; +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \ +	do {								      \ +		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) >		      \ +			     FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD));   \ +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \ +						struct bpf_sock_ops_kern,     \ +						is_fullsock),		      \ +				      si->dst_reg, si->src_reg,		      \ +				      offsetof(struct bpf_sock_ops_kern,      \ +					       is_fullsock));		      \ +		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \ +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \ +						struct bpf_sock_ops_kern, sk),\ +				      si->dst_reg, si->src_reg,		      \ +				      offsetof(struct bpf_sock_ops_kern, sk));\ +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,		      \ +						       OBJ_FIELD),	      \ +				      si->dst_reg, si->dst_reg,		      \ +				      offsetof(OBJ, OBJ_FIELD));	      \ +	} while (0) + +#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ +		SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) + +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \ +	do {								      \ +		int reg = BPF_REG_9;					      \ +		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) >		      \ +			     FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD));   \ +		if (si->dst_reg == reg || si->src_reg == reg)		      \ +			reg--;						      \ +		if (si->dst_reg == reg || si->src_reg == reg)		      \ +			reg--;						      \ +		*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,		      \ +				      offsetof(struct bpf_sock_ops_kern,      \ +					       temp));			      \ +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \ +						struct bpf_sock_ops_kern,     \ +						is_fullsock),		      \ +				      reg, si->dst_reg,			      \ +				      offsetof(struct bpf_sock_ops_kern,      \ +					       is_fullsock));		      \ +		*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);		      \ +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \ +						struct bpf_sock_ops_kern, sk),\ +				      reg, si->dst_reg,			      \ +				      offsetof(struct bpf_sock_ops_kern, sk));\ +		*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD),	      \ +				      reg, si->src_reg,			      \ +				      offsetof(OBJ, OBJ_FIELD));	      \ +		*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,		      \ +				      offsetof(struct bpf_sock_ops_kern,      \ +					       temp));			      \ +	} while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE)	      \ +	do {								      \ +		if (TYPE == BPF_WRITE)					      \ +			SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \ +		else							      \ +			SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \ +	} while (0) + +	CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops, +				       SOCK_OPS_GET_TCP_SOCK_FIELD); + +	if (insn > insn_buf) +		return insn - insn_buf; +  	switch (si->off) {  	case offsetof(struct bpf_sock_ops, op) ...  	     offsetof(struct bpf_sock_ops, replylong[3]): @@ -7225,175 +7634,15 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,  				      FIELD_SIZEOF(struct minmax_sample, t));  		break; -/* Helper macro for adding read access to tcp_sock or sock fields. */ -#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \ -	do {								      \ -		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) >		      \ -			     FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD));   \ -		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \ -						struct bpf_sock_ops_kern,     \ -						is_fullsock),		      \ -				      si->dst_reg, si->src_reg,		      \ -				      offsetof(struct bpf_sock_ops_kern,      \ -					       is_fullsock));		      \ -		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2);	      \ -		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \ -						struct bpf_sock_ops_kern, sk),\ -				      si->dst_reg, si->src_reg,		      \ -				      offsetof(struct bpf_sock_ops_kern, sk));\ -		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,		      \ -						       OBJ_FIELD),	      \ -				      si->dst_reg, si->dst_reg,		      \ -				      offsetof(OBJ, OBJ_FIELD));	      \ -	} while (0) - -/* Helper macro for adding write access to tcp_sock or sock fields. - * The macro is called with two registers, dst_reg which contains a pointer - * to ctx (context) and src_reg which contains the value that should be - * stored. However, we need an additional register since we cannot overwrite - * dst_reg because it may be used later in the program. - * Instead we "borrow" one of the other register. We first save its value - * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore - * it at the end of the macro. - */ -#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)			      \ -	do {								      \ -		int reg = BPF_REG_9;					      \ -		BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) >		      \ -			     FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD));   \ -		if (si->dst_reg == reg || si->src_reg == reg)		      \ -			reg--;						      \ -		if (si->dst_reg == reg || si->src_reg == reg)		      \ -			reg--;						      \ -		*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,		      \ -				      offsetof(struct bpf_sock_ops_kern,      \ -					       temp));			      \ -		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \ -						struct bpf_sock_ops_kern,     \ -						is_fullsock),		      \ -				      reg, si->dst_reg,			      \ -				      offsetof(struct bpf_sock_ops_kern,      \ -					       is_fullsock));		      \ -		*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);		      \ -		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			      \ -						struct bpf_sock_ops_kern, sk),\ -				      reg, si->dst_reg,			      \ -				      offsetof(struct bpf_sock_ops_kern, sk));\ -		*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD),	      \ -				      reg, si->src_reg,			      \ -				      offsetof(OBJ, OBJ_FIELD));	      \ -		*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,		      \ -				      offsetof(struct bpf_sock_ops_kern,      \ -					       temp));			      \ -	} while (0) - -#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE)	      \ -	do {								      \ -		if (TYPE == BPF_WRITE)					      \ -			SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \ -		else							      \ -			SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \ -	} while (0) - -	case offsetof(struct bpf_sock_ops, snd_cwnd): -		SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, srtt_us): -		SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); -		break; -  	case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):  		SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,  				   struct tcp_sock);  		break; -	case offsetof(struct bpf_sock_ops, snd_ssthresh): -		SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, rcv_nxt): -		SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, snd_nxt): -		SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, snd_una): -		SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, mss_cache): -		SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, ecn_flags): -		SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, rate_delivered): -		SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, -				   struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, rate_interval_us): -		SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, -				   struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, packets_out): -		SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, retrans_out): -		SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, total_retrans): -		SOCK_OPS_GET_FIELD(total_retrans, total_retrans, -				   struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, segs_in): -		SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, data_segs_in): -		SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, segs_out): -		SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, data_segs_out): -		SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, -				   struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, lost_out): -		SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, sacked_out): -		SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); -		break; -  	case offsetof(struct bpf_sock_ops, sk_txhash):  		SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,  					  struct sock, type);  		break; - -	case offsetof(struct bpf_sock_ops, bytes_received): -		SOCK_OPS_GET_FIELD(bytes_received, bytes_received, -				   struct tcp_sock); -		break; - -	case offsetof(struct bpf_sock_ops, bytes_acked): -		SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); -		break; -  	}  	return insn - insn_buf;  } @@ -7698,6 +7947,7 @@ const struct bpf_verifier_ops flow_dissector_verifier_ops = {  };  const struct bpf_prog_ops flow_dissector_prog_ops = { +	.test_run		= bpf_prog_test_run_flow_dissector,  };  int sk_detach_filter(struct sock *sk) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 9f2840510e63..bb1a54747d64 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -683,6 +683,46 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,  	}  } +bool __skb_flow_bpf_dissect(struct bpf_prog *prog, +			    const struct sk_buff *skb, +			    struct flow_dissector *flow_dissector, +			    struct bpf_flow_keys *flow_keys) +{ +	struct bpf_skb_data_end cb_saved; +	struct bpf_skb_data_end *cb; +	u32 result; + +	/* Note that even though the const qualifier is discarded +	 * throughout the execution of the BPF program, all changes(the +	 * control block) are reverted after the BPF program returns. +	 * Therefore, __skb_flow_dissect does not alter the skb. +	 */ + +	cb = (struct bpf_skb_data_end *)skb->cb; + +	/* Save Control Block */ +	memcpy(&cb_saved, cb, sizeof(cb_saved)); +	memset(cb, 0, sizeof(*cb)); + +	/* Pass parameters to the BPF program */ +	memset(flow_keys, 0, sizeof(*flow_keys)); +	cb->qdisc_cb.flow_keys = flow_keys; +	flow_keys->nhoff = skb_network_offset(skb); +	flow_keys->thoff = flow_keys->nhoff; + +	bpf_compute_data_pointers((struct sk_buff *)skb); +	result = BPF_PROG_RUN(prog, skb); + +	/* Restore state */ +	memcpy(cb, &cb_saved, sizeof(cb_saved)); + +	flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, 0, skb->len); +	flow_keys->thoff = clamp_t(u16, flow_keys->thoff, +				   flow_keys->nhoff, skb->len); + +	return result == BPF_OK; +} +  /**   * __skb_flow_dissect - extract the flow_keys struct and return it   * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -714,7 +754,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb,  	struct flow_dissector_key_vlan *key_vlan;  	enum flow_dissect_ret fdret;  	enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; -	struct bpf_prog *attached = NULL;  	int num_hdrs = 0;  	u8 ip_proto = 0;  	bool ret; @@ -754,53 +793,30 @@ bool __skb_flow_dissect(const struct sk_buff *skb,  					      FLOW_DISSECTOR_KEY_BASIC,  					      target_container); -	rcu_read_lock();  	if (skb) { +		struct bpf_flow_keys flow_keys; +		struct bpf_prog *attached = NULL; + +		rcu_read_lock(); +  		if (skb->dev)  			attached = rcu_dereference(dev_net(skb->dev)->flow_dissector_prog);  		else if (skb->sk)  			attached = rcu_dereference(sock_net(skb->sk)->flow_dissector_prog);  		else  			WARN_ON_ONCE(1); -	} -	if (attached) { -		/* Note that even though the const qualifier is discarded -		 * throughout the execution of the BPF program, all changes(the -		 * control block) are reverted after the BPF program returns. -		 * Therefore, __skb_flow_dissect does not alter the skb. -		 */ -		struct bpf_flow_keys flow_keys = {}; -		struct bpf_skb_data_end cb_saved; -		struct bpf_skb_data_end *cb; -		u32 result; - -		cb = (struct bpf_skb_data_end *)skb->cb; - -		/* Save Control Block */ -		memcpy(&cb_saved, cb, sizeof(cb_saved)); -		memset(cb, 0, sizeof(cb_saved)); -		/* Pass parameters to the BPF program */ -		cb->qdisc_cb.flow_keys = &flow_keys; -		flow_keys.nhoff = nhoff; -		flow_keys.thoff = nhoff; - -		bpf_compute_data_pointers((struct sk_buff *)skb); -		result = BPF_PROG_RUN(attached, skb); - -		/* Restore state */ -		memcpy(cb, &cb_saved, sizeof(cb_saved)); - -		flow_keys.nhoff = clamp_t(u16, flow_keys.nhoff, 0, skb->len); -		flow_keys.thoff = clamp_t(u16, flow_keys.thoff, -					  flow_keys.nhoff, skb->len); - -		__skb_flow_bpf_to_target(&flow_keys, flow_dissector, -					 target_container); +		if (attached) { +			ret = __skb_flow_bpf_dissect(attached, skb, +						     flow_dissector, +						     &flow_keys); +			__skb_flow_bpf_to_target(&flow_keys, flow_dissector, +						 target_container); +			rcu_read_unlock(); +			return ret; +		}  		rcu_read_unlock(); -		return result == BPF_OK;  	} -	rcu_read_unlock();  	if (dissector_uses_key(flow_dissector,  			       FLOW_DISSECTOR_KEY_ETH_ADDRS)) { diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c new file mode 100644 index 000000000000..c3a00eac4804 --- /dev/null +++ b/net/core/flow_offload.c @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <net/flow_offload.h> + +struct flow_rule *flow_rule_alloc(unsigned int num_actions) +{ +	struct flow_rule *rule; + +	rule = kzalloc(sizeof(struct flow_rule) + +		       sizeof(struct flow_action_entry) * num_actions, +		       GFP_KERNEL); +	if (!rule) +		return NULL; + +	rule->action.num_entries = num_actions; + +	return rule; +} +EXPORT_SYMBOL(flow_rule_alloc); + +#define FLOW_DISSECTOR_MATCH(__rule, __type, __out)				\ +	const struct flow_match *__m = &(__rule)->match;			\ +	struct flow_dissector *__d = (__m)->dissector;				\ +										\ +	(__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key);	\ +	(__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask);	\ + +void flow_rule_match_basic(const struct flow_rule *rule, +			   struct flow_match_basic *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_BASIC, out); +} +EXPORT_SYMBOL(flow_rule_match_basic); + +void flow_rule_match_control(const struct flow_rule *rule, +			     struct flow_match_control *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CONTROL, out); +} +EXPORT_SYMBOL(flow_rule_match_control); + +void flow_rule_match_eth_addrs(const struct flow_rule *rule, +			       struct flow_match_eth_addrs *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_eth_addrs); + +void flow_rule_match_vlan(const struct flow_rule *rule, +			  struct flow_match_vlan *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_VLAN, out); +} +EXPORT_SYMBOL(flow_rule_match_vlan); + +void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, +				struct flow_match_ipv4_addrs *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV4_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_ipv4_addrs); + +void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, +				struct flow_match_ipv6_addrs *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV6_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_ipv6_addrs); + +void flow_rule_match_ip(const struct flow_rule *rule, +			struct flow_match_ip *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IP, out); +} +EXPORT_SYMBOL(flow_rule_match_ip); + +void flow_rule_match_ports(const struct flow_rule *rule, +			   struct flow_match_ports *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS, out); +} +EXPORT_SYMBOL(flow_rule_match_ports); + +void flow_rule_match_tcp(const struct flow_rule *rule, +			 struct flow_match_tcp *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_TCP, out); +} +EXPORT_SYMBOL(flow_rule_match_tcp); + +void flow_rule_match_icmp(const struct flow_rule *rule, +			  struct flow_match_icmp *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ICMP, out); +} +EXPORT_SYMBOL(flow_rule_match_icmp); + +void flow_rule_match_mpls(const struct flow_rule *rule, +			  struct flow_match_mpls *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_MPLS, out); +} +EXPORT_SYMBOL(flow_rule_match_mpls); + +void flow_rule_match_enc_control(const struct flow_rule *rule, +				 struct flow_match_control *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_control); + +void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule, +				    struct flow_match_ipv4_addrs *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ipv4_addrs); + +void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule, +				    struct flow_match_ipv6_addrs *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ipv6_addrs); + +void flow_rule_match_enc_ip(const struct flow_rule *rule, +			    struct flow_match_ip *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IP, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ip); + +void flow_rule_match_enc_ports(const struct flow_rule *rule, +			       struct flow_match_ports *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_PORTS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_ports); + +void flow_rule_match_enc_keyid(const struct flow_rule *rule, +			       struct flow_match_enc_keyid *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_KEYID, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_keyid); + +void flow_rule_match_enc_opts(const struct flow_rule *rule, +			      struct flow_match_enc_opts *out) +{ +	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_OPTS, out); +} +EXPORT_SYMBOL(flow_rule_match_enc_opts); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 9bf1b9ad1780..ac679f74ba47 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -291,7 +291,6 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,  	for_each_possible_cpu(i) {  		const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); -		qstats->qlen = 0;  		qstats->backlog += qcpu->backlog;  		qstats->drops += qcpu->drops;  		qstats->requeues += qcpu->requeues; @@ -307,7 +306,6 @@ void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,  	if (cpu) {  		__gnet_stats_copy_queue_cpu(qstats, cpu);  	} else { -		qstats->qlen = q->qlen;  		qstats->backlog = q->backlog;  		qstats->drops = q->drops;  		qstats->requeues = q->requeues; diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index a648568c5e8f..cf2f8897ca19 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -16,6 +16,8 @@  #include <linux/types.h>  #include <linux/bpf.h>  #include <net/lwtunnel.h> +#include <net/gre.h> +#include <net/ip6_route.h>  struct bpf_lwt_prog {  	struct bpf_prog *prog; @@ -55,6 +57,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,  	switch (ret) {  	case BPF_OK: +	case BPF_LWT_REROUTE:  		break;  	case BPF_REDIRECT: @@ -87,6 +90,30 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,  	return ret;  } +static int bpf_lwt_input_reroute(struct sk_buff *skb) +{ +	int err = -EINVAL; + +	if (skb->protocol == htons(ETH_P_IP)) { +		struct iphdr *iph = ip_hdr(skb); + +		err = ip_route_input_noref(skb, iph->daddr, iph->saddr, +					   iph->tos, skb_dst(skb)->dev); +	} else if (skb->protocol == htons(ETH_P_IPV6)) { +		err = ipv6_stub->ipv6_route_input(skb); +	} else { +		err = -EAFNOSUPPORT; +	} + +	if (err) +		goto err; +	return dst_input(skb); + +err: +	kfree_skb(skb); +	return err; +} +  static int bpf_input(struct sk_buff *skb)  {  	struct dst_entry *dst = skb_dst(skb); @@ -98,11 +125,11 @@ static int bpf_input(struct sk_buff *skb)  		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);  		if (ret < 0)  			return ret; +		if (ret == BPF_LWT_REROUTE) +			return bpf_lwt_input_reroute(skb);  	}  	if (unlikely(!dst->lwtstate->orig_input)) { -		pr_warn_once("orig_input not set on dst for prog %s\n", -			     bpf->out.name);  		kfree_skb(skb);  		return -EINVAL;  	} @@ -147,6 +174,102 @@ static int xmit_check_hhlen(struct sk_buff *skb)  	return 0;  } +static int bpf_lwt_xmit_reroute(struct sk_buff *skb) +{ +	struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); +	int oif = l3mdev ? l3mdev->ifindex : 0; +	struct dst_entry *dst = NULL; +	int err = -EAFNOSUPPORT; +	struct sock *sk; +	struct net *net; +	bool ipv4; + +	if (skb->protocol == htons(ETH_P_IP)) +		ipv4 = true; +	else if (skb->protocol == htons(ETH_P_IPV6)) +		ipv4 = false; +	else +		goto err; + +	sk = sk_to_full_sk(skb->sk); +	if (sk) { +		if (sk->sk_bound_dev_if) +			oif = sk->sk_bound_dev_if; +		net = sock_net(sk); +	} else { +		net = dev_net(skb_dst(skb)->dev); +	} + +	if (ipv4) { +		struct iphdr *iph = ip_hdr(skb); +		struct flowi4 fl4 = {}; +		struct rtable *rt; + +		fl4.flowi4_oif = oif; +		fl4.flowi4_mark = skb->mark; +		fl4.flowi4_uid = sock_net_uid(net, sk); +		fl4.flowi4_tos = RT_TOS(iph->tos); +		fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; +		fl4.flowi4_proto = iph->protocol; +		fl4.daddr = iph->daddr; +		fl4.saddr = iph->saddr; + +		rt = ip_route_output_key(net, &fl4); +		if (IS_ERR(rt)) { +			err = PTR_ERR(rt); +			goto err; +		} +		dst = &rt->dst; +	} else { +		struct ipv6hdr *iph6 = ipv6_hdr(skb); +		struct flowi6 fl6 = {}; + +		fl6.flowi6_oif = oif; +		fl6.flowi6_mark = skb->mark; +		fl6.flowi6_uid = sock_net_uid(net, sk); +		fl6.flowlabel = ip6_flowinfo(iph6); +		fl6.flowi6_proto = iph6->nexthdr; +		fl6.daddr = iph6->daddr; +		fl6.saddr = iph6->saddr; + +		err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); +		if (unlikely(err)) +			goto err; +		if (IS_ERR(dst)) { +			err = PTR_ERR(dst); +			goto err; +		} +	} +	if (unlikely(dst->error)) { +		err = dst->error; +		dst_release(dst); +		goto err; +	} + +	/* Although skb header was reserved in bpf_lwt_push_ip_encap(), it +	 * was done for the previous dst, so we are doing it here again, in +	 * case the new dst needs much more space. The call below is a noop +	 * if there is enough header space in skb. +	 */ +	err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); +	if (unlikely(err)) +		goto err; + +	skb_dst_drop(skb); +	skb_dst_set(skb, dst); + +	err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); +	if (unlikely(err)) +		return err; + +	/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ +	return LWTUNNEL_XMIT_DONE; + +err: +	kfree_skb(skb); +	return err; +} +  static int bpf_xmit(struct sk_buff *skb)  {  	struct dst_entry *dst = skb_dst(skb); @@ -154,11 +277,20 @@ static int bpf_xmit(struct sk_buff *skb)  	bpf = bpf_lwt_lwtunnel(dst->lwtstate);  	if (bpf->xmit.prog) { +		__be16 proto = skb->protocol;  		int ret;  		ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);  		switch (ret) {  		case BPF_OK: +			/* If the header changed, e.g. via bpf_lwt_push_encap, +			 * BPF_LWT_REROUTE below should have been used if the +			 * protocol was also changed. +			 */ +			if (skb->protocol != proto) { +				kfree_skb(skb); +				return -EINVAL; +			}  			/* If the header was expanded, headroom might be too  			 * small for L2 header to come, expand as needed.  			 */ @@ -169,6 +301,8 @@ static int bpf_xmit(struct sk_buff *skb)  			return LWTUNNEL_XMIT_CONTINUE;  		case BPF_REDIRECT:  			return LWTUNNEL_XMIT_DONE; +		case BPF_LWT_REROUTE: +			return bpf_lwt_xmit_reroute(skb);  		default:  			return ret;  		} @@ -390,6 +524,133 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {  	.owner		= THIS_MODULE,  }; +static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, +			   int encap_len) +{ +	struct skb_shared_info *shinfo = skb_shinfo(skb); + +	gso_type |= SKB_GSO_DODGY; +	shinfo->gso_type |= gso_type; +	skb_decrease_gso_size(shinfo, encap_len); +	shinfo->gso_segs = 0; +	return 0; +} + +static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) +{ +	int next_hdr_offset; +	void *next_hdr; +	__u8 protocol; + +	/* SCTP and UDP_L4 gso need more nuanced handling than what +	 * handle_gso_type() does above: skb_decrease_gso_size() is not enough. +	 * So at the moment only TCP GSO packets are let through. +	 */ +	if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) +		return -ENOTSUPP; + +	if (ipv4) { +		protocol = ip_hdr(skb)->protocol; +		next_hdr_offset = sizeof(struct iphdr); +		next_hdr = skb_network_header(skb) + next_hdr_offset; +	} else { +		protocol = ipv6_hdr(skb)->nexthdr; +		next_hdr_offset = sizeof(struct ipv6hdr); +		next_hdr = skb_network_header(skb) + next_hdr_offset; +	} + +	switch (protocol) { +	case IPPROTO_GRE: +		next_hdr_offset += sizeof(struct gre_base_hdr); +		if (next_hdr_offset > encap_len) +			return -EINVAL; + +		if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) +			return handle_gso_type(skb, SKB_GSO_GRE_CSUM, +					       encap_len); +		return handle_gso_type(skb, SKB_GSO_GRE, encap_len); + +	case IPPROTO_UDP: +		next_hdr_offset += sizeof(struct udphdr); +		if (next_hdr_offset > encap_len) +			return -EINVAL; + +		if (((struct udphdr *)next_hdr)->check) +			return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, +					       encap_len); +		return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); + +	case IPPROTO_IP: +	case IPPROTO_IPV6: +		if (ipv4) +			return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); +		else +			return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); + +	default: +		return -EPROTONOSUPPORT; +	} +} + +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) +{ +	struct iphdr *iph; +	bool ipv4; +	int err; + +	if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) +		return -EINVAL; + +	/* validate protocol and length */ +	iph = (struct iphdr *)hdr; +	if (iph->version == 4) { +		ipv4 = true; +		if (unlikely(len < iph->ihl * 4)) +			return -EINVAL; +	} else if (iph->version == 6) { +		ipv4 = false; +		if (unlikely(len < sizeof(struct ipv6hdr))) +			return -EINVAL; +	} else { +		return -EINVAL; +	} + +	if (ingress) +		err = skb_cow_head(skb, len + skb->mac_len); +	else +		err = skb_cow_head(skb, +				   len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); +	if (unlikely(err)) +		return err; + +	/* push the encap headers and fix pointers */ +	skb_reset_inner_headers(skb); +	skb->encapsulation = 1; +	skb_push(skb, len); +	if (ingress) +		skb_postpush_rcsum(skb, iph, len); +	skb_reset_network_header(skb); +	memcpy(skb_network_header(skb), hdr, len); +	bpf_compute_data_pointers(skb); +	skb_clear_hash(skb); + +	if (ipv4) { +		skb->protocol = htons(ETH_P_IP); +		iph = ip_hdr(skb); + +		if (!iph->check) +			iph->check = ip_fast_csum((unsigned char *)iph, +						  iph->ihl); +	} else { +		skb->protocol = htons(ETH_P_IPV6); +	} + +	if (skb_is_gso(skb)) +		return handle_gso_encap(skb, ipv4, len); + +	return 0; +} +  static int __init bpf_lwt_init(void)  {  	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 0b171756453c..19b557bd294b 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -122,18 +122,18 @@ int lwtunnel_build_state(u16 encap_type,  	ret = -EOPNOTSUPP;  	rcu_read_lock();  	ops = rcu_dereference(lwtun_encaps[encap_type]); -	if (likely(ops && ops->build_state && try_module_get(ops->owner))) { +	if (likely(ops && ops->build_state && try_module_get(ops->owner)))  		found = true; +	rcu_read_unlock(); + +	if (found) {  		ret = ops->build_state(encap, family, cfg, lws, extack);  		if (ret)  			module_put(ops->owner); -	} -	rcu_read_unlock(); - -	/* don't rely on -EOPNOTSUPP to detect match as build_state -	 * handlers could return it -	 */ -	if (!found) { +	} else { +		/* don't rely on -EOPNOTSUPP to detect match as build_state +		 * handlers could return it +		 */  		NL_SET_ERR_MSG_ATTR(extack, encap,  				    "LWT encapsulation type not supported");  	} diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 4230400b9a30..30f6fd8f68e0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -42,6 +42,8 @@  #include <linux/inetdevice.h>  #include <net/addrconf.h> +#include <trace/events/neigh.h> +  #define DEBUG  #define NEIGH_DEBUG 1  #define neigh_dbg(level, fmt, ...)		\ @@ -102,6 +104,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)  	if (neigh->parms->neigh_cleanup)  		neigh->parms->neigh_cleanup(neigh); +	trace_neigh_cleanup_and_release(neigh, 0);  	__neigh_notify(neigh, RTM_DELNEIGH, 0, 0);  	call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);  	neigh_release(neigh); @@ -1095,6 +1098,8 @@ out:  	if (notify)  		neigh_update_notify(neigh, 0); +	trace_neigh_timer_handler(neigh, 0); +  	neigh_release(neigh);  } @@ -1165,6 +1170,7 @@ out_unlock_bh:  	else  		write_unlock(&neigh->lock);  	local_bh_enable(); +	trace_neigh_event_send_done(neigh, rc);  	return rc;  out_dead: @@ -1172,6 +1178,7 @@ out_dead:  		goto out_unlock_bh;  	write_unlock_bh(&neigh->lock);  	kfree_skb(skb); +	trace_neigh_event_send_dead(neigh, 1);  	return 1;  }  EXPORT_SYMBOL(__neigh_event_send); @@ -1227,6 +1234,8 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,  	struct net_device *dev;  	int update_isrouter = 0; +	trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); +  	write_lock_bh(&neigh->lock);  	dev    = neigh->dev; @@ -1393,6 +1402,8 @@ out:  	if (notify)  		neigh_update_notify(neigh, nlmsg_pid); +	trace_neigh_update_done(neigh, err); +  	return err;  } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ff9fd2bb4ce4..4ff661f6f989 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -12,7 +12,6 @@  #include <linux/capability.h>  #include <linux/kernel.h>  #include <linux/netdevice.h> -#include <net/switchdev.h>  #include <linux/if_arp.h>  #include <linux/slab.h>  #include <linux/sched/signal.h> @@ -501,16 +500,11 @@ static ssize_t phys_switch_id_show(struct device *dev,  		return restart_syscall();  	if (dev_isalive(netdev)) { -		struct switchdev_attr attr = { -			.orig_dev = netdev, -			.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, -			.flags = SWITCHDEV_F_NO_RECURSE, -		}; +		struct netdev_phys_item_id ppid = { }; -		ret = switchdev_port_attr_get(netdev, &attr); +		ret = dev_get_port_parent_id(netdev, &ppid, false);  		if (!ret) -			ret = sprintf(buf, "%*phN\n", attr.u.ppid.id_len, -				      attr.u.ppid.id); +			ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);  	}  	rtnl_unlock(); @@ -1342,8 +1336,7 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)  		if (tc < 0)  			return -EINVAL;  	} -	mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long), -		       GFP_KERNEL); +	mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);  	if (!mask)  		return -ENOMEM; @@ -1372,7 +1365,7 @@ out_no_maps:  	rcu_read_unlock();  	len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); -	kfree(mask); +	bitmap_free(mask);  	return len < PAGE_SIZE ? len : -EINVAL;  } @@ -1388,8 +1381,7 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,  	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))  		return -EPERM; -	mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long), -		       GFP_KERNEL); +	mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);  	if (!mask)  		return -ENOMEM; @@ -1397,7 +1389,7 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,  	err = bitmap_parse(buf, len, mask, dev->num_rx_queues);  	if (err) { -		kfree(mask); +		bitmap_free(mask);  		return err;  	} @@ -1405,7 +1397,7 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,  	err = __netif_set_xps_queue(dev, mask, index, true);  	cpus_read_unlock(); -	kfree(mask); +	bitmap_free(mask);  	return err ? : len;  } @@ -1547,6 +1539,9 @@ static int register_queue_kobjects(struct net_device *dev)  error:  	netdev_queue_update_kobjects(dev, txq, 0);  	net_rx_queue_update_kobjects(dev, rxq, 0); +#ifdef CONFIG_SYSFS +	kset_unregister(dev->queues_kset); +#endif  	return error;  } diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 419af6dfe29f..470b179d599e 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -43,6 +43,14 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);  EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);  #endif +#include <trace/events/neigh.h> +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_timer_handler); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_done); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_dead); +EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_cleanup_and_release); +  EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);  EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b02fb19df2cc..17f36317363d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -778,6 +778,41 @@ nla_put_failure:  	return -EMSGSIZE;  } +static int rtnl_net_valid_getid_req(struct sk_buff *skb, +				    const struct nlmsghdr *nlh, +				    struct nlattr **tb, +				    struct netlink_ext_ack *extack) +{ +	int i, err; + +	if (!netlink_strict_get_check(skb)) +		return nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, +				   rtnl_net_policy, extack); + +	err = nlmsg_parse_strict(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, +				 rtnl_net_policy, extack); +	if (err) +		return err; + +	for (i = 0; i <= NETNSA_MAX; i++) { +		if (!tb[i]) +			continue; + +		switch (i) { +		case NETNSA_PID: +		case NETNSA_FD: +		case NETNSA_NSID: +		case NETNSA_TARGET_NSID: +			break; +		default: +			NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request"); +			return -EINVAL; +		} +	} + +	return 0; +} +  static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,  			  struct netlink_ext_ack *extack)  { @@ -793,8 +828,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,  	struct sk_buff *msg;  	int err; -	err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, -			  rtnl_net_policy, extack); +	err = rtnl_net_valid_getid_req(skb, nlh, tb, extack);  	if (err < 0)  		return err;  	if (tb[NETNSA_PID]) { diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 43a932cb609b..5b2252c6d49b 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -136,17 +136,19 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,  	if (!(pool->p.flags & PP_FLAG_DMA_MAP))  		goto skip_dma_map; -	/* Setup DMA mapping: use page->private for DMA-addr +	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr +	 * since dma_addr_t can be either 32 or 64 bits and does not always fit +	 * into page private data (i.e 32bit cpu with 64bit DMA caps)  	 * This mapping is kept for lifetime of page, until leaving pool.  	 */ -	dma = dma_map_page(pool->p.dev, page, 0, -			   (PAGE_SIZE << pool->p.order), -			   pool->p.dma_dir); +	dma = dma_map_page_attrs(pool->p.dev, page, 0, +				 (PAGE_SIZE << pool->p.order), +				 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);  	if (dma_mapping_error(pool->p.dev, dma)) {  		put_page(page);  		return NULL;  	} -	set_page_private(page, dma); /* page->private = dma; */ +	page->dma_addr = dma;  skip_dma_map:  	/* When page just alloc'ed is should/must have refcnt 1. */ @@ -175,13 +177,17 @@ EXPORT_SYMBOL(page_pool_alloc_pages);  static void __page_pool_clean_page(struct page_pool *pool,  				   struct page *page)  { +	dma_addr_t dma; +  	if (!(pool->p.flags & PP_FLAG_DMA_MAP))  		return; +	dma = page->dma_addr;  	/* DMA unmap */ -	dma_unmap_page(pool->p.dev, page_private(page), -		       PAGE_SIZE << pool->p.order, pool->p.dma_dir); -	set_page_private(page, 0); +	dma_unmap_page_attrs(pool->p.dev, dma, +			     PAGE_SIZE << pool->p.order, pool->p.dma_dir, +			     DMA_ATTR_SKIP_CPU_SYNC); +	page->dma_addr = 0;  }  /* Return a page to the page allocator, cleaning up our state */ diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 6ac919847ce6..f3f5a78cd062 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -158,6 +158,7 @@  #include <linux/etherdevice.h>  #include <linux/kthread.h>  #include <linux/prefetch.h> +#include <linux/mmzone.h>  #include <net/net_namespace.h>  #include <net/checksum.h>  #include <net/ipv6.h> @@ -3625,7 +3626,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)  	pkt_dev->svlan_cfi = 0;  	pkt_dev->svlan_id = 0xffff;  	pkt_dev->burst = 1; -	pkt_dev->node = -1; +	pkt_dev->node = NUMA_NO_NODE;  	err = pktgen_setup_dev(t->net, pkt_dev, ifname);  	if (err) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5ea1bed08ede..a51cab95ba64 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -46,7 +46,6 @@  #include <linux/inet.h>  #include <linux/netdevice.h> -#include <net/switchdev.h>  #include <net/ip.h>  #include <net/protocol.h>  #include <net/arp.h> @@ -1146,22 +1145,17 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev)  static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)  { +	struct netdev_phys_item_id ppid = { };  	int err; -	struct switchdev_attr attr = { -		.orig_dev = dev, -		.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID, -		.flags = SWITCHDEV_F_NO_RECURSE, -	}; -	err = switchdev_port_attr_get(dev, &attr); +	err = dev_get_port_parent_id(dev, &ppid, false);  	if (err) {  		if (err == -EOPNOTSUPP)  			return 0;  		return err;  	} -	if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.u.ppid.id_len, -		    attr.u.ppid.id)) +	if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id))  		return -EMSGSIZE;  	return 0; @@ -3242,6 +3236,53 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,  	return ret;  } +static int rtnl_valid_getlink_req(struct sk_buff *skb, +				  const struct nlmsghdr *nlh, +				  struct nlattr **tb, +				  struct netlink_ext_ack *extack) +{ +	struct ifinfomsg *ifm; +	int i, err; + +	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { +		NL_SET_ERR_MSG(extack, "Invalid header for get link"); +		return -EINVAL; +	} + +	if (!netlink_strict_get_check(skb)) +		return nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, +				   extack); + +	ifm = nlmsg_data(nlh); +	if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || +	    ifm->ifi_change) { +		NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); +		return -EINVAL; +	} + +	err = nlmsg_parse_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, +				 extack); +	if (err) +		return err; + +	for (i = 0; i <= IFLA_MAX; i++) { +		if (!tb[i]) +			continue; + +		switch (i) { +		case IFLA_IFNAME: +		case IFLA_EXT_MASK: +		case IFLA_TARGET_NETNSID: +			break; +		default: +			NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request"); +			return -EINVAL; +		} +	} + +	return 0; +} +  static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,  			struct netlink_ext_ack *extack)  { @@ -3256,7 +3297,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,  	int err;  	u32 ext_filter_mask = 0; -	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); +	err = rtnl_valid_getlink_req(skb, nlh, tb, extack);  	if (err < 0)  		return err; @@ -3639,7 +3680,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,  		const struct net_device_ops *ops = br_dev->netdev_ops;  		err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, -				       nlh->nlmsg_flags); +				       nlh->nlmsg_flags, extack);  		if (err)  			goto out;  		else @@ -3651,7 +3692,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,  		if (dev->netdev_ops->ndo_fdb_add)  			err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,  							   vid, -							   nlh->nlmsg_flags); +							   nlh->nlmsg_flags, +							   extack);  		else  			err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,  					       nlh->nlmsg_flags); @@ -4901,6 +4943,40 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,  	return size;  } +static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, +				bool is_dump, struct netlink_ext_ack *extack) +{ +	struct if_stats_msg *ifsm; + +	if (nlh->nlmsg_len < sizeof(*ifsm)) { +		NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); +		return -EINVAL; +	} + +	if (!strict_check) +		return 0; + +	ifsm = nlmsg_data(nlh); + +	/* only requests using strict checks can pass data to influence +	 * the dump. The legacy exception is filter_mask. +	 */ +	if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) { +		NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); +		return -EINVAL; +	} +	if (nlmsg_attrlen(nlh, sizeof(*ifsm))) { +		NL_SET_ERR_MSG(extack, "Invalid attributes after stats header"); +		return -EINVAL; +	} +	if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) { +		NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask"); +		return -EINVAL; +	} + +	return 0; +} +  static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,  			  struct netlink_ext_ack *extack)  { @@ -4912,8 +4988,10 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,  	u32 filter_mask;  	int err; -	if (nlmsg_len(nlh) < sizeof(*ifsm)) -		return -EINVAL; +	err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), +				   false, extack); +	if (err) +		return err;  	ifsm = nlmsg_data(nlh);  	if (ifsm->ifindex > 0) @@ -4965,27 +5043,11 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)  	cb->seq = net->dev_base_seq; -	if (nlmsg_len(cb->nlh) < sizeof(*ifsm)) { -		NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); -		return -EINVAL; -	} +	err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack); +	if (err) +		return err;  	ifsm = nlmsg_data(cb->nlh); - -	/* only requests using strict checks can pass data to influence -	 * the dump. The legacy exception is filter_mask. -	 */ -	if (cb->strict_check) { -		if (ifsm->pad1 || ifsm->pad2 || ifsm->ifindex) { -			NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); -			return -EINVAL; -		} -		if (nlmsg_attrlen(cb->nlh, sizeof(*ifsm))) { -			NL_SET_ERR_MSG(extack, "Invalid attributes after stats header"); -			return -EINVAL; -		} -	} -  	filter_mask = ifsm->filter_mask;  	if (!filter_mask) {  		NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); diff --git a/net/core/scm.c b/net/core/scm.c index b1ff8a441748..52ef219cf6df 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -29,6 +29,7 @@  #include <linux/pid.h>  #include <linux/nsproxy.h>  #include <linux/slab.h> +#include <linux/errqueue.h>  #include <linux/uaccess.h> @@ -252,6 +253,32 @@ out:  }  EXPORT_SYMBOL(put_cmsg); +void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) +{ +	struct scm_timestamping64 tss; +	int i; + +	for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { +		tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; +		tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; +	} + +	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_NEW, sizeof(tss), &tss); +} +EXPORT_SYMBOL(put_cmsg_scm_timestamping64); + +void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) +{ +	struct scm_timestamping tss; +	int i; + +	for (i = 0; i < ARRAY_SIZE(tss.ts); i++) +		tss.ts[i] = timespec64_to_timespec(tss_internal->ts[i]); + +	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss); +} +EXPORT_SYMBOL(put_cmsg_scm_timestamping); +  void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)  {  	struct cmsghdr __user *cm diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 26d848484912..2415d9cb9b89 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -356,6 +356,8 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)   */  void *netdev_alloc_frag(unsigned int fragsz)  { +	fragsz = SKB_DATA_ALIGN(fragsz); +  	return __netdev_alloc_frag(fragsz, GFP_ATOMIC);  }  EXPORT_SYMBOL(netdev_alloc_frag); @@ -369,6 +371,8 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)  void *napi_alloc_frag(unsigned int fragsz)  { +	fragsz = SKB_DATA_ALIGN(fragsz); +  	return __napi_alloc_frag(fragsz, GFP_ATOMIC);  }  EXPORT_SYMBOL(napi_alloc_frag); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d6d5c20d7044..ae6f06e45737 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -78,11 +78,9 @@ int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,  {  	int i = src->sg.start;  	struct scatterlist *sge = sk_msg_elem(src, i); +	struct scatterlist *sgd = NULL;  	u32 sge_len, sge_off; -	if (sk_msg_full(dst)) -		return -ENOSPC; -  	while (off) {  		if (sge->length > off)  			break; @@ -94,16 +92,27 @@ int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,  	}  	while (len) { -		if (sk_msg_full(dst)) -			return -ENOSPC; -  		sge_len = sge->length - off; -		sge_off = sge->offset + off;  		if (sge_len > len)  			sge_len = len; + +		if (dst->sg.end) +			sgd = sk_msg_elem(dst, dst->sg.end - 1); + +		if (sgd && +		    (sg_page(sge) == sg_page(sgd)) && +		    (sg_virt(sge) + off == sg_virt(sgd) + sgd->length)) { +			sgd->length += sge_len; +			dst->sg.size += sge_len; +		} else if (!sk_msg_full(dst)) { +			sge_off = sge->offset + off; +			sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off); +		} else { +			return -ENOSPC; +		} +  		off = 0;  		len -= sge_len; -		sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off);  		sk_mem_charge(sk, sge_len);  		sk_msg_iter_var_next(i);  		if (i == src->sg.end && len) @@ -545,8 +554,7 @@ static void sk_psock_destroy_deferred(struct work_struct *gc)  	struct sk_psock *psock = container_of(gc, struct sk_psock, gc);  	/* No sk_callback_lock since already detached. */ -	if (psock->parser.enabled) -		strp_done(&psock->parser.strp); +	strp_done(&psock->parser.strp);  	cancel_work_sync(&psock->work); diff --git a/net/core/sock.c b/net/core/sock.c index 6aa2e7e0b4fb..782343bb925b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -335,14 +335,68 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)  }  EXPORT_SYMBOL(__sk_backlog_rcv); -static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) +static int sock_get_timeout(long timeo, void *optval, bool old_timeval)  { -	struct timeval tv; +	struct __kernel_sock_timeval tv; +	int size; -	if (optlen < sizeof(tv)) -		return -EINVAL; -	if (copy_from_user(&tv, optval, sizeof(tv))) -		return -EFAULT; +	if (timeo == MAX_SCHEDULE_TIMEOUT) { +		tv.tv_sec = 0; +		tv.tv_usec = 0; +	} else { +		tv.tv_sec = timeo / HZ; +		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; +	} + +	if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { +		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; +		*(struct old_timeval32 *)optval = tv32; +		return sizeof(tv32); +	} + +	if (old_timeval) { +		struct __kernel_old_timeval old_tv; +		old_tv.tv_sec = tv.tv_sec; +		old_tv.tv_usec = tv.tv_usec; +		*(struct __kernel_old_timeval *)optval = old_tv; +		size = sizeof(old_tv); +	} else { +		*(struct __kernel_sock_timeval *)optval = tv; +		size = sizeof(tv); +	} + +	return size; +} + +static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen, bool old_timeval) +{ +	struct __kernel_sock_timeval tv; + +	if (in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { +		struct old_timeval32 tv32; + +		if (optlen < sizeof(tv32)) +			return -EINVAL; + +		if (copy_from_user(&tv32, optval, sizeof(tv32))) +			return -EFAULT; +		tv.tv_sec = tv32.tv_sec; +		tv.tv_usec = tv32.tv_usec; +	} else if (old_timeval) { +		struct __kernel_old_timeval old_tv; + +		if (optlen < sizeof(old_tv)) +			return -EINVAL; +		if (copy_from_user(&old_tv, optval, sizeof(old_tv))) +			return -EFAULT; +		tv.tv_sec = old_tv.tv_sec; +		tv.tv_usec = old_tv.tv_usec; +	} else { +		if (optlen < sizeof(tv)) +			return -EINVAL; +		if (copy_from_user(&tv, optval, sizeof(tv))) +			return -EFAULT; +	}  	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)  		return -EDOM; @@ -360,8 +414,8 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)  	*timeo_p = MAX_SCHEDULE_TIMEOUT;  	if (tv.tv_sec == 0 && tv.tv_usec == 0)  		return 0; -	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) -		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ); +	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) +		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);  	return 0;  } @@ -520,14 +574,11 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)  }  EXPORT_SYMBOL(sk_dst_check); -static int sock_setbindtodevice(struct sock *sk, char __user *optval, -				int optlen) +static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)  {  	int ret = -ENOPROTOOPT;  #ifdef CONFIG_NETDEVICES  	struct net *net = sock_net(sk); -	char devname[IFNAMSIZ]; -	int index;  	/* Sorry... */  	ret = -EPERM; @@ -535,6 +586,32 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,  		goto out;  	ret = -EINVAL; +	if (ifindex < 0) +		goto out; + +	sk->sk_bound_dev_if = ifindex; +	if (sk->sk_prot->rehash) +		sk->sk_prot->rehash(sk); +	sk_dst_reset(sk); + +	ret = 0; + +out: +#endif + +	return ret; +} + +static int sock_setbindtodevice(struct sock *sk, char __user *optval, +				int optlen) +{ +	int ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES +	struct net *net = sock_net(sk); +	char devname[IFNAMSIZ]; +	int index; + +	ret = -EINVAL;  	if (optlen < 0)  		goto out; @@ -566,14 +643,9 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,  	}  	lock_sock(sk); -	sk->sk_bound_dev_if = index; -	if (sk->sk_prot->rehash) -		sk->sk_prot->rehash(sk); -	sk_dst_reset(sk); +	ret = sock_setbindtodevice_locked(sk, index);  	release_sock(sk); -	ret = 0; -  out:  #endif @@ -713,6 +785,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,  		 */  		val = min_t(u32, val, sysctl_wmem_max);  set_sndbuf: +		/* Ensure val * 2 fits into an int, to prevent max_t() +		 * from treating it as a negative value. +		 */ +		val = min_t(int, val, INT_MAX / 2);  		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;  		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);  		/* Wake up sending tasks if we upped the value. */ @@ -724,6 +800,12 @@ set_sndbuf:  			ret = -EPERM;  			break;  		} + +		/* No negative values (to prevent underflow, as val will be +		 * multiplied by 2). +		 */ +		if (val < 0) +			val = 0;  		goto set_sndbuf;  	case SO_RCVBUF: @@ -734,6 +816,10 @@ set_sndbuf:  		 */  		val = min_t(u32, val, sysctl_rmem_max);  set_rcvbuf: +		/* Ensure val * 2 fits into an int, to prevent max_t() +		 * from treating it as a negative value. +		 */ +		val = min_t(int, val, INT_MAX / 2);  		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;  		/*  		 * We double it on the way in to account for @@ -758,6 +844,12 @@ set_rcvbuf:  			ret = -EPERM;  			break;  		} + +		/* No negative values (to prevent underflow, as val will be +		 * multiplied by 2). +		 */ +		if (val < 0) +			val = 0;  		goto set_rcvbuf;  	case SO_KEEPALIVE: @@ -815,10 +907,17 @@ set_rcvbuf:  			clear_bit(SOCK_PASSCRED, &sock->flags);  		break; -	case SO_TIMESTAMP: -	case SO_TIMESTAMPNS: +	case SO_TIMESTAMP_OLD: +	case SO_TIMESTAMP_NEW: +	case SO_TIMESTAMPNS_OLD: +	case SO_TIMESTAMPNS_NEW:  		if (valbool)  { -			if (optname == SO_TIMESTAMP) +			if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW) +				sock_set_flag(sk, SOCK_TSTAMP_NEW); +			else +				sock_reset_flag(sk, SOCK_TSTAMP_NEW); + +			if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW)  				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);  			else  				sock_set_flag(sk, SOCK_RCVTSTAMPNS); @@ -827,10 +926,14 @@ set_rcvbuf:  		} else {  			sock_reset_flag(sk, SOCK_RCVTSTAMP);  			sock_reset_flag(sk, SOCK_RCVTSTAMPNS); +			sock_reset_flag(sk, SOCK_TSTAMP_NEW);  		}  		break; -	case SO_TIMESTAMPING: +	case SO_TIMESTAMPING_NEW: +		sock_set_flag(sk, SOCK_TSTAMP_NEW); +		/* fall through */ +	case SO_TIMESTAMPING_OLD:  		if (val & ~SOF_TIMESTAMPING_MASK) {  			ret = -EINVAL;  			break; @@ -861,9 +964,13 @@ set_rcvbuf:  		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)  			sock_enable_timestamp(sk,  					      SOCK_TIMESTAMPING_RX_SOFTWARE); -		else +		else { +			if (optname == SO_TIMESTAMPING_NEW) +				sock_reset_flag(sk, SOCK_TSTAMP_NEW); +  			sock_disable_timestamp(sk,  					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); +		}  		break;  	case SO_RCVLOWAT: @@ -875,12 +982,14 @@ set_rcvbuf:  			sk->sk_rcvlowat = val ? : 1;  		break; -	case SO_RCVTIMEO: -		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); +	case SO_RCVTIMEO_OLD: +	case SO_RCVTIMEO_NEW: +		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen, optname == SO_RCVTIMEO_OLD);  		break; -	case SO_SNDTIMEO: -		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); +	case SO_SNDTIMEO_OLD: +	case SO_SNDTIMEO_NEW: +		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD);  		break;  	case SO_ATTACH_FILTER: @@ -999,15 +1108,23 @@ set_rcvbuf:  #endif  	case SO_MAX_PACING_RATE: -		if (val != ~0U) +		{ +		unsigned long ulval = (val == ~0U) ? ~0UL : val; + +		if (sizeof(ulval) != sizeof(val) && +		    optlen >= sizeof(ulval) && +		    get_user(ulval, (unsigned long __user *)optval)) { +			ret = -EFAULT; +			break; +		} +		if (ulval != ~0UL)  			cmpxchg(&sk->sk_pacing_status,  				SK_PACING_NONE,  				SK_PACING_NEEDED); -		sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val; -		sk->sk_pacing_rate = min(sk->sk_pacing_rate, -					 sk->sk_max_pacing_rate); +		sk->sk_max_pacing_rate = ulval; +		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);  		break; - +		}  	case SO_INCOMING_CPU:  		sk->sk_incoming_cpu = val;  		break; @@ -1055,6 +1172,10 @@ set_rcvbuf:  		}  		break; +	case SO_BINDTOIFINDEX: +		ret = sock_setbindtodevice_locked(sk, val); +		break; +  	default:  		ret = -ENOPROTOOPT;  		break; @@ -1098,8 +1219,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  	union {  		int val;  		u64 val64; +		unsigned long ulval;  		struct linger ling; -		struct timeval tm; +		struct old_timeval32 tm32; +		struct __kernel_old_timeval tm; +		struct  __kernel_sock_timeval stm;  		struct sock_txtime txtime;  	} v; @@ -1186,39 +1310,36 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  		sock_warn_obsolete_bsdism("getsockopt");  		break; -	case SO_TIMESTAMP: +	case SO_TIMESTAMP_OLD:  		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && +				!sock_flag(sk, SOCK_TSTAMP_NEW) &&  				!sock_flag(sk, SOCK_RCVTSTAMPNS);  		break; -	case SO_TIMESTAMPNS: -		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); +	case SO_TIMESTAMPNS_OLD: +		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);  		break; -	case SO_TIMESTAMPING: +	case SO_TIMESTAMP_NEW: +		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); +		break; + +	case SO_TIMESTAMPNS_NEW: +		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); +		break; + +	case SO_TIMESTAMPING_OLD:  		v.val = sk->sk_tsflags;  		break; -	case SO_RCVTIMEO: -		lv = sizeof(struct timeval); -		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { -			v.tm.tv_sec = 0; -			v.tm.tv_usec = 0; -		} else { -			v.tm.tv_sec = sk->sk_rcvtimeo / HZ; -			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ; -		} +	case SO_RCVTIMEO_OLD: +	case SO_RCVTIMEO_NEW: +		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);  		break; -	case SO_SNDTIMEO: -		lv = sizeof(struct timeval); -		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { -			v.tm.tv_sec = 0; -			v.tm.tv_usec = 0; -		} else { -			v.tm.tv_sec = sk->sk_sndtimeo / HZ; -			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ; -		} +	case SO_SNDTIMEO_OLD: +	case SO_SNDTIMEO_NEW: +		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);  		break;  	case SO_RCVLOWAT: @@ -1344,8 +1465,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  #endif  	case SO_MAX_PACING_RATE: -		/* 32bit version */ -		v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); +		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { +			lv = sizeof(v.ulval); +			v.ulval = sk->sk_max_pacing_rate; +		} else { +			/* 32bit version */ +			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); +		}  		break;  	case SO_INCOMING_CPU: @@ -1399,6 +1525,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,  				  SOF_TXTIME_REPORT_ERRORS : 0;  		break; +	case SO_BINDTOIFINDEX: +		v.val = sk->sk_bound_dev_if; +		break; +  	default:  		/* We implement the SO_SNDLOWAT etc to not be settable  		 * (1003.1g 7). @@ -1726,7 +1856,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)  		newsk->sk_err_soft = 0;  		newsk->sk_priority = 0;  		newsk->sk_incoming_cpu = raw_smp_processor_id(); -		atomic64_set(&newsk->sk_cookie, 0);  		if (likely(newsk->sk_net_refcnt))  			sock_inuse_add(sock_net(newsk), 1); @@ -1750,7 +1879,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)  		 */  		sk_refcnt_debug_inc(newsk);  		sk_set_socket(newsk, NULL); -		newsk->sk_wq = NULL; +		RCU_INIT_POINTER(newsk->sk_wq, NULL);  		if (newsk->sk_prot->sockets_allocated)  			sk_sockets_allocated_inc(newsk); @@ -2122,7 +2251,7 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,  			return -EINVAL;  		sockc->mark = *(u32 *)CMSG_DATA(cmsg);  		break; -	case SO_TIMESTAMPING: +	case SO_TIMESTAMPING_OLD:  		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))  			return -EINVAL; @@ -2380,7 +2509,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)  	}  	if (sk_has_memory_pressure(sk)) { -		int alloc; +		u64 alloc;  		if (!sk_under_memory_pressure(sk))  			return 1; @@ -2713,11 +2842,11 @@ void sock_init_data(struct socket *sock, struct sock *sk)  	if (sock) {  		sk->sk_type	=	sock->type; -		sk->sk_wq	=	sock->wq; +		RCU_INIT_POINTER(sk->sk_wq, sock->wq);  		sock->sk	=	sk;  		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;  	} else { -		sk->sk_wq	=	NULL; +		RCU_INIT_POINTER(sk->sk_wq, NULL);  		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);  	} diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d67ec17f2cc8..84bf2861f45f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -36,6 +36,15 @@ static int net_msg_warn;	/* Unused, but still a sysctl */  int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;  EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); +/* 0 - Keep current behavior: + *     IPv4: inherit all current settings from init_net + *     IPv6: reset all settings to default + * 1 - Both inherit all current settings from init_net + * 2 - Both reset all settings to default + */ +int sysctl_devconf_inherit_init_net __read_mostly; +EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); +  #ifdef CONFIG_RPS  static int rps_sock_flow_sysctl(struct ctl_table *table, int write,  				void __user *buffer, size_t *lenp, loff_t *ppos) @@ -544,6 +553,15 @@ static struct ctl_table net_core_table[] = {  		.extra1		= &zero,  		.extra2		= &one,  	}, +	{ +		.procname	= "devconf_inherit_init_net", +		.data		= &sysctl_devconf_inherit_init_net, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &two, +	},  	{ }  };  |