diff options
Diffstat (limited to 'drivers/infiniband/core')
34 files changed, 1786 insertions, 763 deletions
| diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 0dce94e3c495..2f7d14159841 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -42,9 +42,10 @@  #include <net/neighbour.h>  #include <net/route.h>  #include <net/netevent.h> -#include <net/addrconf.h> +#include <net/ipv6_stubs.h>  #include <net/ip6_route.h>  #include <rdma/ib_addr.h> +#include <rdma/ib_cache.h>  #include <rdma/ib_sa.h>  #include <rdma/ib.h>  #include <rdma/rdma_netlink.h> @@ -86,8 +87,8 @@ static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh)  	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)  		return false; -	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), -			nlmsg_len(nlh), ib_nl_addr_policy, NULL); +	ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), +				   nlmsg_len(nlh), ib_nl_addr_policy, NULL);  	if (ret)  		return false; @@ -351,7 +352,7 @@ static bool has_gateway(const struct dst_entry *dst, sa_family_t family)  	if (family == AF_INET) {  		rt = container_of(dst, struct rtable, dst); -		return rt->rt_uses_gateway; +		return rt->rt_gw_family == AF_INET;  	}  	rt6 = container_of(dst, struct rt6_info, dst); @@ -730,8 +731,8 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec,  	if (rec->roce.route_resolved)  		return 0; -	rdma_gid2ip(&sgid._sockaddr, &rec->sgid); -	rdma_gid2ip(&dgid._sockaddr, &rec->dgid); +	rdma_gid2ip((struct sockaddr *)&sgid, &rec->sgid); +	rdma_gid2ip((struct sockaddr *)&dgid, &rec->dgid);  	if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family)  		return -EINVAL; @@ -742,7 +743,7 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec,  	dev_addr.net = &init_net;  	dev_addr.sgid_attr = attr; -	ret = addr_resolve(&sgid._sockaddr, &dgid._sockaddr, +	ret = addr_resolve((struct sockaddr *)&sgid, (struct sockaddr *)&dgid,  			   &dev_addr, false, true, 0);  	if (ret)  		return ret; @@ -814,22 +815,22 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,  	struct rdma_dev_addr dev_addr;  	struct resolve_cb_context ctx;  	union { -		struct sockaddr     _sockaddr;  		struct sockaddr_in  _sockaddr_in;  		struct sockaddr_in6 _sockaddr_in6;  	} sgid_addr, dgid_addr;  	int ret; -	rdma_gid2ip(&sgid_addr._sockaddr, sgid); -	rdma_gid2ip(&dgid_addr._sockaddr, dgid); +	rdma_gid2ip((struct sockaddr *)&sgid_addr, sgid); +	rdma_gid2ip((struct sockaddr *)&dgid_addr, dgid);  	memset(&dev_addr, 0, sizeof(dev_addr));  	dev_addr.net = &init_net;  	dev_addr.sgid_attr = sgid_attr;  	init_completion(&ctx.comp); -	ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr, -			      &dev_addr, 1000, resolve_cb, true, &ctx); +	ret = rdma_resolve_ip((struct sockaddr *)&sgid_addr, +			      (struct sockaddr *)&dgid_addr, &dev_addr, 1000, +			      resolve_cb, true, &ctx);  	if (ret)  		return ret; diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 43c67e5f43c6..18e476b3ced0 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -78,11 +78,22 @@ enum gid_table_entry_state {  	GID_TABLE_ENTRY_PENDING_DEL	= 3,  }; +struct roce_gid_ndev_storage { +	struct rcu_head rcu_head; +	struct net_device *ndev; +}; +  struct ib_gid_table_entry {  	struct kref			kref;  	struct work_struct		del_work;  	struct ib_gid_attr		attr;  	void				*context; +	/* Store the ndev pointer to release reference later on in +	 * call_rcu context because by that time gid_table_entry +	 * and attr might be already freed. So keep a copy of it. +	 * ndev_storage is freed by rcu callback. +	 */ +	struct roce_gid_ndev_storage	*ndev_storage;  	enum gid_table_entry_state	state;  }; @@ -206,6 +217,20 @@ static void schedule_free_gid(struct kref *kref)  	queue_work(ib_wq, &entry->del_work);  } +static void put_gid_ndev(struct rcu_head *head) +{ +	struct roce_gid_ndev_storage *storage = +		container_of(head, struct roce_gid_ndev_storage, rcu_head); + +	WARN_ON(!storage->ndev); +	/* At this point its safe to release netdev reference, +	 * as all callers working on gid_attr->ndev are done +	 * using this netdev. +	 */ +	dev_put(storage->ndev); +	kfree(storage); +} +  static void free_gid_entry_locked(struct ib_gid_table_entry *entry)  {  	struct ib_device *device = entry->attr.device; @@ -228,8 +253,8 @@ static void free_gid_entry_locked(struct ib_gid_table_entry *entry)  	/* Now this index is ready to be allocated */  	write_unlock_irq(&table->rwlock); -	if (entry->attr.ndev) -		dev_put(entry->attr.ndev); +	if (entry->ndev_storage) +		call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev);  	kfree(entry);  } @@ -266,14 +291,25 @@ static struct ib_gid_table_entry *  alloc_gid_entry(const struct ib_gid_attr *attr)  {  	struct ib_gid_table_entry *entry; +	struct net_device *ndev;  	entry = kzalloc(sizeof(*entry), GFP_KERNEL);  	if (!entry)  		return NULL; + +	ndev = rcu_dereference_protected(attr->ndev, 1); +	if (ndev) { +		entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage), +					      GFP_KERNEL); +		if (!entry->ndev_storage) { +			kfree(entry); +			return NULL; +		} +		dev_hold(ndev); +		entry->ndev_storage->ndev = ndev; +	}  	kref_init(&entry->kref);  	memcpy(&entry->attr, attr, sizeof(*attr)); -	if (entry->attr.ndev) -		dev_hold(entry->attr.ndev);  	INIT_WORK(&entry->del_work, free_gid_work);  	entry->state = GID_TABLE_ENTRY_INVALID;  	return entry; @@ -343,6 +379,7 @@ static int add_roce_gid(struct ib_gid_table_entry *entry)  static void del_gid(struct ib_device *ib_dev, u8 port,  		    struct ib_gid_table *table, int ix)  { +	struct roce_gid_ndev_storage *ndev_storage;  	struct ib_gid_table_entry *entry;  	lockdep_assert_held(&table->lock); @@ -360,6 +397,13 @@ static void del_gid(struct ib_device *ib_dev, u8 port,  		table->data_vec[ix] = NULL;  	write_unlock_irq(&table->rwlock); +	ndev_storage = entry->ndev_storage; +	if (ndev_storage) { +		entry->ndev_storage = NULL; +		rcu_assign_pointer(entry->attr.ndev, NULL); +		call_rcu(&ndev_storage->rcu_head, put_gid_ndev); +	} +  	if (rdma_cap_roce_gid_table(ib_dev, port))  		ib_dev->ops.del_gid(&entry->attr, &entry->context); @@ -543,30 +587,11 @@ out_unlock:  int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,  		     union ib_gid *gid, struct ib_gid_attr *attr)  { -	struct net_device *idev; -	unsigned long mask; -	int ret; - -	idev = ib_device_get_netdev(ib_dev, port); -	if (idev && attr->ndev != idev) { -		union ib_gid default_gid; - -		/* Adding default GIDs is not permitted */ -		make_default_gid(idev, &default_gid); -		if (!memcmp(gid, &default_gid, sizeof(*gid))) { -			dev_put(idev); -			return -EPERM; -		} -	} -	if (idev) -		dev_put(idev); - -	mask = GID_ATTR_FIND_MASK_GID | -	       GID_ATTR_FIND_MASK_GID_TYPE | -	       GID_ATTR_FIND_MASK_NETDEV; +	unsigned long mask = GID_ATTR_FIND_MASK_GID | +			     GID_ATTR_FIND_MASK_GID_TYPE | +			     GID_ATTR_FIND_MASK_NETDEV; -	ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); -	return ret; +	return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);  }  static int @@ -1263,11 +1288,72 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr)  	read_lock_irqsave(&table->rwlock, flags);  	valid = is_gid_entry_valid(table->data_vec[attr->index]); -	if (valid && attr->ndev && (READ_ONCE(attr->ndev->flags) & IFF_UP)) -		ndev = attr->ndev; +	if (valid) { +		ndev = rcu_dereference(attr->ndev); +		if (!ndev || +		    (ndev && ((READ_ONCE(ndev->flags) & IFF_UP) == 0))) +			ndev = ERR_PTR(-ENODEV); +	}  	read_unlock_irqrestore(&table->rwlock, flags);  	return ndev;  } +EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); + +static int get_lower_dev_vlan(struct net_device *lower_dev, void *data) +{ +	u16 *vlan_id = data; + +	if (is_vlan_dev(lower_dev)) +		*vlan_id = vlan_dev_vlan_id(lower_dev); + +	/* We are interested only in first level vlan device, so +	 * always return 1 to stop iterating over next level devices. +	 */ +	return 1; +} + +/** + * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address + *			     of a GID entry. + * + * @attr:	GID attribute pointer whose L2 fields to be read + * @vlan_id:	Pointer to vlan id to fill up if the GID entry has + *		vlan id. It is optional. + * @smac:	Pointer to smac to fill up for a GID entry. It is optional. + * + * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id + * (if gid entry has vlan) and source MAC, or returns error. + */ +int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, +			    u16 *vlan_id, u8 *smac) +{ +	struct net_device *ndev; + +	rcu_read_lock(); +	ndev = rcu_dereference(attr->ndev); +	if (!ndev) { +		rcu_read_unlock(); +		return -ENODEV; +	} +	if (smac) +		ether_addr_copy(smac, ndev->dev_addr); +	if (vlan_id) { +		*vlan_id = 0xffff; +		if (is_vlan_dev(ndev)) { +			*vlan_id = vlan_dev_vlan_id(ndev); +		} else { +			/* If the netdev is upper device and if it's lower +			 * device is vlan device, consider vlan id of the +			 * the lower vlan device for this gid entry. +			 */ +			netdev_walk_all_lower_dev_rcu(attr->ndev, +					get_lower_dev_vlan, vlan_id); +		} +	} +	rcu_read_unlock(); +	return 0; +} +EXPORT_SYMBOL(rdma_read_gid_l2_fields);  static int config_non_roce_gid_cache(struct ib_device *device,  				     u8 port, int gid_tbl_len) @@ -1392,7 +1478,6 @@ static void ib_cache_event(struct ib_event_handler *handler,  	    event->event == IB_EVENT_PORT_ACTIVE ||  	    event->event == IB_EVENT_LID_CHANGE  ||  	    event->event == IB_EVENT_PKEY_CHANGE || -	    event->event == IB_EVENT_SM_CHANGE   ||  	    event->event == IB_EVENT_CLIENT_REREGISTER ||  	    event->event == IB_EVENT_GID_CHANGE) {  		work = kmalloc(sizeof *work, GFP_ATOMIC); diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index b9416a6fca36..da10e6ccb43c 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -52,6 +52,7 @@  #include <rdma/ib_cache.h>  #include <rdma/ib_cm.h>  #include "cm_msgs.h" +#include "core_priv.h"  MODULE_AUTHOR("Sean Hefty");  MODULE_DESCRIPTION("InfiniBand CM"); @@ -124,7 +125,8 @@ static struct ib_cm {  	struct rb_root remote_qp_table;  	struct rb_root remote_id_table;  	struct rb_root remote_sidr_table; -	struct idr local_id_table; +	struct xarray local_id_table; +	u32 local_id_next;  	__be32 random_id_operand;  	struct list_head timewait_list;  	struct workqueue_struct *wq; @@ -219,7 +221,6 @@ struct cm_port {  struct cm_device {  	struct list_head list;  	struct ib_device *ib_device; -	struct device *device;  	u8 ack_delay;  	int going_down;  	struct cm_port *port[0]; @@ -598,35 +599,31 @@ static int cm_init_av_by_path(struct sa_path_rec *path,  static int cm_alloc_id(struct cm_id_private *cm_id_priv)  { -	unsigned long flags; -	int id; - -	idr_preload(GFP_KERNEL); -	spin_lock_irqsave(&cm.lock, flags); +	int err; +	u32 id; -	id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); - -	spin_unlock_irqrestore(&cm.lock, flags); -	idr_preload_end(); +	err = xa_alloc_cyclic_irq(&cm.local_id_table, &id, cm_id_priv, +			xa_limit_32b, &cm.local_id_next, GFP_KERNEL);  	cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; -	return id < 0 ? id : 0; +	return err; +} + +static u32 cm_local_id(__be32 local_id) +{ +	return (__force u32) (local_id ^ cm.random_id_operand);  }  static void cm_free_id(__be32 local_id)  { -	spin_lock_irq(&cm.lock); -	idr_remove(&cm.local_id_table, -		   (__force int) (local_id ^ cm.random_id_operand)); -	spin_unlock_irq(&cm.lock); +	xa_erase_irq(&cm.local_id_table, cm_local_id(local_id));  }  static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)  {  	struct cm_id_private *cm_id_priv; -	cm_id_priv = idr_find(&cm.local_id_table, -			      (__force int) (local_id ^ cm.random_id_operand)); +	cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id));  	if (cm_id_priv) {  		if (cm_id_priv->id.remote_id == remote_id)  			atomic_inc(&cm_id_priv->refcount); @@ -1988,11 +1985,12 @@ static int cm_req_handler(struct cm_work *work)  	grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr);  	gid_attr = grh->sgid_attr; -	if (gid_attr && gid_attr->ndev) { +	if (gid_attr && +	    rdma_protocol_roce(work->port->cm_dev->ib_device, +			       work->port->port_num)) {  		work->path[0].rec_type =  			sa_conv_gid_to_pathrec_type(gid_attr->gid_type);  	} else { -		/* If no GID attribute or ndev is null, it is not RoCE. */  		cm_path_set_rec_type(work->port->cm_dev->ib_device,  				     work->port->port_num,  				     &work->path[0], @@ -2824,9 +2822,8 @@ static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)  			spin_unlock_irq(&cm.lock);  			return NULL;  		} -		cm_id_priv = idr_find(&cm.local_id_table, (__force int) -				      (timewait_info->work.local_id ^ -				       cm.random_id_operand)); +		cm_id_priv = xa_load(&cm.local_id_table, +				cm_local_id(timewait_info->work.local_id));  		if (cm_id_priv) {  			if (cm_id_priv->id.remote_id == remote_id)  				atomic_inc(&cm_id_priv->refcount); @@ -4276,18 +4273,6 @@ static struct kobj_type cm_counter_obj_type = {  	.default_attrs = cm_counter_default_attrs  }; -static void cm_release_port_obj(struct kobject *obj) -{ -	struct cm_port *cm_port; - -	cm_port = container_of(obj, struct cm_port, port_obj); -	kfree(cm_port); -} - -static struct kobj_type cm_port_obj_type = { -	.release = cm_release_port_obj -}; -  static char *cm_devnode(struct device *dev, umode_t *mode)  {  	if (mode) @@ -4306,19 +4291,12 @@ static int cm_create_port_fs(struct cm_port *port)  {  	int i, ret; -	ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type, -				   &port->cm_dev->device->kobj, -				   "%d", port->port_num); -	if (ret) { -		kfree(port); -		return ret; -	} -  	for (i = 0; i < CM_COUNTER_GROUPS; i++) { -		ret = kobject_init_and_add(&port->counter_group[i].obj, -					   &cm_counter_obj_type, -					   &port->port_obj, -					   "%s", counter_group_names[i]); +		ret = ib_port_register_module_stat(port->cm_dev->ib_device, +						   port->port_num, +						   &port->counter_group[i].obj, +						   &cm_counter_obj_type, +						   counter_group_names[i]);  		if (ret)  			goto error;  	} @@ -4327,8 +4305,7 @@ static int cm_create_port_fs(struct cm_port *port)  error:  	while (i--) -		kobject_put(&port->counter_group[i].obj); -	kobject_put(&port->port_obj); +		ib_port_unregister_module_stat(&port->counter_group[i].obj);  	return ret;  } @@ -4338,9 +4315,8 @@ static void cm_remove_port_fs(struct cm_port *port)  	int i;  	for (i = 0; i < CM_COUNTER_GROUPS; i++) -		kobject_put(&port->counter_group[i].obj); +		ib_port_unregister_module_stat(&port->counter_group[i].obj); -	kobject_put(&port->port_obj);  }  static void cm_add_one(struct ib_device *ib_device) @@ -4367,13 +4343,6 @@ static void cm_add_one(struct ib_device *ib_device)  	cm_dev->ib_device = ib_device;  	cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;  	cm_dev->going_down = 0; -	cm_dev->device = device_create(&cm_class, &ib_device->dev, -				       MKDEV(0, 0), NULL, -				       "%s", dev_name(&ib_device->dev)); -	if (IS_ERR(cm_dev->device)) { -		kfree(cm_dev); -		return; -	}  	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);  	for (i = 1; i <= ib_device->phys_port_cnt; i++) { @@ -4440,7 +4409,6 @@ error1:  		cm_remove_port_fs(port);  	}  free: -	device_unregister(cm_dev->device);  	kfree(cm_dev);  } @@ -4494,7 +4462,6 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)  		cm_remove_port_fs(port);  	} -	device_unregister(cm_dev->device);  	kfree(cm_dev);  } @@ -4502,7 +4469,6 @@ static int __init ib_cm_init(void)  {  	int ret; -	memset(&cm, 0, sizeof cm);  	INIT_LIST_HEAD(&cm.device_list);  	rwlock_init(&cm.device_lock);  	spin_lock_init(&cm.lock); @@ -4512,7 +4478,7 @@ static int __init ib_cm_init(void)  	cm.remote_id_table = RB_ROOT;  	cm.remote_qp_table = RB_ROOT;  	cm.remote_sidr_table = RB_ROOT; -	idr_init(&cm.local_id_table); +	xa_init_flags(&cm.local_id_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);  	get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);  	INIT_LIST_HEAD(&cm.timewait_list); @@ -4538,7 +4504,6 @@ error3:  error2:  	class_unregister(&cm_class);  error1: -	idr_destroy(&cm.local_id_table);  	return ret;  } @@ -4560,9 +4525,8 @@ static void __exit ib_cm_cleanup(void)  	}  	class_unregister(&cm_class); -	idr_destroy(&cm.local_id_table); +	WARN_ON(!xa_empty(&cm.local_id_table));  }  module_init(ib_cm_init);  module_exit(ib_cm_cleanup); - diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h index 476d4309576d..3d16d614aff6 100644 --- a/drivers/infiniband/core/cm_msgs.h +++ b/drivers/infiniband/core/cm_msgs.h @@ -98,7 +98,7 @@ struct cm_req_msg {  	u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)]; -} __attribute__ ((packed)); +} __packed;  static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg)  { @@ -423,7 +423,7 @@ enum cm_msg_response {  	u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed;  static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg)  { @@ -461,7 +461,7 @@ struct cm_rej_msg {  	u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed;  static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg)  { @@ -506,7 +506,7 @@ struct cm_rep_msg {  	u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed;  static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg)  { @@ -614,7 +614,7 @@ struct cm_rtu_msg {  	u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed;  struct cm_dreq_msg {  	struct ib_mad_hdr hdr; @@ -626,7 +626,7 @@ struct cm_dreq_msg {  	u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed;  static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg)  { @@ -647,7 +647,7 @@ struct cm_drep_msg {  	u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed;  struct cm_lap_msg {  	struct ib_mad_hdr hdr; @@ -675,7 +675,7 @@ struct cm_lap_msg {  	u8 offset63;  	u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE]; -} __attribute__  ((packed)); +} __packed;  static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg)  { @@ -784,7 +784,7 @@ struct cm_apr_msg {  	u8 info[IB_CM_APR_INFO_LENGTH];  	u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed;  struct cm_sidr_req_msg {  	struct ib_mad_hdr hdr; @@ -795,7 +795,7 @@ struct cm_sidr_req_msg {  	__be64 service_id;  	u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)]; -} __attribute__ ((packed)); +} __packed;  struct cm_sidr_rep_msg {  	struct ib_mad_hdr hdr; @@ -811,7 +811,7 @@ struct cm_sidr_rep_msg {  	u8 info[IB_CM_SIDR_REP_INFO_LENGTH];  	u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed;  static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg)  { diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 68c997be2429..19f1730a4f24 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -39,7 +39,7 @@  #include <linux/mutex.h>  #include <linux/random.h>  #include <linux/igmp.h> -#include <linux/idr.h> +#include <linux/xarray.h>  #include <linux/inetdevice.h>  #include <linux/slab.h>  #include <linux/module.h> @@ -191,10 +191,10 @@ static struct workqueue_struct *cma_wq;  static unsigned int cma_pernet_id;  struct cma_pernet { -	struct idr tcp_ps; -	struct idr udp_ps; -	struct idr ipoib_ps; -	struct idr ib_ps; +	struct xarray tcp_ps; +	struct xarray udp_ps; +	struct xarray ipoib_ps; +	struct xarray ib_ps;  };  static struct cma_pernet *cma_pernet(struct net *net) @@ -202,7 +202,8 @@ static struct cma_pernet *cma_pernet(struct net *net)  	return net_generic(net, cma_pernet_id);  } -static struct idr *cma_pernet_idr(struct net *net, enum rdma_ucm_port_space ps) +static +struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps)  {  	struct cma_pernet *pernet = cma_pernet(net); @@ -247,25 +248,25 @@ struct class_port_info_context {  static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps,  			struct rdma_bind_list *bind_list, int snum)  { -	struct idr *idr = cma_pernet_idr(net, ps); +	struct xarray *xa = cma_pernet_xa(net, ps); -	return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL); +	return xa_insert(xa, snum, bind_list, GFP_KERNEL);  }  static struct rdma_bind_list *cma_ps_find(struct net *net,  					  enum rdma_ucm_port_space ps, int snum)  { -	struct idr *idr = cma_pernet_idr(net, ps); +	struct xarray *xa = cma_pernet_xa(net, ps); -	return idr_find(idr, snum); +	return xa_load(xa, snum);  }  static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps,  			  int snum)  { -	struct idr *idr = cma_pernet_idr(net, ps); +	struct xarray *xa = cma_pernet_xa(net, ps); -	idr_remove(idr, snum); +	xa_erase(xa, snum);  }  enum { @@ -615,6 +616,9 @@ cma_validate_port(struct ib_device *device, u8 port,  	int dev_type = dev_addr->dev_type;  	struct net_device *ndev = NULL; +	if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) +		return ERR_PTR(-ENODEV); +  	if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))  		return ERR_PTR(-ENODEV); @@ -1173,18 +1177,31 @@ static inline bool cma_any_addr(const struct sockaddr *addr)  	return cma_zero_addr(addr) || cma_loopback_addr(addr);  } -static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) +static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst)  {  	if (src->sa_family != dst->sa_family)  		return -1;  	switch (src->sa_family) {  	case AF_INET: -		return ((struct sockaddr_in *) src)->sin_addr.s_addr != -		       ((struct sockaddr_in *) dst)->sin_addr.s_addr; -	case AF_INET6: -		return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, -				     &((struct sockaddr_in6 *) dst)->sin6_addr); +		return ((struct sockaddr_in *)src)->sin_addr.s_addr != +		       ((struct sockaddr_in *)dst)->sin_addr.s_addr; +	case AF_INET6: { +		struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src; +		struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst; +		bool link_local; + +		if (ipv6_addr_cmp(&src_addr6->sin6_addr, +					  &dst_addr6->sin6_addr)) +			return 1; +		link_local = ipv6_addr_type(&dst_addr6->sin6_addr) & +			     IPV6_ADDR_LINKLOCAL; +		/* Link local must match their scope_ids */ +		return link_local ? (src_addr6->sin6_scope_id != +				     dst_addr6->sin6_scope_id) : +				    0; +	} +  	default:  		return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr,  				   &((struct sockaddr_ib *) dst)->sib_addr); @@ -1469,6 +1486,7 @@ static struct net_device *  roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event)  {  	const struct ib_gid_attr *sgid_attr = NULL; +	struct net_device *ndev;  	if (ib_event->event == IB_CM_REQ_RECEIVED)  		sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; @@ -1477,8 +1495,15 @@ roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event)  	if (!sgid_attr)  		return NULL; -	dev_hold(sgid_attr->ndev); -	return sgid_attr->ndev; + +	rcu_read_lock(); +	ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr); +	if (IS_ERR(ndev)) +		ndev = NULL; +	else +		dev_hold(ndev); +	rcu_read_unlock(); +	return ndev;  }  static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, @@ -3247,7 +3272,7 @@ static int cma_alloc_port(enum rdma_ucm_port_space ps,  		goto err;  	bind_list->ps = ps; -	bind_list->port = (unsigned short)ret; +	bind_list->port = snum;  	cma_bind_port(bind_list, id_priv);  	return 0;  err: @@ -4655,10 +4680,10 @@ static int cma_init_net(struct net *net)  {  	struct cma_pernet *pernet = cma_pernet(net); -	idr_init(&pernet->tcp_ps); -	idr_init(&pernet->udp_ps); -	idr_init(&pernet->ipoib_ps); -	idr_init(&pernet->ib_ps); +	xa_init(&pernet->tcp_ps); +	xa_init(&pernet->udp_ps); +	xa_init(&pernet->ipoib_ps); +	xa_init(&pernet->ib_ps);  	return 0;  } @@ -4667,10 +4692,10 @@ static void cma_exit_net(struct net *net)  {  	struct cma_pernet *pernet = cma_pernet(net); -	idr_destroy(&pernet->tcp_ps); -	idr_destroy(&pernet->udp_ps); -	idr_destroy(&pernet->ipoib_ps); -	idr_destroy(&pernet->ib_ps); +	WARN_ON(!xa_empty(&pernet->tcp_ps)); +	WARN_ON(!xa_empty(&pernet->udp_ps)); +	WARN_ON(!xa_empty(&pernet->ipoib_ps)); +	WARN_ON(!xa_empty(&pernet->ib_ps));  }  static struct pernet_operations cma_pernet_operations = { diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 08c690249594..ff40a450b5d2 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -55,6 +55,7 @@ struct pkey_index_qp_list {  };  extern const struct attribute_group ib_dev_attr_group; +extern bool ib_devices_shared_netns;  int ib_device_register_sysfs(struct ib_device *device);  void ib_device_unregister_sysfs(struct ib_device *device); @@ -279,7 +280,8 @@ static inline void ib_mad_agent_security_change(void)  }  #endif -struct ib_device *ib_device_get_by_index(u32 ifindex); +struct ib_device *ib_device_get_by_index(const struct net *net, u32 index); +  /* RDMA device netlink */  void nldev_init(void);  void nldev_exit(void); @@ -302,6 +304,7 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev,  	qp->device = dev;  	qp->pd = pd;  	qp->uobject = uobj; +	qp->real_qp = qp;  	/*  	 * We don't track XRC QPs for now, because they don't have PD  	 * and more importantly they are created internaly by driver, @@ -336,4 +339,17 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec,  				 const struct ib_gid_attr *attr);  struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); + +void ib_free_port_attrs(struct ib_core_device *coredev); +int ib_setup_port_attrs(struct ib_core_device *coredev); + +int rdma_compatdev_set(u8 enable); + +int ib_port_register_module_stat(struct ib_device *device, u8 port_num, +				 struct kobject *kobj, struct kobj_type *ktype, +				 const char *name); +void ib_port_unregister_module_stat(struct kobject *kobj); + +int ib_device_set_netns_put(struct sk_buff *skb, +			    struct ib_device *dev, u32 ns_fd);  #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index d61e5e1427c2..a4c81992267c 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -128,15 +128,17 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)   * @comp_vector:	HCA completion vectors for this CQ   * @poll_ctx:		context to poll the CQ from.   * @caller:		module owner name. + * @udata:		Valid user data or NULL for kernel object   *   * This is the proper interface to allocate a CQ for in-kernel users. A   * CQ allocated with this interface will automatically be polled from the   * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id   * to use this CQ abstraction.   */ -struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, -			    int nr_cqe, int comp_vector, -			    enum ib_poll_context poll_ctx, const char *caller) +struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, +				 int nr_cqe, int comp_vector, +				 enum ib_poll_context poll_ctx, +				 const char *caller, struct ib_udata *udata)  {  	struct ib_cq_init_attr cq_attr = {  		.cqe		= nr_cqe, @@ -145,7 +147,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,  	struct ib_cq *cq;  	int ret = -ENOMEM; -	cq = dev->ops.create_cq(dev, &cq_attr, NULL, NULL); +	cq = dev->ops.create_cq(dev, &cq_attr, NULL);  	if (IS_ERR(cq))  		return cq; @@ -193,16 +195,17 @@ out_free_wc:  	kfree(cq->wc);  	rdma_restrack_del(&cq->res);  out_destroy_cq: -	cq->device->ops.destroy_cq(cq); +	cq->device->ops.destroy_cq(cq, udata);  	return ERR_PTR(ret);  } -EXPORT_SYMBOL(__ib_alloc_cq); +EXPORT_SYMBOL(__ib_alloc_cq_user);  /**   * ib_free_cq - free a completion queue   * @cq:		completion queue to free. + * @udata:	User data or NULL for kernel object   */ -void ib_free_cq(struct ib_cq *cq) +void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)  {  	int ret; @@ -225,7 +228,7 @@ void ib_free_cq(struct ib_cq *cq)  	kfree(cq->wc);  	rdma_restrack_del(&cq->res); -	ret = cq->device->ops.destroy_cq(cq); +	ret = cq->device->ops.destroy_cq(cq, udata);  	WARN_ON_ONCE(ret);  } -EXPORT_SYMBOL(ib_free_cq); +EXPORT_SYMBOL(ib_free_cq_user); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 7421ec4883fb..78dc07c6ac4b 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -38,6 +38,8 @@  #include <linux/slab.h>  #include <linux/init.h>  #include <linux/netdevice.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h>  #include <linux/security.h>  #include <linux/notifier.h>  #include <linux/hashtable.h> @@ -101,6 +103,54 @@ static DECLARE_RWSEM(clients_rwsem);   * be registered.   */  #define CLIENT_DATA_REGISTERED XA_MARK_1 + +/** + * struct rdma_dev_net - rdma net namespace metadata for a net + * @net:	Pointer to owner net namespace + * @id:		xarray id to identify the net namespace. + */ +struct rdma_dev_net { +	possible_net_t net; +	u32 id; +}; + +static unsigned int rdma_dev_net_id; + +/* + * A list of net namespaces is maintained in an xarray. This is necessary + * because we can't get the locking right using the existing net ns list. We + * would require a init_net callback after the list is updated. + */ +static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); +/* + * rwsem to protect accessing the rdma_nets xarray entries. + */ +static DECLARE_RWSEM(rdma_nets_rwsem); + +bool ib_devices_shared_netns = true; +module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); +MODULE_PARM_DESC(netns_mode, +		 "Share device among net namespaces; default=1 (shared)"); +/** + * rdma_dev_access_netns() - Return whether a rdma device can be accessed + *			     from a specified net namespace or not. + * @device:	Pointer to rdma device which needs to be checked + * @net:	Pointer to net namesapce for which access to be checked + * + * rdma_dev_access_netns() - Return whether a rdma device can be accessed + *			     from a specified net namespace or not. When + *			     rdma device is in shared mode, it ignores the + *			     net namespace. When rdma device is exclusive + *			     to a net namespace, rdma device net namespace is + *			     checked against the specified one. + */ +bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) +{ +	return (ib_devices_shared_netns || +		net_eq(read_pnet(&dev->coredev.rdma_net), net)); +} +EXPORT_SYMBOL(rdma_dev_access_netns); +  /*   * xarray has this behavior where it won't iterate over NULL values stored in   * allocated arrays.  So we need our own iterator to see all values stored in @@ -147,10 +197,73 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,  static void ib_policy_change_task(struct work_struct *work);  static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); +static void __ibdev_printk(const char *level, const struct ib_device *ibdev, +			   struct va_format *vaf) +{ +	if (ibdev && ibdev->dev.parent) +		dev_printk_emit(level[1] - '0', +				ibdev->dev.parent, +				"%s %s %s: %pV", +				dev_driver_string(ibdev->dev.parent), +				dev_name(ibdev->dev.parent), +				dev_name(&ibdev->dev), +				vaf); +	else if (ibdev) +		printk("%s%s: %pV", +		       level, dev_name(&ibdev->dev), vaf); +	else +		printk("%s(NULL ib_device): %pV", level, vaf); +} + +void ibdev_printk(const char *level, const struct ib_device *ibdev, +		  const char *format, ...) +{ +	struct va_format vaf; +	va_list args; + +	va_start(args, format); + +	vaf.fmt = format; +	vaf.va = &args; + +	__ibdev_printk(level, ibdev, &vaf); + +	va_end(args); +} +EXPORT_SYMBOL(ibdev_printk); + +#define define_ibdev_printk_level(func, level)                  \ +void func(const struct ib_device *ibdev, const char *fmt, ...)  \ +{                                                               \ +	struct va_format vaf;                                   \ +	va_list args;                                           \ +								\ +	va_start(args, fmt);                                    \ +								\ +	vaf.fmt = fmt;                                          \ +	vaf.va = &args;                                         \ +								\ +	__ibdev_printk(level, ibdev, &vaf);                     \ +								\ +	va_end(args);                                           \ +}                                                               \ +EXPORT_SYMBOL(func); + +define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); +define_ibdev_printk_level(ibdev_alert, KERN_ALERT); +define_ibdev_printk_level(ibdev_crit, KERN_CRIT); +define_ibdev_printk_level(ibdev_err, KERN_ERR); +define_ibdev_printk_level(ibdev_warn, KERN_WARNING); +define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); +define_ibdev_printk_level(ibdev_info, KERN_INFO); +  static struct notifier_block ibdev_lsm_nb = {  	.notifier_call = ib_security_change,  }; +static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, +				 struct net *net); +  /* Pointer to the RCU head at the start of the ib_port_data array */  struct ib_port_data_rcu {  	struct rcu_head rcu_head; @@ -200,16 +313,22 @@ static int ib_device_check_mandatory(struct ib_device *device)   * Caller must perform ib_device_put() to return the device reference count   * when ib_device_get_by_index() returns valid device pointer.   */ -struct ib_device *ib_device_get_by_index(u32 index) +struct ib_device *ib_device_get_by_index(const struct net *net, u32 index)  {  	struct ib_device *device;  	down_read(&devices_rwsem);  	device = xa_load(&devices, index);  	if (device) { +		if (!rdma_dev_access_netns(device, net)) { +			device = NULL; +			goto out; +		} +  		if (!ib_device_try_get(device))  			device = NULL;  	} +out:  	up_read(&devices_rwsem);  	return device;  } @@ -268,6 +387,26 @@ struct ib_device *ib_device_get_by_name(const char *name,  }  EXPORT_SYMBOL(ib_device_get_by_name); +static int rename_compat_devs(struct ib_device *device) +{ +	struct ib_core_device *cdev; +	unsigned long index; +	int ret = 0; + +	mutex_lock(&device->compat_devs_mutex); +	xa_for_each (&device->compat_devs, index, cdev) { +		ret = device_rename(&cdev->dev, dev_name(&device->dev)); +		if (ret) { +			dev_warn(&cdev->dev, +				 "Fail to rename compatdev to new name %s\n", +				 dev_name(&device->dev)); +			break; +		} +	} +	mutex_unlock(&device->compat_devs_mutex); +	return ret; +} +  int ib_device_rename(struct ib_device *ibdev, const char *name)  {  	int ret; @@ -287,6 +426,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)  	if (ret)  		goto out;  	strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); +	ret = rename_compat_devs(ibdev);  out:  	up_write(&devices_rwsem);  	return ret; @@ -336,6 +476,7 @@ static void ib_device_release(struct device *device)  	WARN_ON(refcount_read(&dev->refcount));  	ib_cache_release_one(dev);  	ib_security_release_port_pkey_list(dev); +	xa_destroy(&dev->compat_devs);  	xa_destroy(&dev->client_data);  	if (dev->port_data)  		kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, @@ -357,12 +498,42 @@ static int ib_device_uevent(struct device *device,  	return 0;  } +static const void *net_namespace(struct device *d) +{ +	struct ib_core_device *coredev = +			container_of(d, struct ib_core_device, dev); + +	return read_pnet(&coredev->rdma_net); +} +  static struct class ib_class = {  	.name    = "infiniband",  	.dev_release = ib_device_release,  	.dev_uevent = ib_device_uevent, +	.ns_type = &net_ns_type_operations, +	.namespace = net_namespace,  }; +static void rdma_init_coredev(struct ib_core_device *coredev, +			      struct ib_device *dev, struct net *net) +{ +	/* This BUILD_BUG_ON is intended to catch layout change +	 * of union of ib_core_device and device. +	 * dev must be the first element as ib_core and providers +	 * driver uses it. Adding anything in ib_core_device before +	 * device will break this assumption. +	 */ +	BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != +		     offsetof(struct ib_device, dev)); + +	coredev->dev.class = &ib_class; +	coredev->dev.groups = dev->groups; +	device_initialize(&coredev->dev); +	coredev->owner = dev; +	INIT_LIST_HEAD(&coredev->port_list); +	write_pnet(&coredev->rdma_net, net); +} +  /**   * _ib_alloc_device - allocate an IB device struct   * @size:size of structure to allocate @@ -389,10 +560,8 @@ struct ib_device *_ib_alloc_device(size_t size)  		return NULL;  	} -	device->dev.class = &ib_class;  	device->groups[0] = &ib_dev_attr_group; -	device->dev.groups = device->groups; -	device_initialize(&device->dev); +	rdma_init_coredev(&device->coredev, device, &init_net);  	INIT_LIST_HEAD(&device->event_handler_list);  	spin_lock_init(&device->event_handler_lock); @@ -403,7 +572,8 @@ struct ib_device *_ib_alloc_device(size_t size)  	 */  	xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);  	init_rwsem(&device->client_data_rwsem); -	INIT_LIST_HEAD(&device->port_list); +	xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); +	mutex_init(&device->compat_devs_mutex);  	init_completion(&device->unreg_completion);  	INIT_WORK(&device->unregistration_work, ib_unregister_work); @@ -436,6 +606,7 @@ void ib_dealloc_device(struct ib_device *device)  	/* Expedite releasing netdev references */  	free_netdevs(device); +	WARN_ON(!xa_empty(&device->compat_devs));  	WARN_ON(!xa_empty(&device->client_data));  	WARN_ON(refcount_read(&device->refcount));  	rdma_restrack_clean(device); @@ -644,6 +815,283 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,  	return NOTIFY_OK;  } +static void compatdev_release(struct device *dev) +{ +	struct ib_core_device *cdev = +		container_of(dev, struct ib_core_device, dev); + +	kfree(cdev); +} + +static int add_one_compat_dev(struct ib_device *device, +			      struct rdma_dev_net *rnet) +{ +	struct ib_core_device *cdev; +	int ret; + +	lockdep_assert_held(&rdma_nets_rwsem); +	if (!ib_devices_shared_netns) +		return 0; + +	/* +	 * Create and add compat device in all namespaces other than where it +	 * is currently bound to. +	 */ +	if (net_eq(read_pnet(&rnet->net), +		   read_pnet(&device->coredev.rdma_net))) +		return 0; + +	/* +	 * The first of init_net() or ib_register_device() to take the +	 * compat_devs_mutex wins and gets to add the device. Others will wait +	 * for completion here. +	 */ +	mutex_lock(&device->compat_devs_mutex); +	cdev = xa_load(&device->compat_devs, rnet->id); +	if (cdev) { +		ret = 0; +		goto done; +	} +	ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); +	if (ret) +		goto done; + +	cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); +	if (!cdev) { +		ret = -ENOMEM; +		goto cdev_err; +	} + +	cdev->dev.parent = device->dev.parent; +	rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); +	cdev->dev.release = compatdev_release; +	dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); + +	ret = device_add(&cdev->dev); +	if (ret) +		goto add_err; +	ret = ib_setup_port_attrs(cdev); +	if (ret) +		goto port_err; + +	ret = xa_err(xa_store(&device->compat_devs, rnet->id, +			      cdev, GFP_KERNEL)); +	if (ret) +		goto insert_err; + +	mutex_unlock(&device->compat_devs_mutex); +	return 0; + +insert_err: +	ib_free_port_attrs(cdev); +port_err: +	device_del(&cdev->dev); +add_err: +	put_device(&cdev->dev); +cdev_err: +	xa_release(&device->compat_devs, rnet->id); +done: +	mutex_unlock(&device->compat_devs_mutex); +	return ret; +} + +static void remove_one_compat_dev(struct ib_device *device, u32 id) +{ +	struct ib_core_device *cdev; + +	mutex_lock(&device->compat_devs_mutex); +	cdev = xa_erase(&device->compat_devs, id); +	mutex_unlock(&device->compat_devs_mutex); +	if (cdev) { +		ib_free_port_attrs(cdev); +		device_del(&cdev->dev); +		put_device(&cdev->dev); +	} +} + +static void remove_compat_devs(struct ib_device *device) +{ +	struct ib_core_device *cdev; +	unsigned long index; + +	xa_for_each (&device->compat_devs, index, cdev) +		remove_one_compat_dev(device, index); +} + +static int add_compat_devs(struct ib_device *device) +{ +	struct rdma_dev_net *rnet; +	unsigned long index; +	int ret = 0; + +	lockdep_assert_held(&devices_rwsem); + +	down_read(&rdma_nets_rwsem); +	xa_for_each (&rdma_nets, index, rnet) { +		ret = add_one_compat_dev(device, rnet); +		if (ret) +			break; +	} +	up_read(&rdma_nets_rwsem); +	return ret; +} + +static void remove_all_compat_devs(void) +{ +	struct ib_compat_device *cdev; +	struct ib_device *dev; +	unsigned long index; + +	down_read(&devices_rwsem); +	xa_for_each (&devices, index, dev) { +		unsigned long c_index = 0; + +		/* Hold nets_rwsem so that any other thread modifying this +		 * system param can sync with this thread. +		 */ +		down_read(&rdma_nets_rwsem); +		xa_for_each (&dev->compat_devs, c_index, cdev) +			remove_one_compat_dev(dev, c_index); +		up_read(&rdma_nets_rwsem); +	} +	up_read(&devices_rwsem); +} + +static int add_all_compat_devs(void) +{ +	struct rdma_dev_net *rnet; +	struct ib_device *dev; +	unsigned long index; +	int ret = 0; + +	down_read(&devices_rwsem); +	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { +		unsigned long net_index = 0; + +		/* Hold nets_rwsem so that any other thread modifying this +		 * system param can sync with this thread. +		 */ +		down_read(&rdma_nets_rwsem); +		xa_for_each (&rdma_nets, net_index, rnet) { +			ret = add_one_compat_dev(dev, rnet); +			if (ret) +				break; +		} +		up_read(&rdma_nets_rwsem); +	} +	up_read(&devices_rwsem); +	if (ret) +		remove_all_compat_devs(); +	return ret; +} + +int rdma_compatdev_set(u8 enable) +{ +	struct rdma_dev_net *rnet; +	unsigned long index; +	int ret = 0; + +	down_write(&rdma_nets_rwsem); +	if (ib_devices_shared_netns == enable) { +		up_write(&rdma_nets_rwsem); +		return 0; +	} + +	/* enable/disable of compat devices is not supported +	 * when more than default init_net exists. +	 */ +	xa_for_each (&rdma_nets, index, rnet) { +		ret++; +		break; +	} +	if (!ret) +		ib_devices_shared_netns = enable; +	up_write(&rdma_nets_rwsem); +	if (ret) +		return -EBUSY; + +	if (enable) +		ret = add_all_compat_devs(); +	else +		remove_all_compat_devs(); +	return ret; +} + +static void rdma_dev_exit_net(struct net *net) +{ +	struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); +	struct ib_device *dev; +	unsigned long index; +	int ret; + +	down_write(&rdma_nets_rwsem); +	/* +	 * Prevent the ID from being re-used and hide the id from xa_for_each. +	 */ +	ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); +	WARN_ON(ret); +	up_write(&rdma_nets_rwsem); + +	down_read(&devices_rwsem); +	xa_for_each (&devices, index, dev) { +		get_device(&dev->dev); +		/* +		 * Release the devices_rwsem so that pontentially blocking +		 * device_del, doesn't hold the devices_rwsem for too long. +		 */ +		up_read(&devices_rwsem); + +		remove_one_compat_dev(dev, rnet->id); + +		/* +		 * If the real device is in the NS then move it back to init. +		 */ +		rdma_dev_change_netns(dev, net, &init_net); + +		put_device(&dev->dev); +		down_read(&devices_rwsem); +	} +	up_read(&devices_rwsem); + +	xa_erase(&rdma_nets, rnet->id); +} + +static __net_init int rdma_dev_init_net(struct net *net) +{ +	struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); +	unsigned long index; +	struct ib_device *dev; +	int ret; + +	/* No need to create any compat devices in default init_net. */ +	if (net_eq(net, &init_net)) +		return 0; + +	write_pnet(&rnet->net, net); + +	ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); +	if (ret) +		return ret; + +	down_read(&devices_rwsem); +	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { +		/* Hold nets_rwsem so that netlink command cannot change +		 * system configuration for device sharing mode. +		 */ +		down_read(&rdma_nets_rwsem); +		ret = add_one_compat_dev(dev, rnet); +		up_read(&rdma_nets_rwsem); +		if (ret) +			break; +	} +	up_read(&devices_rwsem); + +	if (ret) +		rdma_dev_exit_net(net); + +	return ret; +} +  /*   * Assign the unique string device name and the unique device index. This is   * undone by ib_dealloc_device. @@ -711,6 +1159,9 @@ static void setup_dma_device(struct ib_device *device)  		WARN_ON_ONCE(!parent);  		device->dma_device = parent;  	} +	/* Setup default max segment size for all IB devices */ +	dma_set_max_seg_size(device->dma_device, SZ_2G); +  }  /* @@ -765,8 +1216,12 @@ static void disable_device(struct ib_device *device)  	ib_device_put(device);  	wait_for_completion(&device->unreg_completion); -	/* Expedite removing unregistered pointers from the hash table */ -	free_netdevs(device); +	/* +	 * compat devices must be removed after device refcount drops to zero. +	 * Otherwise init_net() may add more compatdevs after removing compat +	 * devices and before device is disabled. +	 */ +	remove_compat_devs(device);  }  /* @@ -807,7 +1262,8 @@ static int enable_device_and_get(struct ib_device *device)  			break;  	}  	up_read(&clients_rwsem); - +	if (!ret) +		ret = add_compat_devs(device);  out:  	up_read(&devices_rwsem);  	return ret; @@ -847,6 +1303,11 @@ int ib_register_device(struct ib_device *device, const char *name)  	ib_device_register_rdmacg(device); +	/* +	 * Ensure that ADD uevent is not fired because it +	 * is too early amd device is not initialized yet. +	 */ +	dev_set_uevent_suppress(&device->dev, true);  	ret = device_add(&device->dev);  	if (ret)  		goto cg_cleanup; @@ -859,6 +1320,9 @@ int ib_register_device(struct ib_device *device, const char *name)  	}  	ret = enable_device_and_get(device); +	dev_set_uevent_suppress(&device->dev, false); +	/* Mark for userspace that device is ready */ +	kobject_uevent(&device->dev.kobj, KOBJ_ADD);  	if (ret) {  		void (*dealloc_fn)(struct ib_device *); @@ -887,6 +1351,7 @@ int ib_register_device(struct ib_device *device, const char *name)  dev_cleanup:  	device_del(&device->dev);  cg_cleanup: +	dev_set_uevent_suppress(&device->dev, false);  	ib_device_unregister_rdmacg(device);  	ib_cache_cleanup_one(device);  	return ret; @@ -908,6 +1373,10 @@ static void __ib_unregister_device(struct ib_device *ib_dev)  		goto out;  	disable_device(ib_dev); + +	/* Expedite removing unregistered pointers from the hash table */ +	free_netdevs(ib_dev); +  	ib_device_unregister_sysfs(ib_dev);  	device_del(&ib_dev->dev);  	ib_device_unregister_rdmacg(ib_dev); @@ -1038,6 +1507,126 @@ void ib_unregister_device_queued(struct ib_device *ib_dev)  }  EXPORT_SYMBOL(ib_unregister_device_queued); +/* + * The caller must pass in a device that has the kref held and the refcount + * released. If the device is in cur_net and still registered then it is moved + * into net. + */ +static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, +				 struct net *net) +{ +	int ret2 = -EINVAL; +	int ret; + +	mutex_lock(&device->unregistration_lock); + +	/* +	 * If a device not under ib_device_get() or if the unregistration_lock +	 * is not held, the namespace can be changed, or it can be unregistered. +	 * Check again under the lock. +	 */ +	if (refcount_read(&device->refcount) == 0 || +	    !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { +		ret = -ENODEV; +		goto out; +	} + +	kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); +	disable_device(device); + +	/* +	 * At this point no one can be using the device, so it is safe to +	 * change the namespace. +	 */ +	write_pnet(&device->coredev.rdma_net, net); + +	down_read(&devices_rwsem); +	/* +	 * Currently rdma devices are system wide unique. So the device name +	 * is guaranteed free in the new namespace. Publish the new namespace +	 * at the sysfs level. +	 */ +	ret = device_rename(&device->dev, dev_name(&device->dev)); +	up_read(&devices_rwsem); +	if (ret) { +		dev_warn(&device->dev, +			 "%s: Couldn't rename device after namespace change\n", +			 __func__); +		/* Try and put things back and re-enable the device */ +		write_pnet(&device->coredev.rdma_net, cur_net); +	} + +	ret2 = enable_device_and_get(device); +	if (ret2) { +		/* +		 * This shouldn't really happen, but if it does, let the user +		 * retry at later point. So don't disable the device. +		 */ +		dev_warn(&device->dev, +			 "%s: Couldn't re-enable device after namespace change\n", +			 __func__); +	} +	kobject_uevent(&device->dev.kobj, KOBJ_ADD); + +	ib_device_put(device); +out: +	mutex_unlock(&device->unregistration_lock); +	if (ret) +		return ret; +	return ret2; +} + +int ib_device_set_netns_put(struct sk_buff *skb, +			    struct ib_device *dev, u32 ns_fd) +{ +	struct net *net; +	int ret; + +	net = get_net_ns_by_fd(ns_fd); +	if (IS_ERR(net)) { +		ret = PTR_ERR(net); +		goto net_err; +	} + +	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { +		ret = -EPERM; +		goto ns_err; +	} + +	/* +	 * Currently supported only for those providers which support +	 * disassociation and don't do port specific sysfs init. Once a +	 * port_cleanup infrastructure is implemented, this limitation will be +	 * removed. +	 */ +	if (!dev->ops.disassociate_ucontext || dev->ops.init_port || +	    ib_devices_shared_netns) { +		ret = -EOPNOTSUPP; +		goto ns_err; +	} + +	get_device(&dev->dev); +	ib_device_put(dev); +	ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); +	put_device(&dev->dev); + +	put_net(net); +	return ret; + +ns_err: +	put_net(net); +net_err: +	ib_device_put(dev); +	return ret; +} + +static struct pernet_operations rdma_dev_net_ops = { +	.init = rdma_dev_init_net, +	.exit = rdma_dev_exit_net, +	.id = &rdma_dev_net_id, +	.size = sizeof(struct rdma_dev_net), +}; +  static int assign_client_id(struct ib_client *client)  {  	int ret; @@ -1515,6 +2104,9 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,  	down_read(&devices_rwsem);  	xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { +		if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) +			continue; +  		ret = nldev_cb(dev, skb, cb, idx);  		if (ret)  			break; @@ -1787,6 +2379,14 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)  	SET_DEVICE_OP(dev_ops, get_vf_config);  	SET_DEVICE_OP(dev_ops, get_vf_stats);  	SET_DEVICE_OP(dev_ops, init_port); +	SET_DEVICE_OP(dev_ops, iw_accept); +	SET_DEVICE_OP(dev_ops, iw_add_ref); +	SET_DEVICE_OP(dev_ops, iw_connect); +	SET_DEVICE_OP(dev_ops, iw_create_listen); +	SET_DEVICE_OP(dev_ops, iw_destroy_listen); +	SET_DEVICE_OP(dev_ops, iw_get_qp); +	SET_DEVICE_OP(dev_ops, iw_reject); +	SET_DEVICE_OP(dev_ops, iw_rem_ref);  	SET_DEVICE_OP(dev_ops, map_mr_sg);  	SET_DEVICE_OP(dev_ops, map_phys_fmr);  	SET_DEVICE_OP(dev_ops, mmap); @@ -1823,7 +2423,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)  	SET_DEVICE_OP(dev_ops, set_vf_link_state);  	SET_DEVICE_OP(dev_ops, unmap_fmr); +	SET_OBJ_SIZE(dev_ops, ib_ah);  	SET_OBJ_SIZE(dev_ops, ib_pd); +	SET_OBJ_SIZE(dev_ops, ib_srq);  	SET_OBJ_SIZE(dev_ops, ib_ucontext);  }  EXPORT_SYMBOL(ib_set_device_ops); @@ -1903,12 +2505,20 @@ static int __init ib_core_init(void)  		goto err_sa;  	} +	ret = register_pernet_device(&rdma_dev_net_ops); +	if (ret) { +		pr_warn("Couldn't init compat dev. ret %d\n", ret); +		goto err_compat; +	} +  	nldev_init();  	rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);  	roce_gid_mgmt_init();  	return 0; +err_compat: +	unregister_lsm_notifier(&ibdev_lsm_nb);  err_sa:  	ib_sa_cleanup();  err_mad: @@ -1933,6 +2543,7 @@ static void __exit ib_core_cleanup(void)  	roce_gid_mgmt_cleanup();  	nldev_exit();  	rdma_nl_unregister(RDMA_NL_LS); +	unregister_pernet_device(&rdma_dev_net_ops);  	unregister_lsm_notifier(&ibdev_lsm_nb);  	ib_sa_cleanup();  	ib_mad_cleanup(); @@ -1950,5 +2561,8 @@ static void __exit ib_core_cleanup(void)  MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); -subsys_initcall(ib_core_init); +/* ib core relies on netdev stack to first register net_ns_type_operations + * ns kobject type before ib_core initialization. + */ +fs_initcall(ib_core_init);  module_exit(ib_core_cleanup); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 732637c913d9..72141c5b7c95 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -394,7 +394,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)  		cm_id_priv->state = IW_CM_STATE_DESTROYING;  		spin_unlock_irqrestore(&cm_id_priv->lock, flags);  		/* destroy the listening endpoint */ -		cm_id->device->iwcm->destroy_listen(cm_id); +		cm_id->device->ops.iw_destroy_listen(cm_id);  		spin_lock_irqsave(&cm_id_priv->lock, flags);  		break;  	case IW_CM_STATE_ESTABLISHED: @@ -417,7 +417,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)  		 */  		cm_id_priv->state = IW_CM_STATE_DESTROYING;  		spin_unlock_irqrestore(&cm_id_priv->lock, flags); -		cm_id->device->iwcm->reject(cm_id, NULL, 0); +		cm_id->device->ops.iw_reject(cm_id, NULL, 0);  		spin_lock_irqsave(&cm_id_priv->lock, flags);  		break;  	case IW_CM_STATE_CONN_SENT: @@ -427,7 +427,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)  		break;  	}  	if (cm_id_priv->qp) { -		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); +		cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);  		cm_id_priv->qp = NULL;  	}  	spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -504,7 +504,7 @@ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr,  static int iw_cm_map(struct iw_cm_id *cm_id, bool active)  {  	const char *devname = dev_name(&cm_id->device->dev); -	const char *ifname = cm_id->device->iwcm->ifname; +	const char *ifname = cm_id->device->iw_ifname;  	struct iwpm_dev_data pm_reg_msg = {};  	struct iwpm_sa_data pm_msg;  	int status; @@ -526,7 +526,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)  	cm_id->mapped = true;  	pm_msg.loc_addr = cm_id->local_addr;  	pm_msg.rem_addr = cm_id->remote_addr; -	pm_msg.flags = (cm_id->device->iwcm->driver_flags & IW_F_NO_PORT_MAP) ? +	pm_msg.flags = (cm_id->device->iw_driver_flags & IW_F_NO_PORT_MAP) ?  		       IWPM_FLAGS_NO_PORT_MAP : 0;  	if (active)  		status = iwpm_add_and_query_mapping(&pm_msg, @@ -577,7 +577,8 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)  		spin_unlock_irqrestore(&cm_id_priv->lock, flags);  		ret = iw_cm_map(cm_id, false);  		if (!ret) -			ret = cm_id->device->iwcm->create_listen(cm_id, backlog); +			ret = cm_id->device->ops.iw_create_listen(cm_id, +								  backlog);  		if (ret)  			cm_id_priv->state = IW_CM_STATE_IDLE;  		spin_lock_irqsave(&cm_id_priv->lock, flags); @@ -617,7 +618,7 @@ int iw_cm_reject(struct iw_cm_id *cm_id,  	cm_id_priv->state = IW_CM_STATE_IDLE;  	spin_unlock_irqrestore(&cm_id_priv->lock, flags); -	ret = cm_id->device->iwcm->reject(cm_id, private_data, +	ret = cm_id->device->ops.iw_reject(cm_id, private_data,  					  private_data_len);  	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); @@ -653,25 +654,25 @@ int iw_cm_accept(struct iw_cm_id *cm_id,  		return -EINVAL;  	}  	/* Get the ib_qp given the QPN */ -	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); +	qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn);  	if (!qp) {  		spin_unlock_irqrestore(&cm_id_priv->lock, flags);  		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);  		wake_up_all(&cm_id_priv->connect_wait);  		return -EINVAL;  	} -	cm_id->device->iwcm->add_ref(qp); +	cm_id->device->ops.iw_add_ref(qp);  	cm_id_priv->qp = qp;  	spin_unlock_irqrestore(&cm_id_priv->lock, flags); -	ret = cm_id->device->iwcm->accept(cm_id, iw_param); +	ret = cm_id->device->ops.iw_accept(cm_id, iw_param);  	if (ret) {  		/* An error on accept precludes provider events */  		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);  		cm_id_priv->state = IW_CM_STATE_IDLE;  		spin_lock_irqsave(&cm_id_priv->lock, flags);  		if (cm_id_priv->qp) { -			cm_id->device->iwcm->rem_ref(qp); +			cm_id->device->ops.iw_rem_ref(qp);  			cm_id_priv->qp = NULL;  		}  		spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -712,25 +713,25 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)  	}  	/* Get the ib_qp given the QPN */ -	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); +	qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn);  	if (!qp) {  		ret = -EINVAL;  		goto err;  	} -	cm_id->device->iwcm->add_ref(qp); +	cm_id->device->ops.iw_add_ref(qp);  	cm_id_priv->qp = qp;  	cm_id_priv->state = IW_CM_STATE_CONN_SENT;  	spin_unlock_irqrestore(&cm_id_priv->lock, flags);  	ret = iw_cm_map(cm_id, true);  	if (!ret) -		ret = cm_id->device->iwcm->connect(cm_id, iw_param); +		ret = cm_id->device->ops.iw_connect(cm_id, iw_param);  	if (!ret)  		return 0;	/* success */  	spin_lock_irqsave(&cm_id_priv->lock, flags);  	if (cm_id_priv->qp) { -		cm_id->device->iwcm->rem_ref(qp); +		cm_id->device->ops.iw_rem_ref(qp);  		cm_id_priv->qp = NULL;  	}  	cm_id_priv->state = IW_CM_STATE_IDLE; @@ -895,7 +896,7 @@ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,  		cm_id_priv->state = IW_CM_STATE_ESTABLISHED;  	} else {  		/* REJECTED or RESET */ -		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); +		cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);  		cm_id_priv->qp = NULL;  		cm_id_priv->state = IW_CM_STATE_IDLE;  	} @@ -946,7 +947,7 @@ static int cm_close_handler(struct iwcm_id_private *cm_id_priv,  	spin_lock_irqsave(&cm_id_priv->lock, flags);  	if (cm_id_priv->qp) { -		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); +		cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);  		cm_id_priv->qp = NULL;  	}  	switch (cm_id_priv->state) { diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index a5d2a20ee697..41929bb83739 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -506,14 +506,14 @@ int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,  	int ret;  	const char *err_str = ""; -	ret = nlmsg_validate(cb->nlh, nlh_len, policy_max - 1, nlmsg_policy, -			     NULL); +	ret = nlmsg_validate_deprecated(cb->nlh, nlh_len, policy_max - 1, +					nlmsg_policy, NULL);  	if (ret) {  		err_str = "Invalid attribute";  		goto parse_nlmsg_error;  	} -	ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max - 1, -			  nlmsg_policy, NULL); +	ret = nlmsg_parse_deprecated(cb->nlh, nlh_len, nltb, policy_max - 1, +				     nlmsg_policy, NULL);  	if (ret) {  		err_str = "Unable to parse the nlmsg";  		goto parse_nlmsg_error; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index e742a6a2c138..cc99479b2c09 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -3,7 +3,7 @@   * Copyright (c) 2005 Intel Corporation.  All rights reserved.   * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.   * Copyright (c) 2009 HNR Consulting. All rights reserved. - * Copyright (c) 2014 Intel Corporation.  All rights reserved. + * Copyright (c) 2014,2018 Intel Corporation.  All rights reserved.   *   * This software is available to you under a choice of one of two   * licenses.  You may choose to be licensed under the terms of the GNU @@ -38,10 +38,10 @@  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt  #include <linux/dma-mapping.h> -#include <linux/idr.h>  #include <linux/slab.h>  #include <linux/module.h>  #include <linux/security.h> +#include <linux/xarray.h>  #include <rdma/ib_cache.h>  #include "mad_priv.h" @@ -51,6 +51,32 @@  #include "opa_smi.h"  #include "agent.h" +#define CREATE_TRACE_POINTS +#include <trace/events/ib_mad.h> + +#ifdef CONFIG_TRACEPOINTS +static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, +			  struct ib_mad_qp_info *qp_info, +			  struct trace_event_raw_ib_mad_send_template *entry) +{ +	u16 pkey; +	struct ib_device *dev = qp_info->port_priv->device; +	u8 pnum = qp_info->port_priv->port_num; +	struct ib_ud_wr *wr = &mad_send_wr->send_wr; +	struct rdma_ah_attr attr = {}; + +	rdma_query_ah(wr->ah, &attr); + +	/* These are common */ +	entry->sl = attr.sl; +	ib_query_pkey(dev, pnum, wr->pkey_index, &pkey); +	entry->pkey = pkey; +	entry->rqpn = wr->remote_qpn; +	entry->rqkey = wr->remote_qkey; +	entry->dlid = rdma_ah_get_dlid(&attr); +} +#endif +  static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;  static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; @@ -59,12 +85,9 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests  module_param_named(recv_queue_size, mad_recvq_size, int, 0444);  MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); -/* - * The mlx4 driver uses the top byte to distinguish which virtual function - * generated the MAD, so we must avoid using it. - */ -#define AGENT_ID_LIMIT		(1 << 24) -static DEFINE_IDR(ib_mad_clients); +/* Client ID 0 is used for snoop-only clients */ +static DEFINE_XARRAY_ALLOC1(ib_mad_clients); +static u32 ib_mad_client_next;  static struct list_head ib_mad_port_list;  /* Port list lock */ @@ -389,18 +412,17 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,  		goto error4;  	} -	idr_preload(GFP_KERNEL); -	idr_lock(&ib_mad_clients); -	ret2 = idr_alloc_cyclic(&ib_mad_clients, mad_agent_priv, 0, -			AGENT_ID_LIMIT, GFP_ATOMIC); -	idr_unlock(&ib_mad_clients); -	idr_preload_end(); - +	/* +	 * The mlx4 driver uses the top byte to distinguish which virtual +	 * function generated the MAD, so we must avoid using it. +	 */ +	ret2 = xa_alloc_cyclic(&ib_mad_clients, &mad_agent_priv->agent.hi_tid, +			mad_agent_priv, XA_LIMIT(0, (1 << 24) - 1), +			&ib_mad_client_next, GFP_KERNEL);  	if (ret2 < 0) {  		ret = ERR_PTR(ret2);  		goto error5;  	} -	mad_agent_priv->agent.hi_tid = ret2;  	/*  	 * Make sure MAD registration (if supplied) @@ -445,12 +467,11 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,  	}  	spin_unlock_irq(&port_priv->reg_lock); +	trace_ib_mad_create_agent(mad_agent_priv);  	return &mad_agent_priv->agent;  error6:  	spin_unlock_irq(&port_priv->reg_lock); -	idr_lock(&ib_mad_clients); -	idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid); -	idr_unlock(&ib_mad_clients); +	xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid);  error5:  	ib_mad_agent_security_cleanup(&mad_agent_priv->agent);  error4: @@ -602,6 +623,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)  	struct ib_mad_port_private *port_priv;  	/* Note that we could still be handling received MADs */ +	trace_ib_mad_unregister_agent(mad_agent_priv);  	/*  	 * Canceling all sends results in dropping received response @@ -614,9 +636,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)  	spin_lock_irq(&port_priv->reg_lock);  	remove_mad_reg_req(mad_agent_priv);  	spin_unlock_irq(&port_priv->reg_lock); -	idr_lock(&ib_mad_clients); -	idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid); -	idr_unlock(&ib_mad_clients); +	xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid);  	flush_workqueue(port_priv->wq);  	ib_cancel_rmpp_recvs(mad_agent_priv); @@ -821,6 +841,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,  	if (opa && smp->class_version == OPA_SM_CLASS_VERSION) {  		u32 opa_drslid; +		trace_ib_mad_handle_out_opa_smi(opa_smp); +  		if ((opa_get_smp_direction(opa_smp)  		     ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) ==  		     OPA_LID_PERMISSIVE && @@ -846,6 +868,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,  		    opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD)  			goto out;  	} else { +		trace_ib_mad_handle_out_ib_smi(smp); +  		if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==  		     IB_LID_PERMISSIVE &&  		     smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) == @@ -1223,6 +1247,7 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)  	spin_lock_irqsave(&qp_info->send_queue.lock, flags);  	if (qp_info->send_queue.count < qp_info->send_queue.max_active) { +		trace_ib_mad_ib_send_mad(mad_send_wr, qp_info);  		ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr,  				   NULL);  		list = &qp_info->send_queue.list; @@ -1756,7 +1781,7 @@ find_mad_agent(struct ib_mad_port_private *port_priv,  		 */  		hi_tid = be64_to_cpu(mad_hdr->tid) >> 32;  		rcu_read_lock(); -		mad_agent = idr_find(&ib_mad_clients, hi_tid); +		mad_agent = xa_load(&ib_mad_clients, hi_tid);  		if (mad_agent && !atomic_inc_not_zero(&mad_agent->refcount))  			mad_agent = NULL;  		rcu_read_unlock(); @@ -2077,6 +2102,8 @@ static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv  	enum smi_forward_action retsmi;  	struct ib_smp *smp = (struct ib_smp *)recv->mad; +	trace_ib_mad_handle_ib_smi(smp); +  	if (smi_handle_dr_smp_recv(smp,  				   rdma_cap_ib_switch(port_priv->device),  				   port_num, @@ -2162,6 +2189,8 @@ handle_opa_smi(struct ib_mad_port_private *port_priv,  	enum smi_forward_action retsmi;  	struct opa_smp *smp = (struct opa_smp *)recv->mad; +	trace_ib_mad_handle_opa_smi(smp); +  	if (opa_smi_handle_dr_smp_recv(smp,  				   rdma_cap_ib_switch(port_priv->device),  				   port_num, @@ -2286,6 +2315,9 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)  	if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa))  		goto out; +	trace_ib_mad_recv_done_handler(qp_info, wc, +				       (struct ib_mad_hdr *)recv->mad); +  	mad_size = recv->mad_size;  	response = alloc_mad_private(mad_size, GFP_KERNEL);  	if (!response) @@ -2332,6 +2364,7 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)  	mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad);  	if (mad_agent) { +		trace_ib_mad_recv_done_agent(mad_agent);  		ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);  		/*  		 * recv is freed up in error cases in ib_mad_complete_recv @@ -2496,6 +2529,9 @@ static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc)  	send_queue = mad_list->mad_queue;  	qp_info = send_queue->qp_info; +	trace_ib_mad_send_done_agent(mad_send_wr->mad_agent_priv); +	trace_ib_mad_send_done_handler(mad_send_wr, wc); +  retry:  	ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,  			    mad_send_wr->header_mapping, @@ -2527,6 +2563,7 @@ retry:  	ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);  	if (queued_send_wr) { +		trace_ib_mad_send_done_resend(queued_send_wr, qp_info);  		ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr,  				   NULL);  		if (ret) { @@ -2574,6 +2611,7 @@ static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,  		if (mad_send_wr->retry) {  			/* Repost send */  			mad_send_wr->retry = 0; +			trace_ib_mad_error_handler(mad_send_wr, qp_info);  			ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr,  					   NULL);  			if (!ret) @@ -3356,9 +3394,6 @@ int ib_mad_init(void)  	INIT_LIST_HEAD(&ib_mad_port_list); -	/* Client ID 0 is used for snoop-only clients */ -	idr_alloc(&ib_mad_clients, NULL, 0, 0, GFP_KERNEL); -  	if (ib_register_client(&mad_client)) {  		pr_err("Couldn't register ib_mad client\n");  		return -EINVAL; diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 216509036aa8..956b3a7dfed7 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -73,14 +73,14 @@ struct ib_mad_private_header {  	struct ib_mad_recv_wc recv_wc;  	struct ib_wc wc;  	u64 mapping; -} __attribute__ ((packed)); +} __packed;  struct ib_mad_private {  	struct ib_mad_private_header header;  	size_t mad_size;  	struct ib_grh grh;  	u8 mad[0]; -} __attribute__ ((packed)); +} __packed;  struct ib_rmpp_segment {  	struct list_head list; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index d50ff70bb24b..cd338ddc4a39 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -804,7 +804,6 @@ static void mcast_event_handler(struct ib_event_handler *handler,  	switch (event->event) {  	case IB_EVENT_PORT_ERR:  	case IB_EVENT_LID_CHANGE: -	case IB_EVENT_SM_CHANGE:  	case IB_EVENT_CLIENT_REREGISTER:  		mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);  		break; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 11ed58d3fce5..69188cbbd99b 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -116,6 +116,10 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {  	[RDMA_NLDEV_ATTR_RES_CTXN]              = { .type = NLA_U32 },  	[RDMA_NLDEV_ATTR_LINK_TYPE]		= { .type = NLA_NUL_STRING,  				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, +	[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]	= { .type = NLA_U8 }, +	[RDMA_NLDEV_ATTR_DEV_PROTOCOL]		= { .type = NLA_NUL_STRING, +				    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, +	[RDMA_NLDEV_NET_NS_FD]			= { .type = NLA_U32 },  };  static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -198,6 +202,8 @@ static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device)  static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)  {  	char fw[IB_FW_VERSION_NAME_MAX]; +	int ret = 0; +	u8 port;  	if (fill_nldev_handle(msg, device))  		return -EMSGSIZE; @@ -226,7 +232,25 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)  		return -EMSGSIZE;  	if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))  		return -EMSGSIZE; -	return 0; + +	/* +	 * Link type is determined on first port and mlx4 device +	 * which can potentially have two different link type for the same +	 * IB device is considered as better to be avoided in the future, +	 */ +	port = rdma_start_port(device); +	if (rdma_cap_opa_mad(device, port)) +		ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "opa"); +	else if (rdma_protocol_ib(device, port)) +		ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "ib"); +	else if (rdma_protocol_iwarp(device, port)) +		ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "iw"); +	else if (rdma_protocol_roce(device, port)) +		ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "roce"); +	else if (rdma_protocol_usnic(device, port)) +		ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, +				     "usnic"); +	return ret;  }  static int fill_port_info(struct sk_buff *msg, @@ -292,7 +316,8 @@ static int fill_res_info_entry(struct sk_buff *msg,  {  	struct nlattr *entry_attr; -	entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY); +	entry_attr = nla_nest_start_noflag(msg, +					   RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY);  	if (!entry_attr)  		return -EMSGSIZE; @@ -327,7 +352,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device)  	if (fill_nldev_handle(msg, device))  		return -EMSGSIZE; -	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY); +	table_attr = nla_nest_start_noflag(msg, RDMA_NLDEV_ATTR_RES_SUMMARY);  	if (!table_attr)  		return -EMSGSIZE; @@ -607,14 +632,14 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,  	u32 index;  	int err; -	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, -			  nldev_policy, extack); +	err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +				     nldev_policy, extack);  	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])  		return -EINVAL;  	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); -	device = ib_device_get_by_index(index); +	device = ib_device_get_by_index(sock_net(skb->sk), index);  	if (!device)  		return -EINVAL; @@ -652,13 +677,13 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,  	u32 index;  	int err; -	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, -			  extack); +	err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +				     nldev_policy, extack);  	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])  		return -EINVAL;  	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); -	device = ib_device_get_by_index(index); +	device = ib_device_get_by_index(sock_net(skb->sk), index);  	if (!device)  		return -EINVAL; @@ -668,9 +693,20 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,  		nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME],  			    IB_DEVICE_NAME_MAX);  		err = ib_device_rename(device, name); +		goto done; +	} + +	if (tb[RDMA_NLDEV_NET_NS_FD]) { +		u32 ns_fd; + +		ns_fd = nla_get_u32(tb[RDMA_NLDEV_NET_NS_FD]); +		err = ib_device_set_netns_put(skb, device, ns_fd); +		goto put_done;  	} +done:  	ib_device_put(device); +put_done:  	return err;  } @@ -706,7 +742,7 @@ static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)  {  	/*  	 * There is no need to take lock, because -	 * we are relying on ib_core's lists_rwsem +	 * we are relying on ib_core's locking.  	 */  	return ib_enum_all_devs(_nldev_get_dumpit, skb, cb);  } @@ -721,15 +757,15 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,  	u32 port;  	int err; -	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, -			  nldev_policy, extack); +	err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +				     nldev_policy, extack);  	if (err ||  	    !tb[RDMA_NLDEV_ATTR_DEV_INDEX] ||  	    !tb[RDMA_NLDEV_ATTR_PORT_INDEX])  		return -EINVAL;  	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); -	device = ib_device_get_by_index(index); +	device = ib_device_get_by_index(sock_net(skb->sk), index);  	if (!device)  		return -EINVAL; @@ -777,13 +813,13 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,  	int err;  	unsigned int p; -	err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, -			  nldev_policy, NULL); +	err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +				     nldev_policy, NULL);  	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])  		return -EINVAL;  	ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); -	device = ib_device_get_by_index(ifindex); +	device = ib_device_get_by_index(sock_net(skb->sk), ifindex);  	if (!device)  		return -EINVAL; @@ -832,13 +868,13 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,  	u32 index;  	int ret; -	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, -			  nldev_policy, extack); +	ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +				     nldev_policy, extack);  	if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])  		return -EINVAL;  	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); -	device = ib_device_get_by_index(index); +	device = ib_device_get_by_index(sock_net(skb->sk), index);  	if (!device)  		return -EINVAL; @@ -886,7 +922,6 @@ static int _nldev_res_get_dumpit(struct ib_device *device,  		nlmsg_cancel(skb, nlh);  		goto out;  	} -  	nlmsg_end(skb, nlh);  	idx++; @@ -981,13 +1016,13 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,  	struct sk_buff *msg;  	int ret; -	ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, -			  nldev_policy, extack); +	ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +				     nldev_policy, extack);  	if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id])  		return -EINVAL;  	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); -	device = ib_device_get_by_index(index); +	device = ib_device_get_by_index(sock_net(skb->sk), index);  	if (!device)  		return -EINVAL; @@ -1070,8 +1105,8 @@ static int res_get_common_dumpit(struct sk_buff *skb,  	u32 index, port = 0;  	bool filled = false; -	err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, -			  nldev_policy, NULL); +	err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +				     nldev_policy, NULL);  	/*  	 * Right now, we are expecting the device index to get res information,  	 * but it is possible to extend this code to return all devices in @@ -1084,7 +1119,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,  		return -EINVAL;  	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); -	device = ib_device_get_by_index(index); +	device = ib_device_get_by_index(sock_net(skb->sk), index);  	if (!device)  		return -EINVAL; @@ -1108,7 +1143,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,  		goto err;  	} -	table_attr = nla_nest_start(skb, fe->nldev_attr); +	table_attr = nla_nest_start_noflag(skb, fe->nldev_attr);  	if (!table_attr) {  		ret = -EMSGSIZE;  		goto err; @@ -1134,7 +1169,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,  		filled = true; -		entry_attr = nla_nest_start(skb, fe->entry); +		entry_attr = nla_nest_start_noflag(skb, fe->entry);  		if (!entry_attr) {  			ret = -EMSGSIZE;  			rdma_restrack_put(res); @@ -1249,8 +1284,8 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,  	char type[IFNAMSIZ];  	int err; -	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, -			  nldev_policy, extack); +	err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +				     nldev_policy, extack);  	if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] ||  	    !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME])  		return -EINVAL; @@ -1293,13 +1328,13 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,  	u32 index;  	int err; -	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, -			  nldev_policy, extack); +	err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +				     nldev_policy, extack);  	if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX])  		return -EINVAL;  	index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); -	device = ib_device_get_by_index(index); +	device = ib_device_get_by_index(sock_net(skb->sk), index);  	if (!device)  		return -EINVAL; @@ -1312,6 +1347,58 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,  	return 0;  } +static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, +			      struct netlink_ext_ack *extack) +{ +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; +	struct sk_buff *msg; +	int err; + +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +			  nldev_policy, extack); +	if (err) +		return err; + +	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +	if (!msg) +		return -ENOMEM; + +	nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, +			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, +					 RDMA_NLDEV_CMD_SYS_GET), +			0, 0); + +	err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_NETNS_MODE, +			 (u8)ib_devices_shared_netns); +	if (err) { +		nlmsg_free(msg); +		return err; +	} +	nlmsg_end(msg, nlh); +	return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); +} + +static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, +				  struct netlink_ext_ack *extack) +{ +	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; +	u8 enable; +	int err; + +	err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, +			  nldev_policy, extack); +	if (err || !tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]) +		return -EINVAL; + +	enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]); +	/* Only 0 and 1 are supported */ +	if (enable > 1) +		return -EINVAL; + +	err = rdma_compatdev_set(enable); +	return err; +} +  static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {  	[RDMA_NLDEV_CMD_GET] = {  		.doit = nldev_get_doit, @@ -1357,6 +1444,13 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {  		.doit = nldev_res_get_pd_doit,  		.dump = nldev_res_get_pd_dumpit,  	}, +	[RDMA_NLDEV_CMD_SYS_GET] = { +		.doit = nldev_sys_get_doit, +	}, +	[RDMA_NLDEV_CMD_SYS_SET] = { +		.doit = nldev_set_sys_set_doit, +		.flags = RDMA_NL_ADMIN_PERM, +	},  };  void __init nldev_init(void) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 778375ff664e..ccf4d069c25c 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -125,9 +125,10 @@ static void assert_uverbs_usecnt(struct ib_uobject *uobj,   * and consumes the kref on the uobj.   */  static int uverbs_destroy_uobject(struct ib_uobject *uobj, -				  enum rdma_remove_reason reason) +				  enum rdma_remove_reason reason, +				  struct uverbs_attr_bundle *attrs)  { -	struct ib_uverbs_file *ufile = uobj->ufile; +	struct ib_uverbs_file *ufile = attrs->ufile;  	unsigned long flags;  	int ret; @@ -135,7 +136,8 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj,  	assert_uverbs_usecnt(uobj, UVERBS_LOOKUP_WRITE);  	if (uobj->object) { -		ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason); +		ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason, +								attrs);  		if (ret) {  			if (ib_is_destroy_retryable(ret, reason, uobj))  				return ret; @@ -196,9 +198,9 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj,   * version requires the caller to have already obtained an   * LOOKUP_DESTROY uobject kref.   */ -int uobj_destroy(struct ib_uobject *uobj) +int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs)  { -	struct ib_uverbs_file *ufile = uobj->ufile; +	struct ib_uverbs_file *ufile = attrs->ufile;  	int ret;  	down_read(&ufile->hw_destroy_rwsem); @@ -207,7 +209,7 @@ int uobj_destroy(struct ib_uobject *uobj)  	if (ret)  		goto out_unlock; -	ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY); +	ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY, attrs);  	if (ret) {  		atomic_set(&uobj->usecnt, 0);  		goto out_unlock; @@ -224,18 +226,17 @@ out_unlock:   * uverbs_put_destroy.   */  struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj, -				      u32 id, -				      const struct uverbs_attr_bundle *attrs) +				      u32 id, struct uverbs_attr_bundle *attrs)  {  	struct ib_uobject *uobj;  	int ret;  	uobj = rdma_lookup_get_uobject(obj, attrs->ufile, id, -				       UVERBS_LOOKUP_DESTROY); +				       UVERBS_LOOKUP_DESTROY, attrs);  	if (IS_ERR(uobj))  		return uobj; -	ret = uobj_destroy(uobj); +	ret = uobj_destroy(uobj, attrs);  	if (ret) {  		rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY);  		return ERR_PTR(ret); @@ -249,7 +250,7 @@ struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj,   * (negative errno on failure). For use by callers that do not need the uobj.   */  int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id, -			   const struct uverbs_attr_bundle *attrs) +			   struct uverbs_attr_bundle *attrs)  {  	struct ib_uobject *uobj; @@ -296,25 +297,13 @@ static struct ib_uobject *alloc_uobj(struct ib_uverbs_file *ufile,  static int idr_add_uobj(struct ib_uobject *uobj)  { -	int ret; - -	idr_preload(GFP_KERNEL); -	spin_lock(&uobj->ufile->idr_lock); - -	/* -	 * We start with allocating an idr pointing to NULL. This represents an -	 * object which isn't initialized yet. We'll replace it later on with -	 * the real object once we commit. -	 */ -	ret = idr_alloc(&uobj->ufile->idr, NULL, 0, -			min_t(unsigned long, U32_MAX - 1, INT_MAX), GFP_NOWAIT); -	if (ret >= 0) -		uobj->id = ret; - -	spin_unlock(&uobj->ufile->idr_lock); -	idr_preload_end(); - -	return ret < 0 ? ret : 0; +       /* +        * We start with allocating an idr pointing to NULL. This represents an +        * object which isn't initialized yet. We'll replace it later on with +        * the real object once we commit. +        */ +	return xa_alloc(&uobj->ufile->idr, &uobj->id, NULL, xa_limit_32b, +			GFP_KERNEL);  }  /* Returns the ib_uobject or an error. The caller should check for IS_ERR. */ @@ -324,29 +313,20 @@ lookup_get_idr_uobject(const struct uverbs_api_object *obj,  		       enum rdma_lookup_mode mode)  {  	struct ib_uobject *uobj; -	unsigned long idrno = id;  	if (id < 0 || id > ULONG_MAX)  		return ERR_PTR(-EINVAL);  	rcu_read_lock(); -	/* object won't be released as we're protected in rcu */ -	uobj = idr_find(&ufile->idr, idrno); -	if (!uobj) { -		uobj = ERR_PTR(-ENOENT); -		goto free; -	} -  	/*  	 * The idr_find is guaranteed to return a pointer to something that  	 * isn't freed yet, or NULL, as the free after idr_remove goes through  	 * kfree_rcu(). However the object may still have been released and  	 * kfree() could be called at any time.  	 */ -	if (!kref_get_unless_zero(&uobj->ref)) +	uobj = xa_load(&ufile->idr, id); +	if (!uobj || !kref_get_unless_zero(&uobj->ref))  		uobj = ERR_PTR(-ENOENT); - -free:  	rcu_read_unlock();  	return uobj;  } @@ -393,12 +373,13 @@ lookup_get_fd_uobject(const struct uverbs_api_object *obj,  struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,  					   struct ib_uverbs_file *ufile, s64 id, -					   enum rdma_lookup_mode mode) +					   enum rdma_lookup_mode mode, +					   struct uverbs_attr_bundle *attrs)  {  	struct ib_uobject *uobj;  	int ret; -	if (IS_ERR(obj) && PTR_ERR(obj) == -ENOMSG) { +	if (obj == ERR_PTR(-ENOMSG)) {  		/* must be UVERBS_IDR_ANY_OBJECT, see uapi_get_object() */  		uobj = lookup_get_idr_uobject(NULL, ufile, id, mode);  		if (IS_ERR(uobj)) @@ -431,6 +412,8 @@ struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,  	ret = uverbs_try_lock_object(uobj, mode);  	if (ret)  		goto free; +	if (attrs) +		attrs->context = uobj->context;  	return uobj;  free: @@ -438,38 +421,6 @@ free:  	uverbs_uobject_put(uobj);  	return ERR_PTR(ret);  } -struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type, -				  u32 object_id, -				  struct uverbs_attr_bundle *attrs) -{ -	struct ib_uobject *uobj; - -	uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile, -				       object_id, UVERBS_LOOKUP_READ); -	if (IS_ERR(uobj)) -		return uobj; - -	attrs->context = uobj->context; - -	return uobj; -} - -struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type, -				   u32 object_id, -				   struct uverbs_attr_bundle *attrs) -{ -	struct ib_uobject *uobj; - -	uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile, -				       object_id, UVERBS_LOOKUP_WRITE); - -	if (IS_ERR(uobj)) -		return uobj; - -	attrs->context = uobj->context; - -	return uobj; -}  static struct ib_uobject *  alloc_begin_idr_uobject(const struct uverbs_api_object *obj, @@ -489,14 +440,12 @@ alloc_begin_idr_uobject(const struct uverbs_api_object *obj,  	ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device,  				   RDMACG_RESOURCE_HCA_OBJECT);  	if (ret) -		goto idr_remove; +		goto remove;  	return uobj; -idr_remove: -	spin_lock(&ufile->idr_lock); -	idr_remove(&ufile->idr, uobj->id); -	spin_unlock(&ufile->idr_lock); +remove: +	xa_erase(&ufile->idr, uobj->id);  uobj_put:  	uverbs_uobject_put(uobj);  	return ERR_PTR(ret); @@ -526,7 +475,8 @@ alloc_begin_fd_uobject(const struct uverbs_api_object *obj,  }  struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj, -					    struct ib_uverbs_file *ufile) +					    struct ib_uverbs_file *ufile, +					    struct uverbs_attr_bundle *attrs)  {  	struct ib_uobject *ret; @@ -546,6 +496,8 @@ struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,  		up_read(&ufile->hw_destroy_rwsem);  		return ret;  	} +	if (attrs) +		attrs->context = ret->context;  	return ret;  } @@ -554,18 +506,17 @@ static void alloc_abort_idr_uobject(struct ib_uobject *uobj)  	ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,  			   RDMACG_RESOURCE_HCA_OBJECT); -	spin_lock(&uobj->ufile->idr_lock); -	idr_remove(&uobj->ufile->idr, uobj->id); -	spin_unlock(&uobj->ufile->idr_lock); +	xa_erase(&uobj->ufile->idr, uobj->id);  }  static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj, -					       enum rdma_remove_reason why) +					       enum rdma_remove_reason why, +					       struct uverbs_attr_bundle *attrs)  {  	const struct uverbs_obj_idr_type *idr_type =  		container_of(uobj->uapi_object->type_attrs,  			     struct uverbs_obj_idr_type, type); -	int ret = idr_type->destroy_object(uobj, why); +	int ret = idr_type->destroy_object(uobj, why, attrs);  	/*  	 * We can only fail gracefully if the user requested to destroy the @@ -586,9 +537,7 @@ static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj,  static void remove_handle_idr_uobject(struct ib_uobject *uobj)  { -	spin_lock(&uobj->ufile->idr_lock); -	idr_remove(&uobj->ufile->idr, uobj->id); -	spin_unlock(&uobj->ufile->idr_lock); +	xa_erase(&uobj->ufile->idr, uobj->id);  	/* Matches the kref in alloc_commit_idr_uobject */  	uverbs_uobject_put(uobj);  } @@ -599,7 +548,8 @@ static void alloc_abort_fd_uobject(struct ib_uobject *uobj)  }  static int __must_check destroy_hw_fd_uobject(struct ib_uobject *uobj, -					      enum rdma_remove_reason why) +					      enum rdma_remove_reason why, +					      struct uverbs_attr_bundle *attrs)  {  	const struct uverbs_obj_fd_type *fd_type = container_of(  		uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type); @@ -618,17 +568,17 @@ static void remove_handle_fd_uobject(struct ib_uobject *uobj)  static int alloc_commit_idr_uobject(struct ib_uobject *uobj)  {  	struct ib_uverbs_file *ufile = uobj->ufile; +	void *old; -	spin_lock(&ufile->idr_lock);  	/*  	 * We already allocated this IDR with a NULL object, so  	 * this shouldn't fail.  	 * -	 * NOTE: Once we set the IDR we loose ownership of our kref on uobj. +	 * NOTE: Storing the uobj transfers our kref on uobj to the XArray.  	 * It will be put by remove_commit_idr_uobject()  	 */ -	WARN_ON(idr_replace(&ufile->idr, uobj, uobj->id)); -	spin_unlock(&ufile->idr_lock); +	old = xa_store(&ufile->idr, uobj->id, uobj, GFP_KERNEL); +	WARN_ON(old != NULL);  	return 0;  } @@ -675,15 +625,16 @@ static int alloc_commit_fd_uobject(struct ib_uobject *uobj)   * caller can no longer assume uobj is valid. If this function fails it   * destroys the uboject, including the attached HW object.   */ -int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj) +int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj, +					   struct uverbs_attr_bundle *attrs)  { -	struct ib_uverbs_file *ufile = uobj->ufile; +	struct ib_uverbs_file *ufile = attrs->ufile;  	int ret;  	/* alloc_commit consumes the uobj kref */  	ret = uobj->uapi_object->type_class->alloc_commit(uobj);  	if (ret) { -		uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT); +		uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs);  		up_read(&ufile->hw_destroy_rwsem);  		return ret;  	} @@ -707,12 +658,13 @@ int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj)   * This consumes the kref for uobj. It is up to the caller to unwind the HW   * object and anything else connected to uobj before calling this.   */ -void rdma_alloc_abort_uobject(struct ib_uobject *uobj) +void rdma_alloc_abort_uobject(struct ib_uobject *uobj, +			      struct uverbs_attr_bundle *attrs)  {  	struct ib_uverbs_file *ufile = uobj->ufile;  	uobj->object = NULL; -	uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT); +	uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs);  	/* Matches the down_read in rdma_alloc_begin_uobject */  	up_read(&ufile->hw_destroy_rwsem); @@ -760,29 +712,28 @@ void rdma_lookup_put_uobject(struct ib_uobject *uobj,  void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile)  { -	spin_lock_init(&ufile->idr_lock); -	idr_init(&ufile->idr); +	xa_init_flags(&ufile->idr, XA_FLAGS_ALLOC);  }  void release_ufile_idr_uobject(struct ib_uverbs_file *ufile)  {  	struct ib_uobject *entry; -	int id; +	unsigned long id;  	/*  	 * At this point uverbs_cleanup_ufile() is guaranteed to have run, and -	 * there are no HW objects left, however the IDR is still populated +	 * there are no HW objects left, however the xarray is still populated  	 * with anything that has not been cleaned up by userspace. Since the  	 * kref on ufile is 0, nothing is allowed to call lookup_get.  	 *  	 * This is an optimized equivalent to remove_handle_idr_uobject  	 */ -	idr_for_each_entry(&ufile->idr, entry, id) { +	xa_for_each(&ufile->idr, id, entry) {  		WARN_ON(entry->object);  		uverbs_uobject_put(entry);  	} -	idr_destroy(&ufile->idr); +	xa_destroy(&ufile->idr);  }  const struct uverbs_obj_type_class uverbs_idr_class = { @@ -814,6 +765,10 @@ void uverbs_close_fd(struct file *f)  {  	struct ib_uobject *uobj = f->private_data;  	struct ib_uverbs_file *ufile = uobj->ufile; +	struct uverbs_attr_bundle attrs = { +		.context = uobj->context, +		.ufile = ufile, +	};  	if (down_read_trylock(&ufile->hw_destroy_rwsem)) {  		/* @@ -823,7 +778,7 @@ void uverbs_close_fd(struct file *f)  		 * write lock here, or we have a kernel bug.  		 */  		WARN_ON(uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE)); -		uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE); +		uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE, &attrs);  		up_read(&ufile->hw_destroy_rwsem);  	} @@ -872,6 +827,7 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile,  {  	struct ib_uobject *obj, *next_obj;  	int ret = -EINVAL; +	struct uverbs_attr_bundle attrs = { .ufile = ufile };  	/*  	 * This shouldn't run while executing other commands on this @@ -883,12 +839,13 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile,  	 * other threads (which might still use the FDs) chance to run.  	 */  	list_for_each_entry_safe(obj, next_obj, &ufile->uobjects, list) { +		attrs.context = obj->context;  		/*  		 * if we hit this WARN_ON, that means we are  		 * racing with a lookup_get.  		 */  		WARN_ON(uverbs_try_lock_object(obj, UVERBS_LOOKUP_WRITE)); -		if (!uverbs_destroy_uobject(obj, reason)) +		if (!uverbs_destroy_uobject(obj, reason, &attrs))  			ret = 0;  		else  			atomic_set(&obj->usecnt, 0); @@ -967,26 +924,25 @@ const struct uverbs_obj_type_class uverbs_fd_class = {  EXPORT_SYMBOL(uverbs_fd_class);  struct ib_uobject * -uverbs_get_uobject_from_file(u16 object_id, -			     struct ib_uverbs_file *ufile, -			     enum uverbs_obj_access access, s64 id) +uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, +			     s64 id, struct uverbs_attr_bundle *attrs)  {  	const struct uverbs_api_object *obj = -		uapi_get_object(ufile->device->uapi, object_id); +		uapi_get_object(attrs->ufile->device->uapi, object_id);  	switch (access) {  	case UVERBS_ACCESS_READ: -		return rdma_lookup_get_uobject(obj, ufile, id, -					       UVERBS_LOOKUP_READ); +		return rdma_lookup_get_uobject(obj, attrs->ufile, id, +					       UVERBS_LOOKUP_READ, attrs);  	case UVERBS_ACCESS_DESTROY:  		/* Actual destruction is done inside uverbs_handle_method */ -		return rdma_lookup_get_uobject(obj, ufile, id, -					       UVERBS_LOOKUP_DESTROY); +		return rdma_lookup_get_uobject(obj, attrs->ufile, id, +					       UVERBS_LOOKUP_DESTROY, attrs);  	case UVERBS_ACCESS_WRITE: -		return rdma_lookup_get_uobject(obj, ufile, id, -					       UVERBS_LOOKUP_WRITE); +		return rdma_lookup_get_uobject(obj, attrs->ufile, id, +					       UVERBS_LOOKUP_WRITE, attrs);  	case UVERBS_ACCESS_NEW: -		return rdma_alloc_begin_uobject(obj, ufile); +		return rdma_alloc_begin_uobject(obj, attrs->ufile, attrs);  	default:  		WARN_ON(true);  		return ERR_PTR(-EOPNOTSUPP); @@ -994,8 +950,8 @@ uverbs_get_uobject_from_file(u16 object_id,  }  int uverbs_finalize_object(struct ib_uobject *uobj, -			   enum uverbs_obj_access access, -			   bool commit) +			   enum uverbs_obj_access access, bool commit, +			   struct uverbs_attr_bundle *attrs)  {  	int ret = 0; @@ -1018,9 +974,9 @@ int uverbs_finalize_object(struct ib_uobject *uobj,  		break;  	case UVERBS_ACCESS_NEW:  		if (commit) -			ret = rdma_alloc_commit_uobject(uobj); +			ret = rdma_alloc_commit_uobject(uobj, attrs);  		else -			rdma_alloc_abort_uobject(uobj); +			rdma_alloc_abort_uobject(uobj, attrs);  		break;  	default:  		WARN_ON(true); diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 69f8db66925e..5445323629b5 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -48,7 +48,7 @@ struct ib_uverbs_device;  void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile,  			     enum rdma_remove_reason reason); -int uobj_destroy(struct ib_uobject *uobj); +int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs);  /*   * uverbs_uobject_get is called in order to increase the reference count on @@ -83,9 +83,8 @@ void uverbs_close_fd(struct file *f);   * uverbs_finalize_objects are called.   */  struct ib_uobject * -uverbs_get_uobject_from_file(u16 object_id, -			     struct ib_uverbs_file *ufile, -			     enum uverbs_obj_access access, s64 id); +uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, +			     s64 id, struct uverbs_attr_bundle *attrs);  /*   * Note that certain finalize stages could return a status: @@ -103,8 +102,8 @@ uverbs_get_uobject_from_file(u16 object_id,   * object.   */  int uverbs_finalize_object(struct ib_uobject *uobj, -			   enum uverbs_obj_access access, -			   bool commit); +			   enum uverbs_obj_access access, bool commit, +			   struct uverbs_attr_bundle *attrs);  int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 7925e45ea88a..7d8071c7e564 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -40,7 +40,7 @@  #include <linux/slab.h>  #include <linux/dma-mapping.h>  #include <linux/kref.h> -#include <linux/idr.h> +#include <linux/xarray.h>  #include <linux/workqueue.h>  #include <uapi/linux/if_ether.h>  #include <rdma/ib_pack.h> @@ -183,8 +183,7 @@ static struct ib_client sa_client = {  	.remove = ib_sa_remove_one  }; -static DEFINE_SPINLOCK(idr_lock); -static DEFINE_IDR(query_idr); +static DEFINE_XARRAY_FLAGS(queries, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);  static DEFINE_SPINLOCK(tid_lock);  static u32 tid; @@ -1028,8 +1027,8 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,  	    !(NETLINK_CB(skb).sk))  		return -EPERM; -	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), -			nlmsg_len(nlh), ib_nl_policy, NULL); +	ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), +				   nlmsg_len(nlh), ib_nl_policy, NULL);  	attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT];  	if (ret || !attr)  		goto settimeout_out; @@ -1080,8 +1079,8 @@ static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)  	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)  		return 0; -	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), -			nlmsg_len(nlh), ib_nl_policy, NULL); +	ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), +				   nlmsg_len(nlh), ib_nl_policy, NULL);  	if (ret)  		return 0; @@ -1180,14 +1179,14 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)  	struct ib_mad_agent *agent;  	struct ib_mad_send_buf *mad_buf; -	spin_lock_irqsave(&idr_lock, flags); -	if (idr_find(&query_idr, id) != query) { -		spin_unlock_irqrestore(&idr_lock, flags); +	xa_lock_irqsave(&queries, flags); +	if (xa_load(&queries, id) != query) { +		xa_unlock_irqrestore(&queries, flags);  		return;  	}  	agent = query->port->agent;  	mad_buf = query->mad_buf; -	spin_unlock_irqrestore(&idr_lock, flags); +	xa_unlock_irqrestore(&queries, flags);  	/*  	 * If the query is still on the netlink request list, schedule @@ -1363,21 +1362,14 @@ static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent)  static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms,  		    gfp_t gfp_mask)  { -	bool preload = gfpflags_allow_blocking(gfp_mask);  	unsigned long flags;  	int ret, id; -	if (preload) -		idr_preload(gfp_mask); -	spin_lock_irqsave(&idr_lock, flags); - -	id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT); - -	spin_unlock_irqrestore(&idr_lock, flags); -	if (preload) -		idr_preload_end(); -	if (id < 0) -		return id; +	xa_lock_irqsave(&queries, flags); +	ret = __xa_alloc(&queries, &id, query, xa_limit_32b, gfp_mask); +	xa_unlock_irqrestore(&queries, flags); +	if (ret < 0) +		return ret;  	query->mad_buf->timeout_ms  = timeout_ms;  	query->mad_buf->context[0] = query; @@ -1394,9 +1386,9 @@ static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms,  	ret = ib_post_send_mad(query->mad_buf, NULL);  	if (ret) { -		spin_lock_irqsave(&idr_lock, flags); -		idr_remove(&query_idr, id); -		spin_unlock_irqrestore(&idr_lock, flags); +		xa_lock_irqsave(&queries, flags); +		__xa_erase(&queries, id); +		xa_unlock_irqrestore(&queries, flags);  	}  	/* @@ -2188,9 +2180,9 @@ static void send_handler(struct ib_mad_agent *agent,  			break;  		} -	spin_lock_irqsave(&idr_lock, flags); -	idr_remove(&query_idr, query->id); -	spin_unlock_irqrestore(&idr_lock, flags); +	xa_lock_irqsave(&queries, flags); +	__xa_erase(&queries, query->id); +	xa_unlock_irqrestore(&queries, flags);  	free_mad(query);  	if (query->client) @@ -2475,5 +2467,5 @@ void ib_sa_cleanup(void)  	destroy_workqueue(ib_nl_wq);  	mcast_cleanup();  	ib_unregister_client(&sa_client); -	idr_destroy(&query_idr); +	WARN_ON(!xa_empty(&queries));  } diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 9b6a065bdfa5..c78d0c9646ae 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -349,10 +349,15 @@ static struct attribute *port_default_attrs[] = {  static size_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf)  { -	if (!gid_attr->ndev) -		return -EINVAL; - -	return sprintf(buf, "%s\n", gid_attr->ndev->name); +	struct net_device *ndev; +	size_t ret = -EINVAL; + +	rcu_read_lock(); +	ndev = rcu_dereference(gid_attr->ndev); +	if (ndev) +		ret = sprintf(buf, "%s\n", ndev->name); +	rcu_read_unlock(); +	return ret;  }  static size_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf) @@ -1015,8 +1020,10 @@ err_free_stats:  	return;  } -static int add_port(struct ib_device *device, int port_num) +static int add_port(struct ib_core_device *coredev, int port_num)  { +	struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); +	bool is_full_dev = &device->coredev == coredev;  	struct ib_port *p;  	struct ib_port_attr attr;  	int i; @@ -1034,7 +1041,7 @@ static int add_port(struct ib_device *device, int port_num)  	p->port_num   = port_num;  	ret = kobject_init_and_add(&p->kobj, &port_type, -				   device->ports_kobj, +				   coredev->ports_kobj,  				   "%d", port_num);  	if (ret) {  		kfree(p); @@ -1055,7 +1062,7 @@ static int add_port(struct ib_device *device, int port_num)  		goto err_put;  	} -	if (device->ops.process_mad) { +	if (device->ops.process_mad && is_full_dev) {  		p->pma_table = get_counter_table(device, port_num);  		ret = sysfs_create_group(&p->kobj, p->pma_table);  		if (ret) @@ -1111,7 +1118,7 @@ static int add_port(struct ib_device *device, int port_num)  	if (ret)  		goto err_free_pkey; -	if (device->ops.init_port) { +	if (device->ops.init_port && is_full_dev) {  		ret = device->ops.init_port(device, port_num, &p->kobj);  		if (ret)  			goto err_remove_pkey; @@ -1122,10 +1129,10 @@ static int add_port(struct ib_device *device, int port_num)  	 * port, so holder should be device. Therefore skip per port conunter  	 * initialization.  	 */ -	if (device->ops.alloc_hw_stats && port_num) +	if (device->ops.alloc_hw_stats && port_num && is_full_dev)  		setup_hw_stats(device, p, port_num); -	list_add_tail(&p->kobj.entry, &device->port_list); +	list_add_tail(&p->kobj.entry, &coredev->port_list);  	kobject_uevent(&p->kobj, KOBJ_ADD);  	return 0; @@ -1194,6 +1201,7 @@ static ssize_t node_type_show(struct device *device,  	case RDMA_NODE_RNIC:	  return sprintf(buf, "%d: RNIC\n", dev->node_type);  	case RDMA_NODE_USNIC:	  return sprintf(buf, "%d: usNIC\n", dev->node_type);  	case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type); +	case RDMA_NODE_UNSPECIFIED: return sprintf(buf, "%d: unspecified\n", dev->node_type);  	case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);  	case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);  	default:		  return sprintf(buf, "%d: <unknown>\n", dev->node_type); @@ -1279,11 +1287,11 @@ const struct attribute_group ib_dev_attr_group = {  	.attrs = ib_dev_attrs,  }; -static void ib_free_port_attrs(struct ib_device *device) +void ib_free_port_attrs(struct ib_core_device *coredev)  {  	struct kobject *p, *t; -	list_for_each_entry_safe(p, t, &device->port_list, entry) { +	list_for_each_entry_safe(p, t, &coredev->port_list, entry) {  		struct ib_port *port = container_of(p, struct ib_port, kobj);  		list_del(&p->entry); @@ -1303,20 +1311,22 @@ static void ib_free_port_attrs(struct ib_device *device)  		kobject_put(p);  	} -	kobject_put(device->ports_kobj); +	kobject_put(coredev->ports_kobj);  } -static int ib_setup_port_attrs(struct ib_device *device) +int ib_setup_port_attrs(struct ib_core_device *coredev)  { +	struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);  	unsigned int port;  	int ret; -	device->ports_kobj = kobject_create_and_add("ports", &device->dev.kobj); -	if (!device->ports_kobj) +	coredev->ports_kobj = kobject_create_and_add("ports", +						     &coredev->dev.kobj); +	if (!coredev->ports_kobj)  		return -ENOMEM;  	rdma_for_each_port (device, port) { -		ret = add_port(device, port); +		ret = add_port(coredev, port);  		if (ret)  			goto err_put;  	} @@ -1324,7 +1334,7 @@ static int ib_setup_port_attrs(struct ib_device *device)  	return 0;  err_put: -	ib_free_port_attrs(device); +	ib_free_port_attrs(coredev);  	return ret;  } @@ -1332,7 +1342,7 @@ int ib_device_register_sysfs(struct ib_device *device)  {  	int ret; -	ret = ib_setup_port_attrs(device); +	ret = ib_setup_port_attrs(&device->coredev);  	if (ret)  		return ret; @@ -1348,5 +1358,48 @@ void ib_device_unregister_sysfs(struct ib_device *device)  		free_hsag(&device->dev.kobj, device->hw_stats_ag);  	kfree(device->hw_stats); -	ib_free_port_attrs(device); +	ib_free_port_attrs(&device->coredev); +} + +/** + * ib_port_register_module_stat - add module counters under relevant port + *  of IB device. + * + * @device: IB device to add counters + * @port_num: valid port number + * @kobj: pointer to the kobject to initialize + * @ktype: pointer to the ktype for this kobject. + * @name: the name of the kobject + */ +int ib_port_register_module_stat(struct ib_device *device, u8 port_num, +				 struct kobject *kobj, struct kobj_type *ktype, +				 const char *name) +{ +	struct kobject *p, *t; +	int ret; + +	list_for_each_entry_safe(p, t, &device->coredev.port_list, entry) { +		struct ib_port *port = container_of(p, struct ib_port, kobj); + +		if (port->port_num != port_num) +			continue; + +		ret = kobject_init_and_add(kobj, ktype, &port->kobj, "%s", +					   name); +		if (ret) +			return ret; +	} + +	return 0; +} +EXPORT_SYMBOL(ib_port_register_module_stat); + +/** + * ib_port_unregister_module_stat - release module counters + * @kobj: pointer to the kobject to release + */ +void ib_port_unregister_module_stat(struct kobject *kobj) +{ +	kobject_put(kobj);  } +EXPORT_SYMBOL(ib_port_unregister_module_stat); diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 7541fbaf58a3..8e7da2d41fd8 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -42,7 +42,7 @@  #include <linux/file.h>  #include <linux/mount.h>  #include <linux/cdev.h> -#include <linux/idr.h> +#include <linux/xarray.h>  #include <linux/mutex.h>  #include <linux/slab.h> @@ -125,23 +125,22 @@ static struct ib_client ucm_client = {  	.remove = ib_ucm_remove_one  }; -static DEFINE_MUTEX(ctx_id_mutex); -static DEFINE_IDR(ctx_id_table); +static DEFINE_XARRAY_ALLOC(ctx_id_table);  static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);  static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)  {  	struct ib_ucm_context *ctx; -	mutex_lock(&ctx_id_mutex); -	ctx = idr_find(&ctx_id_table, id); +	xa_lock(&ctx_id_table); +	ctx = xa_load(&ctx_id_table, id);  	if (!ctx)  		ctx = ERR_PTR(-ENOENT);  	else if (ctx->file != file)  		ctx = ERR_PTR(-EINVAL);  	else  		atomic_inc(&ctx->ref); -	mutex_unlock(&ctx_id_mutex); +	xa_unlock(&ctx_id_table);  	return ctx;  } @@ -194,10 +193,7 @@ static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)  	ctx->file = file;  	INIT_LIST_HEAD(&ctx->events); -	mutex_lock(&ctx_id_mutex); -	ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL); -	mutex_unlock(&ctx_id_mutex); -	if (ctx->id < 0) +	if (xa_alloc(&ctx_id_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL))  		goto error;  	list_add_tail(&ctx->file_list, &file->ctxs); @@ -514,9 +510,7 @@ static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,  err2:  	ib_destroy_cm_id(ctx->cm_id);  err1: -	mutex_lock(&ctx_id_mutex); -	idr_remove(&ctx_id_table, ctx->id); -	mutex_unlock(&ctx_id_mutex); +	xa_erase(&ctx_id_table, ctx->id);  	kfree(ctx);  	return result;  } @@ -536,15 +530,15 @@ static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,  	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))  		return -EFAULT; -	mutex_lock(&ctx_id_mutex); -	ctx = idr_find(&ctx_id_table, cmd.id); +	xa_lock(&ctx_id_table); +	ctx = xa_load(&ctx_id_table, cmd.id);  	if (!ctx)  		ctx = ERR_PTR(-ENOENT);  	else if (ctx->file != file)  		ctx = ERR_PTR(-EINVAL);  	else -		idr_remove(&ctx_id_table, ctx->id); -	mutex_unlock(&ctx_id_mutex); +		__xa_erase(&ctx_id_table, ctx->id); +	xa_unlock(&ctx_id_table);  	if (IS_ERR(ctx))  		return PTR_ERR(ctx); @@ -1175,7 +1169,7 @@ static int ib_ucm_open(struct inode *inode, struct file *filp)  	file->filp = filp;  	file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev); -	return nonseekable_open(inode, filp); +	return stream_open(inode, filp);  }  static int ib_ucm_close(struct inode *inode, struct file *filp) @@ -1189,10 +1183,7 @@ static int ib_ucm_close(struct inode *inode, struct file *filp)  				 struct ib_ucm_context, file_list);  		mutex_unlock(&file->file_mutex); -		mutex_lock(&ctx_id_mutex); -		idr_remove(&ctx_id_table, ctx->id); -		mutex_unlock(&ctx_id_mutex); - +		xa_erase(&ctx_id_table, ctx->id);  		ib_destroy_cm_id(ctx->cm_id);  		ib_ucm_cleanup_events(ctx);  		kfree(ctx); @@ -1352,7 +1343,7 @@ static void __exit ib_ucm_cleanup(void)  	class_remove_file(&cm_class, &class_attr_abi_version.attr);  	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);  	unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); -	idr_destroy(&ctx_id_table); +	WARN_ON(!xa_empty(&ctx_id_table));  }  module_init(ib_ucm_init); diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 7468b26b8a01..140a338a135f 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1744,7 +1744,7 @@ static int ucma_open(struct inode *inode, struct file *filp)  	filp->private_data = file;  	file->filp = filp; -	return nonseekable_open(inode, filp); +	return stream_open(inode, filp);  }  static int ucma_close(struct inode *inode, struct file *filp) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index fe5551562dbc..e7ea819fcb11 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -37,27 +37,23 @@  #include <linux/sched/signal.h>  #include <linux/sched/mm.h>  #include <linux/export.h> -#include <linux/hugetlb.h>  #include <linux/slab.h> +#include <linux/pagemap.h>  #include <rdma/ib_umem_odp.h>  #include "uverbs.h" -  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)  { -	struct scatterlist *sg; +	struct sg_page_iter sg_iter;  	struct page *page; -	int i;  	if (umem->nmap > 0) -		ib_dma_unmap_sg(dev, umem->sg_head.sgl, -				umem->npages, +		ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,  				DMA_BIDIRECTIONAL); -	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { - -		page = sg_page(sg); +	for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { +		page = sg_page_iter_page(&sg_iter);  		if (!PageDirty(page) && umem->writable && dirty)  			set_page_dirty_lock(page);  		put_page(page); @@ -66,6 +62,124 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d  	sg_free_table(&umem->sg_head);  } +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table + * + * sg: current scatterlist entry + * page_list: array of npage struct page pointers + * npages: number of pages in page_list + * max_seg_sz: maximum segment size in bytes + * nents: [out] number of entries in the scatterlist + * + * Return new end of scatterlist + */ +static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, +						struct page **page_list, +						unsigned long npages, +						unsigned int max_seg_sz, +						int *nents) +{ +	unsigned long first_pfn; +	unsigned long i = 0; +	bool update_cur_sg = false; +	bool first = !sg_page(sg); + +	/* Check if new page_list is contiguous with end of previous page_list. +	 * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. +	 */ +	if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == +		       page_to_pfn(page_list[0]))) +		update_cur_sg = true; + +	while (i != npages) { +		unsigned long len; +		struct page *first_page = page_list[i]; + +		first_pfn = page_to_pfn(first_page); + +		/* Compute the number of contiguous pages we have starting +		 * at i +		 */ +		for (len = 0; i != npages && +			      first_pfn + len == page_to_pfn(page_list[i]) && +			      len < (max_seg_sz >> PAGE_SHIFT); +		     len++) +			i++; + +		/* Squash N contiguous pages from page_list into current sge */ +		if (update_cur_sg) { +			if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) { +				sg_set_page(sg, sg_page(sg), +					    sg->length + (len << PAGE_SHIFT), +					    0); +				update_cur_sg = false; +				continue; +			} +			update_cur_sg = false; +		} + +		/* Squash N contiguous pages into next sge or first sge */ +		if (!first) +			sg = sg_next(sg); + +		(*nents)++; +		sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); +		first = false; +	} + +	return sg; +} + +/** + * ib_umem_find_best_pgsz - Find best HW page size to use for this MR + * + * @umem: umem struct + * @pgsz_bitmap: bitmap of HW supported page sizes + * @virt: IOVA + * + * This helper is intended for HW that support multiple page + * sizes but can do only a single page size in an MR. + * + * Returns 0 if the umem requires page sizes not supported by + * the driver to be mapped. Drivers always supporting PAGE_SIZE + * or smaller will never see a 0 result. + */ +unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, +				     unsigned long pgsz_bitmap, +				     unsigned long virt) +{ +	struct scatterlist *sg; +	unsigned int best_pg_bit; +	unsigned long va, pgoff; +	dma_addr_t mask; +	int i; + +	/* At minimum, drivers must support PAGE_SIZE or smaller */ +	if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0)))) +		return 0; + +	va = virt; +	/* max page size not to exceed MR length */ +	mask = roundup_pow_of_two(umem->length); +	/* offset into first SGL */ +	pgoff = umem->address & ~PAGE_MASK; + +	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) { +		/* Walk SGL and reduce max page size if VA/PA bits differ +		 * for any address. +		 */ +		mask |= (sg_dma_address(sg) + pgoff) ^ va; +		if (i && i != (umem->nmap - 1)) +			/* restrict by length as well for interior SGEs */ +			mask |= sg_dma_len(sg); +		va += sg_dma_len(sg) - pgoff; +		pgoff = 0; +	} +	best_pg_bit = rdma_find_pg_bit(mask, pgsz_bitmap); + +	return BIT_ULL(best_pg_bit); +} +EXPORT_SYMBOL(ib_umem_find_best_pgsz); +  /**   * ib_umem_get - Pin and DMA map userspace memory.   * @@ -84,16 +198,14 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,  	struct ib_ucontext *context;  	struct ib_umem *umem;  	struct page **page_list; -	struct vm_area_struct **vma_list;  	unsigned long lock_limit;  	unsigned long new_pinned;  	unsigned long cur_base;  	struct mm_struct *mm;  	unsigned long npages;  	int ret; -	int i;  	unsigned long dma_attrs = 0; -	struct scatterlist *sg, *sg_list_start; +	struct scatterlist *sg;  	unsigned int gup_flags = FOLL_WRITE;  	if (!udata) @@ -138,29 +250,23 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,  	mmgrab(mm);  	if (access & IB_ACCESS_ON_DEMAND) { +		if (WARN_ON_ONCE(!context->invalidate_range)) { +			ret = -EINVAL; +			goto umem_kfree; +		} +  		ret = ib_umem_odp_get(to_ib_umem_odp(umem), access);  		if (ret)  			goto umem_kfree;  		return umem;  	} -	/* We assume the memory is from hugetlb until proved otherwise */ -	umem->hugetlb   = 1; -  	page_list = (struct page **) __get_free_page(GFP_KERNEL);  	if (!page_list) {  		ret = -ENOMEM;  		goto umem_kfree;  	} -	/* -	 * if we can't alloc the vma_list, it's not so bad; -	 * just assume the memory is not hugetlb memory -	 */ -	vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL); -	if (!vma_list) -		umem->hugetlb = 0; -  	npages = ib_umem_num_pages(umem);  	if (npages == 0 || npages > UINT_MAX) {  		ret = -EINVAL; @@ -185,41 +291,35 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,  	if (!umem->writable)  		gup_flags |= FOLL_FORCE; -	sg_list_start = umem->sg_head.sgl; +	sg = umem->sg_head.sgl;  	while (npages) {  		down_read(&mm->mmap_sem); -		ret = get_user_pages_longterm(cur_base, +		ret = get_user_pages(cur_base,  				     min_t(unsigned long, npages,  					   PAGE_SIZE / sizeof (struct page *)), -				     gup_flags, page_list, vma_list); +				     gup_flags | FOLL_LONGTERM, +				     page_list, NULL);  		if (ret < 0) {  			up_read(&mm->mmap_sem);  			goto umem_release;  		} -		umem->npages += ret;  		cur_base += ret * PAGE_SIZE;  		npages   -= ret; -		/* Continue to hold the mmap_sem as vma_list access -		 * needs to be protected. -		 */ -		for_each_sg(sg_list_start, sg, ret, i) { -			if (vma_list && !is_vm_hugetlb_page(vma_list[i])) -				umem->hugetlb = 0; +		sg = ib_umem_add_sg_table(sg, page_list, ret, +			dma_get_max_seg_size(context->device->dma_device), +			&umem->sg_nents); -			sg_set_page(sg, page_list[i], PAGE_SIZE, 0); -		}  		up_read(&mm->mmap_sem); - -		/* preparing for next loop */ -		sg_list_start = sg;  	} +	sg_mark_end(sg); +  	umem->nmap = ib_dma_map_sg_attrs(context->device,  				  umem->sg_head.sgl, -				  umem->npages, +				  umem->sg_nents,  				  DMA_BIDIRECTIONAL,  				  dma_attrs); @@ -236,8 +336,6 @@ umem_release:  vma:  	atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);  out: -	if (vma_list) -		free_page((unsigned long) vma_list);  	free_page((unsigned long) page_list);  umem_kfree:  	if (ret) { @@ -315,7 +413,7 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,  		return -EINVAL;  	} -	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length, +	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length,  				 offset + ib_umem_offset(umem));  	if (ret < 0) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e6ec79ad9cc8..f962b5bbfa40 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -152,7 +152,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,  	struct ib_ucontext_per_mm *per_mm =  		container_of(mn, struct ib_ucontext_per_mm, mn); -	if (range->blockable) +	if (mmu_notifier_range_blockable(range))  		down_read(&per_mm->umem_rwsem);  	else if (!down_read_trylock(&per_mm->umem_rwsem))  		return -EAGAIN; @@ -170,7 +170,8 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,  	return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,  					     range->end,  					     invalidate_range_start_trampoline, -					     range->blockable, NULL); +					     mmu_notifier_range_blockable(range), +					     NULL);  }  static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, @@ -241,7 +242,7 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx,  	per_mm->mm = mm;  	per_mm->umem_tree = RB_ROOT_CACHED;  	init_rwsem(&per_mm->umem_rwsem); -	per_mm->active = ctx->invalidate_range; +	per_mm->active = true;  	rcu_read_lock();  	per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); @@ -417,9 +418,6 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)  		h = hstate_vma(vma);  		umem->page_shift = huge_page_shift(h);  		up_read(&mm->mmap_sem); -		umem->hugetlb = 1; -	} else { -		umem->hugetlb = 0;  	}  	mutex_init(&umem_odp->umem_mutex); @@ -503,7 +501,6 @@ static int ib_umem_odp_map_dma_single_page(  	struct ib_umem *umem = &umem_odp->umem;  	struct ib_device *dev = umem->context->device;  	dma_addr_t dma_addr; -	int stored_page = 0;  	int remove_existing_mapping = 0;  	int ret = 0; @@ -527,8 +524,7 @@ static int ib_umem_odp_map_dma_single_page(  		}  		umem_odp->dma_list[page_index] = dma_addr | access_mask;  		umem_odp->page_list[page_index] = page; -		umem->npages++; -		stored_page = 1; +		umem_odp->npages++;  	} else if (umem_odp->page_list[page_index] == page) {  		umem_odp->dma_list[page_index] |= access_mask;  	} else { @@ -540,11 +536,9 @@ static int ib_umem_odp_map_dma_single_page(  	}  out: -	/* On Demand Paging - avoid pinning the page */ -	if (umem->context->invalidate_range || !stored_page) -		put_page(page); +	put_page(page); -	if (remove_existing_mapping && umem->context->invalidate_range) { +	if (remove_existing_mapping) {  		ib_umem_notifier_start_account(umem_odp);  		umem->context->invalidate_range(  			umem_odp, @@ -754,12 +748,9 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,  				 */  				set_page_dirty(head_page);  			} -			/* on demand pinning support */ -			if (!umem->context->invalidate_range) -				put_page(page);  			umem_odp->page_list[idx] = NULL;  			umem_odp->dma_list[idx] = 0; -			umem->npages--; +			umem_odp->npages--;  		}  	}  	mutex_unlock(&umem_odp->umem_mutex); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 02b7947ab215..671f07ba1fad 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -129,6 +129,9 @@ struct ib_umad_packet {  	struct ib_user_mad mad;  }; +#define CREATE_TRACE_POINTS +#include <trace/events/ib_umad.h> +  static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);  static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) +  				   IB_UMAD_NUM_FIXED_MINOR; @@ -334,6 +337,9 @@ static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,  				return -EFAULT;  		}  	} + +	trace_ib_umad_read_recv(file, &packet->mad.hdr, &recv_buf->mad->mad_hdr); +  	return hdr_size(file) + packet->length;  } @@ -353,6 +359,9 @@ static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf,  	if (copy_to_user(buf, packet->mad.data, packet->length))  		return -EFAULT; +	trace_ib_umad_read_send(file, &packet->mad.hdr, +				(struct ib_mad_hdr *)&packet->mad.data); +  	return size;  } @@ -508,6 +517,9 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,  	mutex_lock(&file->mutex); +	trace_ib_umad_write(file, &packet->mad.hdr, +			    (struct ib_mad_hdr *)&packet->mad.data); +  	agent = __get_agent(file, packet->mad.hdr.id);  	if (!agent) {  		ret = -EINVAL; @@ -968,6 +980,11 @@ static int ib_umad_open(struct inode *inode, struct file *filp)  		goto out;  	} +	if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) { +		ret = -EPERM; +		goto out; +	} +  	file = kzalloc(sizeof(*file), GFP_KERNEL);  	if (!file) {  		ret = -ENOMEM; @@ -985,7 +1002,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp)  	list_add_tail(&file->port_list, &port->file_list); -	nonseekable_open(inode, filp); +	stream_open(inode, filp);  out:  	mutex_unlock(&port->file_mutex);  	return ret; @@ -1061,6 +1078,11 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)  		}  	} +	if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) { +		ret = -EPERM; +		goto err_up_sem; +	} +  	ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);  	if (ret)  		goto err_up_sem; diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index ea0bc6885517..1e5aeb39f774 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -160,10 +160,9 @@ struct ib_uverbs_file {  	struct mutex umap_lock;  	struct list_head umaps; +	struct page *disassociate_page; -	struct idr		idr; -	/* spinlock protects write access to idr */ -	spinlock_t		idr_lock; +	struct xarray		idr;  };  struct ib_uverbs_event { @@ -240,7 +239,8 @@ void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);  void ib_uverbs_event_handler(struct ib_event_handler *handler,  			     struct ib_event *event);  int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd, -			   enum rdma_remove_reason why); +			   enum rdma_remove_reason why, +			   struct uverbs_attr_bundle *attrs);  int uverbs_dealloc_mw(struct ib_mw *mw);  void ib_uverbs_detach_umcast(struct ib_qp *qp, diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 062a86c04123..5a3a1780ceea 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -162,7 +162,7 @@ static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter,  	const void __user *res = iter->cur;  	if (iter->cur + len > iter->end) -		return ERR_PTR(-ENOSPC); +		return (void __force __user *)ERR_PTR(-ENOSPC);  	iter->cur += len;  	return res;  } @@ -175,7 +175,7 @@ static int uverbs_request_finish(struct uverbs_req_iter *iter)  }  static struct ib_uverbs_completion_event_file * -_ib_uverbs_lookup_comp_file(s32 fd, const struct uverbs_attr_bundle *attrs) +_ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs)  {  	struct ib_uobject *uobj = ufd_get_read(UVERBS_OBJECT_COMP_CHANNEL,  					       fd, attrs); @@ -230,6 +230,8 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)  		goto err_alloc;  	} +	attrs->context = ucontext; +  	ucontext->res.type = RDMA_RESTRACK_CTX;  	ucontext->device = ib_dev;  	ucontext->cg_obj = cg_obj; @@ -423,7 +425,7 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)  	atomic_set(&pd->usecnt, 0);  	pd->res.type = RDMA_RESTRACK_PD; -	ret = ib_dev->ops.alloc_pd(pd, uobj->context, &attrs->driver_udata); +	ret = ib_dev->ops.alloc_pd(pd, &attrs->driver_udata);  	if (ret)  		goto err_alloc; @@ -436,15 +438,15 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)  	if (ret)  		goto err_copy; -	return uobj_alloc_commit(uobj); +	return uobj_alloc_commit(uobj, attrs);  err_copy: -	ib_dealloc_pd(pd); +	ib_dealloc_pd_user(pd, &attrs->driver_udata);  	pd = NULL;  err_alloc:  	kfree(pd);  err: -	uobj_alloc_abort(uobj); +	uobj_alloc_abort(uobj, attrs);  	return ret;  } @@ -594,8 +596,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)  	}  	if (!xrcd) { -		xrcd = ib_dev->ops.alloc_xrcd(ib_dev, obj->uobject.context, -					      &attrs->driver_udata); +		xrcd = ib_dev->ops.alloc_xrcd(ib_dev, &attrs->driver_udata);  		if (IS_ERR(xrcd)) {  			ret = PTR_ERR(xrcd);  			goto err; @@ -633,7 +634,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)  	mutex_unlock(&ibudev->xrcd_tree_mutex); -	return uobj_alloc_commit(&obj->uobject); +	return uobj_alloc_commit(&obj->uobject, attrs);  err_copy:  	if (inode) { @@ -643,10 +644,10 @@ err_copy:  	}  err_dealloc_xrcd: -	ib_dealloc_xrcd(xrcd); +	ib_dealloc_xrcd(xrcd, &attrs->driver_udata);  err: -	uobj_alloc_abort(&obj->uobject); +	uobj_alloc_abort(&obj->uobject, attrs);  err_tree_mutex_unlock:  	if (f.file) @@ -669,19 +670,19 @@ static int ib_uverbs_close_xrcd(struct uverbs_attr_bundle *attrs)  	return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, attrs);  } -int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, -			   struct ib_xrcd *xrcd, -			   enum rdma_remove_reason why) +int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd, +			   enum rdma_remove_reason why, +			   struct uverbs_attr_bundle *attrs)  {  	struct inode *inode;  	int ret; -	struct ib_uverbs_device *dev = uobject->context->ufile->device; +	struct ib_uverbs_device *dev = attrs->ufile->device;  	inode = xrcd->inode;  	if (inode && !atomic_dec_and_test(&xrcd->usecnt))  		return 0; -	ret = ib_dealloc_xrcd(xrcd); +	ret = ib_dealloc_xrcd(xrcd, &attrs->driver_udata);  	if (ib_is_destroy_retryable(ret, why, uobject)) {  		atomic_inc(&xrcd->usecnt); @@ -763,16 +764,16 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)  	uobj_put_obj_read(pd); -	return uobj_alloc_commit(uobj); +	return uobj_alloc_commit(uobj, attrs);  err_copy: -	ib_dereg_mr(mr); +	ib_dereg_mr_user(mr, &attrs->driver_udata);  err_put:  	uobj_put_obj_read(pd);  err_free: -	uobj_alloc_abort(uobj); +	uobj_alloc_abort(uobj, attrs);  	return ret;  } @@ -917,14 +918,14 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs)  		goto err_copy;  	uobj_put_obj_read(pd); -	return uobj_alloc_commit(uobj); +	return uobj_alloc_commit(uobj, attrs);  err_copy:  	uverbs_dealloc_mw(mw);  err_put:  	uobj_put_obj_read(pd);  err_free: -	uobj_alloc_abort(uobj); +	uobj_alloc_abort(uobj, attrs);  	return ret;  } @@ -965,11 +966,11 @@ static int ib_uverbs_create_comp_channel(struct uverbs_attr_bundle *attrs)  	ret = uverbs_response(attrs, &resp, sizeof(resp));  	if (ret) { -		uobj_alloc_abort(uobj); +		uobj_alloc_abort(uobj, attrs);  		return ret;  	} -	return uobj_alloc_commit(uobj); +	return uobj_alloc_commit(uobj, attrs);  }  static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, @@ -1009,8 +1010,7 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,  	attr.comp_vector = cmd->comp_vector;  	attr.flags = cmd->flags; -	cq = ib_dev->ops.create_cq(ib_dev, &attr, obj->uobject.context, -				   &attrs->driver_udata); +	cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata);  	if (IS_ERR(cq)) {  		ret = PTR_ERR(cq);  		goto err_file; @@ -1036,7 +1036,7 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,  	if (ret)  		goto err_cb; -	ret = uobj_alloc_commit(&obj->uobject); +	ret = uobj_alloc_commit(&obj->uobject, attrs);  	if (ret)  		return ERR_PTR(ret);  	return obj; @@ -1049,7 +1049,7 @@ err_file:  		ib_uverbs_release_ucq(attrs->ufile, ev_file, obj);  err: -	uobj_alloc_abort(&obj->uobject); +	uobj_alloc_abort(&obj->uobject, attrs);  	return ERR_PTR(ret);  } @@ -1418,7 +1418,6 @@ static int create_qp(struct uverbs_attr_bundle *attrs,  		if (ret)  			goto err_cb; -		qp->real_qp	  = qp;  		qp->pd		  = pd;  		qp->send_cq	  = attr.send_cq;  		qp->recv_cq	  = attr.recv_cq; @@ -1477,7 +1476,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs,  	if (ind_tbl)  		uobj_put_obj_read(ind_tbl); -	return uobj_alloc_commit(&obj->uevent.uobject); +	return uobj_alloc_commit(&obj->uevent.uobject, attrs);  err_cb:  	ib_destroy_qp(qp); @@ -1495,7 +1494,7 @@ err_put:  	if (ind_tbl)  		uobj_put_obj_read(ind_tbl); -	uobj_alloc_abort(&obj->uevent.uobject); +	uobj_alloc_abort(&obj->uevent.uobject, attrs);  	return ret;  } @@ -1609,14 +1608,14 @@ static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs)  	qp->uobject = &obj->uevent.uobject;  	uobj_put_read(xrcd_uobj); -	return uobj_alloc_commit(&obj->uevent.uobject); +	return uobj_alloc_commit(&obj->uevent.uobject, attrs);  err_destroy:  	ib_destroy_qp(qp);  err_xrcd:  	uobj_put_read(xrcd_uobj);  err_put: -	uobj_alloc_abort(&obj->uevent.uobject); +	uobj_alloc_abort(&obj->uevent.uobject, attrs);  	return ret;  } @@ -2451,7 +2450,7 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs)  		goto err_copy;  	uobj_put_obj_read(pd); -	return uobj_alloc_commit(uobj); +	return uobj_alloc_commit(uobj, attrs);  err_copy:  	rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE); @@ -2460,7 +2459,7 @@ err_put:  	uobj_put_obj_read(pd);  err: -	uobj_alloc_abort(uobj); +	uobj_alloc_abort(uobj, attrs);  	return ret;  } @@ -2962,16 +2961,16 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs)  	uobj_put_obj_read(pd);  	uobj_put_obj_read(cq); -	return uobj_alloc_commit(&obj->uevent.uobject); +	return uobj_alloc_commit(&obj->uevent.uobject, attrs);  err_copy: -	ib_destroy_wq(wq); +	ib_destroy_wq(wq, &attrs->driver_udata);  err_put_cq:  	uobj_put_obj_read(cq);  err_put_pd:  	uobj_put_obj_read(pd);  err_uobj: -	uobj_alloc_abort(&obj->uevent.uobject); +	uobj_alloc_abort(&obj->uevent.uobject, attrs);  	return err;  } @@ -3136,12 +3135,12 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs)  	for (j = 0; j < num_read_wqs; j++)  		uobj_put_obj_read(wqs[j]); -	return uobj_alloc_commit(uobj); +	return uobj_alloc_commit(uobj, attrs);  err_copy:  	ib_destroy_rwq_ind_table(rwq_ind_tbl);  err_uobj: -	uobj_alloc_abort(uobj); +	uobj_alloc_abort(uobj, attrs);  put_wqs:  	for (j = 0; j < num_read_wqs; j++)  		uobj_put_obj_read(wqs[j]); @@ -3314,7 +3313,7 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)  	kfree(flow_attr);  	if (cmd.flow_attr.num_of_specs)  		kfree(kern_flow_attr); -	return uobj_alloc_commit(uobj); +	return uobj_alloc_commit(uobj, attrs);  err_copy:  	if (!qp->device->ops.destroy_flow(flow_id))  		atomic_dec(&qp->usecnt); @@ -3325,7 +3324,7 @@ err_free_flow_attr:  err_put:  	uobj_put_obj_read(qp);  err_uobj: -	uobj_alloc_abort(uobj); +	uobj_alloc_abort(uobj, attrs);  err_free_attr:  	if (cmd.flow_attr.num_of_specs)  		kfree(kern_flow_attr); @@ -3411,9 +3410,9 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,  	obj->uevent.events_reported = 0;  	INIT_LIST_HEAD(&obj->uevent.event_list); -	srq = pd->device->ops.create_srq(pd, &attr, udata); -	if (IS_ERR(srq)) { -		ret = PTR_ERR(srq); +	srq = rdma_zalloc_drv_obj(ib_dev, ib_srq); +	if (!srq) { +		ret = -ENOMEM;  		goto err_put;  	} @@ -3424,6 +3423,10 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,  	srq->event_handler = attr.event_handler;  	srq->srq_context   = attr.srq_context; +	ret = pd->device->ops.create_srq(srq, &attr, udata); +	if (ret) +		goto err_free; +  	if (ib_srq_has_cq(cmd->srq_type)) {  		srq->ext.cq       = attr.ext.cq;  		atomic_inc(&attr.ext.cq->usecnt); @@ -3458,11 +3461,13 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,  		uobj_put_obj_read(attr.ext.cq);  	uobj_put_obj_read(pd); -	return uobj_alloc_commit(&obj->uevent.uobject); +	return uobj_alloc_commit(&obj->uevent.uobject, attrs);  err_copy: -	ib_destroy_srq(srq); +	ib_destroy_srq_user(srq, &attrs->driver_udata); +err_free: +	kfree(srq);  err_put:  	uobj_put_obj_read(pd); @@ -3477,7 +3482,7 @@ err_put_xrcd:  	}  err: -	uobj_alloc_abort(&obj->uevent.uobject); +	uobj_alloc_abort(&obj->uevent.uobject, attrs);  	return ret;  } diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index e1379949e663..829b0c6944d8 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -207,13 +207,12 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle,  	for (i = 0; i != array_len; i++) {  		attr->uobjects[i] = uverbs_get_uobject_from_file( -			spec->u2.objs_arr.obj_type, pbundle->bundle.ufile, -			spec->u2.objs_arr.access, idr_vals[i]); +			spec->u2.objs_arr.obj_type, spec->u2.objs_arr.access, +			idr_vals[i], &pbundle->bundle);  		if (IS_ERR(attr->uobjects[i])) {  			ret = PTR_ERR(attr->uobjects[i]);  			break;  		} -		pbundle->bundle.context = attr->uobjects[i]->context;  	}  	attr->len = i; @@ -223,7 +222,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle,  static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi,  				  struct uverbs_objs_arr_attr *attr, -				  bool commit) +				  bool commit, struct uverbs_attr_bundle *attrs)  {  	const struct uverbs_attr_spec *spec = &attr_uapi->spec;  	int current_ret; @@ -231,8 +230,9 @@ static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi,  	size_t i;  	for (i = 0; i != attr->len; i++) { -		current_ret = uverbs_finalize_object( -			attr->uobjects[i], spec->u2.objs_arr.access, commit); +		current_ret = uverbs_finalize_object(attr->uobjects[i], +						     spec->u2.objs_arr.access, +						     commit, attrs);  		if (!ret)  			ret = current_ret;  	} @@ -325,13 +325,10 @@ static int uverbs_process_attr(struct bundle_priv *pbundle,  		 * IDR implementation today rejects negative IDs  		 */  		o_attr->uobject = uverbs_get_uobject_from_file( -					spec->u.obj.obj_type, -					pbundle->bundle.ufile, -					spec->u.obj.access, -					uattr->data_s64); +			spec->u.obj.obj_type, spec->u.obj.access, +			uattr->data_s64, &pbundle->bundle);  		if (IS_ERR(o_attr->uobject))  			return PTR_ERR(o_attr->uobject); -		pbundle->bundle.context = o_attr->uobject->context;  		__set_bit(attr_bkey, pbundle->uobj_finalize);  		if (spec->u.obj.access == UVERBS_ACCESS_NEW) { @@ -456,12 +453,14 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle,  		uverbs_fill_udata(&pbundle->bundle,  				  &pbundle->bundle.driver_udata,  				  UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT); +	else +		pbundle->bundle.driver_udata = (struct ib_udata){};  	if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) {  		struct uverbs_obj_attr *destroy_attr =  			&pbundle->bundle.attrs[destroy_bkey].obj_attr; -		ret = uobj_destroy(destroy_attr->uobject); +		ret = uobj_destroy(destroy_attr->uobject, &pbundle->bundle);  		if (ret)  			return ret;  		__clear_bit(destroy_bkey, pbundle->uobj_finalize); @@ -512,7 +511,8 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit)  		current_ret = uverbs_finalize_object(  			attr->obj_attr.uobject, -			attr->obj_attr.attr_elm->spec.u.obj.access, commit); +			attr->obj_attr.attr_elm->spec.u.obj.access, commit, +			&pbundle->bundle);  		if (!ret)  			ret = current_ret;  	} @@ -535,7 +535,8 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit)  		if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {  			current_ret = uverbs_free_idrs_array( -				attr_uapi, &attr->objs_arr_attr, commit); +				attr_uapi, &attr->objs_arr_attr, commit, +				&pbundle->bundle);  			if (!ret)  				ret = current_ret;  		} diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 70b7d80431a9..84a5e9a6d483 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -208,6 +208,9 @@ void ib_uverbs_release_file(struct kref *ref)  		kref_put(&file->async_file->ref,  			 ib_uverbs_release_async_event_file);  	put_device(&file->device->dev); + +	if (file->disassociate_page) +		__free_pages(file->disassociate_page, 0);  	kfree(file);  } @@ -720,7 +723,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,  			 * then the command request structure starts  			 * with a '__aligned u64 response' member.  			 */ -			ret = get_user(response, (const u64 *)buf); +			ret = get_user(response, (const u64 __user *)buf);  			if (ret)  				goto out_unlock; @@ -877,45 +880,78 @@ static void rdma_umap_close(struct vm_area_struct *vma)  	kfree(priv);  } +/* + * Once the zap_vma_ptes has been called touches to the VMA will come here and + * we return a dummy writable zero page for all the pfns. + */ +static vm_fault_t rdma_umap_fault(struct vm_fault *vmf) +{ +	struct ib_uverbs_file *ufile = vmf->vma->vm_file->private_data; +	struct rdma_umap_priv *priv = vmf->vma->vm_private_data; +	vm_fault_t ret = 0; + +	if (!priv) +		return VM_FAULT_SIGBUS; + +	/* Read only pages can just use the system zero page. */ +	if (!(vmf->vma->vm_flags & (VM_WRITE | VM_MAYWRITE))) { +		vmf->page = ZERO_PAGE(vmf->address); +		get_page(vmf->page); +		return 0; +	} + +	mutex_lock(&ufile->umap_lock); +	if (!ufile->disassociate_page) +		ufile->disassociate_page = +			alloc_pages(vmf->gfp_mask | __GFP_ZERO, 0); + +	if (ufile->disassociate_page) { +		/* +		 * This VMA is forced to always be shared so this doesn't have +		 * to worry about COW. +		 */ +		vmf->page = ufile->disassociate_page; +		get_page(vmf->page); +	} else { +		ret = VM_FAULT_SIGBUS; +	} +	mutex_unlock(&ufile->umap_lock); + +	return ret; +} +  static const struct vm_operations_struct rdma_umap_ops = {  	.open = rdma_umap_open,  	.close = rdma_umap_close, +	.fault = rdma_umap_fault,  }; -static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, -						 struct vm_area_struct *vma, -						 unsigned long size) +/* + * Map IO memory into a process. This is to be called by drivers as part of + * their mmap() functions if they wish to send something like PCI-E BAR memory + * to userspace. + */ +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, +		      unsigned long pfn, unsigned long size, pgprot_t prot)  {  	struct ib_uverbs_file *ufile = ucontext->ufile;  	struct rdma_umap_priv *priv; +	if (!(vma->vm_flags & VM_SHARED)) +		return -EINVAL; +  	if (vma->vm_end - vma->vm_start != size) -		return ERR_PTR(-EINVAL); +		return -EINVAL;  	/* Driver is using this wrong, must be called by ib_uverbs_mmap */  	if (WARN_ON(!vma->vm_file ||  		    vma->vm_file->private_data != ufile)) -		return ERR_PTR(-EINVAL); +		return -EINVAL;  	lockdep_assert_held(&ufile->device->disassociate_srcu);  	priv = kzalloc(sizeof(*priv), GFP_KERNEL);  	if (!priv) -		return ERR_PTR(-ENOMEM); -	return priv; -} - -/* - * Map IO memory into a process. This is to be called by drivers as part of - * their mmap() functions if they wish to send something like PCI-E BAR memory - * to userspace. - */ -int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, -		      unsigned long pfn, unsigned long size, pgprot_t prot) -{ -	struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); - -	if (IS_ERR(priv)) -		return PTR_ERR(priv); +		return -ENOMEM;  	vma->vm_page_prot = prot;  	if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) { @@ -928,35 +964,6 @@ int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,  }  EXPORT_SYMBOL(rdma_user_mmap_io); -/* - * The page case is here for a slightly different reason, the driver expects - * to be able to free the page it is sharing to user space when it destroys - * its ucontext, which means we need to zap the user space references. - * - * We could handle this differently by providing an API to allocate a shared - * page and then only freeing the shared page when the last ufile is - * destroyed. - */ -int rdma_user_mmap_page(struct ib_ucontext *ucontext, -			struct vm_area_struct *vma, struct page *page, -			unsigned long size) -{ -	struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); - -	if (IS_ERR(priv)) -		return PTR_ERR(priv); - -	if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size, -			    vma->vm_page_prot)) { -		kfree(priv); -		return -EAGAIN; -	} - -	rdma_umap_priv_init(priv, vma); -	return 0; -} -EXPORT_SYMBOL(rdma_user_mmap_page); -  void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)  {  	struct rdma_umap_priv *priv, *next_priv; @@ -992,7 +999,9 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)  		 * at a time to get the lock ordering right. Typically there  		 * will only be one mm, so no big deal.  		 */ -		down_write(&mm->mmap_sem); +		down_read(&mm->mmap_sem); +		if (!mmget_still_valid(mm)) +			goto skip_mm;  		mutex_lock(&ufile->umap_lock);  		list_for_each_entry_safe (priv, next_priv, &ufile->umaps,  					  list) { @@ -1004,10 +1013,10 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)  			zap_vma_ptes(vma, vma->vm_start,  				     vma->vm_end - vma->vm_start); -			vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);  		}  		mutex_unlock(&ufile->umap_lock); -		up_write(&mm->mmap_sem); +	skip_mm: +		up_read(&mm->mmap_sem);  		mmput(mm);  	}  } @@ -1045,6 +1054,11 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)  		goto err;  	} +	if (!rdma_dev_access_netns(ib_dev, current->nsproxy->net_ns)) { +		ret = -EPERM; +		goto err; +	} +  	/* In case IB device supports disassociate ucontext, there is no hard  	 * dependency between uverbs device and its low level device.  	 */ @@ -1083,7 +1097,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)  	setup_ufile_idr_uobject(file); -	return nonseekable_open(inode, filp); +	return stream_open(inode, filp);  err_module:  	module_put(ib_dev->owner); diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index f224cb727224..35b2e2c640cc 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -40,14 +40,17 @@  #include "uverbs.h"  static int uverbs_free_ah(struct ib_uobject *uobject, -			  enum rdma_remove_reason why) +			  enum rdma_remove_reason why, +			  struct uverbs_attr_bundle *attrs)  { -	return rdma_destroy_ah((struct ib_ah *)uobject->object, -			       RDMA_DESTROY_AH_SLEEPABLE); +	return rdma_destroy_ah_user((struct ib_ah *)uobject->object, +				    RDMA_DESTROY_AH_SLEEPABLE, +				    &attrs->driver_udata);  }  static int uverbs_free_flow(struct ib_uobject *uobject, -			    enum rdma_remove_reason why) +			    enum rdma_remove_reason why, +			    struct uverbs_attr_bundle *attrs)  {  	struct ib_flow *flow = (struct ib_flow *)uobject->object;  	struct ib_uflow_object *uflow = @@ -66,13 +69,15 @@ static int uverbs_free_flow(struct ib_uobject *uobject,  }  static int uverbs_free_mw(struct ib_uobject *uobject, -			  enum rdma_remove_reason why) +			  enum rdma_remove_reason why, +			  struct uverbs_attr_bundle *attrs)  {  	return uverbs_dealloc_mw((struct ib_mw *)uobject->object);  }  static int uverbs_free_qp(struct ib_uobject *uobject, -			  enum rdma_remove_reason why) +			  enum rdma_remove_reason why, +			  struct uverbs_attr_bundle *attrs)  {  	struct ib_qp *qp = uobject->object;  	struct ib_uqp_object *uqp = @@ -93,19 +98,20 @@ static int uverbs_free_qp(struct ib_uobject *uobject,  		ib_uverbs_detach_umcast(qp, uqp);  	} -	ret = ib_destroy_qp(qp); +	ret = ib_destroy_qp_user(qp, &attrs->driver_udata);  	if (ib_is_destroy_retryable(ret, why, uobject))  		return ret;  	if (uqp->uxrcd)  		atomic_dec(&uqp->uxrcd->refcnt); -	ib_uverbs_release_uevent(uobject->context->ufile, &uqp->uevent); +	ib_uverbs_release_uevent(attrs->ufile, &uqp->uevent);  	return ret;  }  static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, -				   enum rdma_remove_reason why) +				   enum rdma_remove_reason why, +				   struct uverbs_attr_bundle *attrs)  {  	struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object;  	struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; @@ -120,23 +126,25 @@ static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject,  }  static int uverbs_free_wq(struct ib_uobject *uobject, -			  enum rdma_remove_reason why) +			  enum rdma_remove_reason why, +			  struct uverbs_attr_bundle *attrs)  {  	struct ib_wq *wq = uobject->object;  	struct ib_uwq_object *uwq =  		container_of(uobject, struct ib_uwq_object, uevent.uobject);  	int ret; -	ret = ib_destroy_wq(wq); +	ret = ib_destroy_wq(wq, &attrs->driver_udata);  	if (ib_is_destroy_retryable(ret, why, uobject))  		return ret; -	ib_uverbs_release_uevent(uobject->context->ufile, &uwq->uevent); +	ib_uverbs_release_uevent(attrs->ufile, &uwq->uevent);  	return ret;  }  static int uverbs_free_srq(struct ib_uobject *uobject, -			   enum rdma_remove_reason why) +			   enum rdma_remove_reason why, +			   struct uverbs_attr_bundle *attrs)  {  	struct ib_srq *srq = uobject->object;  	struct ib_uevent_object *uevent = @@ -144,7 +152,7 @@ static int uverbs_free_srq(struct ib_uobject *uobject,  	enum ib_srq_type  srq_type = srq->srq_type;  	int ret; -	ret = ib_destroy_srq(srq); +	ret = ib_destroy_srq_user(srq, &attrs->driver_udata);  	if (ib_is_destroy_retryable(ret, why, uobject))  		return ret; @@ -155,12 +163,13 @@ static int uverbs_free_srq(struct ib_uobject *uobject,  		atomic_dec(&us->uxrcd->refcnt);  	} -	ib_uverbs_release_uevent(uobject->context->ufile, uevent); +	ib_uverbs_release_uevent(attrs->ufile, uevent);  	return ret;  }  static int uverbs_free_xrcd(struct ib_uobject *uobject, -			    enum rdma_remove_reason why) +			    enum rdma_remove_reason why, +			    struct uverbs_attr_bundle *attrs)  {  	struct ib_xrcd *xrcd = uobject->object;  	struct ib_uxrcd_object *uxrcd = @@ -171,15 +180,16 @@ static int uverbs_free_xrcd(struct ib_uobject *uobject,  	if (ret)  		return ret; -	mutex_lock(&uobject->context->ufile->device->xrcd_tree_mutex); -	ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why); -	mutex_unlock(&uobject->context->ufile->device->xrcd_tree_mutex); +	mutex_lock(&attrs->ufile->device->xrcd_tree_mutex); +	ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why, attrs); +	mutex_unlock(&attrs->ufile->device->xrcd_tree_mutex);  	return ret;  }  static int uverbs_free_pd(struct ib_uobject *uobject, -			  enum rdma_remove_reason why) +			  enum rdma_remove_reason why, +			  struct uverbs_attr_bundle *attrs)  {  	struct ib_pd *pd = uobject->object;  	int ret; @@ -188,7 +198,7 @@ static int uverbs_free_pd(struct ib_uobject *uobject,  	if (ret)  		return ret; -	ib_dealloc_pd(pd); +	ib_dealloc_pd_user(pd, &attrs->driver_udata);  	return 0;  } diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c index 309c5e80988d..9f013304e677 100644 --- a/drivers/infiniband/core/uverbs_std_types_counters.c +++ b/drivers/infiniband/core/uverbs_std_types_counters.c @@ -31,11 +31,13 @@   * SOFTWARE.   */ +#include "rdma_core.h"  #include "uverbs.h"  #include <rdma/uverbs_std_types.h>  static int uverbs_free_counters(struct ib_uobject *uobject, -				enum rdma_remove_reason why) +				enum rdma_remove_reason why, +				struct uverbs_attr_bundle *attrs)  {  	struct ib_counters *counters = uobject->object;  	int ret; @@ -52,7 +54,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(  {  	struct ib_uobject *uobj = uverbs_attr_get_uobject(  		attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE); -	struct ib_device *ib_dev = uobj->context->device; +	struct ib_device *ib_dev = attrs->context->device;  	struct ib_counters *counters;  	int ret; diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index a59ea89e3f2b..db5c46a1bb2d 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -35,7 +35,8 @@  #include "uverbs.h"  static int uverbs_free_cq(struct ib_uobject *uobject, -			  enum rdma_remove_reason why) +			  enum rdma_remove_reason why, +			  struct uverbs_attr_bundle *attrs)  {  	struct ib_cq *cq = uobject->object;  	struct ib_uverbs_event_queue *ev_queue = cq->cq_context; @@ -43,12 +44,12 @@ static int uverbs_free_cq(struct ib_uobject *uobject,  		container_of(uobject, struct ib_ucq_object, uobject);  	int ret; -	ret = ib_destroy_cq(cq); +	ret = ib_destroy_cq_user(cq, &attrs->driver_udata);  	if (ib_is_destroy_retryable(ret, why, uobject))  		return ret;  	ib_uverbs_release_ucq( -		uobject->context->ufile, +		attrs->ufile,  		ev_queue ? container_of(ev_queue,  					struct ib_uverbs_completion_event_file,  					ev_queue) : @@ -63,7 +64,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(  	struct ib_ucq_object *obj = container_of(  		uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE),  		typeof(*obj), uobject); -	struct ib_device *ib_dev = obj->uobject.context->device; +	struct ib_device *ib_dev = attrs->context->device;  	int ret;  	u64 user_handle;  	struct ib_cq_init_attr attr = {}; @@ -110,8 +111,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(  	INIT_LIST_HEAD(&obj->comp_list);  	INIT_LIST_HEAD(&obj->async_list); -	cq = ib_dev->ops.create_cq(ib_dev, &attr, obj->uobject.context, -				   &attrs->driver_udata); +	cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata);  	if (IS_ERR(cq)) {  		ret = PTR_ERR(cq);  		goto err_event_file; diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c index 2ef70637bee1..d5a1de33c2c9 100644 --- a/drivers/infiniband/core/uverbs_std_types_dm.c +++ b/drivers/infiniband/core/uverbs_std_types_dm.c @@ -30,11 +30,13 @@   * SOFTWARE.   */ +#include "rdma_core.h"  #include "uverbs.h"  #include <rdma/uverbs_std_types.h>  static int uverbs_free_dm(struct ib_uobject *uobject, -			  enum rdma_remove_reason why) +			  enum rdma_remove_reason why, +			  struct uverbs_attr_bundle *attrs)  {  	struct ib_dm *dm = uobject->object;  	int ret; @@ -43,7 +45,7 @@ static int uverbs_free_dm(struct ib_uobject *uobject,  	if (ret)  		return ret; -	return dm->device->ops.dealloc_dm(dm); +	return dm->device->ops.dealloc_dm(dm, attrs);  }  static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)( @@ -53,7 +55,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(  	struct ib_uobject *uobj =  		uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE)  			->obj_attr.uobject; -	struct ib_device *ib_dev = uobj->context->device; +	struct ib_device *ib_dev = attrs->context->device;  	struct ib_dm *dm;  	int ret; @@ -70,7 +72,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(  	if (ret)  		return ret; -	dm = ib_dev->ops.alloc_dm(ib_dev, uobj->context, &attr, attrs); +	dm = ib_dev->ops.alloc_dm(ib_dev, attrs->context, &attr, attrs);  	if (IS_ERR(dm))  		return PTR_ERR(dm); diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c index 4962b87fa600..459cf165b231 100644 --- a/drivers/infiniband/core/uverbs_std_types_flow_action.c +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -30,11 +30,13 @@   * SOFTWARE.   */ +#include "rdma_core.h"  #include "uverbs.h"  #include <rdma/uverbs_std_types.h>  static int uverbs_free_flow_action(struct ib_uobject *uobject, -				   enum rdma_remove_reason why) +				   enum rdma_remove_reason why, +				   struct uverbs_attr_bundle *attrs)  {  	struct ib_flow_action *action = uobject->object;  	int ret; @@ -308,7 +310,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(  {  	struct ib_uobject *uobj = uverbs_attr_get_uobject(  		attrs, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE); -	struct ib_device *ib_dev = uobj->context->device; +	struct ib_device *ib_dev = attrs->context->device;  	int				  ret;  	struct ib_flow_action		  *action;  	struct ib_flow_action_esp_attr	  esp_attr = {}; diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 4d4be0c2b752..610d3b9f7654 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -30,13 +30,16 @@   * SOFTWARE.   */ +#include "rdma_core.h"  #include "uverbs.h"  #include <rdma/uverbs_std_types.h>  static int uverbs_free_mr(struct ib_uobject *uobject, -			  enum rdma_remove_reason why) +			  enum rdma_remove_reason why, +			  struct uverbs_attr_bundle *attrs)  { -	return ib_dereg_mr((struct ib_mr *)uobject->object); +	return ib_dereg_mr_user((struct ib_mr *)uobject->object, +				&attrs->driver_udata);  }  static int UVERBS_HANDLER(UVERBS_METHOD_ADVISE_MR)( @@ -145,7 +148,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(  	return 0;  err_dereg: -	ib_dereg_mr(mr); +	ib_dereg_mr_user(mr, &attrs->driver_udata);  	return ret;  } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 5a5e83f5f0fc..e666a1f7608d 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -218,6 +218,8 @@ rdma_node_get_transport(enum rdma_node_type node_type)  		return RDMA_TRANSPORT_USNIC_UDP;  	if (node_type == RDMA_NODE_RNIC)  		return RDMA_TRANSPORT_IWARP; +	if (node_type == RDMA_NODE_UNSPECIFIED) +		return RDMA_TRANSPORT_UNSPECIFIED;  	return RDMA_TRANSPORT_IB;  } @@ -269,7 +271,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,  	pd->res.type = RDMA_RESTRACK_PD;  	rdma_restrack_set_task(&pd->res, caller); -	ret = device->ops.alloc_pd(pd, NULL, NULL); +	ret = device->ops.alloc_pd(pd, NULL);  	if (ret) {  		kfree(pd);  		return ERR_PTR(ret); @@ -316,17 +318,18 @@ EXPORT_SYMBOL(__ib_alloc_pd);  /**   * ib_dealloc_pd - Deallocates a protection domain.   * @pd: The protection domain to deallocate. + * @udata: Valid user data or NULL for kernel object   *   * It is an error to call this function while any resources in the pd still   * exist.  The caller is responsible to synchronously destroy them and   * guarantee no new allocations will happen.   */ -void ib_dealloc_pd(struct ib_pd *pd) +void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata)  {  	int ret;  	if (pd->__internal_mr) { -		ret = pd->device->ops.dereg_mr(pd->__internal_mr); +		ret = pd->device->ops.dereg_mr(pd->__internal_mr, NULL);  		WARN_ON(ret);  		pd->__internal_mr = NULL;  	} @@ -336,10 +339,10 @@ void ib_dealloc_pd(struct ib_pd *pd)  	WARN_ON(atomic_read(&pd->usecnt));  	rdma_restrack_del(&pd->res); -	pd->device->ops.dealloc_pd(pd); +	pd->device->ops.dealloc_pd(pd, udata);  	kfree(pd);  } -EXPORT_SYMBOL(ib_dealloc_pd); +EXPORT_SYMBOL(ib_dealloc_pd_user);  /* Address handles */ @@ -495,25 +498,33 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd,  				     u32 flags,  				     struct ib_udata *udata)  { +	struct ib_device *device = pd->device;  	struct ib_ah *ah; +	int ret;  	might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE); -	if (!pd->device->ops.create_ah) +	if (!device->ops.create_ah)  		return ERR_PTR(-EOPNOTSUPP); -	ah = pd->device->ops.create_ah(pd, ah_attr, flags, udata); +	ah = rdma_zalloc_drv_obj_gfp( +		device, ib_ah, +		(flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC); +	if (!ah) +		return ERR_PTR(-ENOMEM); -	if (!IS_ERR(ah)) { -		ah->device  = pd->device; -		ah->pd      = pd; -		ah->uobject = NULL; -		ah->type    = ah_attr->type; -		ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); +	ah->device = device; +	ah->pd = pd; +	ah->type = ah_attr->type; +	ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); -		atomic_inc(&pd->usecnt); +	ret = device->ops.create_ah(ah, ah_attr, flags, udata); +	if (ret) { +		kfree(ah); +		return ERR_PTR(ret);  	} +	atomic_inc(&pd->usecnt);  	return ah;  } @@ -930,25 +941,24 @@ int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)  }  EXPORT_SYMBOL(rdma_query_ah); -int rdma_destroy_ah(struct ib_ah *ah, u32 flags) +int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata)  {  	const struct ib_gid_attr *sgid_attr = ah->sgid_attr;  	struct ib_pd *pd; -	int ret;  	might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE);  	pd = ah->pd; -	ret = ah->device->ops.destroy_ah(ah, flags); -	if (!ret) { -		atomic_dec(&pd->usecnt); -		if (sgid_attr) -			rdma_put_gid_attr(sgid_attr); -	} -	return ret; +	ah->device->ops.destroy_ah(ah, flags); +	atomic_dec(&pd->usecnt); +	if (sgid_attr) +		rdma_put_gid_attr(sgid_attr); + +	kfree(ah); +	return 0;  } -EXPORT_SYMBOL(rdma_destroy_ah); +EXPORT_SYMBOL(rdma_destroy_ah_user);  /* Shared receive queues */ @@ -956,29 +966,40 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd,  			     struct ib_srq_init_attr *srq_init_attr)  {  	struct ib_srq *srq; +	int ret;  	if (!pd->device->ops.create_srq)  		return ERR_PTR(-EOPNOTSUPP); -	srq = pd->device->ops.create_srq(pd, srq_init_attr, NULL); - -	if (!IS_ERR(srq)) { -		srq->device    	   = pd->device; -		srq->pd        	   = pd; -		srq->uobject       = NULL; -		srq->event_handler = srq_init_attr->event_handler; -		srq->srq_context   = srq_init_attr->srq_context; -		srq->srq_type      = srq_init_attr->srq_type; -		if (ib_srq_has_cq(srq->srq_type)) { -			srq->ext.cq   = srq_init_attr->ext.cq; -			atomic_inc(&srq->ext.cq->usecnt); -		} -		if (srq->srq_type == IB_SRQT_XRC) { -			srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; -			atomic_inc(&srq->ext.xrc.xrcd->usecnt); -		} -		atomic_inc(&pd->usecnt); -		atomic_set(&srq->usecnt, 0); +	srq = rdma_zalloc_drv_obj(pd->device, ib_srq); +	if (!srq) +		return ERR_PTR(-ENOMEM); + +	srq->device = pd->device; +	srq->pd = pd; +	srq->event_handler = srq_init_attr->event_handler; +	srq->srq_context = srq_init_attr->srq_context; +	srq->srq_type = srq_init_attr->srq_type; + +	if (ib_srq_has_cq(srq->srq_type)) { +		srq->ext.cq = srq_init_attr->ext.cq; +		atomic_inc(&srq->ext.cq->usecnt); +	} +	if (srq->srq_type == IB_SRQT_XRC) { +		srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; +		atomic_inc(&srq->ext.xrc.xrcd->usecnt); +	} +	atomic_inc(&pd->usecnt); + +	ret = pd->device->ops.create_srq(srq, srq_init_attr, NULL); +	if (ret) { +		atomic_dec(&srq->pd->usecnt); +		if (srq->srq_type == IB_SRQT_XRC) +			atomic_dec(&srq->ext.xrc.xrcd->usecnt); +		if (ib_srq_has_cq(srq->srq_type)) +			atomic_dec(&srq->ext.cq->usecnt); +		kfree(srq); +		return ERR_PTR(ret);  	}  	return srq; @@ -1003,36 +1024,23 @@ int ib_query_srq(struct ib_srq *srq,  }  EXPORT_SYMBOL(ib_query_srq); -int ib_destroy_srq(struct ib_srq *srq) +int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata)  { -	struct ib_pd *pd; -	enum ib_srq_type srq_type; -	struct ib_xrcd *uninitialized_var(xrcd); -	struct ib_cq *uninitialized_var(cq); -	int ret; -  	if (atomic_read(&srq->usecnt))  		return -EBUSY; -	pd = srq->pd; -	srq_type = srq->srq_type; -	if (ib_srq_has_cq(srq_type)) -		cq = srq->ext.cq; -	if (srq_type == IB_SRQT_XRC) -		xrcd = srq->ext.xrc.xrcd; +	srq->device->ops.destroy_srq(srq, udata); -	ret = srq->device->ops.destroy_srq(srq); -	if (!ret) { -		atomic_dec(&pd->usecnt); -		if (srq_type == IB_SRQT_XRC) -			atomic_dec(&xrcd->usecnt); -		if (ib_srq_has_cq(srq_type)) -			atomic_dec(&cq->usecnt); -	} +	atomic_dec(&srq->pd->usecnt); +	if (srq->srq_type == IB_SRQT_XRC) +		atomic_dec(&srq->ext.xrc.xrcd->usecnt); +	if (ib_srq_has_cq(srq->srq_type)) +		atomic_dec(&srq->ext.cq->usecnt); +	kfree(srq); -	return ret; +	return 0;  } -EXPORT_SYMBOL(ib_destroy_srq); +EXPORT_SYMBOL(ib_destroy_srq_user);  /* Queue pairs */ @@ -1111,8 +1119,9 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,  }  EXPORT_SYMBOL(ib_open_qp); -static struct ib_qp *create_xrc_qp(struct ib_qp *qp, -				   struct ib_qp_init_attr *qp_init_attr) +static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp, +					struct ib_qp_init_attr *qp_init_attr, +					struct ib_udata *udata)  {  	struct ib_qp *real_qp = qp; @@ -1134,8 +1143,9 @@ static struct ib_qp *create_xrc_qp(struct ib_qp *qp,  	return qp;  } -struct ib_qp *ib_create_qp(struct ib_pd *pd, -			   struct ib_qp_init_attr *qp_init_attr) +struct ib_qp *ib_create_qp_user(struct ib_pd *pd, +				struct ib_qp_init_attr *qp_init_attr, +				struct ib_udata *udata)  {  	struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;  	struct ib_qp *qp; @@ -1164,7 +1174,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,  	if (ret)  		goto err; -	qp->real_qp    = qp;  	qp->qp_type    = qp_init_attr->qp_type;  	qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl; @@ -1176,7 +1185,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,  	qp->port = 0;  	if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { -		struct ib_qp *xrc_qp = create_xrc_qp(qp, qp_init_attr); +		struct ib_qp *xrc_qp = +			create_xrc_qp_user(qp, qp_init_attr, udata);  		if (IS_ERR(xrc_qp)) {  			ret = PTR_ERR(xrc_qp); @@ -1230,7 +1240,7 @@ err:  	return ERR_PTR(ret);  } -EXPORT_SYMBOL(ib_create_qp); +EXPORT_SYMBOL(ib_create_qp_user);  static const struct {  	int			valid; @@ -1837,7 +1847,7 @@ static int __ib_destroy_shared_qp(struct ib_qp *qp)  	return 0;  } -int ib_destroy_qp(struct ib_qp *qp) +int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)  {  	const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr;  	const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr; @@ -1869,7 +1879,7 @@ int ib_destroy_qp(struct ib_qp *qp)  		rdma_rw_cleanup_mrs(qp);  	rdma_restrack_del(&qp->res); -	ret = qp->device->ops.destroy_qp(qp); +	ret = qp->device->ops.destroy_qp(qp, udata);  	if (!ret) {  		if (alt_path_sgid_attr)  			rdma_put_gid_attr(alt_path_sgid_attr); @@ -1894,7 +1904,7 @@ int ib_destroy_qp(struct ib_qp *qp)  	return ret;  } -EXPORT_SYMBOL(ib_destroy_qp); +EXPORT_SYMBOL(ib_destroy_qp_user);  /* Completion queues */ @@ -1907,7 +1917,7 @@ struct ib_cq *__ib_create_cq(struct ib_device *device,  {  	struct ib_cq *cq; -	cq = device->ops.create_cq(device, cq_attr, NULL, NULL); +	cq = device->ops.create_cq(device, cq_attr, NULL);  	if (!IS_ERR(cq)) {  		cq->device        = device; @@ -1933,15 +1943,15 @@ int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period)  }  EXPORT_SYMBOL(rdma_set_cq_moderation); -int ib_destroy_cq(struct ib_cq *cq) +int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata)  {  	if (atomic_read(&cq->usecnt))  		return -EBUSY;  	rdma_restrack_del(&cq->res); -	return cq->device->ops.destroy_cq(cq); +	return cq->device->ops.destroy_cq(cq, udata);  } -EXPORT_SYMBOL(ib_destroy_cq); +EXPORT_SYMBOL(ib_destroy_cq_user);  int ib_resize_cq(struct ib_cq *cq, int cqe)  { @@ -1952,14 +1962,14 @@ EXPORT_SYMBOL(ib_resize_cq);  /* Memory regions */ -int ib_dereg_mr(struct ib_mr *mr) +int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)  {  	struct ib_pd *pd = mr->pd;  	struct ib_dm *dm = mr->dm;  	int ret;  	rdma_restrack_del(&mr->res); -	ret = mr->device->ops.dereg_mr(mr); +	ret = mr->device->ops.dereg_mr(mr, udata);  	if (!ret) {  		atomic_dec(&pd->usecnt);  		if (dm) @@ -1968,13 +1978,14 @@ int ib_dereg_mr(struct ib_mr *mr)  	return ret;  } -EXPORT_SYMBOL(ib_dereg_mr); +EXPORT_SYMBOL(ib_dereg_mr_user);  /**   * ib_alloc_mr() - Allocates a memory region   * @pd:            protection domain associated with the region   * @mr_type:       memory region type   * @max_num_sg:    maximum sg entries available for registration. + * @udata:	   user data or null for kernel objects   *   * Notes:   * Memory registeration page/sg lists must not exceed max_num_sg. @@ -1982,16 +1993,15 @@ EXPORT_SYMBOL(ib_dereg_mr);   * max_num_sg * used_page_size.   *   */ -struct ib_mr *ib_alloc_mr(struct ib_pd *pd, -			  enum ib_mr_type mr_type, -			  u32 max_num_sg) +struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type, +			       u32 max_num_sg, struct ib_udata *udata)  {  	struct ib_mr *mr;  	if (!pd->device->ops.alloc_mr)  		return ERR_PTR(-EOPNOTSUPP); -	mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg); +	mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg, udata);  	if (!IS_ERR(mr)) {  		mr->device  = pd->device;  		mr->pd      = pd; @@ -2005,7 +2015,7 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,  	return mr;  } -EXPORT_SYMBOL(ib_alloc_mr); +EXPORT_SYMBOL(ib_alloc_mr_user);  /* "Fast" memory regions */ @@ -2138,7 +2148,7 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller)  	if (!device->ops.alloc_xrcd)  		return ERR_PTR(-EOPNOTSUPP); -	xrcd = device->ops.alloc_xrcd(device, NULL, NULL); +	xrcd = device->ops.alloc_xrcd(device, NULL);  	if (!IS_ERR(xrcd)) {  		xrcd->device = device;  		xrcd->inode = NULL; @@ -2151,7 +2161,7 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller)  }  EXPORT_SYMBOL(__ib_alloc_xrcd); -int ib_dealloc_xrcd(struct ib_xrcd *xrcd) +int ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)  {  	struct ib_qp *qp;  	int ret; @@ -2166,7 +2176,7 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd)  			return ret;  	} -	return xrcd->device->ops.dealloc_xrcd(xrcd); +	return xrcd->device->ops.dealloc_xrcd(xrcd, udata);  }  EXPORT_SYMBOL(ib_dealloc_xrcd); @@ -2210,10 +2220,11 @@ struct ib_wq *ib_create_wq(struct ib_pd *pd,  EXPORT_SYMBOL(ib_create_wq);  /** - * ib_destroy_wq - Destroys the specified WQ. + * ib_destroy_wq - Destroys the specified user WQ.   * @wq: The WQ to destroy. + * @udata: Valid user data   */ -int ib_destroy_wq(struct ib_wq *wq) +int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)  {  	int err;  	struct ib_cq *cq = wq->cq; @@ -2222,7 +2233,7 @@ int ib_destroy_wq(struct ib_wq *wq)  	if (atomic_read(&wq->usecnt))  		return -EBUSY; -	err = wq->device->ops.destroy_wq(wq); +	err = wq->device->ops.destroy_wq(wq, udata);  	if (!err) {  		atomic_dec(&pd->usecnt);  		atomic_dec(&cq->usecnt); @@ -2701,3 +2712,37 @@ int rdma_init_netdev(struct ib_device *device, u8 port_num,  					     netdev, params.param);  }  EXPORT_SYMBOL(rdma_init_netdev); + +void __rdma_block_iter_start(struct ib_block_iter *biter, +			     struct scatterlist *sglist, unsigned int nents, +			     unsigned long pgsz) +{ +	memset(biter, 0, sizeof(struct ib_block_iter)); +	biter->__sg = sglist; +	biter->__sg_nents = nents; + +	/* Driver provides best block size to use */ +	biter->__pg_bit = __fls(pgsz); +} +EXPORT_SYMBOL(__rdma_block_iter_start); + +bool __rdma_block_iter_next(struct ib_block_iter *biter) +{ +	unsigned int block_offset; + +	if (!biter->__sg_nents || !biter->__sg) +		return false; + +	biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; +	block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); +	biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset; + +	if (biter->__sg_advance >= sg_dma_len(biter->__sg)) { +		biter->__sg_advance = 0; +		biter->__sg = sg_next(biter->__sg); +		biter->__sg_nents--; +	} + +	return true; +} +EXPORT_SYMBOL(__rdma_block_iter_next); |