diff options
Diffstat (limited to 'drivers/infiniband/core')
27 files changed, 3772 insertions, 877 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index acf736764445..d43a8994ac5c 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -9,7 +9,8 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \  					$(user_access-y)  ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \ -				device.o fmr_pool.o cache.o netlink.o +				device.o fmr_pool.o cache.o netlink.o \ +				roce_gid_mgmt.o  ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o  ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index c7dcfe4ca5f1..0429040304fd 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -88,7 +88,7 @@ void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *  	struct ib_ah *ah;  	struct ib_mad_send_wr_private *mad_send_wr; -	if (device->node_type == RDMA_NODE_IB_SWITCH) +	if (rdma_cap_ib_switch(device))  		port_priv = ib_get_agent_port(device, 0);  	else  		port_priv = ib_get_agent_port(device, port_num); @@ -122,7 +122,7 @@ void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh *  	memcpy(send_buf->mad, mad_hdr, resp_mad_len);  	send_buf->ah = ah; -	if (device->node_type == RDMA_NODE_IB_SWITCH) { +	if (rdma_cap_ib_switch(device)) {  		mad_send_wr = container_of(send_buf,  					   struct ib_mad_send_wr_private,  					   send_buf); diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 871da832d016..8f66c67ff0df 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -37,6 +37,8 @@  #include <linux/errno.h>  #include <linux/slab.h>  #include <linux/workqueue.h> +#include <linux/netdevice.h> +#include <net/addrconf.h>  #include <rdma/ib_cache.h> @@ -47,76 +49,621 @@ struct ib_pkey_cache {  	u16             table[0];  }; -struct ib_gid_cache { -	int             table_len; -	union ib_gid    table[0]; -}; -  struct ib_update_work {  	struct work_struct work;  	struct ib_device  *device;  	u8                 port_num;  }; -int ib_get_cached_gid(struct ib_device *device, -		      u8                port_num, -		      int               index, -		      union ib_gid     *gid) +union ib_gid zgid; +EXPORT_SYMBOL(zgid); + +static const struct ib_gid_attr zattr; + +enum gid_attr_find_mask { +	GID_ATTR_FIND_MASK_GID          = 1UL << 0, +	GID_ATTR_FIND_MASK_NETDEV	= 1UL << 1, +	GID_ATTR_FIND_MASK_DEFAULT	= 1UL << 2, +}; + +enum gid_table_entry_props { +	GID_TABLE_ENTRY_INVALID		= 1UL << 0, +	GID_TABLE_ENTRY_DEFAULT		= 1UL << 1, +}; + +enum gid_table_write_action { +	GID_TABLE_WRITE_ACTION_ADD, +	GID_TABLE_WRITE_ACTION_DEL, +	/* MODIFY only updates the GID table. Currently only used by +	 * ib_cache_update. +	 */ +	GID_TABLE_WRITE_ACTION_MODIFY +}; + +struct ib_gid_table_entry { +	/* This lock protects an entry from being +	 * read and written simultaneously. +	 */ +	rwlock_t	    lock; +	unsigned long	    props; +	union ib_gid        gid; +	struct ib_gid_attr  attr; +	void		   *context; +}; + +struct ib_gid_table { +	int                  sz; +	/* In RoCE, adding a GID to the table requires: +	 * (a) Find if this GID is already exists. +	 * (b) Find a free space. +	 * (c) Write the new GID +	 * +	 * Delete requires different set of operations: +	 * (a) Find the GID +	 * (b) Delete it. +	 * +	 * Add/delete should be carried out atomically. +	 * This is done by locking this mutex from multiple +	 * writers. We don't need this lock for IB, as the MAD +	 * layer replaces all entries. All data_vec entries +	 * are locked by this lock. +	 **/ +	struct mutex         lock; +	struct ib_gid_table_entry *data_vec; +}; + +static int write_gid(struct ib_device *ib_dev, u8 port, +		     struct ib_gid_table *table, int ix, +		     const union ib_gid *gid, +		     const struct ib_gid_attr *attr, +		     enum gid_table_write_action action, +		     bool  default_gid)  { -	struct ib_gid_cache *cache; +	int ret = 0; +	struct net_device *old_net_dev;  	unsigned long flags; + +	/* in rdma_cap_roce_gid_table, this funciton should be protected by a +	 * sleep-able lock. +	 */ +	write_lock_irqsave(&table->data_vec[ix].lock, flags); + +	if (rdma_cap_roce_gid_table(ib_dev, port)) { +		table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; +		write_unlock_irqrestore(&table->data_vec[ix].lock, flags); +		/* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by +		 * RoCE providers and thus only updates the cache. +		 */ +		if (action == GID_TABLE_WRITE_ACTION_ADD) +			ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr, +					      &table->data_vec[ix].context); +		else if (action == GID_TABLE_WRITE_ACTION_DEL) +			ret = ib_dev->del_gid(ib_dev, port, ix, +					      &table->data_vec[ix].context); +		write_lock_irqsave(&table->data_vec[ix].lock, flags); +	} + +	old_net_dev = table->data_vec[ix].attr.ndev; +	if (old_net_dev && old_net_dev != attr->ndev) +		dev_put(old_net_dev); +	/* if modify_gid failed, just delete the old gid */ +	if (ret || action == GID_TABLE_WRITE_ACTION_DEL) { +		gid = &zgid; +		attr = &zattr; +		table->data_vec[ix].context = NULL; +	} +	if (default_gid) +		table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT; +	memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid)); +	memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr)); +	if (table->data_vec[ix].attr.ndev && +	    table->data_vec[ix].attr.ndev != old_net_dev) +		dev_hold(table->data_vec[ix].attr.ndev); + +	table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID; + +	write_unlock_irqrestore(&table->data_vec[ix].lock, flags); + +	if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) { +		struct ib_event event; + +		event.device		= ib_dev; +		event.element.port_num	= port; +		event.event		= IB_EVENT_GID_CHANGE; + +		ib_dispatch_event(&event); +	} +	return ret; +} + +static int add_gid(struct ib_device *ib_dev, u8 port, +		   struct ib_gid_table *table, int ix, +		   const union ib_gid *gid, +		   const struct ib_gid_attr *attr, +		   bool  default_gid) { +	return write_gid(ib_dev, port, table, ix, gid, attr, +			 GID_TABLE_WRITE_ACTION_ADD, default_gid); +} + +static int modify_gid(struct ib_device *ib_dev, u8 port, +		      struct ib_gid_table *table, int ix, +		      const union ib_gid *gid, +		      const struct ib_gid_attr *attr, +		      bool  default_gid) { +	return write_gid(ib_dev, port, table, ix, gid, attr, +			 GID_TABLE_WRITE_ACTION_MODIFY, default_gid); +} + +static int del_gid(struct ib_device *ib_dev, u8 port, +		   struct ib_gid_table *table, int ix, +		   bool  default_gid) { +	return write_gid(ib_dev, port, table, ix, &zgid, &zattr, +			 GID_TABLE_WRITE_ACTION_DEL, default_gid); +} + +static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, +		    const struct ib_gid_attr *val, bool default_gid, +		    unsigned long mask) +{ +	int i; + +	for (i = 0; i < table->sz; i++) { +		unsigned long flags; +		struct ib_gid_attr *attr = &table->data_vec[i].attr; + +		read_lock_irqsave(&table->data_vec[i].lock, flags); + +		if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) +			goto next; + +		if (mask & GID_ATTR_FIND_MASK_GID && +		    memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) +			goto next; + +		if (mask & GID_ATTR_FIND_MASK_NETDEV && +		    attr->ndev != val->ndev) +			goto next; + +		if (mask & GID_ATTR_FIND_MASK_DEFAULT && +		    !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) != +		    default_gid) +			goto next; + +		read_unlock_irqrestore(&table->data_vec[i].lock, flags); +		return i; +next: +		read_unlock_irqrestore(&table->data_vec[i].lock, flags); +	} + +	return -1; +} + +static void make_default_gid(struct  net_device *dev, union ib_gid *gid) +{ +	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); +	addrconf_ifid_eui48(&gid->raw[8], dev); +} + +int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, +		     union ib_gid *gid, struct ib_gid_attr *attr) +{ +	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; +	struct ib_gid_table *table; +	int ix;  	int ret = 0; +	struct net_device *idev; -	if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) +	table = ports_table[port - rdma_start_port(ib_dev)]; + +	if (!memcmp(gid, &zgid, sizeof(*gid)))  		return -EINVAL; -	read_lock_irqsave(&device->cache.lock, flags); +	if (ib_dev->get_netdev) { +		idev = ib_dev->get_netdev(ib_dev, port); +		if (idev && attr->ndev != idev) { +			union ib_gid default_gid; -	cache = device->cache.gid_cache[port_num - rdma_start_port(device)]; +			/* Adding default GIDs in not permitted */ +			make_default_gid(idev, &default_gid); +			if (!memcmp(gid, &default_gid, sizeof(*gid))) { +				dev_put(idev); +				return -EPERM; +			} +		} +		if (idev) +			dev_put(idev); +	} -	if (index < 0 || index >= cache->table_len) -		ret = -EINVAL; -	else -		*gid = cache->table[index]; +	mutex_lock(&table->lock); -	read_unlock_irqrestore(&device->cache.lock, flags); +	ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | +		      GID_ATTR_FIND_MASK_NETDEV); +	if (ix >= 0) +		goto out_unlock; +	ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID | +		      GID_ATTR_FIND_MASK_DEFAULT); +	if (ix < 0) { +		ret = -ENOSPC; +		goto out_unlock; +	} + +	add_gid(ib_dev, port, table, ix, gid, attr, false); + +out_unlock: +	mutex_unlock(&table->lock);  	return ret;  } -EXPORT_SYMBOL(ib_get_cached_gid); -int ib_find_cached_gid(struct ib_device   *device, -		       const union ib_gid *gid, -		       u8                 *port_num, -		       u16                *index) +int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, +		     union ib_gid *gid, struct ib_gid_attr *attr)  { -	struct ib_gid_cache *cache; +	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; +	struct ib_gid_table *table; +	int ix; + +	table = ports_table[port - rdma_start_port(ib_dev)]; + +	mutex_lock(&table->lock); + +	ix = find_gid(table, gid, attr, false, +		      GID_ATTR_FIND_MASK_GID	  | +		      GID_ATTR_FIND_MASK_NETDEV	  | +		      GID_ATTR_FIND_MASK_DEFAULT); +	if (ix < 0) +		goto out_unlock; + +	del_gid(ib_dev, port, table, ix, false); + +out_unlock: +	mutex_unlock(&table->lock); +	return 0; +} + +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, +				     struct net_device *ndev) +{ +	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; +	struct ib_gid_table *table; +	int ix; + +	table  = ports_table[port - rdma_start_port(ib_dev)]; + +	mutex_lock(&table->lock); + +	for (ix = 0; ix < table->sz; ix++) +		if (table->data_vec[ix].attr.ndev == ndev) +			del_gid(ib_dev, port, table, ix, false); + +	mutex_unlock(&table->lock); +	return 0; +} + +static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, +			      union ib_gid *gid, struct ib_gid_attr *attr) +{ +	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; +	struct ib_gid_table *table;  	unsigned long flags; -	int p, i; -	int ret = -ENOENT; -	*port_num = -1; -	if (index) -		*index = -1; +	table = ports_table[port - rdma_start_port(ib_dev)]; -	read_lock_irqsave(&device->cache.lock, flags); +	if (index < 0 || index >= table->sz) +		return -EINVAL; -	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) { -		cache = device->cache.gid_cache[p]; -		for (i = 0; i < cache->table_len; ++i) { -			if (!memcmp(gid, &cache->table[i], sizeof *gid)) { -				*port_num = p + rdma_start_port(device); -				if (index) -					*index = i; -				ret = 0; -				goto found; -			} +	read_lock_irqsave(&table->data_vec[index].lock, flags); +	if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) { +		read_unlock_irqrestore(&table->data_vec[index].lock, flags); +		return -EAGAIN; +	} + +	memcpy(gid, &table->data_vec[index].gid, sizeof(*gid)); +	if (attr) { +		memcpy(attr, &table->data_vec[index].attr, sizeof(*attr)); +		if (attr->ndev) +			dev_hold(attr->ndev); +	} + +	read_unlock_irqrestore(&table->data_vec[index].lock, flags); +	return 0; +} + +static int _ib_cache_gid_table_find(struct ib_device *ib_dev, +				    const union ib_gid *gid, +				    const struct ib_gid_attr *val, +				    unsigned long mask, +				    u8 *port, u16 *index) +{ +	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; +	struct ib_gid_table *table; +	u8 p; +	int local_index; + +	for (p = 0; p < ib_dev->phys_port_cnt; p++) { +		table = ports_table[p]; +		local_index = find_gid(table, gid, val, false, mask); +		if (local_index >= 0) { +			if (index) +				*index = local_index; +			if (port) +				*port = p + rdma_start_port(ib_dev); +			return 0;  		}  	} -found: -	read_unlock_irqrestore(&device->cache.lock, flags); -	return ret; +	return -ENOENT; +} + +static int ib_cache_gid_find(struct ib_device *ib_dev, +			     const union ib_gid *gid, +			     struct net_device *ndev, u8 *port, +			     u16 *index) +{ +	unsigned long mask = GID_ATTR_FIND_MASK_GID; +	struct ib_gid_attr gid_attr_val = {.ndev = ndev}; + +	if (ndev) +		mask |= GID_ATTR_FIND_MASK_NETDEV; + +	return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val, +					mask, port, index); +} + +int ib_cache_gid_find_by_port(struct ib_device *ib_dev, +			      const union ib_gid *gid, +			      u8 port, struct net_device *ndev, +			      u16 *index) +{ +	int local_index; +	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; +	struct ib_gid_table *table; +	unsigned long mask = GID_ATTR_FIND_MASK_GID; +	struct ib_gid_attr val = {.ndev = ndev}; + +	if (port < rdma_start_port(ib_dev) || +	    port > rdma_end_port(ib_dev)) +		return -ENOENT; + +	table = ports_table[port - rdma_start_port(ib_dev)]; + +	if (ndev) +		mask |= GID_ATTR_FIND_MASK_NETDEV; + +	local_index = find_gid(table, gid, &val, false, mask); +	if (local_index >= 0) { +		if (index) +			*index = local_index; +		return 0; +	} + +	return -ENOENT; +} + +static struct ib_gid_table *alloc_gid_table(int sz) +{ +	unsigned int i; +	struct ib_gid_table *table = +		kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); +	if (!table) +		return NULL; + +	table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL); +	if (!table->data_vec) +		goto err_free_table; + +	mutex_init(&table->lock); + +	table->sz = sz; + +	for (i = 0; i < sz; i++) +		rwlock_init(&table->data_vec[i].lock); + +	return table; + +err_free_table: +	kfree(table); +	return NULL; +} + +static void release_gid_table(struct ib_gid_table *table) +{ +	if (table) { +		kfree(table->data_vec); +		kfree(table); +	} +} + +static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, +				   struct ib_gid_table *table) +{ +	int i; + +	if (!table) +		return; + +	for (i = 0; i < table->sz; ++i) { +		if (memcmp(&table->data_vec[i].gid, &zgid, +			   sizeof(table->data_vec[i].gid))) +			del_gid(ib_dev, port, table, i, +				table->data_vec[i].props & +				GID_ATTR_FIND_MASK_DEFAULT); +	} +} + +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, +				  struct net_device *ndev, +				  enum ib_cache_gid_default_mode mode) +{ +	struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; +	union ib_gid gid; +	struct ib_gid_attr gid_attr; +	struct ib_gid_table *table; +	int ix; +	union ib_gid current_gid; +	struct ib_gid_attr current_gid_attr = {}; + +	table  = ports_table[port - rdma_start_port(ib_dev)]; + +	make_default_gid(ndev, &gid); +	memset(&gid_attr, 0, sizeof(gid_attr)); +	gid_attr.ndev = ndev; + +	ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT); + +	/* Coudn't find default GID location */ +	WARN_ON(ix < 0); + +	mutex_lock(&table->lock); +	if (!__ib_cache_gid_get(ib_dev, port, ix, +				¤t_gid, ¤t_gid_attr) && +	    mode == IB_CACHE_GID_DEFAULT_MODE_SET && +	    !memcmp(&gid, ¤t_gid, sizeof(gid)) && +	    !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) +		goto unlock; + +	if ((memcmp(¤t_gid, &zgid, sizeof(current_gid)) || +	     memcmp(¤t_gid_attr, &zattr, +		    sizeof(current_gid_attr))) && +	    del_gid(ib_dev, port, table, ix, true)) { +		pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", +			ix, gid.raw); +		goto unlock; +	} + +	if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) +		if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) +			pr_warn("ib_cache_gid: unable to add default gid %pI6\n", +				gid.raw); + +unlock: +	if (current_gid_attr.ndev) +		dev_put(current_gid_attr.ndev); +	mutex_unlock(&table->lock); +} + +static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, +				     struct ib_gid_table *table) +{ +	if (rdma_protocol_roce(ib_dev, port)) { +		struct ib_gid_table_entry *entry = &table->data_vec[0]; + +		entry->props |= GID_TABLE_ENTRY_DEFAULT; +	} + +	return 0; +} + +static int _gid_table_setup_one(struct ib_device *ib_dev) +{ +	u8 port; +	struct ib_gid_table **table; +	int err = 0; + +	table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL); + +	if (!table) { +		pr_warn("failed to allocate ib gid cache for %s\n", +			ib_dev->name); +		return -ENOMEM; +	} + +	for (port = 0; port < ib_dev->phys_port_cnt; port++) { +		u8 rdma_port = port + rdma_start_port(ib_dev); + +		table[port] = +			alloc_gid_table( +				ib_dev->port_immutable[rdma_port].gid_tbl_len); +		if (!table[port]) { +			err = -ENOMEM; +			goto rollback_table_setup; +		} + +		err = gid_table_reserve_default(ib_dev, +						port + rdma_start_port(ib_dev), +						table[port]); +		if (err) +			goto rollback_table_setup; +	} + +	ib_dev->cache.gid_cache = table; +	return 0; + +rollback_table_setup: +	for (port = 0; port < ib_dev->phys_port_cnt; port++) { +		cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), +				       table[port]); +		release_gid_table(table[port]); +	} + +	kfree(table); +	return err; +} + +static void gid_table_release_one(struct ib_device *ib_dev) +{ +	struct ib_gid_table **table = ib_dev->cache.gid_cache; +	u8 port; + +	if (!table) +		return; + +	for (port = 0; port < ib_dev->phys_port_cnt; port++) +		release_gid_table(table[port]); + +	kfree(table); +	ib_dev->cache.gid_cache = NULL; +} + +static void gid_table_cleanup_one(struct ib_device *ib_dev) +{ +	struct ib_gid_table **table = ib_dev->cache.gid_cache; +	u8 port; + +	if (!table) +		return; + +	for (port = 0; port < ib_dev->phys_port_cnt; port++) +		cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), +				       table[port]); +} + +static int gid_table_setup_one(struct ib_device *ib_dev) +{ +	int err; + +	err = _gid_table_setup_one(ib_dev); + +	if (err) +		return err; + +	err = roce_rescan_device(ib_dev); + +	if (err) { +		gid_table_cleanup_one(ib_dev); +		gid_table_release_one(ib_dev); +	} + +	return err; +} + +int ib_get_cached_gid(struct ib_device *device, +		      u8                port_num, +		      int               index, +		      union ib_gid     *gid) +{ +	if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) +		return -EINVAL; + +	return __ib_cache_gid_get(device, port_num, index, gid, NULL); +} +EXPORT_SYMBOL(ib_get_cached_gid); + +int ib_find_cached_gid(struct ib_device *device, +		       const union ib_gid *gid, +		       u8               *port_num, +		       u16              *index) +{ +	return ib_cache_gid_find(device, gid, NULL, port_num, index);  }  EXPORT_SYMBOL(ib_find_cached_gid); @@ -243,9 +790,21 @@ static void ib_cache_update(struct ib_device *device,  {  	struct ib_port_attr       *tprops = NULL;  	struct ib_pkey_cache      *pkey_cache = NULL, *old_pkey_cache; -	struct ib_gid_cache       *gid_cache = NULL, *old_gid_cache; +	struct ib_gid_cache { +		int             table_len; +		union ib_gid    table[0]; +	}			  *gid_cache = NULL;  	int                        i;  	int                        ret; +	struct ib_gid_table	  *table; +	struct ib_gid_table	 **ports_table = device->cache.gid_cache; +	bool			   use_roce_gid_table = +					rdma_cap_roce_gid_table(device, port); + +	if (port < rdma_start_port(device) || port > rdma_end_port(device)) +		return; + +	table = ports_table[port - rdma_start_port(device)];  	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);  	if (!tprops) @@ -265,12 +824,14 @@ static void ib_cache_update(struct ib_device *device,  	pkey_cache->table_len = tprops->pkey_tbl_len; -	gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len * -			    sizeof *gid_cache->table, GFP_KERNEL); -	if (!gid_cache) -		goto err; +	if (!use_roce_gid_table) { +		gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len * +			    sizeof(*gid_cache->table), GFP_KERNEL); +		if (!gid_cache) +			goto err; -	gid_cache->table_len = tprops->gid_tbl_len; +		gid_cache->table_len = tprops->gid_tbl_len; +	}  	for (i = 0; i < pkey_cache->table_len; ++i) {  		ret = ib_query_pkey(device, port, i, pkey_cache->table + i); @@ -281,29 +842,36 @@ static void ib_cache_update(struct ib_device *device,  		}  	} -	for (i = 0; i < gid_cache->table_len; ++i) { -		ret = ib_query_gid(device, port, i, gid_cache->table + i); -		if (ret) { -			printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n", -			       ret, device->name, i); -			goto err; +	if (!use_roce_gid_table) { +		for (i = 0;  i < gid_cache->table_len; ++i) { +			ret = ib_query_gid(device, port, i, +					   gid_cache->table + i); +			if (ret) { +				printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n", +				       ret, device->name, i); +				goto err; +			}  		}  	}  	write_lock_irq(&device->cache.lock);  	old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)]; -	old_gid_cache  = device->cache.gid_cache [port - rdma_start_port(device)];  	device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache; -	device->cache.gid_cache [port - rdma_start_port(device)] = gid_cache; +	if (!use_roce_gid_table) { +		for (i = 0; i < gid_cache->table_len; i++) { +			modify_gid(device, port, table, i, gid_cache->table + i, +				   &zattr, false); +		} +	}  	device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc;  	write_unlock_irq(&device->cache.lock); +	kfree(gid_cache);  	kfree(old_pkey_cache); -	kfree(old_gid_cache);  	kfree(tprops);  	return; @@ -344,85 +912,88 @@ static void ib_cache_event(struct ib_event_handler *handler,  	}  } -static void ib_cache_setup_one(struct ib_device *device) +int ib_cache_setup_one(struct ib_device *device)  {  	int p; +	int err;  	rwlock_init(&device->cache.lock);  	device->cache.pkey_cache = -		kmalloc(sizeof *device->cache.pkey_cache * -			(rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); -	device->cache.gid_cache = -		kmalloc(sizeof *device->cache.gid_cache * +		kzalloc(sizeof *device->cache.pkey_cache *  			(rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); -  	device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *  					  (rdma_end_port(device) -  					   rdma_start_port(device) + 1),  					  GFP_KERNEL); - -	if (!device->cache.pkey_cache || !device->cache.gid_cache || +	if (!device->cache.pkey_cache ||  	    !device->cache.lmc_cache) {  		printk(KERN_WARNING "Couldn't allocate cache "  		       "for %s\n", device->name); -		goto err; +		return -ENOMEM;  	} -	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) { -		device->cache.pkey_cache[p] = NULL; -		device->cache.gid_cache [p] = NULL; +	err = gid_table_setup_one(device); +	if (err) +		/* Allocated memory will be cleaned in the release function */ +		return err; + +	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)  		ib_cache_update(device, p + rdma_start_port(device)); -	}  	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,  			      device, ib_cache_event); -	if (ib_register_event_handler(&device->cache.event_handler)) -		goto err_cache; - -	return; +	err = ib_register_event_handler(&device->cache.event_handler); +	if (err) +		goto err; -err_cache: -	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) { -		kfree(device->cache.pkey_cache[p]); -		kfree(device->cache.gid_cache[p]); -	} +	return 0;  err: -	kfree(device->cache.pkey_cache); -	kfree(device->cache.gid_cache); -	kfree(device->cache.lmc_cache); +	gid_table_cleanup_one(device); +	return err;  } -static void ib_cache_cleanup_one(struct ib_device *device) +void ib_cache_release_one(struct ib_device *device)  {  	int p; -	ib_unregister_event_handler(&device->cache.event_handler); -	flush_workqueue(ib_wq); - -	for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) { -		kfree(device->cache.pkey_cache[p]); -		kfree(device->cache.gid_cache[p]); -	} - +	/* +	 * The release function frees all the cache elements. +	 * This function should be called as part of freeing +	 * all the device's resources when the cache could no +	 * longer be accessed. +	 */ +	if (device->cache.pkey_cache) +		for (p = 0; +		     p <= rdma_end_port(device) - rdma_start_port(device); ++p) +			kfree(device->cache.pkey_cache[p]); + +	gid_table_release_one(device);  	kfree(device->cache.pkey_cache); -	kfree(device->cache.gid_cache);  	kfree(device->cache.lmc_cache);  } -static struct ib_client cache_client = { -	.name   = "cache", -	.add    = ib_cache_setup_one, -	.remove = ib_cache_cleanup_one -}; +void ib_cache_cleanup_one(struct ib_device *device) +{ +	/* The cleanup function unregisters the event handler, +	 * waits for all in-progress workqueue elements and cleans +	 * up the GID cache. This function should be called after +	 * the device was removed from the devices list and all +	 * clients were removed, so the cache exists but is +	 * non-functional and shouldn't be updated anymore. +	 */ +	ib_unregister_event_handler(&device->cache.event_handler); +	flush_workqueue(ib_wq); +	gid_table_cleanup_one(device); +} -int __init ib_cache_setup(void) +void __init ib_cache_setup(void)  { -	return ib_register_client(&cache_client); +	roce_gid_mgmt_init();  }  void __exit ib_cache_cleanup(void)  { -	ib_unregister_client(&cache_client); +	roce_gid_mgmt_cleanup();  } diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index dbddddd6fb5d..ea4db9c1d44f 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -58,7 +58,7 @@ MODULE_DESCRIPTION("InfiniBand CM");  MODULE_LICENSE("Dual BSD/GPL");  static void cm_add_one(struct ib_device *device); -static void cm_remove_one(struct ib_device *device); +static void cm_remove_one(struct ib_device *device, void *client_data);  static struct ib_client cm_client = {  	.name   = "cm", @@ -169,6 +169,7 @@ struct cm_device {  	struct ib_device *ib_device;  	struct device *device;  	u8 ack_delay; +	int going_down;  	struct cm_port *port[0];  }; @@ -212,13 +213,15 @@ struct cm_id_private {  	spinlock_t lock;	/* Do not acquire inside cm.lock */  	struct completion comp;  	atomic_t refcount; +	/* Number of clients sharing this ib_cm_id. Only valid for listeners. +	 * Protected by the cm.lock spinlock. */ +	int listen_sharecount;  	struct ib_mad_send_buf *msg;  	struct cm_timewait_info *timewait_info;  	/* todo: use alternate port on send failure */  	struct cm_av av;  	struct cm_av alt_av; -	struct ib_cm_compare_data *compare_data;  	void *private_data;  	__be64 tid; @@ -439,40 +442,6 @@ static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)  	return cm_id_priv;  } -static void cm_mask_copy(u32 *dst, const u32 *src, const u32 *mask) -{ -	int i; - -	for (i = 0; i < IB_CM_COMPARE_SIZE; i++) -		dst[i] = src[i] & mask[i]; -} - -static int cm_compare_data(struct ib_cm_compare_data *src_data, -			   struct ib_cm_compare_data *dst_data) -{ -	u32 src[IB_CM_COMPARE_SIZE]; -	u32 dst[IB_CM_COMPARE_SIZE]; - -	if (!src_data || !dst_data) -		return 0; - -	cm_mask_copy(src, src_data->data, dst_data->mask); -	cm_mask_copy(dst, dst_data->data, src_data->mask); -	return memcmp(src, dst, sizeof(src)); -} - -static int cm_compare_private_data(u32 *private_data, -				   struct ib_cm_compare_data *dst_data) -{ -	u32 src[IB_CM_COMPARE_SIZE]; - -	if (!dst_data) -		return 0; - -	cm_mask_copy(src, private_data, dst_data->mask); -	return memcmp(src, dst_data->data, sizeof(src)); -} -  /*   * Trivial helpers to strip endian annotation and compare; the   * endianness doesn't actually matter since we just need a stable @@ -505,18 +474,14 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)  	struct cm_id_private *cur_cm_id_priv;  	__be64 service_id = cm_id_priv->id.service_id;  	__be64 service_mask = cm_id_priv->id.service_mask; -	int data_cmp;  	while (*link) {  		parent = *link;  		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,  					  service_node); -		data_cmp = cm_compare_data(cm_id_priv->compare_data, -					   cur_cm_id_priv->compare_data);  		if ((cur_cm_id_priv->id.service_mask & service_id) ==  		    (service_mask & cur_cm_id_priv->id.service_id) && -		    (cm_id_priv->id.device == cur_cm_id_priv->id.device) && -		    !data_cmp) +		    (cm_id_priv->id.device == cur_cm_id_priv->id.device))  			return cur_cm_id_priv;  		if (cm_id_priv->id.device < cur_cm_id_priv->id.device) @@ -527,8 +492,6 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)  			link = &(*link)->rb_left;  		else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))  			link = &(*link)->rb_right; -		else if (data_cmp < 0) -			link = &(*link)->rb_left;  		else  			link = &(*link)->rb_right;  	} @@ -538,20 +501,16 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)  }  static struct cm_id_private * cm_find_listen(struct ib_device *device, -					     __be64 service_id, -					     u32 *private_data) +					     __be64 service_id)  {  	struct rb_node *node = cm.listen_service_table.rb_node;  	struct cm_id_private *cm_id_priv; -	int data_cmp;  	while (node) {  		cm_id_priv = rb_entry(node, struct cm_id_private, service_node); -		data_cmp = cm_compare_private_data(private_data, -						   cm_id_priv->compare_data);  		if ((cm_id_priv->id.service_mask & service_id) ==  		     cm_id_priv->id.service_id && -		    (cm_id_priv->id.device == device) && !data_cmp) +		    (cm_id_priv->id.device == device))  			return cm_id_priv;  		if (device < cm_id_priv->id.device) @@ -562,8 +521,6 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device,  			node = node->rb_left;  		else if (be64_gt(service_id, cm_id_priv->id.service_id))  			node = node->rb_right; -		else if (data_cmp < 0) -			node = node->rb_left;  		else  			node = node->rb_right;  	} @@ -805,6 +762,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv)  {  	int wait_time;  	unsigned long flags; +	struct cm_device *cm_dev; + +	cm_dev = ib_get_client_data(cm_id_priv->id.device, &cm_client); +	if (!cm_dev) +		return;  	spin_lock_irqsave(&cm.lock, flags);  	cm_cleanup_timewait(cm_id_priv->timewait_info); @@ -818,8 +780,14 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv)  	 */  	cm_id_priv->id.state = IB_CM_TIMEWAIT;  	wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); -	queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, -			   msecs_to_jiffies(wait_time)); + +	/* Check if the device started its remove_one */ +	spin_lock_irq(&cm.lock); +	if (!cm_dev->going_down) +		queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, +				   msecs_to_jiffies(wait_time)); +	spin_unlock_irq(&cm.lock); +  	cm_id_priv->timewait_info = NULL;  } @@ -847,9 +815,15 @@ retest:  	spin_lock_irq(&cm_id_priv->lock);  	switch (cm_id->state) {  	case IB_CM_LISTEN: -		cm_id->state = IB_CM_IDLE;  		spin_unlock_irq(&cm_id_priv->lock); +  		spin_lock_irq(&cm.lock); +		if (--cm_id_priv->listen_sharecount > 0) { +			/* The id is still shared. */ +			cm_deref_id(cm_id_priv); +			spin_unlock_irq(&cm.lock); +			return; +		}  		rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);  		spin_unlock_irq(&cm.lock);  		break; @@ -918,7 +892,6 @@ retest:  	wait_for_completion(&cm_id_priv->comp);  	while ((work = cm_dequeue_work(cm_id_priv)) != NULL)  		cm_free_work(work); -	kfree(cm_id_priv->compare_data);  	kfree(cm_id_priv->private_data);  	kfree(cm_id_priv);  } @@ -929,11 +902,23 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id)  }  EXPORT_SYMBOL(ib_destroy_cm_id); -int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask, -		 struct ib_cm_compare_data *compare_data) +/** + * __ib_cm_listen - Initiates listening on the specified service ID for + *   connection and service ID resolution requests. + * @cm_id: Connection identifier associated with the listen request. + * @service_id: Service identifier matched against incoming connection + *   and service ID resolution requests.  The service ID should be specified + *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will + *   assign a service ID to the caller. + * @service_mask: Mask applied to service ID used to listen across a + *   range of service IDs.  If set to 0, the service ID is matched + *   exactly.  This parameter is ignored if %service_id is set to + *   IB_CM_ASSIGN_SERVICE_ID. + */ +static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, +			  __be64 service_mask)  {  	struct cm_id_private *cm_id_priv, *cur_cm_id_priv; -	unsigned long flags;  	int ret = 0;  	service_mask = service_mask ? service_mask : ~cpu_to_be64(0); @@ -946,20 +931,9 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,  	if (cm_id->state != IB_CM_IDLE)  		return -EINVAL; -	if (compare_data) { -		cm_id_priv->compare_data = kzalloc(sizeof *compare_data, -						   GFP_KERNEL); -		if (!cm_id_priv->compare_data) -			return -ENOMEM; -		cm_mask_copy(cm_id_priv->compare_data->data, -			     compare_data->data, compare_data->mask); -		memcpy(cm_id_priv->compare_data->mask, compare_data->mask, -		       sizeof(compare_data->mask)); -	} -  	cm_id->state = IB_CM_LISTEN; +	++cm_id_priv->listen_sharecount; -	spin_lock_irqsave(&cm.lock, flags);  	if (service_id == IB_CM_ASSIGN_SERVICE_ID) {  		cm_id->service_id = cpu_to_be64(cm.listen_service_id++);  		cm_id->service_mask = ~cpu_to_be64(0); @@ -968,18 +942,95 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,  		cm_id->service_mask = service_mask;  	}  	cur_cm_id_priv = cm_insert_listen(cm_id_priv); -	spin_unlock_irqrestore(&cm.lock, flags);  	if (cur_cm_id_priv) {  		cm_id->state = IB_CM_IDLE; -		kfree(cm_id_priv->compare_data); -		cm_id_priv->compare_data = NULL; +		--cm_id_priv->listen_sharecount;  		ret = -EBUSY;  	}  	return ret;  } + +int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask) +{ +	unsigned long flags; +	int ret; + +	spin_lock_irqsave(&cm.lock, flags); +	ret = __ib_cm_listen(cm_id, service_id, service_mask); +	spin_unlock_irqrestore(&cm.lock, flags); + +	return ret; +}  EXPORT_SYMBOL(ib_cm_listen); +/** + * Create a new listening ib_cm_id and listen on the given service ID. + * + * If there's an existing ID listening on that same device and service ID, + * return it. + * + * @device: Device associated with the cm_id.  All related communication will + * be associated with the specified device. + * @cm_handler: Callback invoked to notify the user of CM events. + * @service_id: Service identifier matched against incoming connection + *   and service ID resolution requests.  The service ID should be specified + *   network-byte order.  If set to IB_CM_ASSIGN_SERVICE_ID, the CM will + *   assign a service ID to the caller. + * + * Callers should call ib_destroy_cm_id when done with the listener ID. + */ +struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device, +				     ib_cm_handler cm_handler, +				     __be64 service_id) +{ +	struct cm_id_private *cm_id_priv; +	struct ib_cm_id *cm_id; +	unsigned long flags; +	int err = 0; + +	/* Create an ID in advance, since the creation may sleep */ +	cm_id = ib_create_cm_id(device, cm_handler, NULL); +	if (IS_ERR(cm_id)) +		return cm_id; + +	spin_lock_irqsave(&cm.lock, flags); + +	if (service_id == IB_CM_ASSIGN_SERVICE_ID) +		goto new_id; + +	/* Find an existing ID */ +	cm_id_priv = cm_find_listen(device, service_id); +	if (cm_id_priv) { +		if (cm_id->cm_handler != cm_handler || cm_id->context) { +			/* Sharing an ib_cm_id with different handlers is not +			 * supported */ +			spin_unlock_irqrestore(&cm.lock, flags); +			return ERR_PTR(-EINVAL); +		} +		atomic_inc(&cm_id_priv->refcount); +		++cm_id_priv->listen_sharecount; +		spin_unlock_irqrestore(&cm.lock, flags); + +		ib_destroy_cm_id(cm_id); +		cm_id = &cm_id_priv->id; +		return cm_id; +	} + +new_id: +	/* Use newly created ID */ +	err = __ib_cm_listen(cm_id, service_id, 0); + +	spin_unlock_irqrestore(&cm.lock, flags); + +	if (err) { +		ib_destroy_cm_id(cm_id); +		return ERR_PTR(err); +	} +	return cm_id; +} +EXPORT_SYMBOL(ib_cm_insert_listen); +  static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,  			  enum cm_msg_sequence msg_seq)  { @@ -1256,6 +1307,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,  	primary_path->packet_life_time =  		cm_req_get_primary_local_ack_timeout(req_msg);  	primary_path->packet_life_time -= (primary_path->packet_life_time > 0); +	primary_path->service_id = req_msg->service_id;  	if (req_msg->alt_local_lid) {  		memset(alt_path, 0, sizeof *alt_path); @@ -1277,9 +1329,28 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,  		alt_path->packet_life_time =  			cm_req_get_alt_local_ack_timeout(req_msg);  		alt_path->packet_life_time -= (alt_path->packet_life_time > 0); +		alt_path->service_id = req_msg->service_id;  	}  } +static u16 cm_get_bth_pkey(struct cm_work *work) +{ +	struct ib_device *ib_dev = work->port->cm_dev->ib_device; +	u8 port_num = work->port->port_num; +	u16 pkey_index = work->mad_recv_wc->wc->pkey_index; +	u16 pkey; +	int ret; + +	ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey); +	if (ret) { +		dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n", +				     port_num, pkey_index, ret); +		return 0; +	} + +	return pkey; +} +  static void cm_format_req_event(struct cm_work *work,  				struct cm_id_private *cm_id_priv,  				struct ib_cm_id *listen_id) @@ -1290,6 +1361,7 @@ static void cm_format_req_event(struct cm_work *work,  	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;  	param = &work->cm_event.param.req_rcvd;  	param->listen_id = listen_id; +	param->bth_pkey = cm_get_bth_pkey(work);  	param->port = cm_id_priv->av.port->port_num;  	param->primary_path = &work->path[0];  	if (req_msg->alt_local_lid) @@ -1472,8 +1544,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,  	/* Find matching listen request. */  	listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device, -					   req_msg->service_id, -					   req_msg->private_data); +					   req_msg->service_id);  	if (!listen_cm_id_priv) {  		cm_cleanup_timewait(cm_id_priv->timewait_info);  		spin_unlock_irq(&cm.lock); @@ -2980,6 +3051,8 @@ static void cm_format_sidr_req_event(struct cm_work *work,  	param = &work->cm_event.param.sidr_req_rcvd;  	param->pkey = __be16_to_cpu(sidr_req_msg->pkey);  	param->listen_id = listen_id; +	param->service_id = sidr_req_msg->service_id; +	param->bth_pkey = cm_get_bth_pkey(work);  	param->port = work->port->port_num;  	work->cm_event.private_data = &sidr_req_msg->private_data;  } @@ -3019,8 +3092,7 @@ static int cm_sidr_req_handler(struct cm_work *work)  	}  	cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;  	cur_cm_id_priv = cm_find_listen(cm_id->device, -					sidr_req_msg->service_id, -					sidr_req_msg->private_data); +					sidr_req_msg->service_id);  	if (!cur_cm_id_priv) {  		spin_unlock_irq(&cm.lock);  		cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED); @@ -3305,6 +3377,11 @@ static int cm_establish(struct ib_cm_id *cm_id)  	struct cm_work *work;  	unsigned long flags;  	int ret = 0; +	struct cm_device *cm_dev; + +	cm_dev = ib_get_client_data(cm_id->device, &cm_client); +	if (!cm_dev) +		return -ENODEV;  	work = kmalloc(sizeof *work, GFP_ATOMIC);  	if (!work) @@ -3343,7 +3420,17 @@ static int cm_establish(struct ib_cm_id *cm_id)  	work->remote_id = cm_id->remote_id;  	work->mad_recv_wc = NULL;  	work->cm_event.event = IB_CM_USER_ESTABLISHED; -	queue_delayed_work(cm.wq, &work->work, 0); + +	/* Check if the device started its remove_one */ +	spin_lock_irq(&cm.lock); +	if (!cm_dev->going_down) { +		queue_delayed_work(cm.wq, &work->work, 0); +	} else { +		kfree(work); +		ret = -ENODEV; +	} +	spin_unlock_irq(&cm.lock); +  out:  	return ret;  } @@ -3394,6 +3481,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent,  	enum ib_cm_event_type event;  	u16 attr_id;  	int paths = 0; +	int going_down = 0;  	switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {  	case CM_REQ_ATTR_ID: @@ -3452,7 +3540,19 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent,  	work->cm_event.event = event;  	work->mad_recv_wc = mad_recv_wc;  	work->port = port; -	queue_delayed_work(cm.wq, &work->work, 0); + +	/* Check if the device started its remove_one */ +	spin_lock_irq(&cm.lock); +	if (!port->cm_dev->going_down) +		queue_delayed_work(cm.wq, &work->work, 0); +	else +		going_down = 1; +	spin_unlock_irq(&cm.lock); + +	if (going_down) { +		kfree(work); +		ib_free_recv_mad(mad_recv_wc); +	}  }  static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, @@ -3771,7 +3871,7 @@ static void cm_add_one(struct ib_device *ib_device)  	cm_dev->ib_device = ib_device;  	cm_get_ack_delay(cm_dev); - +	cm_dev->going_down = 0;  	cm_dev->device = device_create(&cm_class, &ib_device->dev,  				       MKDEV(0, 0), NULL,  				       "%s", ib_device->name); @@ -3846,9 +3946,9 @@ free:  	kfree(cm_dev);  } -static void cm_remove_one(struct ib_device *ib_device) +static void cm_remove_one(struct ib_device *ib_device, void *client_data)  { -	struct cm_device *cm_dev; +	struct cm_device *cm_dev = client_data;  	struct cm_port *port;  	struct ib_port_modify port_modify = {  		.clr_port_cap_mask = IB_PORT_CM_SUP @@ -3856,7 +3956,6 @@ static void cm_remove_one(struct ib_device *ib_device)  	unsigned long flags;  	int i; -	cm_dev = ib_get_client_data(ib_device, &cm_client);  	if (!cm_dev)  		return; @@ -3864,14 +3963,23 @@ static void cm_remove_one(struct ib_device *ib_device)  	list_del(&cm_dev->list);  	write_unlock_irqrestore(&cm.device_lock, flags); +	spin_lock_irq(&cm.lock); +	cm_dev->going_down = 1; +	spin_unlock_irq(&cm.lock); +  	for (i = 1; i <= ib_device->phys_port_cnt; i++) {  		if (!rdma_cap_ib_cm(ib_device, i))  			continue;  		port = cm_dev->port[i-1];  		ib_modify_port(ib_device, port->port_num, 0, &port_modify); -		ib_unregister_mad_agent(port->mad_agent); +		/* +		 * We flush the queue here after the going_down set, this +		 * verify that no new works will be queued in the recv handler, +		 * after that we can call the unregister_mad_agent +		 */  		flush_workqueue(cm.wq); +		ib_unregister_mad_agent(port->mad_agent);  		cm_remove_port_fs(port);  	}  	device_unregister(cm_dev->device); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 143ded2bbe7c..b1ab13f3e182 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -46,6 +46,8 @@  #include <net/tcp.h>  #include <net/ipv6.h> +#include <net/ip_fib.h> +#include <net/ip6_route.h>  #include <rdma/rdma_cm.h>  #include <rdma/rdma_cm_ib.h> @@ -94,7 +96,7 @@ const char *rdma_event_msg(enum rdma_cm_event_type event)  EXPORT_SYMBOL(rdma_event_msg);  static void cma_add_one(struct ib_device *device); -static void cma_remove_one(struct ib_device *device); +static void cma_remove_one(struct ib_device *device, void *client_data);  static struct ib_client cma_client = {  	.name   = "cma", @@ -113,6 +115,22 @@ static DEFINE_IDR(udp_ps);  static DEFINE_IDR(ipoib_ps);  static DEFINE_IDR(ib_ps); +static struct idr *cma_idr(enum rdma_port_space ps) +{ +	switch (ps) { +	case RDMA_PS_TCP: +		return &tcp_ps; +	case RDMA_PS_UDP: +		return &udp_ps; +	case RDMA_PS_IPOIB: +		return &ipoib_ps; +	case RDMA_PS_IB: +		return &ib_ps; +	default: +		return NULL; +	} +} +  struct cma_device {  	struct list_head	list;  	struct ib_device	*device; @@ -122,11 +140,33 @@ struct cma_device {  };  struct rdma_bind_list { -	struct idr		*ps; +	enum rdma_port_space	ps;  	struct hlist_head	owners;  	unsigned short		port;  }; +static int cma_ps_alloc(enum rdma_port_space ps, +			struct rdma_bind_list *bind_list, int snum) +{ +	struct idr *idr = cma_idr(ps); + +	return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL); +} + +static struct rdma_bind_list *cma_ps_find(enum rdma_port_space ps, int snum) +{ +	struct idr *idr = cma_idr(ps); + +	return idr_find(idr, snum); +} + +static void cma_ps_remove(enum rdma_port_space ps, int snum) +{ +	struct idr *idr = cma_idr(ps); + +	idr_remove(idr, snum); +} +  enum {  	CMA_OPTION_AFONLY,  }; @@ -225,6 +265,15 @@ struct cma_hdr {  #define CMA_VERSION 0x00 +struct cma_req_info { +	struct ib_device *device; +	int port; +	union ib_gid local_gid; +	__be64 service_id; +	u16 pkey; +	bool has_gid:1; +}; +  static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)  {  	unsigned long flags; @@ -262,7 +311,7 @@ static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,  	return old;  } -static inline u8 cma_get_ip_ver(struct cma_hdr *hdr) +static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)  {  	return hdr->ip_version >> 4;  } @@ -870,107 +919,397 @@ static inline int cma_any_port(struct sockaddr *addr)  	return !cma_port(addr);  } -static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, +static void cma_save_ib_info(struct sockaddr *src_addr, +			     struct sockaddr *dst_addr, +			     struct rdma_cm_id *listen_id,  			     struct ib_sa_path_rec *path)  {  	struct sockaddr_ib *listen_ib, *ib;  	listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; -	ib = (struct sockaddr_ib *) &id->route.addr.src_addr; -	ib->sib_family = listen_ib->sib_family; -	if (path) { -		ib->sib_pkey = path->pkey; -		ib->sib_flowinfo = path->flow_label; -		memcpy(&ib->sib_addr, &path->sgid, 16); -	} else { -		ib->sib_pkey = listen_ib->sib_pkey; -		ib->sib_flowinfo = listen_ib->sib_flowinfo; -		ib->sib_addr = listen_ib->sib_addr; -	} -	ib->sib_sid = listen_ib->sib_sid; -	ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); -	ib->sib_scope_id = listen_ib->sib_scope_id; - -	if (path) { -		ib = (struct sockaddr_ib *) &id->route.addr.dst_addr; -		ib->sib_family = listen_ib->sib_family; -		ib->sib_pkey = path->pkey; -		ib->sib_flowinfo = path->flow_label; -		memcpy(&ib->sib_addr, &path->dgid, 16); +	if (src_addr) { +		ib = (struct sockaddr_ib *)src_addr; +		ib->sib_family = AF_IB; +		if (path) { +			ib->sib_pkey = path->pkey; +			ib->sib_flowinfo = path->flow_label; +			memcpy(&ib->sib_addr, &path->sgid, 16); +			ib->sib_sid = path->service_id; +			ib->sib_scope_id = 0; +		} else { +			ib->sib_pkey = listen_ib->sib_pkey; +			ib->sib_flowinfo = listen_ib->sib_flowinfo; +			ib->sib_addr = listen_ib->sib_addr; +			ib->sib_sid = listen_ib->sib_sid; +			ib->sib_scope_id = listen_ib->sib_scope_id; +		} +		ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); +	} +	if (dst_addr) { +		ib = (struct sockaddr_ib *)dst_addr; +		ib->sib_family = AF_IB; +		if (path) { +			ib->sib_pkey = path->pkey; +			ib->sib_flowinfo = path->flow_label; +			memcpy(&ib->sib_addr, &path->dgid, 16); +		}  	}  } -static __be16 ss_get_port(const struct sockaddr_storage *ss) -{ -	if (ss->ss_family == AF_INET) -		return ((struct sockaddr_in *)ss)->sin_port; -	else if (ss->ss_family == AF_INET6) -		return ((struct sockaddr_in6 *)ss)->sin6_port; -	BUG(); -} - -static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, -			      struct cma_hdr *hdr) +static void cma_save_ip4_info(struct sockaddr *src_addr, +			      struct sockaddr *dst_addr, +			      struct cma_hdr *hdr, +			      __be16 local_port)  {  	struct sockaddr_in *ip4; -	ip4 = (struct sockaddr_in *) &id->route.addr.src_addr; -	ip4->sin_family = AF_INET; -	ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr; -	ip4->sin_port = ss_get_port(&listen_id->route.addr.src_addr); +	if (src_addr) { +		ip4 = (struct sockaddr_in *)src_addr; +		ip4->sin_family = AF_INET; +		ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr; +		ip4->sin_port = local_port; +	} -	ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr; -	ip4->sin_family = AF_INET; -	ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr; -	ip4->sin_port = hdr->port; +	if (dst_addr) { +		ip4 = (struct sockaddr_in *)dst_addr; +		ip4->sin_family = AF_INET; +		ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr; +		ip4->sin_port = hdr->port; +	}  } -static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, -			      struct cma_hdr *hdr) +static void cma_save_ip6_info(struct sockaddr *src_addr, +			      struct sockaddr *dst_addr, +			      struct cma_hdr *hdr, +			      __be16 local_port)  {  	struct sockaddr_in6 *ip6; -	ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr; -	ip6->sin6_family = AF_INET6; -	ip6->sin6_addr = hdr->dst_addr.ip6; -	ip6->sin6_port = ss_get_port(&listen_id->route.addr.src_addr); +	if (src_addr) { +		ip6 = (struct sockaddr_in6 *)src_addr; +		ip6->sin6_family = AF_INET6; +		ip6->sin6_addr = hdr->dst_addr.ip6; +		ip6->sin6_port = local_port; +	} -	ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr; -	ip6->sin6_family = AF_INET6; -	ip6->sin6_addr = hdr->src_addr.ip6; -	ip6->sin6_port = hdr->port; +	if (dst_addr) { +		ip6 = (struct sockaddr_in6 *)dst_addr; +		ip6->sin6_family = AF_INET6; +		ip6->sin6_addr = hdr->src_addr.ip6; +		ip6->sin6_port = hdr->port; +	}  } -static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id, -			     struct ib_cm_event *ib_event) +static u16 cma_port_from_service_id(__be64 service_id)  { -	struct cma_hdr *hdr; +	return (u16)be64_to_cpu(service_id); +} -	if (listen_id->route.addr.src_addr.ss_family == AF_IB) { -		if (ib_event->event == IB_CM_REQ_RECEIVED) -			cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path); -		else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) -			cma_save_ib_info(id, listen_id, NULL); -		return 0; -	} +static int cma_save_ip_info(struct sockaddr *src_addr, +			    struct sockaddr *dst_addr, +			    struct ib_cm_event *ib_event, +			    __be64 service_id) +{ +	struct cma_hdr *hdr; +	__be16 port;  	hdr = ib_event->private_data;  	if (hdr->cma_version != CMA_VERSION)  		return -EINVAL; +	port = htons(cma_port_from_service_id(service_id)); +  	switch (cma_get_ip_ver(hdr)) {  	case 4: -		cma_save_ip4_info(id, listen_id, hdr); +		cma_save_ip4_info(src_addr, dst_addr, hdr, port);  		break;  	case 6: -		cma_save_ip6_info(id, listen_id, hdr); +		cma_save_ip6_info(src_addr, dst_addr, hdr, port); +		break; +	default: +		return -EAFNOSUPPORT; +	} + +	return 0; +} + +static int cma_save_net_info(struct sockaddr *src_addr, +			     struct sockaddr *dst_addr, +			     struct rdma_cm_id *listen_id, +			     struct ib_cm_event *ib_event, +			     sa_family_t sa_family, __be64 service_id) +{ +	if (sa_family == AF_IB) { +		if (ib_event->event == IB_CM_REQ_RECEIVED) +			cma_save_ib_info(src_addr, dst_addr, listen_id, +					 ib_event->param.req_rcvd.primary_path); +		else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) +			cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); +		return 0; +	} + +	return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); +} + +static int cma_save_req_info(const struct ib_cm_event *ib_event, +			     struct cma_req_info *req) +{ +	const struct ib_cm_req_event_param *req_param = +		&ib_event->param.req_rcvd; +	const struct ib_cm_sidr_req_event_param *sidr_param = +		&ib_event->param.sidr_req_rcvd; + +	switch (ib_event->event) { +	case IB_CM_REQ_RECEIVED: +		req->device	= req_param->listen_id->device; +		req->port	= req_param->port; +		memcpy(&req->local_gid, &req_param->primary_path->sgid, +		       sizeof(req->local_gid)); +		req->has_gid	= true; +		req->service_id	= req_param->primary_path->service_id; +		req->pkey	= req_param->bth_pkey; +		break; +	case IB_CM_SIDR_REQ_RECEIVED: +		req->device	= sidr_param->listen_id->device; +		req->port	= sidr_param->port; +		req->has_gid	= false; +		req->service_id	= sidr_param->service_id; +		req->pkey	= sidr_param->bth_pkey;  		break;  	default:  		return -EINVAL;  	} +  	return 0;  } +static bool validate_ipv4_net_dev(struct net_device *net_dev, +				  const struct sockaddr_in *dst_addr, +				  const struct sockaddr_in *src_addr) +{ +	__be32 daddr = dst_addr->sin_addr.s_addr, +	       saddr = src_addr->sin_addr.s_addr; +	struct fib_result res; +	struct flowi4 fl4; +	int err; +	bool ret; + +	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || +	    ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || +	    ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || +	    ipv4_is_loopback(saddr)) +		return false; + +	memset(&fl4, 0, sizeof(fl4)); +	fl4.flowi4_iif = net_dev->ifindex; +	fl4.daddr = daddr; +	fl4.saddr = saddr; + +	rcu_read_lock(); +	err = fib_lookup(dev_net(net_dev), &fl4, &res, 0); +	if (err) +		return false; + +	ret = FIB_RES_DEV(res) == net_dev; +	rcu_read_unlock(); + +	return ret; +} + +static bool validate_ipv6_net_dev(struct net_device *net_dev, +				  const struct sockaddr_in6 *dst_addr, +				  const struct sockaddr_in6 *src_addr) +{ +#if IS_ENABLED(CONFIG_IPV6) +	const int strict = ipv6_addr_type(&dst_addr->sin6_addr) & +			   IPV6_ADDR_LINKLOCAL; +	struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, +					 &src_addr->sin6_addr, net_dev->ifindex, +					 strict); +	bool ret; + +	if (!rt) +		return false; + +	ret = rt->rt6i_idev->dev == net_dev; +	ip6_rt_put(rt); + +	return ret; +#else +	return false; +#endif +} + +static bool validate_net_dev(struct net_device *net_dev, +			     const struct sockaddr *daddr, +			     const struct sockaddr *saddr) +{ +	const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; +	const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; +	const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; +	const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; + +	switch (daddr->sa_family) { +	case AF_INET: +		return saddr->sa_family == AF_INET && +		       validate_ipv4_net_dev(net_dev, daddr4, saddr4); + +	case AF_INET6: +		return saddr->sa_family == AF_INET6 && +		       validate_ipv6_net_dev(net_dev, daddr6, saddr6); + +	default: +		return false; +	} +} + +static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event, +					  const struct cma_req_info *req) +{ +	struct sockaddr_storage listen_addr_storage, src_addr_storage; +	struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage, +			*src_addr = (struct sockaddr *)&src_addr_storage; +	struct net_device *net_dev; +	const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; +	int err; + +	err = cma_save_ip_info(listen_addr, src_addr, ib_event, +			       req->service_id); +	if (err) +		return ERR_PTR(err); + +	net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, +					   gid, listen_addr); +	if (!net_dev) +		return ERR_PTR(-ENODEV); + +	if (!validate_net_dev(net_dev, listen_addr, src_addr)) { +		dev_put(net_dev); +		return ERR_PTR(-EHOSTUNREACH); +	} + +	return net_dev; +} + +static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id) +{ +	return (be64_to_cpu(service_id) >> 16) & 0xffff; +} + +static bool cma_match_private_data(struct rdma_id_private *id_priv, +				   const struct cma_hdr *hdr) +{ +	struct sockaddr *addr = cma_src_addr(id_priv); +	__be32 ip4_addr; +	struct in6_addr ip6_addr; + +	if (cma_any_addr(addr) && !id_priv->afonly) +		return true; + +	switch (addr->sa_family) { +	case AF_INET: +		ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; +		if (cma_get_ip_ver(hdr) != 4) +			return false; +		if (!cma_any_addr(addr) && +		    hdr->dst_addr.ip4.addr != ip4_addr) +			return false; +		break; +	case AF_INET6: +		ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; +		if (cma_get_ip_ver(hdr) != 6) +			return false; +		if (!cma_any_addr(addr) && +		    memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) +			return false; +		break; +	case AF_IB: +		return true; +	default: +		return false; +	} + +	return true; +} + +static bool cma_match_net_dev(const struct rdma_id_private *id_priv, +			      const struct net_device *net_dev) +{ +	const struct rdma_addr *addr = &id_priv->id.route.addr; + +	if (!net_dev) +		/* This request is an AF_IB request */ +		return addr->src_addr.ss_family == AF_IB; + +	return !addr->dev_addr.bound_dev_if || +	       (net_eq(dev_net(net_dev), &init_net) && +		addr->dev_addr.bound_dev_if == net_dev->ifindex); +} + +static struct rdma_id_private *cma_find_listener( +		const struct rdma_bind_list *bind_list, +		const struct ib_cm_id *cm_id, +		const struct ib_cm_event *ib_event, +		const struct cma_req_info *req, +		const struct net_device *net_dev) +{ +	struct rdma_id_private *id_priv, *id_priv_dev; + +	if (!bind_list) +		return ERR_PTR(-EINVAL); + +	hlist_for_each_entry(id_priv, &bind_list->owners, node) { +		if (cma_match_private_data(id_priv, ib_event->private_data)) { +			if (id_priv->id.device == cm_id->device && +			    cma_match_net_dev(id_priv, net_dev)) +				return id_priv; +			list_for_each_entry(id_priv_dev, +					    &id_priv->listen_list, +					    listen_list) { +				if (id_priv_dev->id.device == cm_id->device && +				    cma_match_net_dev(id_priv_dev, net_dev)) +					return id_priv_dev; +			} +		} +	} + +	return ERR_PTR(-EINVAL); +} + +static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id, +						 struct ib_cm_event *ib_event, +						 struct net_device **net_dev) +{ +	struct cma_req_info req; +	struct rdma_bind_list *bind_list; +	struct rdma_id_private *id_priv; +	int err; + +	err = cma_save_req_info(ib_event, &req); +	if (err) +		return ERR_PTR(err); + +	*net_dev = cma_get_net_dev(ib_event, &req); +	if (IS_ERR(*net_dev)) { +		if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { +			/* Assuming the protocol is AF_IB */ +			*net_dev = NULL; +		} else { +			return ERR_CAST(*net_dev); +		} +	} + +	bind_list = cma_ps_find(rdma_ps_from_service_id(req.service_id), +				cma_port_from_service_id(req.service_id)); +	id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev); +	if (IS_ERR(id_priv)) { +		dev_put(*net_dev); +		*net_dev = NULL; +	} + +	return id_priv; +} +  static inline int cma_user_data_offset(struct rdma_id_private *id_priv)  {  	return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); @@ -1038,7 +1377,7 @@ static void cma_release_port(struct rdma_id_private *id_priv)  	mutex_lock(&lock);  	hlist_del(&id_priv->node);  	if (hlist_empty(&bind_list->owners)) { -		idr_remove(bind_list->ps, bind_list->port); +		cma_ps_remove(bind_list->ps, bind_list->port);  		kfree(bind_list);  	}  	mutex_unlock(&lock); @@ -1216,11 +1555,15 @@ out:  }  static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, -					       struct ib_cm_event *ib_event) +					       struct ib_cm_event *ib_event, +					       struct net_device *net_dev)  {  	struct rdma_id_private *id_priv;  	struct rdma_cm_id *id;  	struct rdma_route *rt; +	const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; +	const __be64 service_id = +		      ib_event->param.req_rcvd.primary_path->service_id;  	int ret;  	id = rdma_create_id(listen_id->event_handler, listen_id->context, @@ -1229,7 +1572,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,  		return NULL;  	id_priv = container_of(id, struct rdma_id_private, id); -	if (cma_save_net_info(id, listen_id, ib_event)) +	if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, +			      (struct sockaddr *)&id->route.addr.dst_addr, +			      listen_id, ib_event, ss_family, service_id))  		goto err;  	rt = &id->route; @@ -1243,14 +1588,16 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,  	if (rt->num_paths == 2)  		rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; -	if (cma_any_addr(cma_src_addr(id_priv))) { -		rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; -		rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); -		ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); -	} else { -		ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); +	if (net_dev) { +		ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);  		if (ret)  			goto err; +	} else { +		/* An AF_IB connection */ +		WARN_ON_ONCE(ss_family != AF_IB); + +		cma_translate_ib((struct sockaddr_ib *)cma_src_addr(id_priv), +				 &rt->addr.dev_addr);  	}  	rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); @@ -1263,10 +1610,12 @@ err:  }  static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, -					      struct ib_cm_event *ib_event) +					      struct ib_cm_event *ib_event, +					      struct net_device *net_dev)  {  	struct rdma_id_private *id_priv;  	struct rdma_cm_id *id; +	const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;  	int ret;  	id = rdma_create_id(listen_id->event_handler, listen_id->context, @@ -1275,13 +1624,24 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,  		return NULL;  	id_priv = container_of(id, struct rdma_id_private, id); -	if (cma_save_net_info(id, listen_id, ib_event)) +	if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, +			      (struct sockaddr *)&id->route.addr.dst_addr, +			      listen_id, ib_event, ss_family, +			      ib_event->param.sidr_req_rcvd.service_id))  		goto err; -	if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) { -		ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); +	if (net_dev) { +		ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);  		if (ret)  			goto err; +	} else { +		/* An AF_IB connection */ +		WARN_ON_ONCE(ss_family != AF_IB); + +		if (!cma_any_addr(cma_src_addr(id_priv))) +			cma_translate_ib((struct sockaddr_ib *) +						cma_src_addr(id_priv), +					 &id->route.addr.dev_addr);  	}  	id_priv->state = RDMA_CM_CONNECT; @@ -1319,25 +1679,33 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)  {  	struct rdma_id_private *listen_id, *conn_id;  	struct rdma_cm_event event; +	struct net_device *net_dev;  	int offset, ret; -	listen_id = cm_id->context; -	if (!cma_check_req_qp_type(&listen_id->id, ib_event)) -		return -EINVAL; +	listen_id = cma_id_from_event(cm_id, ib_event, &net_dev); +	if (IS_ERR(listen_id)) +		return PTR_ERR(listen_id); -	if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) -		return -ECONNABORTED; +	if (!cma_check_req_qp_type(&listen_id->id, ib_event)) { +		ret = -EINVAL; +		goto net_dev_put; +	} + +	if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) { +		ret = -ECONNABORTED; +		goto net_dev_put; +	}  	memset(&event, 0, sizeof event);  	offset = cma_user_data_offset(listen_id);  	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;  	if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { -		conn_id = cma_new_udp_id(&listen_id->id, ib_event); +		conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev);  		event.param.ud.private_data = ib_event->private_data + offset;  		event.param.ud.private_data_len =  				IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;  	} else { -		conn_id = cma_new_conn_id(&listen_id->id, ib_event); +		conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev);  		cma_set_req_event_data(&event, &ib_event->param.req_rcvd,  				       ib_event->private_data, offset);  	} @@ -1375,6 +1743,8 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)  	mutex_unlock(&conn_id->handler_mutex);  	mutex_unlock(&listen_id->handler_mutex);  	cma_deref_id(conn_id); +	if (net_dev) +		dev_put(net_dev);  	return 0;  err3: @@ -1388,6 +1758,11 @@ err1:  	mutex_unlock(&listen_id->handler_mutex);  	if (conn_id)  		rdma_destroy_id(&conn_id->id); + +net_dev_put: +	if (net_dev) +		dev_put(net_dev); +  	return ret;  } @@ -1400,42 +1775,6 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)  }  EXPORT_SYMBOL(rdma_get_service_id); -static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr, -				 struct ib_cm_compare_data *compare) -{ -	struct cma_hdr *cma_data, *cma_mask; -	__be32 ip4_addr; -	struct in6_addr ip6_addr; - -	memset(compare, 0, sizeof *compare); -	cma_data = (void *) compare->data; -	cma_mask = (void *) compare->mask; - -	switch (addr->sa_family) { -	case AF_INET: -		ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr; -		cma_set_ip_ver(cma_data, 4); -		cma_set_ip_ver(cma_mask, 0xF); -		if (!cma_any_addr(addr)) { -			cma_data->dst_addr.ip4.addr = ip4_addr; -			cma_mask->dst_addr.ip4.addr = htonl(~0); -		} -		break; -	case AF_INET6: -		ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr; -		cma_set_ip_ver(cma_data, 6); -		cma_set_ip_ver(cma_mask, 0xF); -		if (!cma_any_addr(addr)) { -			cma_data->dst_addr.ip6 = ip6_addr; -			memset(&cma_mask->dst_addr.ip6, 0xFF, -			       sizeof cma_mask->dst_addr.ip6); -		} -		break; -	default: -		break; -	} -} -  static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)  {  	struct rdma_id_private *id_priv = iw_id->context; @@ -1589,33 +1928,18 @@ out:  static int cma_ib_listen(struct rdma_id_private *id_priv)  { -	struct ib_cm_compare_data compare_data;  	struct sockaddr *addr;  	struct ib_cm_id	*id;  	__be64 svc_id; -	int ret; -	id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv); +	addr = cma_src_addr(id_priv); +	svc_id = rdma_get_service_id(&id_priv->id, addr); +	id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id);  	if (IS_ERR(id))  		return PTR_ERR(id); -  	id_priv->cm_id.ib = id; -	addr = cma_src_addr(id_priv); -	svc_id = rdma_get_service_id(&id_priv->id, addr); -	if (cma_any_addr(addr) && !id_priv->afonly) -		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL); -	else { -		cma_set_compare_data(id_priv->id.ps, addr, &compare_data); -		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data); -	} - -	if (ret) { -		ib_destroy_cm_id(id_priv->cm_id.ib); -		id_priv->cm_id.ib = NULL; -	} - -	return ret; +	return 0;  }  static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) @@ -2203,8 +2527,11 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,  		src_addr = (struct sockaddr *) &id->route.addr.src_addr;  		src_addr->sa_family = dst_addr->sa_family;  		if (dst_addr->sa_family == AF_INET6) { -			((struct sockaddr_in6 *) src_addr)->sin6_scope_id = -				((struct sockaddr_in6 *) dst_addr)->sin6_scope_id; +			struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr; +			struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr; +			src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; +			if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) +				id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;  		} else if (dst_addr->sa_family == AF_IB) {  			((struct sockaddr_ib *) src_addr)->sib_pkey =  				((struct sockaddr_ib *) dst_addr)->sib_pkey; @@ -2325,8 +2652,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list,  	hlist_add_head(&id_priv->node, &bind_list->owners);  } -static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv, -			  unsigned short snum) +static int cma_alloc_port(enum rdma_port_space ps, +			  struct rdma_id_private *id_priv, unsigned short snum)  {  	struct rdma_bind_list *bind_list;  	int ret; @@ -2335,7 +2662,7 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,  	if (!bind_list)  		return -ENOMEM; -	ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL); +	ret = cma_ps_alloc(ps, bind_list, snum);  	if (ret < 0)  		goto err; @@ -2348,7 +2675,8 @@ err:  	return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;  } -static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) +static int cma_alloc_any_port(enum rdma_port_space ps, +			      struct rdma_id_private *id_priv)  {  	static unsigned int last_used_port;  	int low, high, remaining; @@ -2359,7 +2687,7 @@ static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)  	rover = prandom_u32() % remaining + low;  retry:  	if (last_used_port != rover && -	    !idr_find(ps, (unsigned short) rover)) { +	    !cma_ps_find(ps, (unsigned short)rover)) {  		int ret = cma_alloc_port(ps, id_priv, rover);  		/*  		 * Remember previously used port number in order to avoid @@ -2414,7 +2742,8 @@ static int cma_check_port(struct rdma_bind_list *bind_list,  	return 0;  } -static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) +static int cma_use_port(enum rdma_port_space ps, +			struct rdma_id_private *id_priv)  {  	struct rdma_bind_list *bind_list;  	unsigned short snum; @@ -2424,7 +2753,7 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)  	if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))  		return -EACCES; -	bind_list = idr_find(ps, snum); +	bind_list = cma_ps_find(ps, snum);  	if (!bind_list) {  		ret = cma_alloc_port(ps, id_priv, snum);  	} else { @@ -2447,25 +2776,24 @@ static int cma_bind_listen(struct rdma_id_private *id_priv)  	return ret;  } -static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv) +static enum rdma_port_space cma_select_inet_ps( +		struct rdma_id_private *id_priv)  {  	switch (id_priv->id.ps) {  	case RDMA_PS_TCP: -		return &tcp_ps;  	case RDMA_PS_UDP: -		return &udp_ps;  	case RDMA_PS_IPOIB: -		return &ipoib_ps;  	case RDMA_PS_IB: -		return &ib_ps; +		return id_priv->id.ps;  	default: -		return NULL; + +		return 0;  	}  } -static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv) +static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv)  { -	struct idr *ps = NULL; +	enum rdma_port_space ps = 0;  	struct sockaddr_ib *sib;  	u64 sid_ps, mask, sid; @@ -2475,15 +2803,15 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)  	if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {  		sid_ps = RDMA_IB_IP_PS_IB; -		ps = &ib_ps; +		ps = RDMA_PS_IB;  	} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&  		   (sid == (RDMA_IB_IP_PS_TCP & mask))) {  		sid_ps = RDMA_IB_IP_PS_TCP; -		ps = &tcp_ps; +		ps = RDMA_PS_TCP;  	} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&  		   (sid == (RDMA_IB_IP_PS_UDP & mask))) {  		sid_ps = RDMA_IB_IP_PS_UDP; -		ps = &udp_ps; +		ps = RDMA_PS_UDP;  	}  	if (ps) { @@ -2496,7 +2824,7 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)  static int cma_get_port(struct rdma_id_private *id_priv)  { -	struct idr *ps; +	enum rdma_port_space ps;  	int ret;  	if (cma_family(id_priv) != AF_IB) @@ -3551,11 +3879,10 @@ static void cma_process_remove(struct cma_device *cma_dev)  	wait_for_completion(&cma_dev->comp);  } -static void cma_remove_one(struct ib_device *device) +static void cma_remove_one(struct ib_device *device, void *client_data)  { -	struct cma_device *cma_dev; +	struct cma_device *cma_dev = client_data; -	cma_dev = ib_get_client_data(device, &cma_client);  	if (!cma_dev)  		return; diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 87d1936f5c1c..70bb36ebb03b 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -43,12 +43,58 @@ int  ib_device_register_sysfs(struct ib_device *device,  						   u8, struct kobject *));  void ib_device_unregister_sysfs(struct ib_device *device); -int  ib_sysfs_setup(void); -void ib_sysfs_cleanup(void); - -int  ib_cache_setup(void); +void ib_cache_setup(void);  void ib_cache_cleanup(void);  int ib_resolve_eth_l2_attrs(struct ib_qp *qp,  			    struct ib_qp_attr *qp_attr, int *qp_attr_mask); + +typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, +	      struct net_device *idev, void *cookie); + +typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port, +	     struct net_device *idev, void *cookie); + +void ib_enum_roce_netdev(struct ib_device *ib_dev, +			 roce_netdev_filter filter, +			 void *filter_cookie, +			 roce_netdev_callback cb, +			 void *cookie); +void ib_enum_all_roce_netdevs(roce_netdev_filter filter, +			      void *filter_cookie, +			      roce_netdev_callback cb, +			      void *cookie); + +int ib_cache_gid_find_by_port(struct ib_device *ib_dev, +			      const union ib_gid *gid, +			      u8 port, struct net_device *ndev, +			      u16 *index); + +enum ib_cache_gid_default_mode { +	IB_CACHE_GID_DEFAULT_MODE_SET, +	IB_CACHE_GID_DEFAULT_MODE_DELETE +}; + +void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, +				  struct net_device *ndev, +				  enum ib_cache_gid_default_mode mode); + +int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, +		     union ib_gid *gid, struct ib_gid_attr *attr); + +int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, +		     union ib_gid *gid, struct ib_gid_attr *attr); + +int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, +				     struct net_device *ndev); + +int roce_gid_mgmt_init(void); +void roce_gid_mgmt_cleanup(void); + +int roce_rescan_device(struct ib_device *ib_dev); + +int ib_cache_setup_one(struct ib_device *device); +void ib_cache_cleanup_one(struct ib_device *device); +void ib_cache_release_one(struct ib_device *device); +  #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 9567756ca4f9..17639117afc6 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -38,7 +38,10 @@  #include <linux/slab.h>  #include <linux/init.h>  #include <linux/mutex.h> +#include <linux/netdevice.h>  #include <rdma/rdma_netlink.h> +#include <rdma/ib_addr.h> +#include <rdma/ib_cache.h>  #include "core_priv.h" @@ -50,22 +53,34 @@ struct ib_client_data {  	struct list_head  list;  	struct ib_client *client;  	void *            data; +	/* The device or client is going down. Do not call client or device +	 * callbacks other than remove(). */ +	bool		  going_down;  };  struct workqueue_struct *ib_wq;  EXPORT_SYMBOL_GPL(ib_wq); +/* The device_list and client_list contain devices and clients after their + * registration has completed, and the devices and clients are removed + * during unregistration. */  static LIST_HEAD(device_list);  static LIST_HEAD(client_list);  /* - * device_mutex protects access to both device_list and client_list. - * There's no real point to using multiple locks or something fancier - * like an rwsem: we always access both lists, and we're always - * modifying one list or the other list.  In any case this is not a - * hot path so there's no point in trying to optimize. + * device_mutex and lists_rwsem protect access to both device_list and + * client_list.  device_mutex protects writer access by device and client + * registration / de-registration.  lists_rwsem protects reader access to + * these lists.  Iterators of these lists must lock it for read, while updates + * to the lists must be done with a write lock. A special case is when the + * device_mutex is locked. In this case locking the lists for read access is + * not necessary as the device_mutex implies it. + * + * lists_rwsem also protects access to the client data list.   */  static DEFINE_MUTEX(device_mutex); +static DECLARE_RWSEM(lists_rwsem); +  static int ib_device_check_mandatory(struct ib_device *device)  { @@ -152,6 +167,36 @@ static int alloc_name(char *name)  	return 0;  } +static void ib_device_release(struct device *device) +{ +	struct ib_device *dev = container_of(device, struct ib_device, dev); + +	ib_cache_release_one(dev); +	kfree(dev->port_immutable); +	kfree(dev); +} + +static int ib_device_uevent(struct device *device, +			    struct kobj_uevent_env *env) +{ +	struct ib_device *dev = container_of(device, struct ib_device, dev); + +	if (add_uevent_var(env, "NAME=%s", dev->name)) +		return -ENOMEM; + +	/* +	 * It would be nice to pass the node GUID with the event... +	 */ + +	return 0; +} + +static struct class ib_class = { +	.name    = "infiniband", +	.dev_release = ib_device_release, +	.dev_uevent = ib_device_uevent, +}; +  /**   * ib_alloc_device - allocate an IB device struct   * @size:size of structure to allocate @@ -164,9 +209,27 @@ static int alloc_name(char *name)   */  struct ib_device *ib_alloc_device(size_t size)  { -	BUG_ON(size < sizeof (struct ib_device)); +	struct ib_device *device; + +	if (WARN_ON(size < sizeof(struct ib_device))) +		return NULL; + +	device = kzalloc(size, GFP_KERNEL); +	if (!device) +		return NULL; + +	device->dev.class = &ib_class; +	device_initialize(&device->dev); + +	dev_set_drvdata(&device->dev, device); + +	INIT_LIST_HEAD(&device->event_handler_list); +	spin_lock_init(&device->event_handler_lock); +	spin_lock_init(&device->client_data_lock); +	INIT_LIST_HEAD(&device->client_data_list); +	INIT_LIST_HEAD(&device->port_list); -	return kzalloc(size, GFP_KERNEL); +	return device;  }  EXPORT_SYMBOL(ib_alloc_device); @@ -178,13 +241,8 @@ EXPORT_SYMBOL(ib_alloc_device);   */  void ib_dealloc_device(struct ib_device *device)  { -	if (device->reg_state == IB_DEV_UNINITIALIZED) { -		kfree(device); -		return; -	} - -	BUG_ON(device->reg_state != IB_DEV_UNREGISTERED); - +	WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && +		device->reg_state != IB_DEV_UNINITIALIZED);  	kobject_put(&device->dev.kobj);  }  EXPORT_SYMBOL(ib_dealloc_device); @@ -203,10 +261,13 @@ static int add_client_context(struct ib_device *device, struct ib_client *client  	context->client = client;  	context->data   = NULL; +	context->going_down = false; +	down_write(&lists_rwsem);  	spin_lock_irqsave(&device->client_data_lock, flags);  	list_add(&context->list, &device->client_data_list);  	spin_unlock_irqrestore(&device->client_data_lock, flags); +	up_write(&lists_rwsem);  	return 0;  } @@ -219,7 +280,7 @@ static int verify_immutable(const struct ib_device *dev, u8 port)  static int read_port_immutable(struct ib_device *device)  { -	int ret = -ENOMEM; +	int ret;  	u8 start_port = rdma_start_port(device);  	u8 end_port = rdma_end_port(device);  	u8 port; @@ -235,26 +296,18 @@ static int read_port_immutable(struct ib_device *device)  					 * (end_port + 1),  					 GFP_KERNEL);  	if (!device->port_immutable) -		goto err; +		return -ENOMEM;  	for (port = start_port; port <= end_port; ++port) {  		ret = device->get_port_immutable(device, port,  						 &device->port_immutable[port]);  		if (ret) -			goto err; +			return ret; -		if (verify_immutable(device, port)) { -			ret = -EINVAL; -			goto err; -		} +		if (verify_immutable(device, port)) +			return -EINVAL;  	} - -	ret = 0; -	goto out; -err: -	kfree(device->port_immutable); -out: -	return ret; +	return 0;  }  /** @@ -271,6 +324,7 @@ int ib_register_device(struct ib_device *device,  					    u8, struct kobject *))  {  	int ret; +	struct ib_client *client;  	mutex_lock(&device_mutex); @@ -285,11 +339,6 @@ int ib_register_device(struct ib_device *device,  		goto out;  	} -	INIT_LIST_HEAD(&device->event_handler_list); -	INIT_LIST_HEAD(&device->client_data_list); -	spin_lock_init(&device->event_handler_lock); -	spin_lock_init(&device->client_data_lock); -  	ret = read_port_immutable(device);  	if (ret) {  		printk(KERN_WARNING "Couldn't create per port immutable data %s\n", @@ -297,27 +346,30 @@ int ib_register_device(struct ib_device *device,  		goto out;  	} +	ret = ib_cache_setup_one(device); +	if (ret) { +		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); +		goto out; +	} +  	ret = ib_device_register_sysfs(device, port_callback);  	if (ret) {  		printk(KERN_WARNING "Couldn't register device %s with driver model\n",  		       device->name); -		kfree(device->port_immutable); +		ib_cache_cleanup_one(device);  		goto out;  	} -	list_add_tail(&device->core_list, &device_list); -  	device->reg_state = IB_DEV_REGISTERED; -	{ -		struct ib_client *client; - -		list_for_each_entry(client, &client_list, list) -			if (client->add && !add_client_context(device, client)) -				client->add(device); -	} +	list_for_each_entry(client, &client_list, list) +		if (client->add && !add_client_context(device, client)) +			client->add(device); - out: +	down_write(&lists_rwsem); +	list_add_tail(&device->core_list, &device_list); +	up_write(&lists_rwsem); +out:  	mutex_unlock(&device_mutex);  	return ret;  } @@ -331,26 +383,37 @@ EXPORT_SYMBOL(ib_register_device);   */  void ib_unregister_device(struct ib_device *device)  { -	struct ib_client *client;  	struct ib_client_data *context, *tmp;  	unsigned long flags;  	mutex_lock(&device_mutex); -	list_for_each_entry_reverse(client, &client_list, list) -		if (client->remove) -			client->remove(device); - +	down_write(&lists_rwsem);  	list_del(&device->core_list); +	spin_lock_irqsave(&device->client_data_lock, flags); +	list_for_each_entry_safe(context, tmp, &device->client_data_list, list) +		context->going_down = true; +	spin_unlock_irqrestore(&device->client_data_lock, flags); +	downgrade_write(&lists_rwsem); + +	list_for_each_entry_safe(context, tmp, &device->client_data_list, +				 list) { +		if (context->client->remove) +			context->client->remove(device, context->data); +	} +	up_read(&lists_rwsem);  	mutex_unlock(&device_mutex);  	ib_device_unregister_sysfs(device); +	ib_cache_cleanup_one(device); +	down_write(&lists_rwsem);  	spin_lock_irqsave(&device->client_data_lock, flags);  	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)  		kfree(context);  	spin_unlock_irqrestore(&device->client_data_lock, flags); +	up_write(&lists_rwsem);  	device->reg_state = IB_DEV_UNREGISTERED;  } @@ -375,11 +438,14 @@ int ib_register_client(struct ib_client *client)  	mutex_lock(&device_mutex); -	list_add_tail(&client->list, &client_list);  	list_for_each_entry(device, &device_list, core_list)  		if (client->add && !add_client_context(device, client))  			client->add(device); +	down_write(&lists_rwsem); +	list_add_tail(&client->list, &client_list); +	up_write(&lists_rwsem); +  	mutex_unlock(&device_mutex);  	return 0; @@ -402,19 +468,41 @@ void ib_unregister_client(struct ib_client *client)  	mutex_lock(&device_mutex); +	down_write(&lists_rwsem); +	list_del(&client->list); +	up_write(&lists_rwsem); +  	list_for_each_entry(device, &device_list, core_list) { -		if (client->remove) -			client->remove(device); +		struct ib_client_data *found_context = NULL; +		down_write(&lists_rwsem);  		spin_lock_irqsave(&device->client_data_lock, flags);  		list_for_each_entry_safe(context, tmp, &device->client_data_list, list)  			if (context->client == client) { -				list_del(&context->list); -				kfree(context); +				context->going_down = true; +				found_context = context; +				break;  			}  		spin_unlock_irqrestore(&device->client_data_lock, flags); +		up_write(&lists_rwsem); + +		if (client->remove) +			client->remove(device, found_context ? +					       found_context->data : NULL); + +		if (!found_context) { +			pr_warn("No client context found for %s/%s\n", +				device->name, client->name); +			continue; +		} + +		down_write(&lists_rwsem); +		spin_lock_irqsave(&device->client_data_lock, flags); +		list_del(&found_context->list); +		kfree(found_context); +		spin_unlock_irqrestore(&device->client_data_lock, flags); +		up_write(&lists_rwsem);  	} -	list_del(&client->list);  	mutex_unlock(&device_mutex);  } @@ -590,11 +678,80 @@ EXPORT_SYMBOL(ib_query_port);  int ib_query_gid(struct ib_device *device,  		 u8 port_num, int index, union ib_gid *gid)  { +	if (rdma_cap_roce_gid_table(device, port_num)) +		return ib_get_cached_gid(device, port_num, index, gid); +  	return device->query_gid(device, port_num, index, gid);  }  EXPORT_SYMBOL(ib_query_gid);  /** + * ib_enum_roce_netdev - enumerate all RoCE ports + * @ib_dev : IB device we want to query + * @filter: Should we call the callback? + * @filter_cookie: Cookie passed to filter + * @cb: Callback to call for each found RoCE ports + * @cookie: Cookie passed back to the callback + * + * Enumerates all of the physical RoCE ports of ib_dev + * which are related to netdevice and calls callback() on each + * device for which filter() function returns non zero. + */ +void ib_enum_roce_netdev(struct ib_device *ib_dev, +			 roce_netdev_filter filter, +			 void *filter_cookie, +			 roce_netdev_callback cb, +			 void *cookie) +{ +	u8 port; + +	for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev); +	     port++) +		if (rdma_protocol_roce(ib_dev, port)) { +			struct net_device *idev = NULL; + +			if (ib_dev->get_netdev) +				idev = ib_dev->get_netdev(ib_dev, port); + +			if (idev && +			    idev->reg_state >= NETREG_UNREGISTERED) { +				dev_put(idev); +				idev = NULL; +			} + +			if (filter(ib_dev, port, idev, filter_cookie)) +				cb(ib_dev, port, idev, cookie); + +			if (idev) +				dev_put(idev); +		} +} + +/** + * ib_enum_all_roce_netdevs - enumerate all RoCE devices + * @filter: Should we call the callback? + * @filter_cookie: Cookie passed to filter + * @cb: Callback to call for each found RoCE ports + * @cookie: Cookie passed back to the callback + * + * Enumerates all RoCE devices' physical ports which are related + * to netdevices and calls callback() on each device for which + * filter() function returns non zero. + */ +void ib_enum_all_roce_netdevs(roce_netdev_filter filter, +			      void *filter_cookie, +			      roce_netdev_callback cb, +			      void *cookie) +{ +	struct ib_device *dev; + +	down_read(&lists_rwsem); +	list_for_each_entry(dev, &device_list, core_list) +		ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); +	up_read(&lists_rwsem); +} + +/**   * ib_query_pkey - Get P_Key table entry   * @device:Device to query   * @port_num:Port number to query @@ -673,6 +830,14 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,  	int ret, port, i;  	for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { +		if (rdma_cap_roce_gid_table(device, port)) { +			if (!ib_cache_gid_find_by_port(device, gid, port, +						       NULL, index)) { +				*port_num = port; +				return 0; +			} +		} +  		for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {  			ret = ib_query_gid(device, port, i, &tmp_gid);  			if (ret) @@ -729,6 +894,51 @@ int ib_find_pkey(struct ib_device *device,  }  EXPORT_SYMBOL(ib_find_pkey); +/** + * ib_get_net_dev_by_params() - Return the appropriate net_dev + * for a received CM request + * @dev:	An RDMA device on which the request has been received. + * @port:	Port number on the RDMA device. + * @pkey:	The Pkey the request came on. + * @gid:	A GID that the net_dev uses to communicate. + * @addr:	Contains the IP address that the request specified as its + *		destination. + */ +struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, +					    u8 port, +					    u16 pkey, +					    const union ib_gid *gid, +					    const struct sockaddr *addr) +{ +	struct net_device *net_dev = NULL; +	struct ib_client_data *context; + +	if (!rdma_protocol_ib(dev, port)) +		return NULL; + +	down_read(&lists_rwsem); + +	list_for_each_entry(context, &dev->client_data_list, list) { +		struct ib_client *client = context->client; + +		if (context->going_down) +			continue; + +		if (client->get_net_dev_by_params) { +			net_dev = client->get_net_dev_by_params(dev, port, pkey, +								gid, addr, +								context->data); +			if (net_dev) +				break; +		} +	} + +	up_read(&lists_rwsem); + +	return net_dev; +} +EXPORT_SYMBOL(ib_get_net_dev_by_params); +  static int __init ib_core_init(void)  {  	int ret; @@ -737,7 +947,7 @@ static int __init ib_core_init(void)  	if (!ib_wq)  		return -ENOMEM; -	ret = ib_sysfs_setup(); +	ret = class_register(&ib_class);  	if (ret) {  		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");  		goto err; @@ -749,19 +959,12 @@ static int __init ib_core_init(void)  		goto err_sysfs;  	} -	ret = ib_cache_setup(); -	if (ret) { -		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); -		goto err_nl; -	} +	ib_cache_setup();  	return 0; -err_nl: -	ibnl_cleanup(); -  err_sysfs: -	ib_sysfs_cleanup(); +	class_unregister(&ib_class);  err:  	destroy_workqueue(ib_wq); @@ -772,7 +975,7 @@ static void __exit ib_core_cleanup(void)  {  	ib_cache_cleanup();  	ibnl_cleanup(); -	ib_sysfs_cleanup(); +	class_unregister(&ib_class);  	/* Make sure that any pending umem accounting work is done. */  	destroy_workqueue(ib_wq);  } diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index e6ffa2e66c1a..22a3abee2a54 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -67,7 +67,8 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)  		err_str = "Invalid port mapper client";  		goto pid_query_error;  	} -	if (iwpm_registered_client(nl_client)) +	if (iwpm_check_registration(nl_client, IWPM_REG_VALID) || +			iwpm_user_pid == IWPM_PID_UNAVAILABLE)  		return 0;  	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client);  	if (!skb) { @@ -106,7 +107,6 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)  	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL);  	if (ret) {  		skb = NULL; /* skb is freed in the netlink send-op handling */ -		iwpm_set_registered(nl_client, 1);  		iwpm_user_pid = IWPM_PID_UNAVAILABLE;  		err_str = "Unable to send a nlmsg";  		goto pid_query_error; @@ -144,12 +144,12 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)  		err_str = "Invalid port mapper client";  		goto add_mapping_error;  	} -	if (!iwpm_registered_client(nl_client)) { +	if (!iwpm_valid_pid()) +		return 0; +	if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) {  		err_str = "Unregistered port mapper client";  		goto add_mapping_error;  	} -	if (!iwpm_valid_pid()) -		return 0;  	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client);  	if (!skb) {  		err_str = "Unable to create a nlmsg"; @@ -214,12 +214,12 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)  		err_str = "Invalid port mapper client";  		goto query_mapping_error;  	} -	if (!iwpm_registered_client(nl_client)) { +	if (!iwpm_valid_pid()) +		return 0; +	if (!iwpm_check_registration(nl_client, IWPM_REG_VALID)) {  		err_str = "Unregistered port mapper client";  		goto query_mapping_error;  	} -	if (!iwpm_valid_pid()) -		return 0;  	ret = -ENOMEM;  	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client);  	if (!skb) { @@ -288,12 +288,12 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)  		err_str = "Invalid port mapper client";  		goto remove_mapping_error;  	} -	if (!iwpm_registered_client(nl_client)) { +	if (!iwpm_valid_pid()) +		return 0; +	if (iwpm_check_registration(nl_client, IWPM_REG_UNDEF)) {  		err_str = "Unregistered port mapper client";  		goto remove_mapping_error;  	} -	if (!iwpm_valid_pid()) -		return 0;  	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client);  	if (!skb) {  		ret = -ENOMEM; @@ -388,7 +388,7 @@ int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)  	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",  			__func__, iwpm_user_pid);  	if (iwpm_valid_client(nl_client)) -		iwpm_set_registered(nl_client, 1); +		iwpm_set_registration(nl_client, IWPM_REG_VALID);  register_pid_response_exit:  	nlmsg_request->request_done = 1;  	/* always for found nlmsg_request */ @@ -644,7 +644,6 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)  {  	struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX];  	const char *msg_type = "Mapping Info response"; -	int iwpm_pid;  	u8 nl_client;  	char *iwpm_name;  	u16 iwpm_version; @@ -669,14 +668,14 @@ int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)  				__func__, nl_client);  		return ret;  	} -	iwpm_set_registered(nl_client, 0); +	iwpm_set_registration(nl_client, IWPM_REG_INCOMPL);  	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq); +	iwpm_user_pid = cb->nlh->nlmsg_pid;  	if (!iwpm_mapinfo_available())  		return 0; -	iwpm_pid = cb->nlh->nlmsg_pid;  	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n", -		 __func__, iwpm_pid); -	ret = iwpm_send_mapinfo(nl_client, iwpm_pid); +		 __func__, iwpm_user_pid); +	ret = iwpm_send_mapinfo(nl_client, iwpm_user_pid);  	return ret;  }  EXPORT_SYMBOL(iwpm_mapping_info_cb); diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index a626795bf9c7..5fb089e91353 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -78,6 +78,7 @@ init_exit:  	mutex_unlock(&iwpm_admin_lock);  	if (!ret) {  		iwpm_set_valid(nl_client, 1); +		iwpm_set_registration(nl_client, IWPM_REG_UNDEF);  		pr_debug("%s: Mapinfo and reminfo tables are created\n",  				__func__);  	} @@ -106,6 +107,7 @@ int iwpm_exit(u8 nl_client)  	}  	mutex_unlock(&iwpm_admin_lock);  	iwpm_set_valid(nl_client, 0); +	iwpm_set_registration(nl_client, IWPM_REG_UNDEF);  	return 0;  }  EXPORT_SYMBOL(iwpm_exit); @@ -397,17 +399,23 @@ void iwpm_set_valid(u8 nl_client, int valid)  }  /* valid client */ -int iwpm_registered_client(u8 nl_client) +u32 iwpm_get_registration(u8 nl_client)  {  	return iwpm_admin.reg_list[nl_client];  }  /* valid client */ -void iwpm_set_registered(u8 nl_client, int reg) +void iwpm_set_registration(u8 nl_client, u32 reg)  {  	iwpm_admin.reg_list[nl_client] = reg;  } +/* valid client */ +u32 iwpm_check_registration(u8 nl_client, u32 reg) +{ +	return (iwpm_get_registration(nl_client) & reg); +} +  int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,  				struct sockaddr_storage *b_sockaddr)  { diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h index ee2d9ff095be..b7b9e194ce81 100644 --- a/drivers/infiniband/core/iwpm_util.h +++ b/drivers/infiniband/core/iwpm_util.h @@ -58,6 +58,10 @@  #define IWPM_PID_UNDEFINED     -1  #define IWPM_PID_UNAVAILABLE   -2 +#define IWPM_REG_UNDEF          0x01 +#define IWPM_REG_VALID          0x02 +#define IWPM_REG_INCOMPL        0x04 +  struct iwpm_nlmsg_request {  	struct list_head    inprocess_list;  	__u32               nlmsg_seq; @@ -88,7 +92,7 @@ struct iwpm_admin_data {  	atomic_t refcount;  	atomic_t nlmsg_seq;  	int      client_list[RDMA_NL_NUM_CLIENTS]; -	int      reg_list[RDMA_NL_NUM_CLIENTS]; +	u32      reg_list[RDMA_NL_NUM_CLIENTS];  };  /** @@ -159,19 +163,31 @@ int iwpm_valid_client(u8 nl_client);  void iwpm_set_valid(u8 nl_client, int valid);  /** - * iwpm_registered_client - Check if the port mapper client is registered + * iwpm_check_registration - Check if the client registration + *			      matches the given one   * @nl_client: The index of the netlink client + * @reg: The given registration type to compare with   *   * Call iwpm_register_pid() to register a client + * Returns true if the client registration matches reg, + * otherwise returns false + */ +u32 iwpm_check_registration(u8 nl_client, u32 reg); + +/** + * iwpm_set_registration - Set the client registration + * @nl_client: The index of the netlink client + * @reg: Registration type to set   */ -int iwpm_registered_client(u8 nl_client); +void iwpm_set_registration(u8 nl_client, u32 reg);  /** - * iwpm_set_registered - Set the port mapper client to registered or not + * iwpm_get_registration   * @nl_client: The index of the netlink client - * @reg: 1 if registered or 0 if not + * + * Returns the client registration type   */ -void iwpm_set_registered(u8 nl_client, int reg); +u32 iwpm_get_registration(u8 nl_client);  /**   * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index a4b1466c1bf6..4b5c72311deb 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -338,13 +338,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,  		goto error1;  	} -	mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd, -						 IB_ACCESS_LOCAL_WRITE); -	if (IS_ERR(mad_agent_priv->agent.mr)) { -		ret = ERR_PTR(-ENOMEM); -		goto error2; -	} -  	if (mad_reg_req) {  		reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);  		if (!reg_req) { @@ -429,8 +422,6 @@ error4:  	spin_unlock_irqrestore(&port_priv->reg_lock, flags);  	kfree(reg_req);  error3: -	ib_dereg_mr(mad_agent_priv->agent.mr); -error2:  	kfree(mad_agent_priv);  error1:  	return ret; @@ -590,7 +581,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)  	wait_for_completion(&mad_agent_priv->comp);  	kfree(mad_agent_priv->reg_req); -	ib_dereg_mr(mad_agent_priv->agent.mr);  	kfree(mad_agent_priv);  } @@ -769,7 +759,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,  	bool opa = rdma_cap_opa_mad(mad_agent_priv->qp_info->port_priv->device,  				    mad_agent_priv->qp_info->port_priv->port_num); -	if (device->node_type == RDMA_NODE_IB_SWITCH && +	if (rdma_cap_ib_switch(device) &&  	    smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)  		port_num = send_wr->wr.ud.port_num;  	else @@ -787,14 +777,15 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,  		if ((opa_get_smp_direction(opa_smp)  		     ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) ==  		     OPA_LID_PERMISSIVE && -		     opa_smi_handle_dr_smp_send(opa_smp, device->node_type, +		     opa_smi_handle_dr_smp_send(opa_smp, +						rdma_cap_ib_switch(device),  						port_num) == IB_SMI_DISCARD) {  			ret = -EINVAL;  			dev_err(&device->dev, "OPA Invalid directed route\n");  			goto out;  		}  		opa_drslid = be32_to_cpu(opa_smp->route.dr.dr_slid); -		if (opa_drslid != OPA_LID_PERMISSIVE && +		if (opa_drslid != be32_to_cpu(OPA_LID_PERMISSIVE) &&  		    opa_drslid & 0xffff0000) {  			ret = -EINVAL;  			dev_err(&device->dev, "OPA Invalid dr_slid 0x%x\n", @@ -810,7 +801,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,  	} else {  		if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==  		     IB_LID_PERMISSIVE && -		     smi_handle_dr_smp_send(smp, device->node_type, port_num) == +		     smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) ==  		     IB_SMI_DISCARD) {  			ret = -EINVAL;  			dev_err(&device->dev, "Invalid directed route\n"); @@ -1037,7 +1028,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,  	mad_send_wr->mad_agent_priv = mad_agent_priv;  	mad_send_wr->sg_list[0].length = hdr_len; -	mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey; +	mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey;  	/* OPA MADs don't have to be the full 2048 bytes */  	if (opa && base_version == OPA_MGMT_BASE_VERSION && @@ -1046,7 +1037,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,  	else  		mad_send_wr->sg_list[1].length = mad_size - hdr_len; -	mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey; +	mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;  	mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;  	mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list; @@ -2030,7 +2021,7 @@ static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv  	struct ib_smp *smp = (struct ib_smp *)recv->mad;  	if (smi_handle_dr_smp_recv(smp, -				   port_priv->device->node_type, +				   rdma_cap_ib_switch(port_priv->device),  				   port_num,  				   port_priv->device->phys_port_cnt) ==  				   IB_SMI_DISCARD) @@ -2042,13 +2033,13 @@ static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv  	if (retsmi == IB_SMI_SEND) { /* don't forward */  		if (smi_handle_dr_smp_send(smp, -					   port_priv->device->node_type, +					   rdma_cap_ib_switch(port_priv->device),  					   port_num) == IB_SMI_DISCARD)  			return IB_SMI_DISCARD;  		if (smi_check_local_smp(smp, port_priv->device) == IB_SMI_DISCARD)  			return IB_SMI_DISCARD; -	} else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) { +	} else if (rdma_cap_ib_switch(port_priv->device)) {  		/* forward case for switches */  		memcpy(response, recv, mad_priv_size(response));  		response->header.recv_wc.wc = &response->header.wc; @@ -2115,7 +2106,7 @@ handle_opa_smi(struct ib_mad_port_private *port_priv,  	struct opa_smp *smp = (struct opa_smp *)recv->mad;  	if (opa_smi_handle_dr_smp_recv(smp, -				   port_priv->device->node_type, +				   rdma_cap_ib_switch(port_priv->device),  				   port_num,  				   port_priv->device->phys_port_cnt) ==  				   IB_SMI_DISCARD) @@ -2127,7 +2118,7 @@ handle_opa_smi(struct ib_mad_port_private *port_priv,  	if (retsmi == IB_SMI_SEND) { /* don't forward */  		if (opa_smi_handle_dr_smp_send(smp, -					   port_priv->device->node_type, +					   rdma_cap_ib_switch(port_priv->device),  					   port_num) == IB_SMI_DISCARD)  			return IB_SMI_DISCARD; @@ -2135,7 +2126,7 @@ handle_opa_smi(struct ib_mad_port_private *port_priv,  		    IB_SMI_DISCARD)  			return IB_SMI_DISCARD; -	} else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) { +	} else if (rdma_cap_ib_switch(port_priv->device)) {  		/* forward case for switches */  		memcpy(response, recv, mad_priv_size(response));  		response->header.recv_wc.wc = &response->header.wc; @@ -2235,7 +2226,7 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,  		goto out;  	} -	if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) +	if (rdma_cap_ib_switch(port_priv->device))  		port_num = wc->port_num;  	else  		port_num = port_priv->port_num; @@ -2884,7 +2875,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,  	struct ib_mad_queue *recv_queue = &qp_info->recv_queue;  	/* Initialize common scatter list fields */ -	sg_list.lkey = (*qp_info->port_priv->mr).lkey; +	sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;  	/* Initialize common receive WR fields */  	recv_wr.next = NULL; @@ -3200,13 +3191,6 @@ static int ib_mad_port_open(struct ib_device *device,  		goto error4;  	} -	port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE); -	if (IS_ERR(port_priv->mr)) { -		dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n"); -		ret = PTR_ERR(port_priv->mr); -		goto error5; -	} -  	if (has_smi) {  		ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);  		if (ret) @@ -3247,8 +3231,6 @@ error8:  error7:  	destroy_mad_qp(&port_priv->qp_info[0]);  error6: -	ib_dereg_mr(port_priv->mr); -error5:  	ib_dealloc_pd(port_priv->pd);  error4:  	ib_destroy_cq(port_priv->cq); @@ -3283,7 +3265,6 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)  	destroy_workqueue(port_priv->wq);  	destroy_mad_qp(&port_priv->qp_info[1]);  	destroy_mad_qp(&port_priv->qp_info[0]); -	ib_dereg_mr(port_priv->mr);  	ib_dealloc_pd(port_priv->pd);  	ib_destroy_cq(port_priv->cq);  	cleanup_recv_queue(&port_priv->qp_info[1]); @@ -3297,17 +3278,11 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)  static void ib_mad_init_device(struct ib_device *device)  { -	int start, end, i; +	int start, i; -	if (device->node_type == RDMA_NODE_IB_SWITCH) { -		start = 0; -		end   = 0; -	} else { -		start = 1; -		end   = device->phys_port_cnt; -	} +	start = rdma_start_port(device); -	for (i = start; i <= end; i++) { +	for (i = start; i <= rdma_end_port(device); i++) {  		if (!rdma_cap_ib_mad(device, i))  			continue; @@ -3340,19 +3315,11 @@ error:  	}  } -static void ib_mad_remove_device(struct ib_device *device) +static void ib_mad_remove_device(struct ib_device *device, void *client_data)  { -	int start, end, i; - -	if (device->node_type == RDMA_NODE_IB_SWITCH) { -		start = 0; -		end   = 0; -	} else { -		start = 1; -		end   = device->phys_port_cnt; -	} +	int i; -	for (i = start; i <= end; i++) { +	for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) {  		if (!rdma_cap_ib_mad(device, i))  			continue; diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 5be89f98928f..4a4f7aad0978 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -199,7 +199,6 @@ struct ib_mad_port_private {  	int port_num;  	struct ib_cq *cq;  	struct ib_pd *pd; -	struct ib_mr *mr;  	spinlock_t reg_lock;  	struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION]; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 1244f02a5c6d..d38d8b2b2979 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -43,7 +43,7 @@  #include "sa.h"  static void mcast_add_one(struct ib_device *device); -static void mcast_remove_one(struct ib_device *device); +static void mcast_remove_one(struct ib_device *device, void *client_data);  static struct ib_client mcast_client = {  	.name   = "ib_multicast", @@ -812,12 +812,8 @@ static void mcast_add_one(struct ib_device *device)  	if (!dev)  		return; -	if (device->node_type == RDMA_NODE_IB_SWITCH) -		dev->start_port = dev->end_port = 0; -	else { -		dev->start_port = 1; -		dev->end_port = device->phys_port_cnt; -	} +	dev->start_port = rdma_start_port(device); +	dev->end_port = rdma_end_port(device);  	for (i = 0; i <= dev->end_port - dev->start_port; i++) {  		if (!rdma_cap_ib_mcast(device, dev->start_port + i)) @@ -844,13 +840,12 @@ static void mcast_add_one(struct ib_device *device)  	ib_register_event_handler(&dev->event_handler);  } -static void mcast_remove_one(struct ib_device *device) +static void mcast_remove_one(struct ib_device *device, void *client_data)  { -	struct mcast_device *dev; +	struct mcast_device *dev = client_data;  	struct mcast_port *port;  	int i; -	dev = ib_get_client_data(device, &mcast_client);  	if (!dev)  		return; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 23dd5a5c7597..d47df9356779 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -49,6 +49,14 @@ static DEFINE_MUTEX(ibnl_mutex);  static struct sock *nls;  static LIST_HEAD(client_list); +int ibnl_chk_listeners(unsigned int group) +{ +	if (netlink_has_listeners(nls, group) == 0) +		return -1; +	return 0; +} +EXPORT_SYMBOL(ibnl_chk_listeners); +  int ibnl_add_client(int index, int nops,  		    const struct ibnl_client_cbs cb_table[])  { @@ -151,6 +159,23 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  			    !client->cb_table[op].dump)  				return -EINVAL; +			/* +			 * For response or local service set_timeout request, +			 * there is no need to use netlink_dump_start. +			 */ +			if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || +			    (index == RDMA_NL_LS && +			     op == RDMA_NL_LS_OP_SET_TIMEOUT)) { +				struct netlink_callback cb = { +					.skb = skb, +					.nlh = nlh, +					.dump = client->cb_table[op].dump, +					.module = client->cb_table[op].module, +				}; + +				return cb.dump(skb, &cb); +			} +  			{  				struct netlink_dump_control c = {  					.dump = client->cb_table[op].dump, @@ -165,9 +190,39 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	return -EINVAL;  } +static void ibnl_rcv_reply_skb(struct sk_buff *skb) +{ +	struct nlmsghdr *nlh; +	int msglen; + +	/* +	 * Process responses until there is no more message or the first +	 * request. Generally speaking, it is not recommended to mix responses +	 * with requests. +	 */ +	while (skb->len >= nlmsg_total_size(0)) { +		nlh = nlmsg_hdr(skb); + +		if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) +			return; + +		/* Handle response only */ +		if (nlh->nlmsg_flags & NLM_F_REQUEST) +			return; + +		ibnl_rcv_msg(skb, nlh); + +		msglen = NLMSG_ALIGN(nlh->nlmsg_len); +		if (msglen > skb->len) +			msglen = skb->len; +		skb_pull(skb, msglen); +	} +} +  static void ibnl_rcv(struct sk_buff *skb)  {  	mutex_lock(&ibnl_mutex); +	ibnl_rcv_reply_skb(skb);  	netlink_rcv_skb(skb, &ibnl_rcv_msg);  	mutex_unlock(&ibnl_mutex);  } diff --git a/drivers/infiniband/core/opa_smi.h b/drivers/infiniband/core/opa_smi.h index 62d91bfa4cb7..3bfab3505a29 100644 --- a/drivers/infiniband/core/opa_smi.h +++ b/drivers/infiniband/core/opa_smi.h @@ -39,12 +39,12 @@  #include "smi.h" -enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, u8 node_type, +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,  				       int port_num, int phys_port_cnt);  int opa_smi_get_fwd_port(struct opa_smp *smp);  extern enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp);  extern enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, -					      u8 node_type, int port_num); +					      bool is_switch, int port_num);  /*   * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c new file mode 100644 index 000000000000..6b24cba1e474 --- /dev/null +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -0,0 +1,728 @@ +/* + * Copyright (c) 2015, Mellanox Technologies inc.  All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses.  You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + *     Redistribution and use in source and binary forms, with or + *     without modification, are permitted provided that the following + *     conditions are met: + * + *      - Redistributions of source code must retain the above + *        copyright notice, this list of conditions and the following + *        disclaimer. + * + *      - Redistributions in binary form must reproduce the above + *        copyright notice, this list of conditions and the following + *        disclaimer in the documentation and/or other materials + *        provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "core_priv.h" + +#include <linux/in.h> +#include <linux/in6.h> + +/* For in6_dev_get/in6_dev_put */ +#include <net/addrconf.h> +#include <net/bonding.h> + +#include <rdma/ib_cache.h> +#include <rdma/ib_addr.h> + +enum gid_op_type { +	GID_DEL = 0, +	GID_ADD +}; + +struct update_gid_event_work { +	struct work_struct work; +	union ib_gid       gid; +	struct ib_gid_attr gid_attr; +	enum gid_op_type gid_op; +}; + +#define ROCE_NETDEV_CALLBACK_SZ		3 +struct netdev_event_work_cmd { +	roce_netdev_callback	cb; +	roce_netdev_filter	filter; +	struct net_device	*ndev; +	struct net_device	*filter_ndev; +}; + +struct netdev_event_work { +	struct work_struct		work; +	struct netdev_event_work_cmd	cmds[ROCE_NETDEV_CALLBACK_SZ]; +}; + +static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, +		       u8 port, union ib_gid *gid, +		       struct ib_gid_attr *gid_attr) +{ +	switch (gid_op) { +	case GID_ADD: +		ib_cache_gid_add(ib_dev, port, gid, gid_attr); +		break; +	case GID_DEL: +		ib_cache_gid_del(ib_dev, port, gid, gid_attr); +		break; +	} +} + +enum bonding_slave_state { +	BONDING_SLAVE_STATE_ACTIVE	= 1UL << 0, +	BONDING_SLAVE_STATE_INACTIVE	= 1UL << 1, +	/* No primary slave or the device isn't a slave in bonding */ +	BONDING_SLAVE_STATE_NA		= 1UL << 2, +}; + +static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, +								   struct net_device *upper) +{ +	if (upper && netif_is_bond_master(upper)) { +		struct net_device *pdev = +			bond_option_active_slave_get_rcu(netdev_priv(upper)); + +		if (pdev) +			return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : +				BONDING_SLAVE_STATE_INACTIVE; +	} + +	return BONDING_SLAVE_STATE_NA; +} + +static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper) +{ +	struct net_device *_upper = NULL; +	struct list_head *iter; + +	netdev_for_each_all_upper_dev_rcu(dev, _upper, iter) +		if (_upper == upper) +			break; + +	return _upper == upper; +} + +#define REQUIRED_BOND_STATES		(BONDING_SLAVE_STATE_ACTIVE |	\ +					 BONDING_SLAVE_STATE_NA) +static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port, +				 struct net_device *rdma_ndev, void *cookie) +{ +	struct net_device *event_ndev = (struct net_device *)cookie; +	struct net_device *real_dev; +	int res; + +	if (!rdma_ndev) +		return 0; + +	rcu_read_lock(); +	real_dev = rdma_vlan_dev_real_dev(event_ndev); +	if (!real_dev) +		real_dev = event_ndev; + +	res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) && +	       (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & +		REQUIRED_BOND_STATES)) || +	       real_dev == rdma_ndev); + +	rcu_read_unlock(); +	return res; +} + +static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port, +				      struct net_device *rdma_ndev, void *cookie) +{ +	struct net_device *master_dev; +	int res; + +	if (!rdma_ndev) +		return 0; + +	rcu_read_lock(); +	master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); +	res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == +		BONDING_SLAVE_STATE_INACTIVE; +	rcu_read_unlock(); + +	return res; +} + +static int pass_all_filter(struct ib_device *ib_dev, u8 port, +			   struct net_device *rdma_ndev, void *cookie) +{ +	return 1; +} + +static int upper_device_filter(struct ib_device *ib_dev, u8 port, +			       struct net_device *rdma_ndev, void *cookie) +{ +	struct net_device *event_ndev = (struct net_device *)cookie; +	int res; + +	if (!rdma_ndev) +		return 0; + +	if (rdma_ndev == event_ndev) +		return 1; + +	rcu_read_lock(); +	res = is_upper_dev_rcu(rdma_ndev, event_ndev); +	rcu_read_unlock(); + +	return res; +} + +static void update_gid_ip(enum gid_op_type gid_op, +			  struct ib_device *ib_dev, +			  u8 port, struct net_device *ndev, +			  struct sockaddr *addr) +{ +	union ib_gid gid; +	struct ib_gid_attr gid_attr; + +	rdma_ip2gid(addr, &gid); +	memset(&gid_attr, 0, sizeof(gid_attr)); +	gid_attr.ndev = ndev; + +	update_gid(gid_op, ib_dev, port, &gid, &gid_attr); +} + +static void enum_netdev_default_gids(struct ib_device *ib_dev, +				     u8 port, struct net_device *event_ndev, +				     struct net_device *rdma_ndev) +{ +	rcu_read_lock(); +	if (!rdma_ndev || +	    ((rdma_ndev != event_ndev && +	      !is_upper_dev_rcu(rdma_ndev, event_ndev)) || +	     is_eth_active_slave_of_bonding_rcu(rdma_ndev, +						netdev_master_upper_dev_get_rcu(rdma_ndev)) == +	     BONDING_SLAVE_STATE_INACTIVE)) { +		rcu_read_unlock(); +		return; +	} +	rcu_read_unlock(); + +	ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, +				     IB_CACHE_GID_DEFAULT_MODE_SET); +} + +static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, +					    u8 port, +					    struct net_device *event_ndev, +					    struct net_device *rdma_ndev) +{ +	struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); + +	if (!rdma_ndev) +		return; + +	if (!real_dev) +		real_dev = event_ndev; + +	rcu_read_lock(); + +	if (is_upper_dev_rcu(rdma_ndev, event_ndev) && +	    is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == +	    BONDING_SLAVE_STATE_INACTIVE) { +		rcu_read_unlock(); + +		ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, +					     IB_CACHE_GID_DEFAULT_MODE_DELETE); +	} else { +		rcu_read_unlock(); +	} +} + +static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, +				 u8 port, struct net_device *ndev) +{ +	struct in_device *in_dev; + +	if (ndev->reg_state >= NETREG_UNREGISTERING) +		return; + +	in_dev = in_dev_get(ndev); +	if (!in_dev) +		return; + +	for_ifa(in_dev) { +		struct sockaddr_in ip; + +		ip.sin_family = AF_INET; +		ip.sin_addr.s_addr = ifa->ifa_address; +		update_gid_ip(GID_ADD, ib_dev, port, ndev, +			      (struct sockaddr *)&ip); +	} +	endfor_ifa(in_dev); + +	in_dev_put(in_dev); +} + +static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, +				 u8 port, struct net_device *ndev) +{ +	struct inet6_ifaddr *ifp; +	struct inet6_dev *in6_dev; +	struct sin6_list { +		struct list_head	list; +		struct sockaddr_in6	sin6; +	}; +	struct sin6_list *sin6_iter; +	struct sin6_list *sin6_temp; +	struct ib_gid_attr gid_attr = {.ndev = ndev}; +	LIST_HEAD(sin6_list); + +	if (ndev->reg_state >= NETREG_UNREGISTERING) +		return; + +	in6_dev = in6_dev_get(ndev); +	if (!in6_dev) +		return; + +	read_lock_bh(&in6_dev->lock); +	list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { +		struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + +		if (!entry) { +			pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv6 update\n"); +			continue; +		} + +		entry->sin6.sin6_family = AF_INET6; +		entry->sin6.sin6_addr = ifp->addr; +		list_add_tail(&entry->list, &sin6_list); +	} +	read_unlock_bh(&in6_dev->lock); + +	in6_dev_put(in6_dev); + +	list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { +		union ib_gid	gid; + +		rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); +		update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); +		list_del(&sin6_iter->list); +		kfree(sin6_iter); +	} +} + +static void _add_netdev_ips(struct ib_device *ib_dev, u8 port, +			    struct net_device *ndev) +{ +	enum_netdev_ipv4_ips(ib_dev, port, ndev); +	if (IS_ENABLED(CONFIG_IPV6)) +		enum_netdev_ipv6_ips(ib_dev, port, ndev); +} + +static void add_netdev_ips(struct ib_device *ib_dev, u8 port, +			   struct net_device *rdma_ndev, void *cookie) +{ +	struct net_device *event_ndev = (struct net_device *)cookie; + +	enum_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev); +	_add_netdev_ips(ib_dev, port, event_ndev); +} + +static void del_netdev_ips(struct ib_device *ib_dev, u8 port, +			   struct net_device *rdma_ndev, void *cookie) +{ +	struct net_device *event_ndev = (struct net_device *)cookie; + +	ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); +} + +static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, +				    u8 port, +				    struct net_device *rdma_ndev, +				    void *cookie) +{ +	struct net *net; +	struct net_device *ndev; + +	/* Lock the rtnl to make sure the netdevs does not move under +	 * our feet +	 */ +	rtnl_lock(); +	for_each_net(net) +		for_each_netdev(net, ndev) +			if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev)) +				add_netdev_ips(ib_dev, port, rdma_ndev, ndev); +	rtnl_unlock(); +} + +/* This function will rescan all of the network devices in the system + * and add their gids, as needed, to the relevant RoCE devices. */ +int roce_rescan_device(struct ib_device *ib_dev) +{ +	ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, +			    enum_all_gids_of_dev_cb, NULL); + +	return 0; +} + +static void callback_for_addr_gid_device_scan(struct ib_device *device, +					      u8 port, +					      struct net_device *rdma_ndev, +					      void *cookie) +{ +	struct update_gid_event_work *parsed = cookie; + +	return update_gid(parsed->gid_op, device, +			  port, &parsed->gid, +			  &parsed->gid_attr); +} + +static void handle_netdev_upper(struct ib_device *ib_dev, u8 port, +				void *cookie, +				void (*handle_netdev)(struct ib_device *ib_dev, +						      u8 port, +						      struct net_device *ndev)) +{ +	struct net_device *ndev = (struct net_device *)cookie; +	struct upper_list { +		struct list_head list; +		struct net_device *upper; +	}; +	struct net_device *upper; +	struct list_head *iter; +	struct upper_list *upper_iter; +	struct upper_list *upper_temp; +	LIST_HEAD(upper_list); + +	rcu_read_lock(); +	netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) { +		struct upper_list *entry = kmalloc(sizeof(*entry), +						   GFP_ATOMIC); + +		if (!entry) { +			pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n"); +			continue; +		} + +		list_add_tail(&entry->list, &upper_list); +		dev_hold(upper); +		entry->upper = upper; +	} +	rcu_read_unlock(); + +	handle_netdev(ib_dev, port, ndev); +	list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, +				 list) { +		handle_netdev(ib_dev, port, upper_iter->upper); +		dev_put(upper_iter->upper); +		list_del(&upper_iter->list); +		kfree(upper_iter); +	} +} + +static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, +				      struct net_device *event_ndev) +{ +	ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); +} + +static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port, +				 struct net_device *rdma_ndev, void *cookie) +{ +	handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids); +} + +static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port, +				 struct net_device *rdma_ndev, void *cookie) +{ +	handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); +} + +static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port, +					struct net_device *rdma_ndev, +					void *cookie) +{ +	struct net_device *master_ndev; + +	rcu_read_lock(); +	master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); +	if (master_ndev) +		dev_hold(master_ndev); +	rcu_read_unlock(); + +	if (master_ndev) { +		bond_delete_netdev_default_gids(ib_dev, port, master_ndev, +						rdma_ndev); +		dev_put(master_ndev); +	} +} + +static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port, +				   struct net_device *rdma_ndev, void *cookie) +{ +	struct net_device *event_ndev = (struct net_device *)cookie; + +	bond_delete_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev); +} + +/* The following functions operate on all IB devices. netdevice_event and + * addr_event execute ib_enum_all_roce_netdevs through a work. + * ib_enum_all_roce_netdevs iterates through all IB devices. + */ + +static void netdevice_event_work_handler(struct work_struct *_work) +{ +	struct netdev_event_work *work = +		container_of(_work, struct netdev_event_work, work); +	unsigned int i; + +	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { +		ib_enum_all_roce_netdevs(work->cmds[i].filter, +					 work->cmds[i].filter_ndev, +					 work->cmds[i].cb, +					 work->cmds[i].ndev); +		dev_put(work->cmds[i].ndev); +		dev_put(work->cmds[i].filter_ndev); +	} + +	kfree(work); +} + +static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, +				struct net_device *ndev) +{ +	unsigned int i; +	struct netdev_event_work *ndev_work = +		kmalloc(sizeof(*ndev_work), GFP_KERNEL); + +	if (!ndev_work) { +		pr_warn("roce_gid_mgmt: can't allocate work for netdevice_event\n"); +		return NOTIFY_DONE; +	} + +	memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); +	for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { +		if (!ndev_work->cmds[i].ndev) +			ndev_work->cmds[i].ndev = ndev; +		if (!ndev_work->cmds[i].filter_ndev) +			ndev_work->cmds[i].filter_ndev = ndev; +		dev_hold(ndev_work->cmds[i].ndev); +		dev_hold(ndev_work->cmds[i].filter_ndev); +	} +	INIT_WORK(&ndev_work->work, netdevice_event_work_handler); + +	queue_work(ib_wq, &ndev_work->work); + +	return NOTIFY_DONE; +} + +static const struct netdev_event_work_cmd add_cmd = { +	.cb = add_netdev_ips, .filter = is_eth_port_of_netdev}; +static const struct netdev_event_work_cmd add_cmd_upper_ips = { +	.cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev}; + +static void netdevice_event_changeupper(struct netdev_notifier_changeupper_info *changeupper_info, +					struct netdev_event_work_cmd *cmds) +{ +	static const struct netdev_event_work_cmd upper_ips_del_cmd = { +		.cb = del_netdev_upper_ips, .filter = upper_device_filter}; +	static const struct netdev_event_work_cmd bonding_default_del_cmd = { +		.cb = del_netdev_default_ips, .filter = is_eth_port_inactive_slave}; + +	if (changeupper_info->linking == false) { +		cmds[0] = upper_ips_del_cmd; +		cmds[0].ndev = changeupper_info->upper_dev; +		cmds[1] = add_cmd; +	} else { +		cmds[0] = bonding_default_del_cmd; +		cmds[0].ndev = changeupper_info->upper_dev; +		cmds[1] = add_cmd_upper_ips; +		cmds[1].ndev = changeupper_info->upper_dev; +		cmds[1].filter_ndev = changeupper_info->upper_dev; +	} +} + +static int netdevice_event(struct notifier_block *this, unsigned long event, +			   void *ptr) +{ +	static const struct netdev_event_work_cmd del_cmd = { +		.cb = del_netdev_ips, .filter = pass_all_filter}; +	static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { +		.cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave}; +	static const struct netdev_event_work_cmd default_del_cmd = { +		.cb = del_netdev_default_ips, .filter = pass_all_filter}; +	static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { +		.cb = del_netdev_upper_ips, .filter = upper_device_filter}; +	struct net_device *ndev = netdev_notifier_info_to_dev(ptr); +	struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; + +	if (ndev->type != ARPHRD_ETHER) +		return NOTIFY_DONE; + +	switch (event) { +	case NETDEV_REGISTER: +	case NETDEV_UP: +		cmds[0] = bonding_default_del_cmd_join; +		cmds[1] = add_cmd; +		break; + +	case NETDEV_UNREGISTER: +		if (ndev->reg_state < NETREG_UNREGISTERED) +			cmds[0] = del_cmd; +		else +			return NOTIFY_DONE; +		break; + +	case NETDEV_CHANGEADDR: +		cmds[0] = default_del_cmd; +		cmds[1] = add_cmd; +		break; + +	case NETDEV_CHANGEUPPER: +		netdevice_event_changeupper( +			container_of(ptr, struct netdev_notifier_changeupper_info, info), +			cmds); +		break; + +	case NETDEV_BONDING_FAILOVER: +		cmds[0] = bonding_event_ips_del_cmd; +		cmds[1] = bonding_default_del_cmd_join; +		cmds[2] = add_cmd_upper_ips; +		break; + +	default: +		return NOTIFY_DONE; +	} + +	return netdevice_queue_work(cmds, ndev); +} + +static void update_gid_event_work_handler(struct work_struct *_work) +{ +	struct update_gid_event_work *work = +		container_of(_work, struct update_gid_event_work, work); + +	ib_enum_all_roce_netdevs(is_eth_port_of_netdev, work->gid_attr.ndev, +				 callback_for_addr_gid_device_scan, work); + +	dev_put(work->gid_attr.ndev); +	kfree(work); +} + +static int addr_event(struct notifier_block *this, unsigned long event, +		      struct sockaddr *sa, struct net_device *ndev) +{ +	struct update_gid_event_work *work; +	enum gid_op_type gid_op; + +	if (ndev->type != ARPHRD_ETHER) +		return NOTIFY_DONE; + +	switch (event) { +	case NETDEV_UP: +		gid_op = GID_ADD; +		break; + +	case NETDEV_DOWN: +		gid_op = GID_DEL; +		break; + +	default: +		return NOTIFY_DONE; +	} + +	work = kmalloc(sizeof(*work), GFP_ATOMIC); +	if (!work) { +		pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n"); +		return NOTIFY_DONE; +	} + +	INIT_WORK(&work->work, update_gid_event_work_handler); + +	rdma_ip2gid(sa, &work->gid); +	work->gid_op = gid_op; + +	memset(&work->gid_attr, 0, sizeof(work->gid_attr)); +	dev_hold(ndev); +	work->gid_attr.ndev   = ndev; + +	queue_work(ib_wq, &work->work); + +	return NOTIFY_DONE; +} + +static int inetaddr_event(struct notifier_block *this, unsigned long event, +			  void *ptr) +{ +	struct sockaddr_in	in; +	struct net_device	*ndev; +	struct in_ifaddr	*ifa = ptr; + +	in.sin_family = AF_INET; +	in.sin_addr.s_addr = ifa->ifa_address; +	ndev = ifa->ifa_dev->dev; + +	return addr_event(this, event, (struct sockaddr *)&in, ndev); +} + +static int inet6addr_event(struct notifier_block *this, unsigned long event, +			   void *ptr) +{ +	struct sockaddr_in6	in6; +	struct net_device	*ndev; +	struct inet6_ifaddr	*ifa6 = ptr; + +	in6.sin6_family = AF_INET6; +	in6.sin6_addr = ifa6->addr; +	ndev = ifa6->idev->dev; + +	return addr_event(this, event, (struct sockaddr *)&in6, ndev); +} + +static struct notifier_block nb_netdevice = { +	.notifier_call = netdevice_event +}; + +static struct notifier_block nb_inetaddr = { +	.notifier_call = inetaddr_event +}; + +static struct notifier_block nb_inet6addr = { +	.notifier_call = inet6addr_event +}; + +int __init roce_gid_mgmt_init(void) +{ +	register_inetaddr_notifier(&nb_inetaddr); +	if (IS_ENABLED(CONFIG_IPV6)) +		register_inet6addr_notifier(&nb_inet6addr); +	/* We relay on the netdevice notifier to enumerate all +	 * existing devices in the system. Register to this notifier +	 * last to make sure we will not miss any IP add/del +	 * callbacks. +	 */ +	register_netdevice_notifier(&nb_netdevice); + +	return 0; +} + +void __exit roce_gid_mgmt_cleanup(void) +{ +	if (IS_ENABLED(CONFIG_IPV6)) +		unregister_inet6addr_notifier(&nb_inet6addr); +	unregister_inetaddr_notifier(&nb_inetaddr); +	unregister_netdevice_notifier(&nb_netdevice); +	/* Ensure all gid deletion tasks complete before we go down, +	 * to avoid any reference to free'd memory. By the time +	 * ib-core is removed, all physical devices have been removed, +	 * so no issue with remaining hardware contexts. +	 */ +} diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 0fae85062a65..8c014b33d8e0 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -45,12 +45,21 @@  #include <uapi/linux/if_ether.h>  #include <rdma/ib_pack.h>  #include <rdma/ib_cache.h> +#include <rdma/rdma_netlink.h> +#include <net/netlink.h> +#include <uapi/rdma/ib_user_sa.h> +#include <rdma/ib_marshall.h>  #include "sa.h"  MODULE_AUTHOR("Roland Dreier");  MODULE_DESCRIPTION("InfiniBand subnet administration query support");  MODULE_LICENSE("Dual BSD/GPL"); +#define IB_SA_LOCAL_SVC_TIMEOUT_MIN		100 +#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT		2000 +#define IB_SA_LOCAL_SVC_TIMEOUT_MAX		200000 +static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT; +  struct ib_sa_sm_ah {  	struct ib_ah        *ah;  	struct kref          ref; @@ -80,8 +89,16 @@ struct ib_sa_query {  	struct ib_mad_send_buf *mad_buf;  	struct ib_sa_sm_ah     *sm_ah;  	int			id; +	u32			flags; +	struct list_head	list; /* Local svc request list */ +	u32			seq; /* Local svc request sequence number */ +	unsigned long		timeout; /* Local svc timeout */ +	u8			path_use; /* How will the pathrecord be used */  }; +#define IB_SA_ENABLE_LOCAL_SERVICE	0x00000001 +#define IB_SA_CANCEL			0x00000002 +  struct ib_sa_service_query {  	void (*callback)(int, struct ib_sa_service_rec *, void *);  	void *context; @@ -106,8 +123,28 @@ struct ib_sa_mcmember_query {  	struct ib_sa_query sa_query;  }; +static LIST_HEAD(ib_nl_request_list); +static DEFINE_SPINLOCK(ib_nl_request_lock); +static atomic_t ib_nl_sa_request_seq; +static struct workqueue_struct *ib_nl_wq; +static struct delayed_work ib_nl_timed_work; +static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = { +	[LS_NLA_TYPE_PATH_RECORD]	= {.type = NLA_BINARY, +		.len = sizeof(struct ib_path_rec_data)}, +	[LS_NLA_TYPE_TIMEOUT]		= {.type = NLA_U32}, +	[LS_NLA_TYPE_SERVICE_ID]	= {.type = NLA_U64}, +	[LS_NLA_TYPE_DGID]		= {.type = NLA_BINARY, +		.len = sizeof(struct rdma_nla_ls_gid)}, +	[LS_NLA_TYPE_SGID]		= {.type = NLA_BINARY, +		.len = sizeof(struct rdma_nla_ls_gid)}, +	[LS_NLA_TYPE_TCLASS]		= {.type = NLA_U8}, +	[LS_NLA_TYPE_PKEY]		= {.type = NLA_U16}, +	[LS_NLA_TYPE_QOS_CLASS]		= {.type = NLA_U16}, +}; + +  static void ib_sa_add_one(struct ib_device *device); -static void ib_sa_remove_one(struct ib_device *device); +static void ib_sa_remove_one(struct ib_device *device, void *client_data);  static struct ib_client sa_client = {  	.name   = "sa", @@ -381,6 +418,427 @@ static const struct ib_field guidinfo_rec_table[] = {  	  .size_bits    = 512 },  }; +static inline void ib_sa_disable_local_svc(struct ib_sa_query *query) +{ +	query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE; +} + +static inline int ib_sa_query_cancelled(struct ib_sa_query *query) +{ +	return (query->flags & IB_SA_CANCEL); +} + +static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, +				     struct ib_sa_query *query) +{ +	struct ib_sa_path_rec *sa_rec = query->mad_buf->context[1]; +	struct ib_sa_mad *mad = query->mad_buf->mad; +	ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask; +	u16 val16; +	u64 val64; +	struct rdma_ls_resolve_header *header; + +	query->mad_buf->context[1] = NULL; + +	/* Construct the family header first */ +	header = (struct rdma_ls_resolve_header *) +		skb_put(skb, NLMSG_ALIGN(sizeof(*header))); +	memcpy(header->device_name, query->port->agent->device->name, +	       LS_DEVICE_NAME_MAX); +	header->port_num = query->port->port_num; + +	if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) && +	    sa_rec->reversible != 0) +		query->path_use = LS_RESOLVE_PATH_USE_GMP; +	else +		query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL; +	header->path_use = query->path_use; + +	/* Now build the attributes */ +	if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) { +		val64 = be64_to_cpu(sa_rec->service_id); +		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID, +			sizeof(val64), &val64); +	} +	if (comp_mask & IB_SA_PATH_REC_DGID) +		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID, +			sizeof(sa_rec->dgid), &sa_rec->dgid); +	if (comp_mask & IB_SA_PATH_REC_SGID) +		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID, +			sizeof(sa_rec->sgid), &sa_rec->sgid); +	if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS) +		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS, +			sizeof(sa_rec->traffic_class), &sa_rec->traffic_class); + +	if (comp_mask & IB_SA_PATH_REC_PKEY) { +		val16 = be16_to_cpu(sa_rec->pkey); +		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY, +			sizeof(val16), &val16); +	} +	if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) { +		val16 = be16_to_cpu(sa_rec->qos_class); +		nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS, +			sizeof(val16), &val16); +	} +} + +static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask) +{ +	int len = 0; + +	if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) +		len += nla_total_size(sizeof(u64)); +	if (comp_mask & IB_SA_PATH_REC_DGID) +		len += nla_total_size(sizeof(struct rdma_nla_ls_gid)); +	if (comp_mask & IB_SA_PATH_REC_SGID) +		len += nla_total_size(sizeof(struct rdma_nla_ls_gid)); +	if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS) +		len += nla_total_size(sizeof(u8)); +	if (comp_mask & IB_SA_PATH_REC_PKEY) +		len += nla_total_size(sizeof(u16)); +	if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) +		len += nla_total_size(sizeof(u16)); + +	/* +	 * Make sure that at least some of the required comp_mask bits are +	 * set. +	 */ +	if (WARN_ON(len == 0)) +		return len; + +	/* Add the family header */ +	len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header)); + +	return len; +} + +static int ib_nl_send_msg(struct ib_sa_query *query) +{ +	struct sk_buff *skb = NULL; +	struct nlmsghdr *nlh; +	void *data; +	int ret = 0; +	struct ib_sa_mad *mad; +	int len; + +	mad = query->mad_buf->mad; +	len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask); +	if (len <= 0) +		return -EMSGSIZE; + +	skb = nlmsg_new(len, GFP_KERNEL); +	if (!skb) +		return -ENOMEM; + +	/* Put nlmsg header only for now */ +	data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS, +			    RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST); +	if (!data) { +		kfree_skb(skb); +		return -EMSGSIZE; +	} + +	/* Add attributes */ +	ib_nl_set_path_rec_attrs(skb, query); + +	/* Repair the nlmsg header length */ +	nlmsg_end(skb, nlh); + +	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL); +	if (!ret) +		ret = len; +	else +		ret = 0; + +	return ret; +} + +static int ib_nl_make_request(struct ib_sa_query *query) +{ +	unsigned long flags; +	unsigned long delay; +	int ret; + +	INIT_LIST_HEAD(&query->list); +	query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq); + +	spin_lock_irqsave(&ib_nl_request_lock, flags); +	ret = ib_nl_send_msg(query); +	if (ret <= 0) { +		ret = -EIO; +		goto request_out; +	} else { +		ret = 0; +	} + +	delay = msecs_to_jiffies(sa_local_svc_timeout_ms); +	query->timeout = delay + jiffies; +	list_add_tail(&query->list, &ib_nl_request_list); +	/* Start the timeout if this is the only request */ +	if (ib_nl_request_list.next == &query->list) +		queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); + +request_out: +	spin_unlock_irqrestore(&ib_nl_request_lock, flags); + +	return ret; +} + +static int ib_nl_cancel_request(struct ib_sa_query *query) +{ +	unsigned long flags; +	struct ib_sa_query *wait_query; +	int found = 0; + +	spin_lock_irqsave(&ib_nl_request_lock, flags); +	list_for_each_entry(wait_query, &ib_nl_request_list, list) { +		/* Let the timeout to take care of the callback */ +		if (query == wait_query) { +			query->flags |= IB_SA_CANCEL; +			query->timeout = jiffies; +			list_move(&query->list, &ib_nl_request_list); +			found = 1; +			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1); +			break; +		} +	} +	spin_unlock_irqrestore(&ib_nl_request_lock, flags); + +	return found; +} + +static void send_handler(struct ib_mad_agent *agent, +			 struct ib_mad_send_wc *mad_send_wc); + +static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query, +					   const struct nlmsghdr *nlh) +{ +	struct ib_mad_send_wc mad_send_wc; +	struct ib_sa_mad *mad = NULL; +	const struct nlattr *head, *curr; +	struct ib_path_rec_data  *rec; +	int len, rem; +	u32 mask = 0; +	int status = -EIO; + +	if (query->callback) { +		head = (const struct nlattr *) nlmsg_data(nlh); +		len = nlmsg_len(nlh); +		switch (query->path_use) { +		case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL: +			mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND; +			break; + +		case LS_RESOLVE_PATH_USE_ALL: +		case LS_RESOLVE_PATH_USE_GMP: +		default: +			mask = IB_PATH_PRIMARY | IB_PATH_GMP | +				IB_PATH_BIDIRECTIONAL; +			break; +		} +		nla_for_each_attr(curr, head, len, rem) { +			if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) { +				rec = nla_data(curr); +				/* +				 * Get the first one. In the future, we may +				 * need to get up to 6 pathrecords. +				 */ +				if ((rec->flags & mask) == mask) { +					mad = query->mad_buf->mad; +					mad->mad_hdr.method |= +						IB_MGMT_METHOD_RESP; +					memcpy(mad->data, rec->path_rec, +					       sizeof(rec->path_rec)); +					status = 0; +					break; +				} +			} +		} +		query->callback(query, status, mad); +	} + +	mad_send_wc.send_buf = query->mad_buf; +	mad_send_wc.status = IB_WC_SUCCESS; +	send_handler(query->mad_buf->mad_agent, &mad_send_wc); +} + +static void ib_nl_request_timeout(struct work_struct *work) +{ +	unsigned long flags; +	struct ib_sa_query *query; +	unsigned long delay; +	struct ib_mad_send_wc mad_send_wc; +	int ret; + +	spin_lock_irqsave(&ib_nl_request_lock, flags); +	while (!list_empty(&ib_nl_request_list)) { +		query = list_entry(ib_nl_request_list.next, +				   struct ib_sa_query, list); + +		if (time_after(query->timeout, jiffies)) { +			delay = query->timeout - jiffies; +			if ((long)delay <= 0) +				delay = 1; +			queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); +			break; +		} + +		list_del(&query->list); +		ib_sa_disable_local_svc(query); +		/* Hold the lock to protect against query cancellation */ +		if (ib_sa_query_cancelled(query)) +			ret = -1; +		else +			ret = ib_post_send_mad(query->mad_buf, NULL); +		if (ret) { +			mad_send_wc.send_buf = query->mad_buf; +			mad_send_wc.status = IB_WC_WR_FLUSH_ERR; +			spin_unlock_irqrestore(&ib_nl_request_lock, flags); +			send_handler(query->port->agent, &mad_send_wc); +			spin_lock_irqsave(&ib_nl_request_lock, flags); +		} +	} +	spin_unlock_irqrestore(&ib_nl_request_lock, flags); +} + +static int ib_nl_handle_set_timeout(struct sk_buff *skb, +				    struct netlink_callback *cb) +{ +	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; +	int timeout, delta, abs_delta; +	const struct nlattr *attr; +	unsigned long flags; +	struct ib_sa_query *query; +	long delay = 0; +	struct nlattr *tb[LS_NLA_TYPE_MAX]; +	int ret; + +	if (!netlink_capable(skb, CAP_NET_ADMIN)) +		return -EPERM; + +	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), +			nlmsg_len(nlh), ib_nl_policy); +	attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT]; +	if (ret || !attr) +		goto settimeout_out; + +	timeout = *(int *) nla_data(attr); +	if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN) +		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN; +	if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX) +		timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX; + +	delta = timeout - sa_local_svc_timeout_ms; +	if (delta < 0) +		abs_delta = -delta; +	else +		abs_delta = delta; + +	if (delta != 0) { +		spin_lock_irqsave(&ib_nl_request_lock, flags); +		sa_local_svc_timeout_ms = timeout; +		list_for_each_entry(query, &ib_nl_request_list, list) { +			if (delta < 0 && abs_delta > query->timeout) +				query->timeout = 0; +			else +				query->timeout += delta; + +			/* Get the new delay from the first entry */ +			if (!delay) { +				delay = query->timeout - jiffies; +				if (delay <= 0) +					delay = 1; +			} +		} +		if (delay) +			mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, +					 (unsigned long)delay); +		spin_unlock_irqrestore(&ib_nl_request_lock, flags); +	} + +settimeout_out: +	return skb->len; +} + +static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) +{ +	struct nlattr *tb[LS_NLA_TYPE_MAX]; +	int ret; + +	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) +		return 0; + +	ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), +			nlmsg_len(nlh), ib_nl_policy); +	if (ret) +		return 0; + +	return 1; +} + +static int ib_nl_handle_resolve_resp(struct sk_buff *skb, +				     struct netlink_callback *cb) +{ +	const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; +	unsigned long flags; +	struct ib_sa_query *query; +	struct ib_mad_send_buf *send_buf; +	struct ib_mad_send_wc mad_send_wc; +	int found = 0; +	int ret; + +	if (!netlink_capable(skb, CAP_NET_ADMIN)) +		return -EPERM; + +	spin_lock_irqsave(&ib_nl_request_lock, flags); +	list_for_each_entry(query, &ib_nl_request_list, list) { +		/* +		 * If the query is cancelled, let the timeout routine +		 * take care of it. +		 */ +		if (nlh->nlmsg_seq == query->seq) { +			found = !ib_sa_query_cancelled(query); +			if (found) +				list_del(&query->list); +			break; +		} +	} + +	if (!found) { +		spin_unlock_irqrestore(&ib_nl_request_lock, flags); +		goto resp_out; +	} + +	send_buf = query->mad_buf; + +	if (!ib_nl_is_good_resolve_resp(nlh)) { +		/* if the result is a failure, send out the packet via IB */ +		ib_sa_disable_local_svc(query); +		ret = ib_post_send_mad(query->mad_buf, NULL); +		spin_unlock_irqrestore(&ib_nl_request_lock, flags); +		if (ret) { +			mad_send_wc.send_buf = send_buf; +			mad_send_wc.status = IB_WC_GENERAL_ERR; +			send_handler(query->port->agent, &mad_send_wc); +		} +	} else { +		spin_unlock_irqrestore(&ib_nl_request_lock, flags); +		ib_nl_process_good_resolve_rsp(query, nlh); +	} + +resp_out: +	return skb->len; +} + +static struct ibnl_client_cbs ib_sa_cb_table[] = { +	[RDMA_NL_LS_OP_RESOLVE] = { +		.dump = ib_nl_handle_resolve_resp, +		.module = THIS_MODULE }, +	[RDMA_NL_LS_OP_SET_TIMEOUT] = { +		.dump = ib_nl_handle_set_timeout, +		.module = THIS_MODULE }, +}; +  static void free_sm_ah(struct kref *kref)  {  	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); @@ -502,7 +960,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)  	mad_buf = query->mad_buf;  	spin_unlock_irqrestore(&idr_lock, flags); -	ib_cancel_mad(agent, mad_buf); +	/* +	 * If the query is still on the netlink request list, schedule +	 * it to be cancelled by the timeout routine. Otherwise, it has been +	 * sent to the MAD layer and has to be cancelled from there. +	 */ +	if (!ib_nl_cancel_request(query)) +		ib_cancel_mad(agent, mad_buf);  }  EXPORT_SYMBOL(ib_sa_cancel_query); @@ -639,6 +1103,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)  	query->mad_buf->context[0] = query;  	query->id = id; +	if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) { +		if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) { +			if (!ib_nl_make_request(query)) +				return id; +		} +		ib_sa_disable_local_svc(query); +	} +  	ret = ib_post_send_mad(query->mad_buf, NULL);  	if (ret) {  		spin_lock_irqsave(&idr_lock, flags); @@ -740,7 +1212,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,  	port  = &sa_dev->port[port_num - sa_dev->start_port];  	agent = port->agent; -	query = kmalloc(sizeof *query, gfp_mask); +	query = kzalloc(sizeof(*query), gfp_mask);  	if (!query)  		return -ENOMEM; @@ -767,6 +1239,9 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,  	*sa_query = &query->sa_query; +	query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE; +	query->sa_query.mad_buf->context[1] = rec; +  	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);  	if (ret < 0)  		goto err2; @@ -862,7 +1337,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,  	    method != IB_SA_METHOD_DELETE)  		return -EINVAL; -	query = kmalloc(sizeof *query, gfp_mask); +	query = kzalloc(sizeof(*query), gfp_mask);  	if (!query)  		return -ENOMEM; @@ -954,7 +1429,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client,  	port  = &sa_dev->port[port_num - sa_dev->start_port];  	agent = port->agent; -	query = kmalloc(sizeof *query, gfp_mask); +	query = kzalloc(sizeof(*query), gfp_mask);  	if (!query)  		return -ENOMEM; @@ -1051,7 +1526,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,  	port  = &sa_dev->port[port_num - sa_dev->start_port];  	agent = port->agent; -	query = kmalloc(sizeof *query, gfp_mask); +	query = kzalloc(sizeof(*query), gfp_mask);  	if (!query)  		return -ENOMEM; @@ -1156,12 +1631,8 @@ static void ib_sa_add_one(struct ib_device *device)  	int s, e, i;  	int count = 0; -	if (device->node_type == RDMA_NODE_IB_SWITCH) -		s = e = 0; -	else { -		s = 1; -		e = device->phys_port_cnt; -	} +	s = rdma_start_port(device); +	e = rdma_end_port(device);  	sa_dev = kzalloc(sizeof *sa_dev +  			 (e - s + 1) * sizeof (struct ib_sa_port), @@ -1225,9 +1696,9 @@ free:  	return;  } -static void ib_sa_remove_one(struct ib_device *device) +static void ib_sa_remove_one(struct ib_device *device, void *client_data)  { -	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); +	struct ib_sa_device *sa_dev = client_data;  	int i;  	if (!sa_dev) @@ -1255,6 +1726,8 @@ static int __init ib_sa_init(void)  	get_random_bytes(&tid, sizeof tid); +	atomic_set(&ib_nl_sa_request_seq, 0); +  	ret = ib_register_client(&sa_client);  	if (ret) {  		printk(KERN_ERR "Couldn't register ib_sa client\n"); @@ -1267,7 +1740,25 @@ static int __init ib_sa_init(void)  		goto err2;  	} +	ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq"); +	if (!ib_nl_wq) { +		ret = -ENOMEM; +		goto err3; +	} + +	if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS, +			    ib_sa_cb_table)) { +		pr_err("Failed to add netlink callback\n"); +		ret = -EINVAL; +		goto err4; +	} +	INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout); +  	return 0; +err4: +	destroy_workqueue(ib_nl_wq); +err3: +	mcast_cleanup();  err2:  	ib_unregister_client(&sa_client);  err1: @@ -1276,6 +1767,10 @@ err1:  static void __exit ib_sa_cleanup(void)  { +	ibnl_remove_client(RDMA_NL_LS); +	cancel_delayed_work(&ib_nl_timed_work); +	flush_workqueue(ib_nl_wq); +	destroy_workqueue(ib_nl_wq);  	mcast_cleanup();  	ib_unregister_client(&sa_client);  	idr_destroy(&query_idr); diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c index 368a561d1a5d..f19b23817c2b 100644 --- a/drivers/infiniband/core/smi.c +++ b/drivers/infiniband/core/smi.c @@ -41,7 +41,7 @@  #include "smi.h"  #include "opa_smi.h" -static enum smi_action __smi_handle_dr_smp_send(u8 node_type, int port_num, +static enum smi_action __smi_handle_dr_smp_send(bool is_switch, int port_num,  						u8 *hop_ptr, u8 hop_cnt,  						const u8 *initial_path,  						const u8 *return_path, @@ -64,7 +64,7 @@ static enum smi_action __smi_handle_dr_smp_send(u8 node_type, int port_num,  		/* C14-9:2 */  		if (*hop_ptr && *hop_ptr < hop_cnt) { -			if (node_type != RDMA_NODE_IB_SWITCH) +			if (!is_switch)  				return IB_SMI_DISCARD;  			/* return_path set when received */ @@ -77,7 +77,7 @@ static enum smi_action __smi_handle_dr_smp_send(u8 node_type, int port_num,  		if (*hop_ptr == hop_cnt) {  			/* return_path set when received */  			(*hop_ptr)++; -			return (node_type == RDMA_NODE_IB_SWITCH || +			return (is_switch ||  				dr_dlid_is_permissive ?  				IB_SMI_HANDLE : IB_SMI_DISCARD);  		} @@ -96,7 +96,7 @@ static enum smi_action __smi_handle_dr_smp_send(u8 node_type, int port_num,  		/* C14-13:2 */  		if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) { -			if (node_type != RDMA_NODE_IB_SWITCH) +			if (!is_switch)  				return IB_SMI_DISCARD;  			(*hop_ptr)--; @@ -108,7 +108,7 @@ static enum smi_action __smi_handle_dr_smp_send(u8 node_type, int port_num,  		if (*hop_ptr == 1) {  			(*hop_ptr)--;  			/* C14-13:3 -- SMPs destined for SM shouldn't be here */ -			return (node_type == RDMA_NODE_IB_SWITCH || +			return (is_switch ||  				dr_slid_is_permissive ?  				IB_SMI_HANDLE : IB_SMI_DISCARD);  		} @@ -127,9 +127,9 @@ static enum smi_action __smi_handle_dr_smp_send(u8 node_type, int port_num,   * Return IB_SMI_DISCARD if the SMP should be discarded   */  enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, -				       u8 node_type, int port_num) +				       bool is_switch, int port_num)  { -	return __smi_handle_dr_smp_send(node_type, port_num, +	return __smi_handle_dr_smp_send(is_switch, port_num,  					&smp->hop_ptr, smp->hop_cnt,  					smp->initial_path,  					smp->return_path, @@ -139,9 +139,9 @@ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,  }  enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp, -				       u8 node_type, int port_num) +				       bool is_switch, int port_num)  { -	return __smi_handle_dr_smp_send(node_type, port_num, +	return __smi_handle_dr_smp_send(is_switch, port_num,  					&smp->hop_ptr, smp->hop_cnt,  					smp->route.dr.initial_path,  					smp->route.dr.return_path, @@ -152,7 +152,7 @@ enum smi_action opa_smi_handle_dr_smp_send(struct opa_smp *smp,  					OPA_LID_PERMISSIVE);  } -static enum smi_action __smi_handle_dr_smp_recv(u8 node_type, int port_num, +static enum smi_action __smi_handle_dr_smp_recv(bool is_switch, int port_num,  						int phys_port_cnt,  						u8 *hop_ptr, u8 hop_cnt,  						const u8 *initial_path, @@ -173,7 +173,7 @@ static enum smi_action __smi_handle_dr_smp_recv(u8 node_type, int port_num,  		/* C14-9:2 -- intermediate hop */  		if (*hop_ptr && *hop_ptr < hop_cnt) { -			if (node_type != RDMA_NODE_IB_SWITCH) +			if (!is_switch)  				return IB_SMI_DISCARD;  			return_path[*hop_ptr] = port_num; @@ -188,7 +188,7 @@ static enum smi_action __smi_handle_dr_smp_recv(u8 node_type, int port_num,  				return_path[*hop_ptr] = port_num;  			/* hop_ptr updated when sending */ -			return (node_type == RDMA_NODE_IB_SWITCH || +			return (is_switch ||  				dr_dlid_is_permissive ?  				IB_SMI_HANDLE : IB_SMI_DISCARD);  		} @@ -208,7 +208,7 @@ static enum smi_action __smi_handle_dr_smp_recv(u8 node_type, int port_num,  		/* C14-13:2 */  		if (2 <= *hop_ptr && *hop_ptr <= hop_cnt) { -			if (node_type != RDMA_NODE_IB_SWITCH) +			if (!is_switch)  				return IB_SMI_DISCARD;  			/* hop_ptr updated when sending */ @@ -224,8 +224,7 @@ static enum smi_action __smi_handle_dr_smp_recv(u8 node_type, int port_num,  				return IB_SMI_HANDLE;  			}  			/* hop_ptr updated when sending */ -			return (node_type == RDMA_NODE_IB_SWITCH ? -				IB_SMI_HANDLE : IB_SMI_DISCARD); +			return (is_switch ? IB_SMI_HANDLE : IB_SMI_DISCARD);  		}  		/* C14-13:4 -- hop_ptr = 0 -> give to SM */ @@ -238,10 +237,10 @@ static enum smi_action __smi_handle_dr_smp_recv(u8 node_type, int port_num,   * Adjust information for a received SMP   * Return IB_SMI_DISCARD if the SMP should be dropped   */ -enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,  				       int port_num, int phys_port_cnt)  { -	return __smi_handle_dr_smp_recv(node_type, port_num, phys_port_cnt, +	return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,  					&smp->hop_ptr, smp->hop_cnt,  					smp->initial_path,  					smp->return_path, @@ -254,10 +253,10 @@ enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,   * Adjust information for a received SMP   * Return IB_SMI_DISCARD if the SMP should be dropped   */ -enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, u8 node_type, +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, bool is_switch,  					   int port_num, int phys_port_cnt)  { -	return __smi_handle_dr_smp_recv(node_type, port_num, phys_port_cnt, +	return __smi_handle_dr_smp_recv(is_switch, port_num, phys_port_cnt,  					&smp->hop_ptr, smp->hop_cnt,  					smp->route.dr.initial_path,  					smp->route.dr.return_path, diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h index aff96bac49b4..33c91c8a16e9 100644 --- a/drivers/infiniband/core/smi.h +++ b/drivers/infiniband/core/smi.h @@ -51,12 +51,12 @@ enum smi_forward_action {  	IB_SMI_FORWARD	/* SMP should be forwarded (for switches only) */  }; -enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, +enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, bool is_switch,  				       int port_num, int phys_port_cnt);  int smi_get_fwd_port(struct ib_smp *smp);  extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);  extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, -					      u8 node_type, int port_num); +					      bool is_switch, int port_num);  /*   * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index ed6b6c85c334..34cdd74b0a17 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -457,29 +457,6 @@ static struct kobj_type port_type = {  	.default_attrs = port_default_attrs  }; -static void ib_device_release(struct device *device) -{ -	struct ib_device *dev = container_of(device, struct ib_device, dev); - -	kfree(dev->port_immutable); -	kfree(dev); -} - -static int ib_device_uevent(struct device *device, -			    struct kobj_uevent_env *env) -{ -	struct ib_device *dev = container_of(device, struct ib_device, dev); - -	if (add_uevent_var(env, "NAME=%s", dev->name)) -		return -ENOMEM; - -	/* -	 * It would be nice to pass the node GUID with the event... -	 */ - -	return 0; -} -  static struct attribute **  alloc_group_attrs(ssize_t (*show)(struct ib_port *,  				  struct port_attribute *, char *buf), @@ -702,12 +679,6 @@ static struct device_attribute *ib_class_attributes[] = {  	&dev_attr_node_desc  }; -static struct class ib_class = { -	.name    = "infiniband", -	.dev_release = ib_device_release, -	.dev_uevent = ib_device_uevent, -}; -  /* Show a given an attribute in the statistics group */  static ssize_t show_protocol_stat(const struct device *device,  			    struct device_attribute *attr, char *buf, @@ -846,14 +817,12 @@ int ib_device_register_sysfs(struct ib_device *device,  	int ret;  	int i; -	class_dev->class      = &ib_class; -	class_dev->parent     = device->dma_device; -	dev_set_name(class_dev, "%s", device->name); -	dev_set_drvdata(class_dev, device); - -	INIT_LIST_HEAD(&device->port_list); +	device->dev.parent = device->dma_device; +	ret = dev_set_name(class_dev, "%s", device->name); +	if (ret) +		return ret; -	ret = device_register(class_dev); +	ret = device_add(class_dev);  	if (ret)  		goto err; @@ -870,7 +839,7 @@ int ib_device_register_sysfs(struct ib_device *device,  		goto err_put;  	} -	if (device->node_type == RDMA_NODE_IB_SWITCH) { +	if (rdma_cap_ib_switch(device)) {  		ret = add_port(device, 0, port_callback);  		if (ret)  			goto err_put; @@ -916,13 +885,3 @@ void ib_device_unregister_sysfs(struct ib_device *device)  	device_unregister(&device->dev);  } - -int ib_sysfs_setup(void) -{ -	return class_register(&ib_class); -} - -void ib_sysfs_cleanup(void) -{ -	class_unregister(&ib_class); -} diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 62c24b1452b8..6b4e8a008bc0 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -109,7 +109,7 @@ enum {  #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)  static void ib_ucm_add_one(struct ib_device *device); -static void ib_ucm_remove_one(struct ib_device *device); +static void ib_ucm_remove_one(struct ib_device *device, void *client_data);  static struct ib_client ucm_client = {  	.name   = "ucm", @@ -658,8 +658,7 @@ static ssize_t ib_ucm_listen(struct ib_ucm_file *file,  	if (result)  		goto out; -	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask, -			      NULL); +	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);  out:  	ib_ucm_ctx_put(ctx);  	return result; @@ -1193,6 +1192,7 @@ static int ib_ucm_close(struct inode *inode, struct file *filp)  	return 0;  } +static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);  static void ib_ucm_release_dev(struct device *dev)  {  	struct ib_ucm_device *ucm_dev; @@ -1202,7 +1202,7 @@ static void ib_ucm_release_dev(struct device *dev)  	if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)  		clear_bit(ucm_dev->devnum, dev_map);  	else -		clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map); +		clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map);  	kfree(ucm_dev);  } @@ -1226,7 +1226,6 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,  static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);  static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);  static int find_overflow_devnum(void)  {  	int ret; @@ -1310,9 +1309,9 @@ err:  	return;  } -static void ib_ucm_remove_one(struct ib_device *device) +static void ib_ucm_remove_one(struct ib_device *device, void *client_data)  { -	struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client); +	struct ib_ucm_device *ucm_dev = client_data;  	if (!ucm_dev)  		return; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index ad45469f7582..a53fc9b01c69 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -74,6 +74,7 @@ struct ucma_file {  	struct list_head	ctx_list;  	struct list_head	event_list;  	wait_queue_head_t	poll_wait; +	struct workqueue_struct	*close_wq;  };  struct ucma_context { @@ -89,6 +90,13 @@ struct ucma_context {  	struct list_head	list;  	struct list_head	mc_list; +	/* mark that device is in process of destroying the internal HW +	 * resources, protected by the global mut +	 */ +	int			closing; +	/* sync between removal event and id destroy, protected by file mut */ +	int			destroying; +	struct work_struct	close_work;  };  struct ucma_multicast { @@ -107,6 +115,7 @@ struct ucma_event {  	struct list_head	list;  	struct rdma_cm_id	*cm_id;  	struct rdma_ucm_event_resp resp; +	struct work_struct	close_work;  };  static DEFINE_MUTEX(mut); @@ -132,8 +141,12 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)  	mutex_lock(&mut);  	ctx = _ucma_find_context(id, file); -	if (!IS_ERR(ctx)) -		atomic_inc(&ctx->ref); +	if (!IS_ERR(ctx)) { +		if (ctx->closing) +			ctx = ERR_PTR(-EIO); +		else +			atomic_inc(&ctx->ref); +	}  	mutex_unlock(&mut);  	return ctx;  } @@ -144,6 +157,28 @@ static void ucma_put_ctx(struct ucma_context *ctx)  		complete(&ctx->comp);  } +static void ucma_close_event_id(struct work_struct *work) +{ +	struct ucma_event *uevent_close =  container_of(work, struct ucma_event, close_work); + +	rdma_destroy_id(uevent_close->cm_id); +	kfree(uevent_close); +} + +static void ucma_close_id(struct work_struct *work) +{ +	struct ucma_context *ctx =  container_of(work, struct ucma_context, close_work); + +	/* once all inflight tasks are finished, we close all underlying +	 * resources. The context is still alive till its explicit destryoing +	 * by its creator. +	 */ +	ucma_put_ctx(ctx); +	wait_for_completion(&ctx->comp); +	/* No new events will be generated after destroying the id. */ +	rdma_destroy_id(ctx->cm_id); +} +  static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)  {  	struct ucma_context *ctx; @@ -152,6 +187,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)  	if (!ctx)  		return NULL; +	INIT_WORK(&ctx->close_work, ucma_close_id);  	atomic_set(&ctx->ref, 1);  	init_completion(&ctx->comp);  	INIT_LIST_HEAD(&ctx->mc_list); @@ -242,6 +278,44 @@ static void ucma_set_event_context(struct ucma_context *ctx,  	}  } +/* Called with file->mut locked for the relevant context. */ +static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) +{ +	struct ucma_context *ctx = cm_id->context; +	struct ucma_event *con_req_eve; +	int event_found = 0; + +	if (ctx->destroying) +		return; + +	/* only if context is pointing to cm_id that it owns it and can be +	 * queued to be closed, otherwise that cm_id is an inflight one that +	 * is part of that context event list pending to be detached and +	 * reattached to its new context as part of ucma_get_event, +	 * handled separately below. +	 */ +	if (ctx->cm_id == cm_id) { +		mutex_lock(&mut); +		ctx->closing = 1; +		mutex_unlock(&mut); +		queue_work(ctx->file->close_wq, &ctx->close_work); +		return; +	} + +	list_for_each_entry(con_req_eve, &ctx->file->event_list, list) { +		if (con_req_eve->cm_id == cm_id && +		    con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) { +			list_del(&con_req_eve->list); +			INIT_WORK(&con_req_eve->close_work, ucma_close_event_id); +			queue_work(ctx->file->close_wq, &con_req_eve->close_work); +			event_found = 1; +			break; +		} +	} +	if (!event_found) +		printk(KERN_ERR "ucma_removal_event_handler: warning: connect request event wasn't found\n"); +} +  static int ucma_event_handler(struct rdma_cm_id *cm_id,  			      struct rdma_cm_event *event)  { @@ -276,14 +350,21 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,  		 * We ignore events for new connections until userspace has set  		 * their context.  This can only happen if an error occurs on a  		 * new connection before the user accepts it.  This is okay, -		 * since the accept will just fail later. +		 * since the accept will just fail later. However, we do need +		 * to release the underlying HW resources in case of a device +		 * removal event.  		 */ +		if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) +			ucma_removal_event_handler(cm_id); +  		kfree(uevent);  		goto out;  	}  	list_add_tail(&uevent->list, &ctx->file->event_list);  	wake_up_interruptible(&ctx->file->poll_wait); +	if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) +		ucma_removal_event_handler(cm_id);  out:  	mutex_unlock(&ctx->file->mut);  	return ret; @@ -442,9 +523,15 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc)  }  /* - * We cannot hold file->mut when calling rdma_destroy_id() or we can - * deadlock.  We also acquire file->mut in ucma_event_handler(), and - * rdma_destroy_id() will wait until all callbacks have completed. + * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At + * this point, no new events will be reported from the hardware. However, we + * still need to cleanup the UCMA context for this ID. Specifically, there + * might be events that have not yet been consumed by the user space software. + * These might include pending connect requests which we have not completed + * processing.  We cannot call rdma_destroy_id while holding the lock of the + * context (file->mut), as it might cause a deadlock. We therefore extract all + * relevant events from the context pending events list while holding the + * mutex. After that we release them as needed.   */  static int ucma_free_ctx(struct ucma_context *ctx)  { @@ -452,8 +539,6 @@ static int ucma_free_ctx(struct ucma_context *ctx)  	struct ucma_event *uevent, *tmp;  	LIST_HEAD(list); -	/* No new events will be generated after destroying the id. */ -	rdma_destroy_id(ctx->cm_id);  	ucma_cleanup_multicast(ctx); @@ -501,10 +586,24 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,  	if (IS_ERR(ctx))  		return PTR_ERR(ctx); -	ucma_put_ctx(ctx); -	wait_for_completion(&ctx->comp); -	resp.events_reported = ucma_free_ctx(ctx); +	mutex_lock(&ctx->file->mut); +	ctx->destroying = 1; +	mutex_unlock(&ctx->file->mut); +	flush_workqueue(ctx->file->close_wq); +	/* At this point it's guaranteed that there is no inflight +	 * closing task */ +	mutex_lock(&mut); +	if (!ctx->closing) { +		mutex_unlock(&mut); +		ucma_put_ctx(ctx); +		wait_for_completion(&ctx->comp); +		rdma_destroy_id(ctx->cm_id); +	} else { +		mutex_unlock(&mut); +	} + +	resp.events_reported = ucma_free_ctx(ctx);  	if (copy_to_user((void __user *)(unsigned long)cmd.response,  			 &resp, sizeof(resp)))  		ret = -EFAULT; @@ -1321,10 +1420,10 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,  		mc = ERR_PTR(-ENOENT);  	else if (mc->ctx->file != file)  		mc = ERR_PTR(-EINVAL); -	else { +	else if (!atomic_inc_not_zero(&mc->ctx->ref)) +		mc = ERR_PTR(-ENXIO); +	else  		idr_remove(&multicast_idr, mc->id); -		atomic_inc(&mc->ctx->ref); -	}  	mutex_unlock(&mut);  	if (IS_ERR(mc)) { @@ -1354,10 +1453,10 @@ static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)  	/* Acquire mutex's based on pointer comparison to prevent deadlock. */  	if (file1 < file2) {  		mutex_lock(&file1->mut); -		mutex_lock(&file2->mut); +		mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING);  	} else {  		mutex_lock(&file2->mut); -		mutex_lock(&file1->mut); +		mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING);  	}  } @@ -1529,6 +1628,7 @@ static int ucma_open(struct inode *inode, struct file *filp)  	INIT_LIST_HEAD(&file->ctx_list);  	init_waitqueue_head(&file->poll_wait);  	mutex_init(&file->mut); +	file->close_wq = create_singlethread_workqueue("ucma_close_id");  	filp->private_data = file;  	file->filp = filp; @@ -1543,16 +1643,34 @@ static int ucma_close(struct inode *inode, struct file *filp)  	mutex_lock(&file->mut);  	list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) { +		ctx->destroying = 1;  		mutex_unlock(&file->mut);  		mutex_lock(&mut);  		idr_remove(&ctx_idr, ctx->id);  		mutex_unlock(&mut); +		flush_workqueue(file->close_wq); +		/* At that step once ctx was marked as destroying and workqueue +		 * was flushed we are safe from any inflights handlers that +		 * might put other closing task. +		 */ +		mutex_lock(&mut); +		if (!ctx->closing) { +			mutex_unlock(&mut); +			/* rdma_destroy_id ensures that no event handlers are +			 * inflight for that id before releasing it. +			 */ +			rdma_destroy_id(ctx->cm_id); +		} else { +			mutex_unlock(&mut); +		} +  		ucma_free_ctx(ctx);  		mutex_lock(&file->mut);  	}  	mutex_unlock(&file->mut); +	destroy_workqueue(file->close_wq);  	kfree(file);  	return 0;  } @@ -1616,6 +1734,7 @@ static void __exit ucma_cleanup(void)  	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);  	misc_deregister(&ucma_misc);  	idr_destroy(&ctx_idr); +	idr_destroy(&multicast_idr);  }  module_init(ucma_init); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 35567fffaa4e..57f281f8d686 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -133,7 +133,7 @@ static DEFINE_SPINLOCK(port_lock);  static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);  static void ib_umad_add_one(struct ib_device *device); -static void ib_umad_remove_one(struct ib_device *device); +static void ib_umad_remove_one(struct ib_device *device, void *client_data);  static void ib_umad_release_dev(struct kobject *kobj)  { @@ -1322,9 +1322,9 @@ free:  	kobject_put(&umad_dev->kobj);  } -static void ib_umad_remove_one(struct ib_device *device) +static void ib_umad_remove_one(struct ib_device *device, void *client_data)  { -	struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client); +	struct ib_umad_device *umad_dev = client_data;  	int i;  	if (!umad_dev) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index ba365b6d1e8d..3863d33c243d 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -85,15 +85,20 @@   */  struct ib_uverbs_device { -	struct kref				ref; +	atomic_t				refcount;  	int					num_comp_vectors;  	struct completion			comp;  	struct device			       *dev; -	struct ib_device		       *ib_dev; +	struct ib_device	__rcu	       *ib_dev;  	int					devnum;  	struct cdev			        cdev;  	struct rb_root				xrcd_tree;  	struct mutex				xrcd_tree_mutex; +	struct kobject				kobj; +	struct srcu_struct			disassociate_srcu; +	struct mutex				lists_mutex; /* protect lists */ +	struct list_head			uverbs_file_list; +	struct list_head			uverbs_events_file_list;  };  struct ib_uverbs_event_file { @@ -105,6 +110,7 @@ struct ib_uverbs_event_file {  	wait_queue_head_t			poll_wait;  	struct fasync_struct		       *async_queue;  	struct list_head			event_list; +	struct list_head			list;  };  struct ib_uverbs_file { @@ -114,6 +120,8 @@ struct ib_uverbs_file {  	struct ib_ucontext		       *ucontext;  	struct ib_event_handler			event_handler;  	struct ib_uverbs_event_file	       *async_file; +	struct list_head			list; +	int					is_closed;  };  struct ib_uverbs_event { @@ -177,7 +185,9 @@ extern struct idr ib_uverbs_rule_idr;  void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);  struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, +					struct ib_device *ib_dev,  					int is_async); +void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);  struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);  void ib_uverbs_release_ucq(struct ib_uverbs_file *file, @@ -212,6 +222,7 @@ struct ib_uverbs_flow_spec {  #define IB_UVERBS_DECLARE_CMD(name)					\  	ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,		\ +				 struct ib_device *ib_dev,              \  				 const char __user *buf, int in_len,	\  				 int out_len) @@ -253,6 +264,7 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);  #define IB_UVERBS_DECLARE_EX_CMD(name)				\  	int ib_uverbs_ex_##name(struct ib_uverbs_file *file,	\ +				struct ib_device *ib_dev,		\  				struct ib_udata *ucore,		\  				struct ib_udata *uhw) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index bbb02ffe87df..be4cb9f04be3 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -282,13 +282,13 @@ static void put_xrcd_read(struct ib_uobject *uobj)  }  ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, +			      struct ib_device *ib_dev,  			      const char __user *buf,  			      int in_len, int out_len)  {  	struct ib_uverbs_get_context      cmd;  	struct ib_uverbs_get_context_resp resp;  	struct ib_udata                   udata; -	struct ib_device                 *ibdev = file->device->ib_dev;  #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING  	struct ib_device_attr		  dev_attr;  #endif @@ -313,13 +313,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,  		   (unsigned long) cmd.response + sizeof resp,  		   in_len - sizeof cmd, out_len - sizeof resp); -	ucontext = ibdev->alloc_ucontext(ibdev, &udata); +	ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);  	if (IS_ERR(ucontext)) {  		ret = PTR_ERR(ucontext);  		goto err;  	} -	ucontext->device = ibdev; +	ucontext->device = ib_dev;  	INIT_LIST_HEAD(&ucontext->pd_list);  	INIT_LIST_HEAD(&ucontext->mr_list);  	INIT_LIST_HEAD(&ucontext->mw_list); @@ -340,7 +340,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,  	ucontext->odp_mrs_count = 0;  	INIT_LIST_HEAD(&ucontext->no_private_counters); -	ret = ib_query_device(ibdev, &dev_attr); +	ret = ib_query_device(ib_dev, &dev_attr);  	if (ret)  		goto err_free;  	if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) @@ -355,7 +355,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,  		goto err_free;  	resp.async_fd = ret; -	filp = ib_uverbs_alloc_event_file(file, 1); +	filp = ib_uverbs_alloc_event_file(file, ib_dev, 1);  	if (IS_ERR(filp)) {  		ret = PTR_ERR(filp);  		goto err_fd; @@ -367,16 +367,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,  		goto err_file;  	} -	file->async_file = filp->private_data; - -	INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev, -			      ib_uverbs_event_handler); -	ret = ib_register_event_handler(&file->event_handler); -	if (ret) -		goto err_file; - -	kref_get(&file->async_file->ref); -	kref_get(&file->ref);  	file->ucontext = ucontext;  	fd_install(resp.async_fd, filp); @@ -386,6 +376,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,  	return in_len;  err_file: +	ib_uverbs_free_async_event_file(file);  	fput(filp);  err_fd: @@ -393,7 +384,7 @@ err_fd:  err_free:  	put_pid(ucontext->tgid); -	ibdev->dealloc_ucontext(ucontext); +	ib_dev->dealloc_ucontext(ucontext);  err:  	mutex_unlock(&file->mutex); @@ -401,11 +392,12 @@ err:  }  static void copy_query_dev_fields(struct ib_uverbs_file *file, +				  struct ib_device *ib_dev,  				  struct ib_uverbs_query_device_resp *resp,  				  struct ib_device_attr *attr)  {  	resp->fw_ver		= attr->fw_ver; -	resp->node_guid		= file->device->ib_dev->node_guid; +	resp->node_guid		= ib_dev->node_guid;  	resp->sys_image_guid	= attr->sys_image_guid;  	resp->max_mr_size	= attr->max_mr_size;  	resp->page_size_cap	= attr->page_size_cap; @@ -443,10 +435,11 @@ static void copy_query_dev_fields(struct ib_uverbs_file *file,  	resp->max_srq_sge		= attr->max_srq_sge;  	resp->max_pkeys			= attr->max_pkeys;  	resp->local_ca_ack_delay	= attr->local_ca_ack_delay; -	resp->phys_port_cnt		= file->device->ib_dev->phys_port_cnt; +	resp->phys_port_cnt		= ib_dev->phys_port_cnt;  }  ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, +			       struct ib_device *ib_dev,  			       const char __user *buf,  			       int in_len, int out_len)  { @@ -461,12 +454,12 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,  	if (copy_from_user(&cmd, buf, sizeof cmd))  		return -EFAULT; -	ret = ib_query_device(file->device->ib_dev, &attr); +	ret = ib_query_device(ib_dev, &attr);  	if (ret)  		return ret;  	memset(&resp, 0, sizeof resp); -	copy_query_dev_fields(file, &resp, &attr); +	copy_query_dev_fields(file, ib_dev, &resp, &attr);  	if (copy_to_user((void __user *) (unsigned long) cmd.response,  			 &resp, sizeof resp)) @@ -476,6 +469,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, +			     struct ib_device *ib_dev,  			     const char __user *buf,  			     int in_len, int out_len)  { @@ -490,7 +484,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,  	if (copy_from_user(&cmd, buf, sizeof cmd))  		return -EFAULT; -	ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr); +	ret = ib_query_port(ib_dev, cmd.port_num, &attr);  	if (ret)  		return ret; @@ -515,7 +509,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,  	resp.active_width    = attr.active_width;  	resp.active_speed    = attr.active_speed;  	resp.phys_state      = attr.phys_state; -	resp.link_layer      = rdma_port_get_link_layer(file->device->ib_dev, +	resp.link_layer      = rdma_port_get_link_layer(ib_dev,  							cmd.port_num);  	if (copy_to_user((void __user *) (unsigned long) cmd.response, @@ -526,6 +520,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, +			   struct ib_device *ib_dev,  			   const char __user *buf,  			   int in_len, int out_len)  { @@ -553,15 +548,15 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,  	init_uobj(uobj, 0, file->ucontext, &pd_lock_class);  	down_write(&uobj->mutex); -	pd = file->device->ib_dev->alloc_pd(file->device->ib_dev, -					    file->ucontext, &udata); +	pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);  	if (IS_ERR(pd)) {  		ret = PTR_ERR(pd);  		goto err;  	} -	pd->device  = file->device->ib_dev; +	pd->device  = ib_dev;  	pd->uobject = uobj; +	pd->local_mr = NULL;  	atomic_set(&pd->usecnt, 0);  	uobj->object = pd; @@ -600,11 +595,13 @@ err:  }  ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, +			     struct ib_device *ib_dev,  			     const char __user *buf,  			     int in_len, int out_len)  {  	struct ib_uverbs_dealloc_pd cmd;  	struct ib_uobject          *uobj; +	struct ib_pd		   *pd;  	int                         ret;  	if (copy_from_user(&cmd, buf, sizeof cmd)) @@ -613,15 +610,20 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,  	uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);  	if (!uobj)  		return -EINVAL; +	pd = uobj->object; -	ret = ib_dealloc_pd(uobj->object); -	if (!ret) -		uobj->live = 0; - -	put_uobj_write(uobj); +	if (atomic_read(&pd->usecnt)) { +		ret = -EBUSY; +		goto err_put; +	} +	ret = pd->device->dealloc_pd(uobj->object); +	WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");  	if (ret) -		return ret; +		goto err_put; + +	uobj->live = 0; +	put_uobj_write(uobj);  	idr_remove_uobj(&ib_uverbs_pd_idr, uobj); @@ -632,6 +634,10 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,  	put_uobj(uobj);  	return in_len; + +err_put: +	put_uobj_write(uobj); +	return ret;  }  struct xrcd_table_entry { @@ -720,6 +726,7 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev,  }  ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, +			    struct ib_device *ib_dev,  			    const char __user *buf, int in_len,  			    int out_len)  { @@ -778,15 +785,14 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,  	down_write(&obj->uobject.mutex);  	if (!xrcd) { -		xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev, -							file->ucontext, &udata); +		xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata);  		if (IS_ERR(xrcd)) {  			ret = PTR_ERR(xrcd);  			goto err;  		}  		xrcd->inode   = inode; -		xrcd->device  = file->device->ib_dev; +		xrcd->device  = ib_dev;  		atomic_set(&xrcd->usecnt, 0);  		mutex_init(&xrcd->tgt_qp_mutex);  		INIT_LIST_HEAD(&xrcd->tgt_qp_list); @@ -857,6 +863,7 @@ err_tree_mutex_unlock:  }  ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, +			     struct ib_device *ib_dev,  			     const char __user *buf, int in_len,  			     int out_len)  { @@ -934,6 +941,7 @@ void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,  }  ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, +			 struct ib_device *ib_dev,  			 const char __user *buf, int in_len,  			 int out_len)  { @@ -1043,6 +1051,7 @@ err_free:  }  ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, +			   struct ib_device *ib_dev,  			   const char __user *buf, int in_len,  			   int out_len)  { @@ -1136,6 +1145,7 @@ put_uobjs:  }  ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, +			   struct ib_device *ib_dev,  			   const char __user *buf, int in_len,  			   int out_len)  { @@ -1174,8 +1184,9 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, -			 const char __user *buf, int in_len, -			 int out_len) +			   struct ib_device *ib_dev, +			   const char __user *buf, int in_len, +			   int out_len)  {  	struct ib_uverbs_alloc_mw      cmd;  	struct ib_uverbs_alloc_mw_resp resp; @@ -1256,8 +1267,9 @@ err_free:  }  ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, -			   const char __user *buf, int in_len, -			   int out_len) +			     struct ib_device *ib_dev, +			     const char __user *buf, int in_len, +			     int out_len)  {  	struct ib_uverbs_dealloc_mw cmd;  	struct ib_mw               *mw; @@ -1294,6 +1306,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, +				      struct ib_device *ib_dev,  				      const char __user *buf, int in_len,  				      int out_len)  { @@ -1313,7 +1326,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,  		return ret;  	resp.fd = ret; -	filp = ib_uverbs_alloc_event_file(file, 0); +	filp = ib_uverbs_alloc_event_file(file, ib_dev, 0);  	if (IS_ERR(filp)) {  		put_unused_fd(resp.fd);  		return PTR_ERR(filp); @@ -1331,6 +1344,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,  }  static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, +					struct ib_device *ib_dev,  				       struct ib_udata *ucore,  				       struct ib_udata *uhw,  				       struct ib_uverbs_ex_create_cq *cmd, @@ -1379,14 +1393,14 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,  	if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))  		attr.flags = cmd->flags; -	cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr, +	cq = ib_dev->create_cq(ib_dev, &attr,  					     file->ucontext, uhw);  	if (IS_ERR(cq)) {  		ret = PTR_ERR(cq);  		goto err_file;  	} -	cq->device        = file->device->ib_dev; +	cq->device        = ib_dev;  	cq->uobject       = &obj->uobject;  	cq->comp_handler  = ib_uverbs_comp_handler;  	cq->event_handler = ib_uverbs_cq_event_handler; @@ -1447,6 +1461,7 @@ static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, +			    struct ib_device *ib_dev,  			    const char __user *buf, int in_len,  			    int out_len)  { @@ -1475,7 +1490,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,  	cmd_ex.comp_vector = cmd.comp_vector;  	cmd_ex.comp_channel = cmd.comp_channel; -	obj = create_cq(file, &ucore, &uhw, &cmd_ex, +	obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex,  			offsetof(typeof(cmd_ex), comp_channel) +  			sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,  			NULL); @@ -1498,6 +1513,7 @@ static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,  }  int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file, +			 struct ib_device *ib_dev,  			   struct ib_udata *ucore,  			   struct ib_udata *uhw)  { @@ -1523,7 +1539,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,  			     sizeof(resp.response_length)))  		return -ENOSPC; -	obj = create_cq(file, ucore, uhw, &cmd, +	obj = create_cq(file, ib_dev, ucore, uhw, &cmd,  			min(ucore->inlen, sizeof(cmd)),  			ib_uverbs_ex_create_cq_cb, NULL); @@ -1534,6 +1550,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, +			    struct ib_device *ib_dev,  			    const char __user *buf, int in_len,  			    int out_len)  { @@ -1597,6 +1614,7 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)  }  ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, +			  struct ib_device *ib_dev,  			  const char __user *buf, int in_len,  			  int out_len)  { @@ -1648,6 +1666,7 @@ out_put:  }  ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, +				struct ib_device *ib_dev,  				const char __user *buf, int in_len,  				int out_len)  { @@ -1670,6 +1689,7 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, +			     struct ib_device *ib_dev,  			     const char __user *buf, int in_len,  			     int out_len)  { @@ -1722,6 +1742,7 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, +			    struct ib_device *ib_dev,  			    const char __user *buf, int in_len,  			    int out_len)  { @@ -1917,6 +1938,7 @@ err_put:  }  ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, +			  struct ib_device *ib_dev,  			  const char __user *buf, int in_len, int out_len)  {  	struct ib_uverbs_open_qp        cmd; @@ -2011,6 +2033,7 @@ err_put:  }  ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, +			   struct ib_device *ib_dev,  			   const char __user *buf, int in_len,  			   int out_len)  { @@ -2125,6 +2148,7 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask)  }  ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, +			    struct ib_device *ib_dev,  			    const char __user *buf, int in_len,  			    int out_len)  { @@ -2221,6 +2245,7 @@ out:  }  ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, +			     struct ib_device *ib_dev,  			     const char __user *buf, int in_len,  			     int out_len)  { @@ -2279,6 +2304,7 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, +			    struct ib_device *ib_dev,  			    const char __user *buf, int in_len,  			    int out_len)  { @@ -2346,6 +2372,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,  		next->send_flags = user_wr->send_flags;  		if (is_ud) { +			if (next->opcode != IB_WR_SEND && +			    next->opcode != IB_WR_SEND_WITH_IMM) { +				ret = -EINVAL; +				goto out_put; +			} +  			next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,  						     file->ucontext);  			if (!next->wr.ud.ah) { @@ -2385,9 +2417,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,  					user_wr->wr.atomic.compare_add;  				next->wr.atomic.swap = user_wr->wr.atomic.swap;  				next->wr.atomic.rkey = user_wr->wr.atomic.rkey; +			case IB_WR_SEND:  				break;  			default: -				break; +				ret = -EINVAL; +				goto out_put;  			}  		} @@ -2523,6 +2557,7 @@ err:  }  ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, +			    struct ib_device *ib_dev,  			    const char __user *buf, int in_len,  			    int out_len)  { @@ -2572,6 +2607,7 @@ out:  }  ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, +				struct ib_device *ib_dev,  				const char __user *buf, int in_len,  				int out_len)  { @@ -2621,6 +2657,7 @@ out:  }  ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, +			    struct ib_device *ib_dev,  			    const char __user *buf, int in_len,  			    int out_len)  { @@ -2713,6 +2750,7 @@ err:  }  ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, +			     struct ib_device *ib_dev,  			     const char __user *buf, int in_len, int out_len)  {  	struct ib_uverbs_destroy_ah cmd; @@ -2749,6 +2787,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, +			       struct ib_device *ib_dev,  			       const char __user *buf, int in_len,  			       int out_len)  { @@ -2796,6 +2835,7 @@ out_put:  }  ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, +			       struct ib_device *ib_dev,  			       const char __user *buf, int in_len,  			       int out_len)  { @@ -2876,6 +2916,7 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,  }  int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, +			     struct ib_device *ib_dev,  			     struct ib_udata *ucore,  			     struct ib_udata *uhw)  { @@ -3036,6 +3077,7 @@ err_free_attr:  }  int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, +			      struct ib_device *ib_dev,  			      struct ib_udata *ucore,  			      struct ib_udata *uhw)  { @@ -3078,6 +3120,7 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,  }  static int __uverbs_create_xsrq(struct ib_uverbs_file *file, +				struct ib_device *ib_dev,  				struct ib_uverbs_create_xsrq *cmd,  				struct ib_udata *udata)  { @@ -3211,6 +3254,7 @@ err:  }  ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, +			     struct ib_device *ib_dev,  			     const char __user *buf, int in_len,  			     int out_len)  { @@ -3238,7 +3282,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,  		   (unsigned long) cmd.response + sizeof resp,  		   in_len - sizeof cmd, out_len - sizeof resp); -	ret = __uverbs_create_xsrq(file, &xcmd, &udata); +	ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata);  	if (ret)  		return ret; @@ -3246,6 +3290,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, +			      struct ib_device *ib_dev,  			      const char __user *buf, int in_len, int out_len)  {  	struct ib_uverbs_create_xsrq     cmd; @@ -3263,7 +3308,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,  		   (unsigned long) cmd.response + sizeof resp,  		   in_len - sizeof cmd, out_len - sizeof resp); -	ret = __uverbs_create_xsrq(file, &cmd, &udata); +	ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata);  	if (ret)  		return ret; @@ -3271,6 +3316,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, +			     struct ib_device *ib_dev,  			     const char __user *buf, int in_len,  			     int out_len)  { @@ -3301,6 +3347,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, +			    struct ib_device *ib_dev,  			    const char __user *buf,  			    int in_len, int out_len)  { @@ -3341,6 +3388,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,  }  ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, +			      struct ib_device *ib_dev,  			      const char __user *buf, int in_len,  			      int out_len)  { @@ -3398,16 +3446,15 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,  }  int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, +			      struct ib_device *ib_dev,  			      struct ib_udata *ucore,  			      struct ib_udata *uhw)  {  	struct ib_uverbs_ex_query_device_resp resp;  	struct ib_uverbs_ex_query_device  cmd;  	struct ib_device_attr attr; -	struct ib_device *device;  	int err; -	device = file->device->ib_dev;  	if (ucore->inlen < sizeof(cmd))  		return -EINVAL; @@ -3428,11 +3475,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,  	memset(&attr, 0, sizeof(attr)); -	err = device->query_device(device, &attr, uhw); +	err = ib_dev->query_device(ib_dev, &attr, uhw);  	if (err)  		return err; -	copy_query_dev_fields(file, &resp.base, &attr); +	copy_query_dev_fields(file, ib_dev, &resp.base, &attr);  	resp.comp_mask = 0;  	if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps)) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index f6eef2da7097..c29a660c72fe 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -79,6 +79,7 @@ static DEFINE_SPINLOCK(map_lock);  static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);  static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, +				     struct ib_device *ib_dev,  				     const char __user *buf, int in_len,  				     int out_len) = {  	[IB_USER_VERBS_CMD_GET_CONTEXT]		= ib_uverbs_get_context, @@ -119,6 +120,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,  };  static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, +				    struct ib_device *ib_dev,  				    struct ib_udata *ucore,  				    struct ib_udata *uhw) = {  	[IB_USER_VERBS_EX_CMD_CREATE_FLOW]	= ib_uverbs_ex_create_flow, @@ -128,16 +130,21 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,  };  static void ib_uverbs_add_one(struct ib_device *device); -static void ib_uverbs_remove_one(struct ib_device *device); +static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); -static void ib_uverbs_release_dev(struct kref *ref) +static void ib_uverbs_release_dev(struct kobject *kobj)  {  	struct ib_uverbs_device *dev = -		container_of(ref, struct ib_uverbs_device, ref); +		container_of(kobj, struct ib_uverbs_device, kobj); -	complete(&dev->comp); +	cleanup_srcu_struct(&dev->disassociate_srcu); +	kfree(dev);  } +static struct kobj_type ib_uverbs_dev_ktype = { +	.release = ib_uverbs_release_dev, +}; +  static void ib_uverbs_release_event_file(struct kref *ref)  {  	struct ib_uverbs_event_file *file = @@ -201,9 +208,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,  {  	struct ib_uobject *uobj, *tmp; -	if (!context) -		return 0; -  	context->closing = 1;  	list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) { @@ -303,13 +307,27 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,  	return context->device->dealloc_ucontext(context);  } +static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev) +{ +	complete(&dev->comp); +} +  static void ib_uverbs_release_file(struct kref *ref)  {  	struct ib_uverbs_file *file =  		container_of(ref, struct ib_uverbs_file, ref); +	struct ib_device *ib_dev; +	int srcu_key; + +	srcu_key = srcu_read_lock(&file->device->disassociate_srcu); +	ib_dev = srcu_dereference(file->device->ib_dev, +				  &file->device->disassociate_srcu); +	if (ib_dev && !ib_dev->disassociate_ucontext) +		module_put(ib_dev->owner); +	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); -	module_put(file->device->ib_dev->owner); -	kref_put(&file->device->ref, ib_uverbs_release_dev); +	if (atomic_dec_and_test(&file->device->refcount)) +		ib_uverbs_comp_dev(file->device);  	kfree(file);  } @@ -331,9 +349,19 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,  			return -EAGAIN;  		if (wait_event_interruptible(file->poll_wait, -					     !list_empty(&file->event_list))) +					     (!list_empty(&file->event_list) || +			/* The barriers built into wait_event_interruptible() +			 * and wake_up() guarentee this will see the null set +			 * without using RCU +			 */ +					     !file->uverbs_file->device->ib_dev)))  			return -ERESTARTSYS; +		/* If device was disassociated and no event exists set an error */ +		if (list_empty(&file->event_list) && +		    !file->uverbs_file->device->ib_dev) +			return -EIO; +  		spin_lock_irq(&file->lock);  	} @@ -396,8 +424,11 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)  {  	struct ib_uverbs_event_file *file = filp->private_data;  	struct ib_uverbs_event *entry, *tmp; +	int closed_already = 0; +	mutex_lock(&file->uverbs_file->device->lists_mutex);  	spin_lock_irq(&file->lock); +	closed_already = file->is_closed;  	file->is_closed = 1;  	list_for_each_entry_safe(entry, tmp, &file->event_list, list) {  		if (entry->counter) @@ -405,11 +436,15 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)  		kfree(entry);  	}  	spin_unlock_irq(&file->lock); - -	if (file->is_async) { -		ib_unregister_event_handler(&file->uverbs_file->event_handler); -		kref_put(&file->uverbs_file->ref, ib_uverbs_release_file); +	if (!closed_already) { +		list_del(&file->list); +		if (file->is_async) +			ib_unregister_event_handler(&file->uverbs_file-> +				event_handler);  	} +	mutex_unlock(&file->uverbs_file->device->lists_mutex); + +	kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);  	kref_put(&file->ref, ib_uverbs_release_event_file);  	return 0; @@ -541,13 +576,21 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,  				NULL, NULL);  } +void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file) +{ +	kref_put(&file->async_file->ref, ib_uverbs_release_event_file); +	file->async_file = NULL; +} +  struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, +					struct ib_device	*ib_dev,  					int is_async)  {  	struct ib_uverbs_event_file *ev_file;  	struct file *filp; +	int ret; -	ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL); +	ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);  	if (!ev_file)  		return ERR_PTR(-ENOMEM); @@ -556,16 +599,47 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,  	INIT_LIST_HEAD(&ev_file->event_list);  	init_waitqueue_head(&ev_file->poll_wait);  	ev_file->uverbs_file = uverbs_file; +	kref_get(&ev_file->uverbs_file->ref);  	ev_file->async_queue = NULL; -	ev_file->is_async    = is_async;  	ev_file->is_closed   = 0;  	filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,  				  ev_file, O_RDONLY);  	if (IS_ERR(filp)) -		kfree(ev_file); +		goto err_put_refs; + +	mutex_lock(&uverbs_file->device->lists_mutex); +	list_add_tail(&ev_file->list, +		      &uverbs_file->device->uverbs_events_file_list); +	mutex_unlock(&uverbs_file->device->lists_mutex); + +	if (is_async) { +		WARN_ON(uverbs_file->async_file); +		uverbs_file->async_file = ev_file; +		kref_get(&uverbs_file->async_file->ref); +		INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler, +				      ib_dev, +				      ib_uverbs_event_handler); +		ret = ib_register_event_handler(&uverbs_file->event_handler); +		if (ret) +			goto err_put_file; + +		/* At that point async file stuff was fully set */ +		ev_file->is_async = 1; +	}  	return filp; + +err_put_file: +	fput(filp); +	kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file); +	uverbs_file->async_file = NULL; +	return ERR_PTR(ret); + +err_put_refs: +	kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file); +	kref_put(&ev_file->ref, ib_uverbs_release_event_file); +	return filp;  }  /* @@ -601,8 +675,11 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,  			     size_t count, loff_t *pos)  {  	struct ib_uverbs_file *file = filp->private_data; +	struct ib_device *ib_dev;  	struct ib_uverbs_cmd_hdr hdr;  	__u32 flags; +	int srcu_key; +	ssize_t ret;  	if (count < sizeof hdr)  		return -EINVAL; @@ -610,6 +687,14 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,  	if (copy_from_user(&hdr, buf, sizeof hdr))  		return -EFAULT; +	srcu_key = srcu_read_lock(&file->device->disassociate_srcu); +	ib_dev = srcu_dereference(file->device->ib_dev, +				  &file->device->disassociate_srcu); +	if (!ib_dev) { +		ret = -EIO; +		goto out; +	} +  	flags = (hdr.command &  		 IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; @@ -617,26 +702,36 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,  		__u32 command;  		if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | -					   IB_USER_VERBS_CMD_COMMAND_MASK)) -			return -EINVAL; +					   IB_USER_VERBS_CMD_COMMAND_MASK)) { +			ret = -EINVAL; +			goto out; +		}  		command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;  		if (command >= ARRAY_SIZE(uverbs_cmd_table) || -		    !uverbs_cmd_table[command]) -			return -EINVAL; +		    !uverbs_cmd_table[command]) { +			ret = -EINVAL; +			goto out; +		}  		if (!file->ucontext && -		    command != IB_USER_VERBS_CMD_GET_CONTEXT) -			return -EINVAL; +		    command != IB_USER_VERBS_CMD_GET_CONTEXT) { +			ret = -EINVAL; +			goto out; +		} -		if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command))) -			return -ENOSYS; +		if (!(ib_dev->uverbs_cmd_mask & (1ull << command))) { +			ret = -ENOSYS; +			goto out; +		} -		if (hdr.in_words * 4 != count) -			return -EINVAL; +		if (hdr.in_words * 4 != count) { +			ret = -EINVAL; +			goto out; +		} -		return uverbs_cmd_table[command](file, +		ret = uverbs_cmd_table[command](file, ib_dev,  						 buf + sizeof(hdr),  						 hdr.in_words * 4,  						 hdr.out_words * 4); @@ -647,51 +742,72 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,  		struct ib_uverbs_ex_cmd_hdr ex_hdr;  		struct ib_udata ucore;  		struct ib_udata uhw; -		int err;  		size_t written_count = count;  		if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | -					   IB_USER_VERBS_CMD_COMMAND_MASK)) -			return -EINVAL; +					   IB_USER_VERBS_CMD_COMMAND_MASK)) { +			ret = -EINVAL; +			goto out; +		}  		command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;  		if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || -		    !uverbs_ex_cmd_table[command]) -			return -ENOSYS; +		    !uverbs_ex_cmd_table[command]) { +			ret = -ENOSYS; +			goto out; +		} -		if (!file->ucontext) -			return -EINVAL; +		if (!file->ucontext) { +			ret = -EINVAL; +			goto out; +		} -		if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command))) -			return -ENOSYS; +		if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command))) { +			ret = -ENOSYS; +			goto out; +		} -		if (count < (sizeof(hdr) + sizeof(ex_hdr))) -			return -EINVAL; +		if (count < (sizeof(hdr) + sizeof(ex_hdr))) { +			ret = -EINVAL; +			goto out; +		} -		if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) -			return -EFAULT; +		if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) { +			ret = -EFAULT; +			goto out; +		}  		count -= sizeof(hdr) + sizeof(ex_hdr);  		buf += sizeof(hdr) + sizeof(ex_hdr); -		if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) -			return -EINVAL; +		if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) { +			ret = -EINVAL; +			goto out; +		} -		if (ex_hdr.cmd_hdr_reserved) -			return -EINVAL; +		if (ex_hdr.cmd_hdr_reserved) { +			ret = -EINVAL; +			goto out; +		}  		if (ex_hdr.response) { -			if (!hdr.out_words && !ex_hdr.provider_out_words) -				return -EINVAL; +			if (!hdr.out_words && !ex_hdr.provider_out_words) { +				ret = -EINVAL; +				goto out; +			}  			if (!access_ok(VERIFY_WRITE,  				       (void __user *) (unsigned long) ex_hdr.response, -				       (hdr.out_words + ex_hdr.provider_out_words) * 8)) -				return -EFAULT; +				       (hdr.out_words + ex_hdr.provider_out_words) * 8)) { +				ret = -EFAULT; +				goto out; +			}  		} else { -			if (hdr.out_words || ex_hdr.provider_out_words) -				return -EINVAL; +			if (hdr.out_words || ex_hdr.provider_out_words) { +				ret = -EINVAL; +				goto out; +			}  		}  		INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response, @@ -703,27 +819,43 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,  				       ex_hdr.provider_in_words * 8,  				       ex_hdr.provider_out_words * 8); -		err = uverbs_ex_cmd_table[command](file, +		ret = uverbs_ex_cmd_table[command](file, +						   ib_dev,  						   &ucore,  						   &uhw); - -		if (err) -			return err; - -		return written_count; +		if (!ret) +			ret = written_count; +	} else { +		ret = -ENOSYS;  	} -	return -ENOSYS; +out: +	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); +	return ret;  }  static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)  {  	struct ib_uverbs_file *file = filp->private_data; +	struct ib_device *ib_dev; +	int ret = 0; +	int srcu_key; + +	srcu_key = srcu_read_lock(&file->device->disassociate_srcu); +	ib_dev = srcu_dereference(file->device->ib_dev, +				  &file->device->disassociate_srcu); +	if (!ib_dev) { +		ret = -EIO; +		goto out; +	}  	if (!file->ucontext) -		return -ENODEV; +		ret = -ENODEV;  	else -		return file->device->ib_dev->mmap(file->ucontext, vma); +		ret = ib_dev->mmap(file->ucontext, vma); +out: +	srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); +	return ret;  }  /* @@ -740,23 +872,43 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)  {  	struct ib_uverbs_device *dev;  	struct ib_uverbs_file *file; +	struct ib_device *ib_dev;  	int ret; +	int module_dependent; +	int srcu_key;  	dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev); -	if (dev) -		kref_get(&dev->ref); -	else +	if (!atomic_inc_not_zero(&dev->refcount))  		return -ENXIO; -	if (!try_module_get(dev->ib_dev->owner)) { -		ret = -ENODEV; +	srcu_key = srcu_read_lock(&dev->disassociate_srcu); +	mutex_lock(&dev->lists_mutex); +	ib_dev = srcu_dereference(dev->ib_dev, +				  &dev->disassociate_srcu); +	if (!ib_dev) { +		ret = -EIO;  		goto err;  	} -	file = kmalloc(sizeof *file, GFP_KERNEL); +	/* In case IB device supports disassociate ucontext, there is no hard +	 * dependency between uverbs device and its low level device. +	 */ +	module_dependent = !(ib_dev->disassociate_ucontext); + +	if (module_dependent) { +		if (!try_module_get(ib_dev->owner)) { +			ret = -ENODEV; +			goto err; +		} +	} + +	file = kzalloc(sizeof(*file), GFP_KERNEL);  	if (!file) {  		ret = -ENOMEM; -		goto err_module; +		if (module_dependent) +			goto err_module; + +		goto err;  	}  	file->device	 = dev; @@ -766,27 +918,47 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)  	mutex_init(&file->mutex);  	filp->private_data = file; +	kobject_get(&dev->kobj); +	list_add_tail(&file->list, &dev->uverbs_file_list); +	mutex_unlock(&dev->lists_mutex); +	srcu_read_unlock(&dev->disassociate_srcu, srcu_key);  	return nonseekable_open(inode, filp);  err_module: -	module_put(dev->ib_dev->owner); +	module_put(ib_dev->owner);  err: -	kref_put(&dev->ref, ib_uverbs_release_dev); +	mutex_unlock(&dev->lists_mutex); +	srcu_read_unlock(&dev->disassociate_srcu, srcu_key); +	if (atomic_dec_and_test(&dev->refcount)) +		ib_uverbs_comp_dev(dev); +  	return ret;  }  static int ib_uverbs_close(struct inode *inode, struct file *filp)  {  	struct ib_uverbs_file *file = filp->private_data; - -	ib_uverbs_cleanup_ucontext(file, file->ucontext); +	struct ib_uverbs_device *dev = file->device; +	struct ib_ucontext *ucontext = NULL; + +	mutex_lock(&file->device->lists_mutex); +	ucontext = file->ucontext; +	file->ucontext = NULL; +	if (!file->is_closed) { +		list_del(&file->list); +		file->is_closed = 1; +	} +	mutex_unlock(&file->device->lists_mutex); +	if (ucontext) +		ib_uverbs_cleanup_ucontext(file, ucontext);  	if (file->async_file)  		kref_put(&file->async_file->ref, ib_uverbs_release_event_file);  	kref_put(&file->ref, ib_uverbs_release_file); +	kobject_put(&dev->kobj);  	return 0;  } @@ -817,12 +989,21 @@ static struct ib_client uverbs_client = {  static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,  			  char *buf)  { +	int ret = -ENODEV; +	int srcu_key;  	struct ib_uverbs_device *dev = dev_get_drvdata(device); +	struct ib_device *ib_dev;  	if (!dev)  		return -ENODEV; -	return sprintf(buf, "%s\n", dev->ib_dev->name); +	srcu_key = srcu_read_lock(&dev->disassociate_srcu); +	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); +	if (ib_dev) +		ret = sprintf(buf, "%s\n", ib_dev->name); +	srcu_read_unlock(&dev->disassociate_srcu, srcu_key); + +	return ret;  }  static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); @@ -830,11 +1011,19 @@ static ssize_t show_dev_abi_version(struct device *device,  				    struct device_attribute *attr, char *buf)  {  	struct ib_uverbs_device *dev = dev_get_drvdata(device); +	int ret = -ENODEV; +	int srcu_key; +	struct ib_device *ib_dev;  	if (!dev)  		return -ENODEV; +	srcu_key = srcu_read_lock(&dev->disassociate_srcu); +	ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); +	if (ib_dev) +		ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver); +	srcu_read_unlock(&dev->disassociate_srcu, srcu_key); -	return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver); +	return ret;  }  static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); @@ -874,6 +1063,7 @@ static void ib_uverbs_add_one(struct ib_device *device)  	int devnum;  	dev_t base;  	struct ib_uverbs_device *uverbs_dev; +	int ret;  	if (!device->alloc_ucontext)  		return; @@ -882,10 +1072,20 @@ static void ib_uverbs_add_one(struct ib_device *device)  	if (!uverbs_dev)  		return; -	kref_init(&uverbs_dev->ref); +	ret = init_srcu_struct(&uverbs_dev->disassociate_srcu); +	if (ret) { +		kfree(uverbs_dev); +		return; +	} + +	atomic_set(&uverbs_dev->refcount, 1);  	init_completion(&uverbs_dev->comp);  	uverbs_dev->xrcd_tree = RB_ROOT;  	mutex_init(&uverbs_dev->xrcd_tree_mutex); +	kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype); +	mutex_init(&uverbs_dev->lists_mutex); +	INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); +	INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);  	spin_lock(&map_lock);  	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); @@ -906,12 +1106,13 @@ static void ib_uverbs_add_one(struct ib_device *device)  	}  	spin_unlock(&map_lock); -	uverbs_dev->ib_dev           = device; +	rcu_assign_pointer(uverbs_dev->ib_dev, device);  	uverbs_dev->num_comp_vectors = device->num_comp_vectors;  	cdev_init(&uverbs_dev->cdev, NULL);  	uverbs_dev->cdev.owner = THIS_MODULE;  	uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; +	uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj;  	kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);  	if (cdev_add(&uverbs_dev->cdev, base, 1))  		goto err_cdev; @@ -942,15 +1143,79 @@ err_cdev:  		clear_bit(devnum, overflow_map);  err: -	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); +	if (atomic_dec_and_test(&uverbs_dev->refcount)) +		ib_uverbs_comp_dev(uverbs_dev);  	wait_for_completion(&uverbs_dev->comp); -	kfree(uverbs_dev); +	kobject_put(&uverbs_dev->kobj);  	return;  } -static void ib_uverbs_remove_one(struct ib_device *device) +static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, +					struct ib_device *ib_dev)  { -	struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client); +	struct ib_uverbs_file *file; +	struct ib_uverbs_event_file *event_file; +	struct ib_event event; + +	/* Pending running commands to terminate */ +	synchronize_srcu(&uverbs_dev->disassociate_srcu); +	event.event = IB_EVENT_DEVICE_FATAL; +	event.element.port_num = 0; +	event.device = ib_dev; + +	mutex_lock(&uverbs_dev->lists_mutex); +	while (!list_empty(&uverbs_dev->uverbs_file_list)) { +		struct ib_ucontext *ucontext; + +		file = list_first_entry(&uverbs_dev->uverbs_file_list, +					struct ib_uverbs_file, list); +		file->is_closed = 1; +		ucontext = file->ucontext; +		list_del(&file->list); +		file->ucontext = NULL; +		kref_get(&file->ref); +		mutex_unlock(&uverbs_dev->lists_mutex); +		/* We must release the mutex before going ahead and calling +		 * disassociate_ucontext. disassociate_ucontext might end up +		 * indirectly calling uverbs_close, for example due to freeing +		 * the resources (e.g mmput). +		 */ +		ib_uverbs_event_handler(&file->event_handler, &event); +		if (ucontext) { +			ib_dev->disassociate_ucontext(ucontext); +			ib_uverbs_cleanup_ucontext(file, ucontext); +		} + +		mutex_lock(&uverbs_dev->lists_mutex); +		kref_put(&file->ref, ib_uverbs_release_file); +	} + +	while (!list_empty(&uverbs_dev->uverbs_events_file_list)) { +		event_file = list_first_entry(&uverbs_dev-> +					      uverbs_events_file_list, +					      struct ib_uverbs_event_file, +					      list); +		spin_lock_irq(&event_file->lock); +		event_file->is_closed = 1; +		spin_unlock_irq(&event_file->lock); + +		list_del(&event_file->list); +		if (event_file->is_async) { +			ib_unregister_event_handler(&event_file->uverbs_file-> +						    event_handler); +			event_file->uverbs_file->event_handler.device = NULL; +		} + +		wake_up_interruptible(&event_file->poll_wait); +		kill_fasync(&event_file->async_queue, SIGIO, POLL_IN); +	} +	mutex_unlock(&uverbs_dev->lists_mutex); +} + +static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) +{ +	struct ib_uverbs_device *uverbs_dev = client_data; +	int wait_clients = 1;  	if (!uverbs_dev)  		return; @@ -964,9 +1229,28 @@ static void ib_uverbs_remove_one(struct ib_device *device)  	else  		clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); -	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); -	wait_for_completion(&uverbs_dev->comp); -	kfree(uverbs_dev); +	if (device->disassociate_ucontext) { +		/* We disassociate HW resources and immediately return. +		 * Userspace will see a EIO errno for all future access. +		 * Upon returning, ib_device may be freed internally and is not +		 * valid any more. +		 * uverbs_device is still available until all clients close +		 * their files, then the uverbs device ref count will be zero +		 * and its resources will be freed. +		 * Note: At this point no more files can be opened since the +		 * cdev was deleted, however active clients can still issue +		 * commands and close their open files. +		 */ +		rcu_assign_pointer(uverbs_dev->ib_dev, NULL); +		ib_uverbs_free_hw_resources(uverbs_dev, device); +		wait_clients = 0; +	} + +	if (atomic_dec_and_test(&uverbs_dev->refcount)) +		ib_uverbs_comp_dev(uverbs_dev); +	if (wait_clients) +		wait_for_completion(&uverbs_dev->comp); +	kobject_put(&uverbs_dev->kobj);  }  static char *uverbs_devnode(struct device *dev, umode_t *mode) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index bac3fb406a74..e1f2c9887f3f 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -213,28 +213,79 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);  /* Protection domains */ +/** + * ib_alloc_pd - Allocates an unused protection domain. + * @device: The device on which to allocate the protection domain. + * + * A protection domain object provides an association between QPs, shared + * receive queues, address handles, memory regions, and memory windows. + * + * Every PD has a local_dma_lkey which can be used as the lkey value for local + * memory operations. + */  struct ib_pd *ib_alloc_pd(struct ib_device *device)  {  	struct ib_pd *pd; +	struct ib_device_attr devattr; +	int rc; + +	rc = ib_query_device(device, &devattr); +	if (rc) +		return ERR_PTR(rc);  	pd = device->alloc_pd(device, NULL, NULL); +	if (IS_ERR(pd)) +		return pd; + +	pd->device = device; +	pd->uobject = NULL; +	pd->local_mr = NULL; +	atomic_set(&pd->usecnt, 0); + +	if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) +		pd->local_dma_lkey = device->local_dma_lkey; +	else { +		struct ib_mr *mr; + +		mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); +		if (IS_ERR(mr)) { +			ib_dealloc_pd(pd); +			return (struct ib_pd *)mr; +		} -	if (!IS_ERR(pd)) { -		pd->device  = device; -		pd->uobject = NULL; -		atomic_set(&pd->usecnt, 0); +		pd->local_mr = mr; +		pd->local_dma_lkey = pd->local_mr->lkey;  	} -  	return pd;  }  EXPORT_SYMBOL(ib_alloc_pd); -int ib_dealloc_pd(struct ib_pd *pd) +/** + * ib_dealloc_pd - Deallocates a protection domain. + * @pd: The protection domain to deallocate. + * + * It is an error to call this function while any resources in the pd still + * exist.  The caller is responsible to synchronously destroy them and + * guarantee no new allocations will happen. + */ +void ib_dealloc_pd(struct ib_pd *pd)  { -	if (atomic_read(&pd->usecnt)) -		return -EBUSY; +	int ret; + +	if (pd->local_mr) { +		ret = ib_dereg_mr(pd->local_mr); +		WARN_ON(ret); +		pd->local_mr = NULL; +	} + +	/* uverbs manipulates usecnt with proper locking, while the kabi +	   requires the caller to guarantee we can't race here. */ +	WARN_ON(atomic_read(&pd->usecnt)); -	return pd->device->dealloc_pd(pd); +	/* Making delalloc_pd a void return is a WIP, no driver should return +	   an error here. */ +	ret = pd->device->dealloc_pd(pd); +	WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");  }  EXPORT_SYMBOL(ib_dealloc_pd); @@ -1144,73 +1195,6 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)  }  EXPORT_SYMBOL(ib_get_dma_mr); -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, -			     struct ib_phys_buf *phys_buf_array, -			     int num_phys_buf, -			     int mr_access_flags, -			     u64 *iova_start) -{ -	struct ib_mr *mr; -	int err; - -	err = ib_check_mr_access(mr_access_flags); -	if (err) -		return ERR_PTR(err); - -	if (!pd->device->reg_phys_mr) -		return ERR_PTR(-ENOSYS); - -	mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf, -				     mr_access_flags, iova_start); - -	if (!IS_ERR(mr)) { -		mr->device  = pd->device; -		mr->pd      = pd; -		mr->uobject = NULL; -		atomic_inc(&pd->usecnt); -		atomic_set(&mr->usecnt, 0); -	} - -	return mr; -} -EXPORT_SYMBOL(ib_reg_phys_mr); - -int ib_rereg_phys_mr(struct ib_mr *mr, -		     int mr_rereg_mask, -		     struct ib_pd *pd, -		     struct ib_phys_buf *phys_buf_array, -		     int num_phys_buf, -		     int mr_access_flags, -		     u64 *iova_start) -{ -	struct ib_pd *old_pd; -	int ret; - -	ret = ib_check_mr_access(mr_access_flags); -	if (ret) -		return ret; - -	if (!mr->device->rereg_phys_mr) -		return -ENOSYS; - -	if (atomic_read(&mr->usecnt)) -		return -EBUSY; - -	old_pd = mr->pd; - -	ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd, -					phys_buf_array, num_phys_buf, -					mr_access_flags, iova_start); - -	if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) { -		atomic_dec(&old_pd->usecnt); -		atomic_inc(&pd->usecnt); -	} - -	return ret; -} -EXPORT_SYMBOL(ib_rereg_phys_mr); -  int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)  {  	return mr->device->query_mr ? @@ -1235,54 +1219,28 @@ int ib_dereg_mr(struct ib_mr *mr)  }  EXPORT_SYMBOL(ib_dereg_mr); -struct ib_mr *ib_create_mr(struct ib_pd *pd, -			   struct ib_mr_init_attr *mr_init_attr) -{ -	struct ib_mr *mr; - -	if (!pd->device->create_mr) -		return ERR_PTR(-ENOSYS); - -	mr = pd->device->create_mr(pd, mr_init_attr); - -	if (!IS_ERR(mr)) { -		mr->device  = pd->device; -		mr->pd      = pd; -		mr->uobject = NULL; -		atomic_inc(&pd->usecnt); -		atomic_set(&mr->usecnt, 0); -	} - -	return mr; -} -EXPORT_SYMBOL(ib_create_mr); - -int ib_destroy_mr(struct ib_mr *mr) -{ -	struct ib_pd *pd; -	int ret; - -	if (atomic_read(&mr->usecnt)) -		return -EBUSY; - -	pd = mr->pd; -	ret = mr->device->destroy_mr(mr); -	if (!ret) -		atomic_dec(&pd->usecnt); - -	return ret; -} -EXPORT_SYMBOL(ib_destroy_mr); - -struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) +/** + * ib_alloc_mr() - Allocates a memory region + * @pd:            protection domain associated with the region + * @mr_type:       memory region type + * @max_num_sg:    maximum sg entries available for registration. + * + * Notes: + * Memory registeration page/sg lists must not exceed max_num_sg. + * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed + * max_num_sg * used_page_size. + * + */ +struct ib_mr *ib_alloc_mr(struct ib_pd *pd, +			  enum ib_mr_type mr_type, +			  u32 max_num_sg)  {  	struct ib_mr *mr; -	if (!pd->device->alloc_fast_reg_mr) +	if (!pd->device->alloc_mr)  		return ERR_PTR(-ENOSYS); -	mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len); - +	mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);  	if (!IS_ERR(mr)) {  		mr->device  = pd->device;  		mr->pd      = pd; @@ -1293,7 +1251,7 @@ struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)  	return mr;  } -EXPORT_SYMBOL(ib_alloc_fast_reg_mr); +EXPORT_SYMBOL(ib_alloc_mr);  struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,  							  int max_page_list_len)  |