diff options
Diffstat (limited to 'drivers/infiniband/core')
35 files changed, 4572 insertions, 2191 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index edaae9f9853c..e3cdafff8ece 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -10,9 +10,11 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ - multicast.o mad.o smi.o agent.o mad_rmpp.o + multicast.o mad.o smi.o agent.o mad_rmpp.o \ + security.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o +ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o ib_cm-y := cm.o @@ -28,4 +30,5 @@ ib_umad-y := user_mad.o ib_ucm-y := ucm.o -ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o +ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ + rdma_core.o uverbs_std_types.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 0f58f46dbad7..01236cef7bfb 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -88,7 +88,7 @@ static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh) return false; ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), - nlmsg_len(nlh), ib_nl_addr_policy); + nlmsg_len(nlh), ib_nl_addr_policy, NULL); if (ret) return false; @@ -179,8 +179,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, } /* Construct the family header first */ - header = (struct rdma_ls_ip_resolve_header *) - skb_put(skb, NLMSG_ALIGN(sizeof(*header))); + header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); header->ifindex = dev_addr->bound_dev_if; nla_put(skb, attrtype, size, daddr); @@ -269,6 +268,7 @@ int rdma_translate_ip(const struct sockaddr *addr, return ret; ret = rdma_copy_addr(dev_addr, dev, NULL); + dev_addr->bound_dev_if = dev->ifindex; if (vlan_id) *vlan_id = rdma_vlan_dev_vlan_id(dev); dev_put(dev); @@ -281,6 +281,7 @@ int rdma_translate_ip(const struct sockaddr *addr, &((const struct sockaddr_in6 *)addr)->sin6_addr, dev, 1)) { ret = rdma_copy_addr(dev_addr, dev, NULL); + dev_addr->bound_dev_if = dev->ifindex; if (vlan_id) *vlan_id = rdma_vlan_dev_vlan_id(dev); break; @@ -406,10 +407,10 @@ static int addr4_resolve(struct sockaddr_in *src_in, fl4.saddr = src_ip; fl4.flowi4_oif = addr->bound_dev_if; rt = ip_route_output_key(addr->net, &fl4); - if (IS_ERR(rt)) { - ret = PTR_ERR(rt); - goto out; - } + ret = PTR_ERR_OR_ZERO(rt); + if (ret) + return ret; + src_in->sin_family = AF_INET; src_in->sin_addr.s_addr = fl4.saddr; @@ -424,8 +425,6 @@ static int addr4_resolve(struct sockaddr_in *src_in, *prt = rt; return 0; -out: - return ret; } #if IS_ENABLED(CONFIG_IPV6) @@ -444,17 +443,12 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, fl6.saddr = src_in->sin6_addr; fl6.flowi6_oif = addr->bound_dev_if; - dst = ip6_route_output(addr->net, NULL, &fl6); - if ((ret = dst->error)) - goto put; + ret = ipv6_stub->ipv6_dst_lookup(addr->net, NULL, &dst, &fl6); + if (ret < 0) + return ret; rt = (struct rt6_info *)dst; - if (ipv6_addr_any(&fl6.saddr)) { - ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev, - &fl6.daddr, 0, &fl6.saddr); - if (ret) - goto put; - + if (ipv6_addr_any(&src_in->sin6_addr)) { src_in->sin6_family = AF_INET6; src_in->sin6_addr = fl6.saddr; } @@ -471,9 +465,6 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, *pdst = dst; return 0; -put: - dst_release(dst); - return ret; } #else static int addr6_resolve(struct sockaddr_in6 *src_in, @@ -518,6 +509,11 @@ static int addr_resolve(struct sockaddr *src_in, struct dst_entry *dst; int ret; + if (!addr->net) { + pr_warn_ratelimited("%s: missing namespace\n", __func__); + return -EINVAL; + } + if (src_in->sa_family == AF_INET) { struct rtable *rt = NULL; const struct sockaddr_in *dst_in4 = @@ -531,8 +527,12 @@ static int addr_resolve(struct sockaddr *src_in, if (resolve_neigh) ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq); - ndev = rt->dst.dev; - dev_hold(ndev); + if (addr->bound_dev_if) { + ndev = dev_get_by_index(addr->net, addr->bound_dev_if); + } else { + ndev = rt->dst.dev; + dev_hold(ndev); + } ip_rt_put(rt); } else { @@ -548,14 +548,27 @@ static int addr_resolve(struct sockaddr *src_in, if (resolve_neigh) ret = addr_resolve_neigh(dst, dst_in, addr, seq); - ndev = dst->dev; - dev_hold(ndev); + if (addr->bound_dev_if) { + ndev = dev_get_by_index(addr->net, addr->bound_dev_if); + } else { + ndev = dst->dev; + dev_hold(ndev); + } dst_release(dst); } - addr->bound_dev_if = ndev->ifindex; - addr->net = dev_net(ndev); + if (ndev->flags & IFF_LOOPBACK) { + ret = rdma_translate_ip(dst_in, addr, NULL); + /* + * Put the loopback device and get the translated + * device instead. + */ + dev_put(ndev); + ndev = dev_get_by_index(addr->net, addr->bound_dev_if); + } else { + addr->bound_dev_if = ndev->ifindex; + } dev_put(ndev); return ret; diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index 11dacd97a667..324ef85a13b6 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -137,13 +137,13 @@ void agent_send_response(const struct ib_mad_hdr *mad_hdr, const struct ib_grh * err2: ib_free_send_mad(send_buf); err1: - ib_destroy_ah(ah); + rdma_destroy_ah(ah); } static void agent_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc) { - ib_destroy_ah(mad_send_wc->send_buf->ah); + rdma_destroy_ah(mad_send_wc->send_buf->ah); ib_free_send_mad(mad_send_wc->send_buf); } diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index ae04826e82fc..efc94304dee3 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -53,6 +53,7 @@ struct ib_update_work { struct work_struct work; struct ib_device *device; u8 port_num; + bool enforce_security; }; union ib_gid zgid; @@ -314,14 +315,13 @@ static void make_default_gid(struct net_device *dev, union ib_gid *gid) int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, union ib_gid *gid, struct ib_gid_attr *attr) { - struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; int ix; int ret = 0; struct net_device *idev; int empty; - table = ports_table[port - rdma_start_port(ib_dev)]; + table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; if (!memcmp(gid, &zgid, sizeof(*gid))) return -EINVAL; @@ -369,11 +369,10 @@ out_unlock: int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, union ib_gid *gid, struct ib_gid_attr *attr) { - struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; int ix; - table = ports_table[port - rdma_start_port(ib_dev)]; + table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; mutex_lock(&table->lock); write_lock_irq(&table->rwlock); @@ -399,12 +398,11 @@ out_unlock: int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, struct net_device *ndev) { - struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; int ix; bool deleted = false; - table = ports_table[port - rdma_start_port(ib_dev)]; + table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; mutex_lock(&table->lock); write_lock_irq(&table->rwlock); @@ -428,10 +426,9 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, union ib_gid *gid, struct ib_gid_attr *attr) { - struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; - table = ports_table[port - rdma_start_port(ib_dev)]; + table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; if (index < 0 || index >= table->sz) return -EINVAL; @@ -455,14 +452,13 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev, unsigned long mask, u8 *port, u16 *index) { - struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; u8 p; int local_index; unsigned long flags; for (p = 0; p < ib_dev->phys_port_cnt; p++) { - table = ports_table[p]; + table = ib_dev->cache.ports[p].gid; read_lock_irqsave(&table->rwlock, flags); local_index = find_gid(table, gid, val, false, mask, NULL); if (local_index >= 0) { @@ -503,18 +499,16 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev, u16 *index) { int local_index; - struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; unsigned long flags; - if (port < rdma_start_port(ib_dev) || - port > rdma_end_port(ib_dev)) + if (!rdma_is_port_valid(ib_dev, port)) return -ENOENT; - table = ports_table[port - rdma_start_port(ib_dev)]; + table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; @@ -562,21 +556,17 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, void *context, u16 *index) { - struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; unsigned int i; unsigned long flags; bool found = false; - if (!ports_table) - return -EOPNOTSUPP; - if (port < rdma_start_port(ib_dev) || - port > rdma_end_port(ib_dev) || + if (!rdma_is_port_valid(ib_dev, port) || !rdma_protocol_roce(ib_dev, port)) return -EPROTONOSUPPORT; - table = ports_table[port - rdma_start_port(ib_dev)]; + table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { @@ -668,14 +658,13 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode) { - struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; union ib_gid gid; struct ib_gid_attr gid_attr; struct ib_gid_attr zattr_type = zattr; struct ib_gid_table *table; unsigned int gid_type; - table = ports_table[port - rdma_start_port(ib_dev)]; + table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; make_default_gid(ndev, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); @@ -766,71 +755,64 @@ static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, static int _gid_table_setup_one(struct ib_device *ib_dev) { u8 port; - struct ib_gid_table **table; + struct ib_gid_table *table; int err = 0; - table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL); - if (!table) - return -ENOMEM; - for (port = 0; port < ib_dev->phys_port_cnt; port++) { u8 rdma_port = port + rdma_start_port(ib_dev); - table[port] = + table = alloc_gid_table( ib_dev->port_immutable[rdma_port].gid_tbl_len); - if (!table[port]) { + if (!table) { err = -ENOMEM; goto rollback_table_setup; } err = gid_table_reserve_default(ib_dev, port + rdma_start_port(ib_dev), - table[port]); + table); if (err) goto rollback_table_setup; + ib_dev->cache.ports[port].gid = table; } - ib_dev->cache.gid_cache = table; return 0; rollback_table_setup: for (port = 0; port < ib_dev->phys_port_cnt; port++) { + table = ib_dev->cache.ports[port].gid; + cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), - table[port]); - release_gid_table(table[port]); + table); + release_gid_table(table); } - kfree(table); return err; } static void gid_table_release_one(struct ib_device *ib_dev) { - struct ib_gid_table **table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; u8 port; - if (!table) - return; - - for (port = 0; port < ib_dev->phys_port_cnt; port++) - release_gid_table(table[port]); - - kfree(table); - ib_dev->cache.gid_cache = NULL; + for (port = 0; port < ib_dev->phys_port_cnt; port++) { + table = ib_dev->cache.ports[port].gid; + release_gid_table(table); + ib_dev->cache.ports[port].gid = NULL; + } } static void gid_table_cleanup_one(struct ib_device *ib_dev) { - struct ib_gid_table **table = ib_dev->cache.gid_cache; + struct ib_gid_table *table; u8 port; - if (!table) - return; - - for (port = 0; port < ib_dev->phys_port_cnt; port++) + for (port = 0; port < ib_dev->phys_port_cnt; port++) { + table = ib_dev->cache.ports[port].gid; cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), - table[port]); + table); + } } static int gid_table_setup_one(struct ib_device *ib_dev) @@ -860,12 +842,12 @@ int ib_get_cached_gid(struct ib_device *device, { int res; unsigned long flags; - struct ib_gid_table **ports_table = device->cache.gid_cache; - struct ib_gid_table *table = ports_table[port_num - rdma_start_port(device)]; + struct ib_gid_table *table; - if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + if (!rdma_is_port_valid(device, port_num)) return -EINVAL; + table = device->cache.ports[port_num - rdma_start_port(device)].gid; read_lock_irqsave(&table->rwlock, flags); res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr); read_unlock_irqrestore(&table->rwlock, flags); @@ -912,12 +894,12 @@ int ib_get_cached_pkey(struct ib_device *device, unsigned long flags; int ret = 0; - if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; + cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; if (index < 0 || index >= cache->table_len) ret = -EINVAL; @@ -930,6 +912,26 @@ int ib_get_cached_pkey(struct ib_device *device, } EXPORT_SYMBOL(ib_get_cached_pkey); +int ib_get_cached_subnet_prefix(struct ib_device *device, + u8 port_num, + u64 *sn_pfx) +{ + unsigned long flags; + int p; + + if (port_num < rdma_start_port(device) || + port_num > rdma_end_port(device)) + return -EINVAL; + + p = port_num - rdma_start_port(device); + read_lock_irqsave(&device->cache.lock, flags); + *sn_pfx = device->cache.ports[p].subnet_prefix; + read_unlock_irqrestore(&device->cache.lock, flags); + + return 0; +} +EXPORT_SYMBOL(ib_get_cached_subnet_prefix); + int ib_find_cached_pkey(struct ib_device *device, u8 port_num, u16 pkey, @@ -941,12 +943,12 @@ int ib_find_cached_pkey(struct ib_device *device, int ret = -ENOENT; int partial_ix = -1; - if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; + cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; *index = -1; @@ -981,12 +983,12 @@ int ib_find_exact_cached_pkey(struct ib_device *device, int i; int ret = -ENOENT; - if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - cache = device->cache.pkey_cache[port_num - rdma_start_port(device)]; + cache = device->cache.ports[port_num - rdma_start_port(device)].pkey; *index = -1; @@ -1010,19 +1012,39 @@ int ib_get_cached_lmc(struct ib_device *device, unsigned long flags; int ret = 0; - if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + if (!rdma_is_port_valid(device, port_num)) return -EINVAL; read_lock_irqsave(&device->cache.lock, flags); - *lmc = device->cache.lmc_cache[port_num - rdma_start_port(device)]; + *lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc; read_unlock_irqrestore(&device->cache.lock, flags); return ret; } EXPORT_SYMBOL(ib_get_cached_lmc); +int ib_get_cached_port_state(struct ib_device *device, + u8 port_num, + enum ib_port_state *port_state) +{ + unsigned long flags; + int ret = 0; + + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + *port_state = device->cache.ports[port_num + - rdma_start_port(device)].port_state; + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_get_cached_port_state); + static void ib_cache_update(struct ib_device *device, - u8 port) + u8 port, + bool enforce_security) { struct ib_port_attr *tprops = NULL; struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; @@ -1033,14 +1055,13 @@ static void ib_cache_update(struct ib_device *device, int i; int ret; struct ib_gid_table *table; - struct ib_gid_table **ports_table = device->cache.gid_cache; bool use_roce_gid_table = rdma_cap_roce_gid_table(device, port); - if (port < rdma_start_port(device) || port > rdma_end_port(device)) + if (!rdma_is_port_valid(device, port)) return; - table = ports_table[port - rdma_start_port(device)]; + table = device->cache.ports[port - rdma_start_port(device)].gid; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) @@ -1092,9 +1113,10 @@ static void ib_cache_update(struct ib_device *device, write_lock_irq(&device->cache.lock); - old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)]; + old_pkey_cache = device->cache.ports[port - + rdma_start_port(device)].pkey; - device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache; + device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache; if (!use_roce_gid_table) { write_lock(&table->rwlock); for (i = 0; i < gid_cache->table_len; i++) { @@ -1104,10 +1126,19 @@ static void ib_cache_update(struct ib_device *device, write_unlock(&table->rwlock); } - device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc; + device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc; + device->cache.ports[port - rdma_start_port(device)].port_state = + tprops->state; + device->cache.ports[port - rdma_start_port(device)].subnet_prefix = + tprops->subnet_prefix; write_unlock_irq(&device->cache.lock); + if (enforce_security) + ib_security_cache_change(device, + port, + tprops->subnet_prefix); + kfree(gid_cache); kfree(old_pkey_cache); kfree(tprops); @@ -1124,7 +1155,9 @@ static void ib_cache_task(struct work_struct *_work) struct ib_update_work *work = container_of(_work, struct ib_update_work, work); - ib_cache_update(work->device, work->port_num); + ib_cache_update(work->device, + work->port_num, + work->enforce_security); kfree(work); } @@ -1145,6 +1178,12 @@ static void ib_cache_event(struct ib_event_handler *handler, INIT_WORK(&work->work, ib_cache_task); work->device = event->device; work->port_num = event->element.port_num; + if (event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_GID_CHANGE) + work->enforce_security = true; + else + work->enforce_security = false; + queue_work(ib_wq, &work->work); } } @@ -1157,25 +1196,20 @@ int ib_cache_setup_one(struct ib_device *device) rwlock_init(&device->cache.lock); - device->cache.pkey_cache = - kzalloc(sizeof *device->cache.pkey_cache * + device->cache.ports = + kzalloc(sizeof(*device->cache.ports) * (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); - device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache * - (rdma_end_port(device) - - rdma_start_port(device) + 1), - GFP_KERNEL); - if (!device->cache.pkey_cache || - !device->cache.lmc_cache) { + if (!device->cache.ports) { err = -ENOMEM; - goto free; + goto out; } err = gid_table_setup_one(device); if (err) - goto free; + goto out; for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) - ib_cache_update(device, p + rdma_start_port(device)); + ib_cache_update(device, p + rdma_start_port(device), true); INIT_IB_EVENT_HANDLER(&device->cache.event_handler, device, ib_cache_event); @@ -1187,9 +1221,7 @@ int ib_cache_setup_one(struct ib_device *device) err: gid_table_cleanup_one(device); -free: - kfree(device->cache.pkey_cache); - kfree(device->cache.lmc_cache); +out: return err; } @@ -1203,14 +1235,11 @@ void ib_cache_release_one(struct ib_device *device) * all the device's resources when the cache could no * longer be accessed. */ - if (device->cache.pkey_cache) - for (p = 0; - p <= rdma_end_port(device) - rdma_start_port(device); ++p) - kfree(device->cache.pkey_cache[p]); + for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) + kfree(device->cache.ports[p].pkey); gid_table_release_one(device); - kfree(device->cache.pkey_cache); - kfree(device->cache.lmc_cache); + kfree(device->cache.ports); } void ib_cache_cleanup_one(struct ib_device *device) diff --git a/drivers/infiniband/core/cgroup.c b/drivers/infiniband/core/cgroup.c new file mode 100644 index 000000000000..126ac5f99db7 --- /dev/null +++ b/drivers/infiniband/core/cgroup.c @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2016 Parav Pandit <[email protected]> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include "core_priv.h" + +/** + * ib_device_register_rdmacg - register with rdma cgroup. + * @device: device to register to participate in resource + * accounting by rdma cgroup. + * + * Register with the rdma cgroup. Should be called before + * exposing rdma device to user space applications to avoid + * resource accounting leak. + * Returns 0 on success or otherwise failure code. + */ +int ib_device_register_rdmacg(struct ib_device *device) +{ + device->cg_device.name = device->name; + return rdmacg_register_device(&device->cg_device); +} + +/** + * ib_device_unregister_rdmacg - unregister with rdma cgroup. + * @device: device to unregister. + * + * Unregister with the rdma cgroup. Should be called after + * all the resources are deallocated, and after a stage when any + * other resource allocation by user application cannot be done + * for this device to avoid any leak in accounting. + */ +void ib_device_unregister_rdmacg(struct ib_device *device) +{ + rdmacg_unregister_device(&device->cg_device); +} + +int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index) +{ + return rdmacg_try_charge(&cg_obj->cg, &device->cg_device, + resource_index); +} +EXPORT_SYMBOL(ib_rdmacg_try_charge); + +void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index) +{ + rdmacg_uncharge(cg_obj->cg, &device->cg_device, + resource_index); +} +EXPORT_SYMBOL(ib_rdmacg_uncharge); diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index cf1edfa1cbac..2b4d613a3474 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -228,7 +228,7 @@ struct cm_device { struct cm_av { struct cm_port *port; union ib_gid dgid; - struct ib_ah_attr ah_attr; + struct rdma_ah_attr ah_attr; u16 pkey_index; u8 timeout; }; @@ -241,7 +241,7 @@ struct cm_work { __be32 local_id; /* Established / timewait */ __be32 remote_id; struct ib_cm_event cm_event; - struct ib_sa_path_rec path[0]; + struct sa_path_rec path[0]; }; struct cm_timewait_info { @@ -343,7 +343,7 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv, ret = -ENODEV; goto out; } - ah = ib_create_ah(mad_agent->qp->pd, &av->ah_attr); + ah = rdma_create_ah(mad_agent->qp->pd, &av->ah_attr); if (IS_ERR(ah)) { ret = PTR_ERR(ah); goto out; @@ -355,7 +355,7 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv, GFP_ATOMIC, IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { - ib_destroy_ah(ah); + rdma_destroy_ah(ah); ret = PTR_ERR(m); goto out; } @@ -390,7 +390,7 @@ static int cm_alloc_response_msg(struct cm_port *port, GFP_ATOMIC, IB_MGMT_BASE_VERSION); if (IS_ERR(m)) { - ib_destroy_ah(ah); + rdma_destroy_ah(ah); return PTR_ERR(m); } m->ah = ah; @@ -400,7 +400,7 @@ static int cm_alloc_response_msg(struct cm_port *port, static void cm_free_msg(struct ib_mad_send_buf *msg) { - ib_destroy_ah(msg->ah); + rdma_destroy_ah(msg->ah); if (msg->context[0]) cm_deref_id(msg->context[0]); ib_free_send_mad(msg); @@ -440,7 +440,7 @@ static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, grh, &av->ah_attr); } -static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av, +static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, struct cm_id_private *cm_id_priv) { struct cm_device *cm_dev; @@ -453,7 +453,8 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av, read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, - path->gid_type, ndev, &p, NULL)) { + sa_conv_pathrec_to_gid_type(path), + ndev, &p, NULL)) { port = cm_dev->port[p-1]; break; } @@ -1172,8 +1173,8 @@ static void cm_format_req(struct cm_req_msg *req_msg, struct cm_id_private *cm_id_priv, struct ib_cm_req_param *param) { - struct ib_sa_path_rec *pri_path = param->primary_path; - struct ib_sa_path_rec *alt_path = param->alternate_path; + struct sa_path_rec *pri_path = param->primary_path; + struct sa_path_rec *alt_path = param->alternate_path; cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ)); @@ -1202,8 +1203,10 @@ static void cm_format_req(struct cm_req_msg *req_msg, } if (pri_path->hop_limit <= 1) { - req_msg->primary_local_lid = pri_path->slid; - req_msg->primary_remote_lid = pri_path->dlid; + req_msg->primary_local_lid = + htons(ntohl(sa_path_get_slid(pri_path))); + req_msg->primary_remote_lid = + htons(ntohl(sa_path_get_dlid(pri_path))); } else { /* Work-around until there's a way to obtain remote LID info */ req_msg->primary_local_lid = IB_LID_PERMISSIVE; @@ -1223,8 +1226,10 @@ static void cm_format_req(struct cm_req_msg *req_msg, if (alt_path) { if (alt_path->hop_limit <= 1) { - req_msg->alt_local_lid = alt_path->slid; - req_msg->alt_remote_lid = alt_path->dlid; + req_msg->alt_local_lid = + htons(ntohl(sa_path_get_slid(alt_path))); + req_msg->alt_remote_lid = + htons(ntohl(sa_path_get_dlid(alt_path))); } else { req_msg->alt_local_lid = IB_LID_PERMISSIVE; req_msg->alt_remote_lid = IB_LID_PERMISSIVE; @@ -1401,14 +1406,15 @@ static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid, } static void cm_format_paths_from_req(struct cm_req_msg *req_msg, - struct ib_sa_path_rec *primary_path, - struct ib_sa_path_rec *alt_path) + struct sa_path_rec *primary_path, + struct sa_path_rec *alt_path) { - memset(primary_path, 0, sizeof *primary_path); primary_path->dgid = req_msg->primary_local_gid; primary_path->sgid = req_msg->primary_remote_gid; - primary_path->dlid = req_msg->primary_local_lid; - primary_path->slid = req_msg->primary_remote_lid; + sa_path_set_dlid(primary_path, + htonl(ntohs(req_msg->primary_local_lid))); + sa_path_set_slid(primary_path, + htonl(ntohs(req_msg->primary_remote_lid))); primary_path->flow_label = cm_req_get_primary_flow_label(req_msg); primary_path->hop_limit = req_msg->primary_hop_limit; primary_path->traffic_class = req_msg->primary_traffic_class; @@ -1426,11 +1432,12 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg, primary_path->service_id = req_msg->service_id; if (req_msg->alt_local_lid) { - memset(alt_path, 0, sizeof *alt_path); alt_path->dgid = req_msg->alt_local_gid; alt_path->sgid = req_msg->alt_remote_gid; - alt_path->dlid = req_msg->alt_local_lid; - alt_path->slid = req_msg->alt_remote_lid; + sa_path_set_dlid(alt_path, + htonl(ntohs(req_msg->alt_local_lid))); + sa_path_set_slid(alt_path, + htonl(ntohs(req_msg->alt_remote_lid))); alt_path->flow_label = cm_req_get_alt_flow_label(req_msg); alt_path->hop_limit = req_msg->alt_hop_limit; alt_path->traffic_class = req_msg->alt_traffic_class; @@ -1722,6 +1729,7 @@ static int cm_req_handler(struct cm_work *work) struct cm_req_msg *req_msg; union ib_gid gid; struct ib_gid_attr gid_attr; + const struct ib_global_route *grh; int ret; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; @@ -1758,21 +1766,34 @@ static int cm_req_handler(struct cm_work *work) cm_id_priv->id.service_mask = ~cpu_to_be64(0); cm_process_routed_req(req_msg, work->mad_recv_wc->wc); - cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); - memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); - work->path[0].hop_limit = cm_id_priv->av.ah_attr.grh.hop_limit; + memset(&work->path[0], 0, sizeof(work->path[0])); + memset(&work->path[1], 0, sizeof(work->path[1])); + grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr); ret = ib_get_cached_gid(work->port->cm_dev->ib_device, work->port->port_num, - cm_id_priv->av.ah_attr.grh.sgid_index, + grh->sgid_index, &gid, &gid_attr); if (!ret) { if (gid_attr.ndev) { - work->path[0].ifindex = gid_attr.ndev->ifindex; - work->path[0].net = dev_net(gid_attr.ndev); + work->path[0].rec_type = + sa_conv_gid_to_pathrec_type(gid_attr.gid_type); + sa_path_set_ifindex(&work->path[0], + gid_attr.ndev->ifindex); + sa_path_set_ndev(&work->path[0], + dev_net(gid_attr.ndev)); dev_put(gid_attr.ndev); + } else { + work->path[0].rec_type = SA_PATH_REC_TYPE_IB; } - work->path[0].gid_type = gid_attr.gid_type; + if (req_msg->alt_local_lid) + work->path[1].rec_type = work->path[0].rec_type; + cm_format_paths_from_req(req_msg, &work->path[0], + &work->path[1]); + if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) + sa_path_set_dmac(&work->path[0], + cm_id_priv->av.ah_attr.roce.dmac); + work->path[0].hop_limit = grh->hop_limit; ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, cm_id_priv); } @@ -1782,11 +1803,18 @@ static int cm_req_handler(struct cm_work *work) &work->path[0].sgid, &gid_attr); if (!err && gid_attr.ndev) { - work->path[0].ifindex = gid_attr.ndev->ifindex; - work->path[0].net = dev_net(gid_attr.ndev); + work->path[0].rec_type = + sa_conv_gid_to_pathrec_type(gid_attr.gid_type); + sa_path_set_ifindex(&work->path[0], + gid_attr.ndev->ifindex); + sa_path_set_ndev(&work->path[0], + dev_net(gid_attr.ndev)); dev_put(gid_attr.ndev); + } else { + work->path[0].rec_type = SA_PATH_REC_TYPE_IB; } - work->path[0].gid_type = gid_attr.gid_type; + if (req_msg->alt_local_lid) + work->path[1].rec_type = work->path[0].rec_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); @@ -2811,7 +2839,7 @@ out: static void cm_format_lap(struct cm_lap_msg *lap_msg, struct cm_id_private *cm_id_priv, - struct ib_sa_path_rec *alternate_path, + struct sa_path_rec *alternate_path, const void *private_data, u8 private_data_len) { @@ -2822,8 +2850,10 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn); /* todo: need remote CM response timeout */ cm_lap_set_remote_resp_timeout(lap_msg, 0x1F); - lap_msg->alt_local_lid = alternate_path->slid; - lap_msg->alt_remote_lid = alternate_path->dlid; + lap_msg->alt_local_lid = + htons(ntohl(sa_path_get_slid(alternate_path))); + lap_msg->alt_remote_lid = + htons(ntohl(sa_path_get_dlid(alternate_path))); lap_msg->alt_local_gid = alternate_path->sgid; lap_msg->alt_remote_gid = alternate_path->dgid; cm_lap_set_flow_label(lap_msg, alternate_path->flow_label); @@ -2841,7 +2871,7 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, } int ib_send_cm_lap(struct ib_cm_id *cm_id, - struct ib_sa_path_rec *alternate_path, + struct sa_path_rec *alternate_path, const void *private_data, u8 private_data_len) { @@ -2895,14 +2925,15 @@ out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); EXPORT_SYMBOL(ib_send_cm_lap); static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, - struct ib_sa_path_rec *path, + struct sa_path_rec *path, struct cm_lap_msg *lap_msg) { memset(path, 0, sizeof *path); + path->rec_type = SA_PATH_REC_TYPE_IB; path->dgid = lap_msg->alt_local_gid; path->sgid = lap_msg->alt_remote_gid; - path->dlid = lap_msg->alt_local_lid; - path->slid = lap_msg->alt_remote_lid; + sa_path_set_dlid(path, htonl(ntohs(lap_msg->alt_local_lid))); + sa_path_set_slid(path, htonl(ntohs(lap_msg->alt_remote_lid))); path->flow_label = cm_lap_get_flow_label(lap_msg); path->hop_limit = lap_msg->alt_hop_limit; path->traffic_class = cm_lap_get_traffic_class(lap_msg); @@ -3409,6 +3440,8 @@ static void cm_process_send_error(struct ib_mad_send_buf *msg, if (msg != cm_id_priv->msg || state != cm_id_priv->id.state) goto discard; + pr_debug_ratelimited("CM: failed sending MAD in state %d. (%s)\n", + state, ib_wc_status_msg(wc_status)); switch (state) { case IB_CM_REQ_SENT: case IB_CM_MRA_REQ_RCVD: @@ -3706,7 +3739,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent, atomic_long_inc(&port->counter_group[CM_RECV]. counter[attr_id - CM_ATTR_ID_OFFSET]); - work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths, + work = kmalloc(sizeof(*work) + sizeof(struct sa_path_rec) * paths, GFP_KERNEL); if (!work) { ib_free_recv_mad(mad_recv_wc); @@ -3798,7 +3831,7 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, cm_id_priv->responder_resources; qp_attr->min_rnr_timer = 0; } - if (cm_id_priv->alt_av.ah_attr.dlid) { + if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) { *qp_attr_mask |= IB_QP_ALT_PATH; qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num; qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index; @@ -3852,7 +3885,7 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, default: break; } - if (cm_id_priv->alt_av.ah_attr.dlid) { + if (rdma_ah_get_dlid(&cm_id_priv->alt_av.ah_attr)) { *qp_attr_mask |= IB_QP_PATH_MIG_STATE; qp_attr->path_mig_state = IB_MIG_REARM; } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 3e70a9c5d79d..0eb393237ba2 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -198,6 +198,7 @@ struct cma_device { atomic_t refcount; struct list_head id_list; enum ib_gid_type *default_gid_type; + u8 *default_roce_tos; }; struct rdma_bind_list { @@ -269,8 +270,7 @@ struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, int cma_get_default_gid_type(struct cma_device *cma_dev, unsigned int port) { - if (port < rdma_start_port(cma_dev->device) || - port > rdma_end_port(cma_dev->device)) + if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; @@ -282,8 +282,7 @@ int cma_set_default_gid_type(struct cma_device *cma_dev, { unsigned long supported_gids; - if (port < rdma_start_port(cma_dev->device) || - port > rdma_end_port(cma_dev->device)) + if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; supported_gids = roce_gid_type_mask_support(cma_dev->device, port); @@ -297,6 +296,25 @@ int cma_set_default_gid_type(struct cma_device *cma_dev, return 0; } +int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port) +{ + if (!rdma_is_port_valid(cma_dev->device, port)) + return -EINVAL; + + return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)]; +} + +int cma_set_default_roce_tos(struct cma_device *cma_dev, unsigned int port, + u8 default_roce_tos) +{ + if (!rdma_is_port_valid(cma_dev->device, port)) + return -EINVAL; + + cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] = + default_roce_tos; + + return 0; +} struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) { return cma_dev->device; @@ -343,6 +361,7 @@ struct rdma_id_private { u32 options; u8 srq; u8 tos; + bool tos_set; u8 reuseaddr; u8 afonly; enum ib_gid_type gid_type; @@ -604,22 +623,11 @@ static inline int cma_validate_port(struct ib_device *device, u8 port, if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) return ret; - if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) ndev = dev_get_by_index(&init_net, bound_if_index); - if (ndev && ndev->flags & IFF_LOOPBACK) { - pr_info("detected loopback device\n"); - dev_put(ndev); - - if (!device->get_netdev) - return -EOPNOTSUPP; - - ndev = device->get_netdev(device, port); - if (!ndev) - return -ENODEV; - } - } else { + else gid_type = IB_GID_TYPE_IB; - } + ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, ndev, NULL); @@ -709,6 +717,7 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) union ib_gid gid, sgid, *dgid; u16 pkey, index; u8 p; + enum ib_port_state port_state; int i; cma_dev = NULL; @@ -724,6 +733,8 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) continue; + if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) + continue; for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, &gid, NULL); i++) { @@ -735,7 +746,8 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) } if (!cma_dev && (gid.global.subnet_prefix == - dgid->global.subnet_prefix)) { + dgid->global.subnet_prefix) && + port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; @@ -778,6 +790,7 @@ struct rdma_cm_id *rdma_create_id(struct net *net, id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; + id_priv->tos_set = false; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); @@ -905,7 +918,8 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, goto out; ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num, - qp_attr.ah_attr.grh.sgid_index, &sgid, NULL); + rdma_ah_read_grh(&qp_attr.ah_attr)->sgid_index, + &sgid, NULL); if (ret) goto out; @@ -1019,6 +1033,8 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); + qp_attr->port_num = id_priv->id.port_num; + *qp_attr_mask |= IB_QP_PORT; } else ret = -ENOSYS; @@ -1103,7 +1119,7 @@ static inline int cma_any_port(struct sockaddr *addr) static void cma_save_ib_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_cm_id *listen_id, - struct ib_sa_path_rec *path) + struct sa_path_rec *path) { struct sockaddr_ib *listen_ib, *ib; @@ -1249,7 +1265,7 @@ static int cma_save_req_info(const struct ib_cm_event *ib_event, memcpy(&req->local_gid, &req_param->primary_path->sgid, sizeof(req->local_gid)); req->has_gid = true; - req->service_id = req_param->primary_path->service_id; + req->service_id = req_param->primary_path->service_id; req->pkey = be16_to_cpu(req_param->primary_path->pkey); if (req->pkey != req_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" @@ -1689,6 +1705,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv) return 0; reject: + pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret); cma_modify_qp_err(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); @@ -1730,6 +1747,9 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: + if (cma_comp(id_priv, RDMA_CM_CONNECT) && + (id_priv->id.qp_type != IB_QPT_UD)) + ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : @@ -1760,6 +1780,8 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: + pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id, + ib_event->param.rej_rcvd.reason)); cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; @@ -1794,8 +1816,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, struct rdma_cm_id *id; struct rdma_route *rt; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; + struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path; const __be64 service_id = - ib_event->param.req_rcvd.primary_path->service_id; + ib_event->param.req_rcvd.primary_path->service_id; int ret; id = rdma_create_id(listen_id->route.addr.dev_addr.net, @@ -1817,7 +1840,7 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, if (!rt->path_rec) goto err; - rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path; + rt->path_rec[0] = *path; if (rt->num_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; @@ -2266,10 +2289,11 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos) id_priv = container_of(id, struct rdma_id_private, id); id_priv->tos = (u8) tos; + id_priv->tos_set = true; } EXPORT_SYMBOL(rdma_set_service_type); -static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, +static void cma_query_handler(int status, struct sa_path_rec *path_rec, void *context) { struct cma_work *work = context; @@ -2285,6 +2309,8 @@ static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; + pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", + status); } queue_work(cma_wq, &work->work); @@ -2294,18 +2320,24 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - struct ib_sa_path_rec path_rec; + struct sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); + + if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num)) + path_rec.rec_type = SA_PATH_REC_TYPE_OPA; + else + path_rec.rec_type = SA_PATH_REC_TYPE_IB; rdma_addr_get_sgid(dev_addr, &path_rec.sgid); rdma_addr_get_dgid(dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; - path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); + path_rec.service_id = rdma_get_service_id(&id_priv->id, + cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | @@ -2419,7 +2451,7 @@ err1: } int rdma_set_ib_paths(struct rdma_cm_id *id, - struct ib_sa_path_rec *path_rec, int num_paths) + struct sa_path_rec *path_rec, int num_paths) { struct rdma_id_private *id_priv; int ret; @@ -2467,14 +2499,12 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos) struct net_device *dev; prio = rt_tos2priority(tos); - dev = ndev->priv_flags & IFF_802_1Q_VLAN ? - vlan_dev_real_dev(ndev) : ndev; - + dev = is_vlan_dev(ndev) ? vlan_dev_real_dev(ndev) : ndev; if (dev->num_tc) return netdev_get_prio_tc_map(dev, prio); #if IS_ENABLED(CONFIG_VLAN_8021Q) - if (ndev->priv_flags & IFF_802_1Q_VLAN) + if (is_vlan_dev(ndev)) return (vlan_dev_get_egress_qos_mask(ndev, prio) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; #endif @@ -2500,6 +2530,10 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) struct cma_work *work; int ret; struct net_device *ndev = NULL; + enum ib_gid_type gid_type = IB_GID_TYPE_IB; + u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + u8 tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; work = kzalloc(sizeof *work, GFP_KERNEL); @@ -2526,36 +2560,22 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err2; } - if (ndev->flags & IFF_LOOPBACK) { - dev_put(ndev); - if (!id_priv->id.device->get_netdev) { - ret = -EOPNOTSUPP; - goto err2; - } - - ndev = id_priv->id.device->get_netdev(id_priv->id.device, - id_priv->id.port_num); - if (!ndev) { - ret = -ENODEV; - goto err2; - } - } - - route->path_rec->net = &init_net; - route->path_rec->ifindex = ndev->ifindex; supported_gids = roce_gid_type_mask_support(id_priv->id.device, id_priv->id.port_num); - route->path_rec->gid_type = - cma_route_gid_type(addr->dev_addr.network, - supported_gids, - id_priv->gid_type); + gid_type = cma_route_gid_type(addr->dev_addr.network, + supported_gids, + id_priv->gid_type); + route->path_rec->rec_type = + sa_conv_gid_to_pathrec_type(gid_type); + sa_path_set_ndev(route->path_rec, &init_net); + sa_path_set_ifindex(route->path_rec, ndev->ifindex); } if (!ndev) { ret = -ENODEV; goto err2; } - memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN); + sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); @@ -2563,8 +2583,10 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) &route->path_rec->dgid); /* Use the hint from IP Stack to select GID Type */ - if (route->path_rec->gid_type < ib_network_to_gid_type(addr->dev_addr.network)) - route->path_rec->gid_type = ib_network_to_gid_type(addr->dev_addr.network); + if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) + gid_type = ib_network_to_gid_type(addr->dev_addr.network); + route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); + if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; @@ -2573,7 +2595,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; - route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos); + route->path_rec->sl = iboe_tos_to_sl(ndev, tos); + route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); @@ -2652,8 +2675,8 @@ static void cma_set_loopback(struct sockaddr *addr) static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; - struct ib_port_attr port_attr; union ib_gid gid; + enum ib_port_state port_state; u16 pkey; int ret; u8 p; @@ -2669,8 +2692,8 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv) cma_dev = cur_dev; for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) { - if (!ib_query_port(cur_dev->device, p, &port_attr) && - port_attr.state == IB_PORT_ACTIVE) { + if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) && + port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; goto port_found; } @@ -2720,8 +2743,14 @@ static void addr_handler(int status, struct sockaddr *src_addr, goto out; memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); - if (!status && !id_priv->cma_dev) + if (!status && !id_priv->cma_dev) { status = cma_acquire_dev(id_priv, NULL); + if (status) + pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", + status); + } else { + pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); + } if (status) { if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, @@ -2833,20 +2862,26 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, int ret; id_priv = container_of(id, struct rdma_id_private, id); + memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); if (id_priv->state == RDMA_CM_IDLE) { ret = cma_bind_addr(id, src_addr, dst_addr); - if (ret) + if (ret) { + memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); return ret; + } } - if (cma_family(id_priv) != dst_addr->sa_family) + if (cma_family(id_priv) != dst_addr->sa_family) { + memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); return -EINVAL; + } - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { + memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); return -EINVAL; + } atomic_inc(&id_priv->refcount); - memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); } else { @@ -2962,6 +2997,43 @@ err: return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } +static int cma_port_is_unique(struct rdma_bind_list *bind_list, + struct rdma_id_private *id_priv) +{ + struct rdma_id_private *cur_id; + struct sockaddr *daddr = cma_dst_addr(id_priv); + struct sockaddr *saddr = cma_src_addr(id_priv); + __be16 dport = cma_port(daddr); + + hlist_for_each_entry(cur_id, &bind_list->owners, node) { + struct sockaddr *cur_daddr = cma_dst_addr(cur_id); + struct sockaddr *cur_saddr = cma_src_addr(cur_id); + __be16 cur_dport = cma_port(cur_daddr); + + if (id_priv == cur_id) + continue; + + /* different dest port -> unique */ + if (!cma_any_port(cur_daddr) && + (dport != cur_dport)) + continue; + + /* different src address -> unique */ + if (!cma_any_addr(saddr) && + !cma_any_addr(cur_saddr) && + cma_addr_cmp(saddr, cur_saddr)) + continue; + + /* different dst address -> unique */ + if (!cma_any_addr(cur_daddr) && + cma_addr_cmp(daddr, cur_daddr)) + continue; + + return -EADDRNOTAVAIL; + } + return 0; +} + static int cma_alloc_any_port(enum rdma_port_space ps, struct rdma_id_private *id_priv) { @@ -2974,9 +3046,19 @@ static int cma_alloc_any_port(enum rdma_port_space ps, remaining = (high - low) + 1; rover = prandom_u32() % remaining + low; retry: - if (last_used_port != rover && - !cma_ps_find(net, ps, (unsigned short)rover)) { - int ret = cma_alloc_port(ps, id_priv, rover); + if (last_used_port != rover) { + struct rdma_bind_list *bind_list; + int ret; + + bind_list = cma_ps_find(net, ps, (unsigned short)rover); + + if (!bind_list) { + ret = cma_alloc_port(ps, id_priv, rover); + } else { + ret = cma_port_is_unique(bind_list, id_priv); + if (!ret) + cma_bind_port(bind_list, id_priv); + } /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed. @@ -3205,6 +3287,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; int ret; + struct sockaddr *daddr; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && addr->sa_family != AF_IB) @@ -3244,6 +3327,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (ret) goto err2; + daddr = cma_dst_addr(id_priv); + daddr->sa_family = addr->sa_family; + return 0; err2: if (id_priv->cma_dev) @@ -3308,10 +3394,13 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, if (rep->status != IB_SIDR_SUCCESS) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = ib_event->param.sidr_rep_rcvd.status; + pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n", + event.status); break; } ret = cma_set_qkey(id_priv, rep->qkey); if (ret) { + pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret); event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = ret; break; @@ -3583,6 +3672,9 @@ static int cma_accept_iw(struct rdma_id_private *id_priv, struct iw_cm_conn_param iw_param; int ret; + if (!conn_param) + return -EINVAL; + ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; @@ -3760,10 +3852,17 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) if (!status) status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); + else + pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", + status); mutex_lock(&id_priv->qp_mutex); - if (!status && id_priv->id.qp) + if (!status && id_priv->id.qp) { status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, be16_to_cpu(multicast->rec.mlid)); + if (status) + pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to attach QP. status %d\n", + status); + } mutex_unlock(&id_priv->qp_mutex); memset(&event, 0, sizeof event); @@ -3833,63 +3932,10 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, } } -static void cma_query_sa_classport_info_cb(int status, - struct ib_class_port_info *rec, - void *context) -{ - struct class_port_info_context *cb_ctx = context; - - WARN_ON(!context); - - if (status || !rec) { - pr_debug("RDMA CM: %s port %u failed query ClassPortInfo status: %d\n", - cb_ctx->device->name, cb_ctx->port_num, status); - goto out; - } - - memcpy(cb_ctx->class_port_info, rec, sizeof(struct ib_class_port_info)); - -out: - complete(&cb_ctx->done); -} - -static int cma_query_sa_classport_info(struct ib_device *device, u8 port_num, - struct ib_class_port_info *class_port_info) -{ - struct class_port_info_context *cb_ctx; - int ret; - - cb_ctx = kmalloc(sizeof(*cb_ctx), GFP_KERNEL); - if (!cb_ctx) - return -ENOMEM; - - cb_ctx->device = device; - cb_ctx->class_port_info = class_port_info; - cb_ctx->port_num = port_num; - init_completion(&cb_ctx->done); - - ret = ib_sa_classport_info_rec_query(&sa_client, device, port_num, - CMA_QUERY_CLASSPORT_INFO_TIMEOUT, - GFP_KERNEL, cma_query_sa_classport_info_cb, - cb_ctx, &cb_ctx->sa_query); - if (ret < 0) { - pr_err("RDMA CM: %s port %u failed to send ClassPortInfo query, ret: %d\n", - device->name, port_num, ret); - goto out; - } - - wait_for_completion(&cb_ctx->done); - -out: - kfree(cb_ctx); - return ret; -} - static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; - struct ib_class_port_info class_port_info; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret; @@ -3910,21 +3956,14 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = mc->join_state; - if (rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) { - ret = cma_query_sa_classport_info(id_priv->id.device, - id_priv->id.port_num, - &class_port_info); - - if (ret) - return ret; - - if (!(ib_get_cpi_capmask2(&class_port_info) & - IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT)) { - pr_warn("RDMA CM: %s port %u Unable to multicast join\n" - "RDMA CM: SM doesn't support Send Only Full Member option\n", - id_priv->id.device->name, id_priv->id.port_num); - return -EOPNOTSUPP; - } + if ((rec.join_state == BIT(SENDONLY_FULLMEMBER_JOIN)) && + (!ib_sa_sendonly_fullmem_support(&sa_client, + id_priv->id.device, + id_priv->id.port_num))) { + pr_warn("RDMA CM: %s port %u Unable to multicast join\n" + "RDMA CM: SM doesn't support Send Only Full Member option\n", + id_priv->id.device->name, id_priv->id.port_num); + return -EOPNOTSUPP; } comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | @@ -4229,15 +4268,21 @@ static void cma_add_one(struct ib_device *device) cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_gid_type), GFP_KERNEL); - if (!cma_dev->default_gid_type) { - kfree(cma_dev); - return; - } + if (!cma_dev->default_gid_type) + goto free_cma_dev; + + cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt, + sizeof(*cma_dev->default_roce_tos), + GFP_KERNEL); + if (!cma_dev->default_roce_tos) + goto free_gid_type; + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); cma_dev->default_gid_type[i - rdma_start_port(device)] = find_first_bit(&supported_gids, BITS_PER_LONG); + cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0; } init_completion(&cma_dev->comp); @@ -4250,6 +4295,16 @@ static void cma_add_one(struct ib_device *device) list_for_each_entry(id_priv, &listen_any_list, list) cma_listen_on_dev(id_priv, cma_dev); mutex_unlock(&lock); + + return; + +free_gid_type: + kfree(cma_dev->default_gid_type); + +free_cma_dev: + kfree(cma_dev); + + return; } static int cma_remove_id_dev(struct rdma_id_private *id_priv) @@ -4318,6 +4373,7 @@ static void cma_remove_one(struct ib_device *device, void *client_data) mutex_unlock(&lock); cma_process_remove(cma_dev); + kfree(cma_dev->default_roce_tos); kfree(cma_dev->default_gid_type); kfree(cma_dev); } diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index 41573df1d9fc..54076a3e8007 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -139,8 +139,50 @@ static ssize_t default_roce_mode_store(struct config_item *item, CONFIGFS_ATTR(, default_roce_mode); +static ssize_t default_roce_tos_show(struct config_item *item, char *buf) +{ + struct cma_device *cma_dev; + struct cma_dev_port_group *group; + ssize_t ret; + u8 tos; + + ret = cma_configfs_params_get(item, &cma_dev, &group); + if (ret) + return ret; + + tos = cma_get_default_roce_tos(cma_dev, group->port_num); + cma_configfs_params_put(cma_dev); + + return sprintf(buf, "%u\n", tos); +} + +static ssize_t default_roce_tos_store(struct config_item *item, + const char *buf, size_t count) +{ + struct cma_device *cma_dev; + struct cma_dev_port_group *group; + ssize_t ret; + u8 tos; + + ret = kstrtou8(buf, 0, &tos); + if (ret) + return ret; + + ret = cma_configfs_params_get(item, &cma_dev, &group); + if (ret) + return ret; + + ret = cma_set_default_roce_tos(cma_dev, group->port_num, tos); + cma_configfs_params_put(cma_dev); + + return ret ? ret : strnlen(buf, count); +} + +CONFIGFS_ATTR(, default_roce_tos); + static struct configfs_attribute *cma_configfs_attributes[] = { &attr_default_roce_mode, + &attr_default_roce_tos, NULL, }; diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index d29372624f3a..11ae67514e13 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -35,8 +35,19 @@ #include <linux/list.h> #include <linux/spinlock.h> +#include <linux/cgroup_rdma.h> #include <rdma/ib_verbs.h> +#include <rdma/ib_mad.h> +#include "mad_priv.h" + +struct pkey_index_qp_list { + struct list_head pkey_index_list; + u16 pkey_index; + /* Lock to hold while iterating the qp_list. */ + spinlock_t qp_list_lock; + struct list_head qp_list; +}; #if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) int cma_configfs_init(void); @@ -62,6 +73,9 @@ int cma_get_default_gid_type(struct cma_device *cma_dev, int cma_set_default_gid_type(struct cma_device *cma_dev, unsigned int port, enum ib_gid_type default_gid_type); +int cma_get_default_roce_tos(struct cma_device *cma_dev, unsigned int port); +int cma_set_default_roce_tos(struct cma_device *a_dev, unsigned int port, + u8 default_roce_tos); struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev); int ib_device_register_sysfs(struct ib_device *device, @@ -121,6 +135,35 @@ int ib_cache_setup_one(struct ib_device *device); void ib_cache_cleanup_one(struct ib_device *device); void ib_cache_release_one(struct ib_device *device); +#ifdef CONFIG_CGROUP_RDMA +int ib_device_register_rdmacg(struct ib_device *device); +void ib_device_unregister_rdmacg(struct ib_device *device); + +int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index); + +void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index); +#else +static inline int ib_device_register_rdmacg(struct ib_device *device) +{ return 0; } + +static inline void ib_device_unregister_rdmacg(struct ib_device *device) +{ } + +static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index) +{ return 0; } + +static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, + struct ib_device *device, + enum rdmacg_resource_type resource_index) +{ } +#endif + static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, struct net_device *upper) { @@ -136,6 +179,16 @@ void ib_mad_cleanup(void); int ib_sa_init(void); void ib_sa_cleanup(void); +int ibnl_init(void); +void ibnl_cleanup(void); + +/** + * Check if there are any listeners to the netlink group + * @group: the netlink group ID + * Returns 0 on success or a negative for no listeners. + */ +int ibnl_chk_listeners(unsigned int group); + int ib_nl_handle_resolve_resp(struct sk_buff *skb, struct netlink_callback *cb); int ib_nl_handle_set_timeout(struct sk_buff *skb, @@ -143,4 +196,109 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, int ib_nl_handle_ip_res_resp(struct sk_buff *skb, struct netlink_callback *cb); +int ib_get_cached_subnet_prefix(struct ib_device *device, + u8 port_num, + u64 *sn_pfx); + +#ifdef CONFIG_SECURITY_INFINIBAND +int ib_security_pkey_access(struct ib_device *dev, + u8 port_num, + u16 pkey_index, + void *sec); + +void ib_security_destroy_port_pkey_list(struct ib_device *device); + +void ib_security_cache_change(struct ib_device *device, + u8 port_num, + u64 subnet_prefix); + +int ib_security_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_udata *udata); + +int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev); +void ib_destroy_qp_security_begin(struct ib_qp_security *sec); +void ib_destroy_qp_security_abort(struct ib_qp_security *sec); +void ib_destroy_qp_security_end(struct ib_qp_security *sec); +int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev); +void ib_close_shared_qp_security(struct ib_qp_security *sec); +int ib_mad_agent_security_setup(struct ib_mad_agent *agent, + enum ib_qp_type qp_type); +void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent); +int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index); +#else +static inline int ib_security_pkey_access(struct ib_device *dev, + u8 port_num, + u16 pkey_index, + void *sec) +{ + return 0; +} + +static inline void ib_security_destroy_port_pkey_list(struct ib_device *device) +{ +} + +static inline void ib_security_cache_change(struct ib_device *device, + u8 port_num, + u64 subnet_prefix) +{ +} + +static inline int ib_security_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_udata *udata) +{ + return qp->device->modify_qp(qp->real_qp, + qp_attr, + qp_attr_mask, + udata); +} + +static inline int ib_create_qp_security(struct ib_qp *qp, + struct ib_device *dev) +{ + return 0; +} + +static inline void ib_destroy_qp_security_begin(struct ib_qp_security *sec) +{ +} + +static inline void ib_destroy_qp_security_abort(struct ib_qp_security *sec) +{ +} + +static inline void ib_destroy_qp_security_end(struct ib_qp_security *sec) +{ +} + +static inline int ib_open_shared_qp_security(struct ib_qp *qp, + struct ib_device *dev) +{ + return 0; +} + +static inline void ib_close_shared_qp_security(struct ib_qp_security *sec) +{ +} + +static inline int ib_mad_agent_security_setup(struct ib_mad_agent *agent, + enum ib_qp_type qp_type) +{ + return 0; +} + +static inline void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) +{ +} + +static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, + u16 pkey_index) +{ + return 0; +} +#endif #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index a754fc727de5..f2ae75fa3128 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -29,7 +29,13 @@ static int __ib_process_cq(struct ib_cq *cq, int budget) { int i, n, completed = 0; - while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) { + /* + * budget might be (-1) if the caller does not + * want to bound this call, thus we need unsigned + * minimum here. + */ + while ((n = ib_poll_cq(cq, min_t(u32, IB_POLL_BATCH, + budget - completed), cq->wc)) > 0) { for (i = 0; i < n; i++) { struct ib_wc *wc = &cq->wc[i]; @@ -58,8 +64,8 @@ static int __ib_process_cq(struct ib_cq *cq, int budget) * %IB_POLL_DIRECT CQ. It does not offload CQ processing to a different * context and does not ask for completion interrupts from the HCA. * - * Note: for compatibility reasons -1 can be passed in %budget for unlimited - * polling. Do not use this feature in new code, it will be removed soon. + * Note: do not pass -1 as %budget unless it is guaranteed that the number + * of completions that will be processed is small. */ int ib_process_cq_direct(struct ib_cq *cq, int budget) { @@ -120,7 +126,7 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) * * This is the proper interface to allocate a CQ for in-kernel users. A * CQ allocated with this interface will automatically be polled from the - * specified context. The ULP needs must use wr->wr_cqe instead of wr->wr_id + * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id * to use this CQ abstraction. */ struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, @@ -196,7 +202,7 @@ void ib_free_cq(struct ib_cq *cq) irq_poll_disable(&cq->iop); break; case IB_POLL_WORKQUEUE: - flush_work(&cq->work); + cancel_work_sync(&cq->work); break; default: WARN_ON_ONCE(1); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 571974cd3919..a5dfab6adf49 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -39,6 +39,8 @@ #include <linux/init.h> #include <linux/mutex.h> #include <linux/netdevice.h> +#include <linux/security.h> +#include <linux/notifier.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> @@ -82,6 +84,14 @@ static LIST_HEAD(client_list); static DEFINE_MUTEX(device_mutex); static DECLARE_RWSEM(lists_rwsem); +static int ib_security_change(struct notifier_block *nb, unsigned long event, + void *lsm_data); +static void ib_policy_change_task(struct work_struct *work); +static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); + +static struct notifier_block ibdev_lsm_nb = { + .notifier_call = ib_security_change, +}; static int ib_device_check_mandatory(struct ib_device *device) { @@ -172,8 +182,16 @@ static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); - ib_cache_release_one(dev); - kfree(dev->port_immutable); + WARN_ON(dev->reg_state == IB_DEV_REGISTERED); + if (dev->reg_state == IB_DEV_UNREGISTERED) { + /* + * In IB_DEV_UNINITIALIZED state, cache or port table + * is not even created. Free cache and port table only when + * device reaches UNREGISTERED state. + */ + ib_cache_release_one(dev); + kfree(dev->port_immutable); + } kfree(dev); } @@ -317,6 +335,65 @@ void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len) } EXPORT_SYMBOL(ib_get_device_fw_str); +static int setup_port_pkey_list(struct ib_device *device) +{ + int i; + + /** + * device->port_pkey_list is indexed directly by the port number, + * Therefore it is declared as a 1 based array with potential empty + * slots at the beginning. + */ + device->port_pkey_list = kcalloc(rdma_end_port(device) + 1, + sizeof(*device->port_pkey_list), + GFP_KERNEL); + + if (!device->port_pkey_list) + return -ENOMEM; + + for (i = 0; i < (rdma_end_port(device) + 1); i++) { + spin_lock_init(&device->port_pkey_list[i].list_lock); + INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list); + } + + return 0; +} + +static void ib_policy_change_task(struct work_struct *work) +{ + struct ib_device *dev; + + down_read(&lists_rwsem); + list_for_each_entry(dev, &device_list, core_list) { + int i; + + for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) { + u64 sp; + int ret = ib_get_cached_subnet_prefix(dev, + i, + &sp); + + WARN_ONCE(ret, + "ib_get_cached_subnet_prefix err: %d, this should never happen here\n", + ret); + if (!ret) + ib_security_cache_change(dev, i, sp); + } + } + up_read(&lists_rwsem); +} + +static int ib_security_change(struct notifier_block *nb, unsigned long event, + void *lsm_data) +{ + if (event != LSM_POLICY_CHANGE) + return NOTIFY_DONE; + + schedule_work(&ib_policy_change_work); + + return NOTIFY_OK; +} + /** * ib_register_device - Register an IB device with IB core * @device:Device to register @@ -333,6 +410,29 @@ int ib_register_device(struct ib_device *device, int ret; struct ib_client *client; struct ib_udata uhw = {.outlen = 0, .inlen = 0}; + struct device *parent = device->dev.parent; + + WARN_ON_ONCE(!parent); + WARN_ON_ONCE(device->dma_device); + if (device->dev.dma_ops) { + /* + * The caller provided custom DMA operations. Copy the + * DMA-related fields that are used by e.g. dma_alloc_coherent() + * into device->dev. + */ + device->dma_device = &device->dev; + if (!device->dev.dma_mask) + device->dev.dma_mask = parent->dma_mask; + if (!device->dev.coherent_dma_mask) + device->dev.coherent_dma_mask = + parent->coherent_dma_mask; + } else { + /* + * The caller did not provide custom DMA operations. Use the + * DMA mapping operations of the parent device. + */ + device->dma_device = parent; + } mutex_lock(&device_mutex); @@ -354,26 +454,36 @@ int ib_register_device(struct ib_device *device, goto out; } + ret = setup_port_pkey_list(device); + if (ret) { + pr_warn("Couldn't create per port_pkey_list\n"); + goto out; + } + ret = ib_cache_setup_one(device); if (ret) { pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n"); - goto out; + goto port_cleanup; + } + + ret = ib_device_register_rdmacg(device); + if (ret) { + pr_warn("Couldn't register device with rdma cgroup\n"); + goto cache_cleanup; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->query_device(device, &device->attrs, &uhw); if (ret) { pr_warn("Couldn't query the device attributes\n"); - ib_cache_cleanup_one(device); - goto out; + goto cache_cleanup; } ret = ib_device_register_sysfs(device, port_callback); if (ret) { pr_warn("Couldn't register device %s with driver model\n", device->name); - ib_cache_cleanup_one(device); - goto out; + goto cache_cleanup; } device->reg_state = IB_DEV_REGISTERED; @@ -385,6 +495,14 @@ int ib_register_device(struct ib_device *device, down_write(&lists_rwsem); list_add_tail(&device->core_list, &device_list); up_write(&lists_rwsem); + mutex_unlock(&device_mutex); + return 0; + +cache_cleanup: + ib_cache_cleanup_one(device); + ib_cache_release_one(device); +port_cleanup: + kfree(device->port_immutable); out: mutex_unlock(&device_mutex); return ret; @@ -421,9 +539,13 @@ void ib_unregister_device(struct ib_device *device) mutex_unlock(&device_mutex); + ib_device_unregister_rdmacg(device); ib_device_unregister_sysfs(device); ib_cache_cleanup_one(device); + ib_security_destroy_port_pkey_list(device); + kfree(device->port_pkey_list); + down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) @@ -659,7 +781,7 @@ int ib_query_port(struct ib_device *device, union ib_gid gid; int err; - if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + if (!rdma_is_port_valid(device, port_num)) return -EINVAL; memset(port_attr, 0, sizeof(*port_attr)); @@ -825,7 +947,7 @@ int ib_modify_port(struct ib_device *device, if (!device->modify_port) return -ENOSYS; - if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) + if (!rdma_is_port_valid(device, port_num)) return -EINVAL; return device->modify_port(device, port_num, port_modify_mask, @@ -996,8 +1118,7 @@ static int __init ib_core_init(void) return -ENOMEM; ib_comp_wq = alloc_workqueue("ib-comp-wq", - WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, - WQ_UNBOUND_MAX_ACTIVE); + WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); if (!ib_comp_wq) { ret = -ENOMEM; goto err; @@ -1039,10 +1160,18 @@ static int __init ib_core_init(void) goto err_sa; } + ret = register_lsm_notifier(&ibdev_lsm_nb); + if (ret) { + pr_warn("Couldn't register LSM notifier. ret %d\n", ret); + goto err_ibnl_clients; + } + ib_cache_setup(); return 0; +err_ibnl_clients: + ib_remove_ibnl_clients(); err_sa: ib_sa_cleanup(); err_mad: @@ -1062,6 +1191,7 @@ err: static void __exit ib_core_cleanup(void) { + unregister_lsm_notifier(&ibdev_lsm_nb); ib_cache_cleanup(); ib_remove_ibnl_clients(); ib_sa_cleanup(); diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index cdfad5f26212..84d2615b5d4b 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -96,7 +96,8 @@ struct ib_fmr_pool { void * arg); void *flush_arg; - struct task_struct *thread; + struct kthread_worker *worker; + struct kthread_work work; atomic_t req_ser; atomic_t flush_ser; @@ -174,29 +175,19 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) spin_unlock_irq(&pool->pool_lock); } -static int ib_fmr_cleanup_thread(void *pool_ptr) +static void ib_fmr_cleanup_func(struct kthread_work *work) { - struct ib_fmr_pool *pool = pool_ptr; + struct ib_fmr_pool *pool = container_of(work, struct ib_fmr_pool, work); - do { - if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) { - ib_fmr_batch_release(pool); - - atomic_inc(&pool->flush_ser); - wake_up_interruptible(&pool->force_wait); - - if (pool->flush_function) - pool->flush_function(pool, pool->flush_arg); - } + ib_fmr_batch_release(pool); + atomic_inc(&pool->flush_ser); + wake_up_interruptible(&pool->force_wait); - set_current_state(TASK_INTERRUPTIBLE); - if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 && - !kthread_should_stop()) - schedule(); - __set_current_state(TASK_RUNNING); - } while (!kthread_should_stop()); + if (pool->flush_function) + pool->flush_function(pool, pool->flush_arg); - return 0; + if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) + kthread_queue_work(pool->worker, &pool->work); } /** @@ -265,15 +256,13 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, atomic_set(&pool->flush_ser, 0); init_waitqueue_head(&pool->force_wait); - pool->thread = kthread_run(ib_fmr_cleanup_thread, - pool, - "ib_fmr(%s)", - device->name); - if (IS_ERR(pool->thread)) { - pr_warn(PFX "couldn't start cleanup thread\n"); - ret = PTR_ERR(pool->thread); + pool->worker = kthread_create_worker(0, "ib_fmr(%s)", device->name); + if (IS_ERR(pool->worker)) { + pr_warn(PFX "couldn't start cleanup kthread worker\n"); + ret = PTR_ERR(pool->worker); goto out_free_pool; } + kthread_init_work(&pool->work, ib_fmr_cleanup_func); { struct ib_pool_fmr *fmr; @@ -338,7 +327,7 @@ void ib_destroy_fmr_pool(struct ib_fmr_pool *pool) LIST_HEAD(fmr_list); int i; - kthread_stop(pool->thread); + kthread_destroy_worker(pool->worker); ib_fmr_batch_release(pool); i = 0; @@ -388,7 +377,7 @@ int ib_flush_fmr_pool(struct ib_fmr_pool *pool) spin_unlock_irq(&pool->pool_lock); serial = atomic_inc_return(&pool->req_ser); - wake_up_process(pool->thread); + kthread_queue_work(pool->worker, &pool->work); if (wait_event_interruptible(pool->force_wait, atomic_read(&pool->flush_ser) - serial >= 0)) @@ -502,7 +491,7 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) list_add_tail(&fmr->list, &pool->dirty_list); if (++pool->dirty_len >= pool->dirty_watermark) { atomic_inc(&pool->req_ser); - wake_up_process(pool->thread); + kthread_queue_work(pool->worker, &pool->work); } } } diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 3ef51a96bbf1..f13870e69ccd 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -472,12 +472,14 @@ int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, int ret; const char *err_str = ""; - ret = nlmsg_validate(cb->nlh, nlh_len, policy_max-1, nlmsg_policy); + ret = nlmsg_validate(cb->nlh, nlh_len, policy_max - 1, nlmsg_policy, + NULL); if (ret) { err_str = "Invalid attribute"; goto parse_nlmsg_error; } - ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max-1, nlmsg_policy); + ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max - 1, + nlmsg_policy, NULL); if (ret) { err_str = "Unable to parse the nlmsg"; goto parse_nlmsg_error; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index a009f7132c73..f8f53bb90837 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -40,9 +40,11 @@ #include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/security.h> #include <rdma/ib_cache.h> #include "mad_priv.h" +#include "core_priv.h" #include "mad_rmpp.h" #include "smi.h" #include "opa_smi.h" @@ -316,7 +318,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { - dev_notice(&device->dev, "ib_register_mad_agent: Invalid port\n"); + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid port %d\n", + port_num); ret = ERR_PTR(-ENODEV); goto error1; } @@ -367,6 +371,12 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, atomic_set(&mad_agent_priv->refcount, 1); init_completion(&mad_agent_priv->comp); + ret2 = ib_mad_agent_security_setup(&mad_agent_priv->agent, qp_type); + if (ret2) { + ret = ERR_PTR(ret2); + goto error4; + } + spin_lock_irqsave(&port_priv->reg_lock, flags); mad_agent_priv->agent.hi_tid = ++ib_mad_client_id; @@ -384,7 +394,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, if (method) { if (method_in_use(&method, mad_reg_req)) - goto error4; + goto error5; } } ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv, @@ -400,14 +410,14 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, if (is_vendor_method_in_use( vendor_class, mad_reg_req)) - goto error4; + goto error5; } } ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv); } if (ret2) { ret = ERR_PTR(ret2); - goto error4; + goto error5; } } @@ -416,9 +426,10 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, spin_unlock_irqrestore(&port_priv->reg_lock, flags); return &mad_agent_priv->agent; - -error4: +error5: spin_unlock_irqrestore(&port_priv->reg_lock, flags); + ib_mad_agent_security_cleanup(&mad_agent_priv->agent); +error4: kfree(reg_req); error3: kfree(mad_agent_priv); @@ -489,6 +500,7 @@ struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, struct ib_mad_agent *ret; struct ib_mad_snoop_private *mad_snoop_priv; int qpn; + int err; /* Validate parameters */ if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) || @@ -523,17 +535,25 @@ struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, mad_snoop_priv->agent.port_num = port_num; mad_snoop_priv->mad_snoop_flags = mad_snoop_flags; init_completion(&mad_snoop_priv->comp); + + err = ib_mad_agent_security_setup(&mad_snoop_priv->agent, qp_type); + if (err) { + ret = ERR_PTR(err); + goto error2; + } + mad_snoop_priv->snoop_index = register_snoop_agent( &port_priv->qp_info[qpn], mad_snoop_priv); if (mad_snoop_priv->snoop_index < 0) { ret = ERR_PTR(mad_snoop_priv->snoop_index); - goto error2; + goto error3; } atomic_set(&mad_snoop_priv->refcount, 1); return &mad_snoop_priv->agent; - +error3: + ib_mad_agent_security_cleanup(&mad_snoop_priv->agent); error2: kfree(mad_snoop_priv); error1: @@ -579,6 +599,8 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) deref_mad_agent(mad_agent_priv); wait_for_completion(&mad_agent_priv->comp); + ib_mad_agent_security_cleanup(&mad_agent_priv->agent); + kfree(mad_agent_priv->reg_req); kfree(mad_agent_priv); } @@ -597,13 +619,15 @@ static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) deref_snoop_agent(mad_snoop_priv); wait_for_completion(&mad_snoop_priv->comp); + ib_mad_agent_security_cleanup(&mad_snoop_priv->agent); + kfree(mad_snoop_priv); } /* * ib_unregister_mad_agent - Unregisters a client from using MAD services */ -int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) +void ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_snoop_private *mad_snoop_priv; @@ -620,7 +644,6 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) agent); unregister_mad_snoop(mad_snoop_priv); } - return 0; } EXPORT_SYMBOL(ib_unregister_mad_agent); @@ -1214,12 +1237,16 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, /* Walk list of send WRs and post each on send list */ for (; send_buf; send_buf = next_send_buf) { - mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, send_buf); mad_agent_priv = mad_send_wr->mad_agent_priv; + ret = ib_mad_enforce_security(mad_agent_priv, + mad_send_wr->send_wr.pkey_index); + if (ret) + goto error; + if (!send_buf->mad_agent->send_handler || (send_buf->timeout_ms && !send_buf->mad_agent->recv_handler)) { @@ -1832,12 +1859,13 @@ static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_ const struct ib_mad_send_wr_private *wr, const struct ib_mad_recv_wc *rwc ) { - struct ib_ah_attr attr; + struct rdma_ah_attr attr; u8 send_resp, rcv_resp; union ib_gid sgid; struct ib_device *device = mad_agent_priv->agent.device; u8 port_num = mad_agent_priv->agent.port_num; u8 lmc; + bool has_grh; send_resp = ib_response_mad((struct ib_mad_hdr *)wr->send_buf.mad); rcv_resp = ib_response_mad(&rwc->recv_buf.mad->mad_hdr); @@ -1846,36 +1874,40 @@ static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_ /* both requests, or both responses. GIDs different */ return 0; - if (ib_query_ah(wr->send_buf.ah, &attr)) + if (rdma_query_ah(wr->send_buf.ah, &attr)) /* Assume not equal, to avoid false positives. */ return 0; - if (!!(attr.ah_flags & IB_AH_GRH) != - !!(rwc->wc->wc_flags & IB_WC_GRH)) + has_grh = !!(rdma_ah_get_ah_flags(&attr) & IB_AH_GRH); + if (has_grh != !!(rwc->wc->wc_flags & IB_WC_GRH)) /* one has GID, other does not. Assume different */ return 0; if (!send_resp && rcv_resp) { /* is request/response. */ - if (!(attr.ah_flags & IB_AH_GRH)) { + if (!has_grh) { if (ib_get_cached_lmc(device, port_num, &lmc)) return 0; - return (!lmc || !((attr.src_path_bits ^ + return (!lmc || !((rdma_ah_get_path_bits(&attr) ^ rwc->wc->dlid_path_bits) & ((1 << lmc) - 1))); } else { + const struct ib_global_route *grh = + rdma_ah_read_grh(&attr); + if (ib_get_cached_gid(device, port_num, - attr.grh.sgid_index, &sgid, NULL)) + grh->sgid_index, &sgid, NULL)) return 0; return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw, 16); } } - if (!(attr.ah_flags & IB_AH_GRH)) - return attr.dlid == rwc->wc->slid; + if (!has_grh) + return rdma_ah_get_dlid(&attr) == rwc->wc->slid; else - return !memcmp(attr.grh.dgid.raw, rwc->recv_buf.grh->sgid.raw, + return !memcmp(rdma_ah_read_grh(&attr)->dgid.raw, + rwc->recv_buf.grh->sgid.raw, 16); } @@ -1940,6 +1972,14 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags; + int ret; + + ret = ib_mad_enforce_security(mad_agent_priv, + mad_recv_wc->wc->pkey_index); + if (ret) { + ib_free_recv_mad(mad_recv_wc); + deref_mad_agent(mad_agent_priv); + } INIT_LIST_HEAD(&mad_recv_wc->rmpp_list); list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list); @@ -1997,6 +2037,8 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, mad_recv_wc); deref_mad_agent(mad_agent_priv); } + + return; } static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv, diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index 382941b46e43..0d3cca0a8890 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -81,7 +81,7 @@ static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv) { deref_rmpp_recv(rmpp_recv); wait_for_completion(&rmpp_recv->comp); - ib_destroy_ah(rmpp_recv->ah); + rdma_destroy_ah(rmpp_recv->ah); kfree(rmpp_recv); } @@ -171,7 +171,7 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent, hdr_len, 0, GFP_KERNEL, IB_MGMT_BASE_VERSION); if (IS_ERR(msg)) - ib_destroy_ah(ah); + rdma_destroy_ah(ah); else { msg->ah = ah; msg->context[0] = ah; @@ -201,7 +201,7 @@ static void ack_ds_ack(struct ib_mad_agent_private *agent, ret = ib_post_send_mad(msg, NULL); if (ret) { - ib_destroy_ah(msg->ah); + rdma_destroy_ah(msg->ah); ib_free_send_mad(msg); } } @@ -209,7 +209,7 @@ static void ack_ds_ack(struct ib_mad_agent_private *agent, void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc) { if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah) - ib_destroy_ah(mad_send_wc->send_buf->ah); + rdma_destroy_ah(mad_send_wc->send_buf->ah); ib_free_send_mad(mad_send_wc->send_buf); } @@ -237,7 +237,7 @@ static void nack_recv(struct ib_mad_agent_private *agent, ret = ib_post_send_mad(msg, NULL); if (ret) { - ib_destroy_ah(msg->ah); + rdma_destroy_ah(msg->ah); ib_free_send_mad(msg); } } @@ -852,7 +852,7 @@ static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr) struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv; struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad; struct mad_rmpp_recv *rmpp_recv; - struct ib_ah_attr ah_attr; + struct rdma_ah_attr ah_attr; unsigned long flags; int newwin = 1; @@ -867,10 +867,10 @@ static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr) (rmpp_recv->method & IB_MGMT_METHOD_RESP)) continue; - if (ib_query_ah(mad_send_wr->send_buf.ah, &ah_attr)) + if (rdma_query_ah(mad_send_wr->send_buf.ah, &ah_attr)) continue; - if (rmpp_recv->slid == ah_attr.dlid) { + if (rmpp_recv->slid == rdma_ah_get_dlid(&ah_attr)) { newwin = rmpp_recv->repwin; break; } diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 322cb67b07a9..45f2f095f793 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -720,7 +720,7 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, struct net_device *ndev, enum ib_gid_type gid_type, - struct ib_ah_attr *ah_attr) + struct rdma_ah_attr *ah_attr) { int ret; u16 gid_index; @@ -743,19 +743,18 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, return ret; memset(ah_attr, 0, sizeof *ah_attr); - ah_attr->dlid = be16_to_cpu(rec->mlid); - ah_attr->sl = rec->sl; - ah_attr->port_num = port_num; - ah_attr->static_rate = rec->rate; - - ah_attr->ah_flags = IB_AH_GRH; - ah_attr->grh.dgid = rec->mgid; - - ah_attr->grh.sgid_index = (u8) gid_index; - ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); - ah_attr->grh.hop_limit = rec->hop_limit; - ah_attr->grh.traffic_class = rec->traffic_class; - + ah_attr->type = rdma_ah_find_type(device, port_num); + + rdma_ah_set_dlid(ah_attr, be16_to_cpu(rec->mlid)); + rdma_ah_set_sl(ah_attr, rec->sl); + rdma_ah_set_port_num(ah_attr, port_num); + rdma_ah_set_static_rate(ah_attr, rec->rate); + + rdma_ah_set_grh(ah_attr, &rec->mgid, + be32_to_cpu(rec->flow_label), + (u8)gid_index, + rec->hop_limit, + rec->traffic_class); return 0; } EXPORT_SYMBOL(ib_init_ah_from_mcmember); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 10469b0088b5..94931c474d41 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -37,6 +37,7 @@ #include <net/net_namespace.h> #include <net/sock.h> #include <rdma/rdma_netlink.h> +#include "core_priv.h" struct ibnl_client { struct list_head list; @@ -55,7 +56,6 @@ int ibnl_chk_listeners(unsigned int group) return -1; return 0; } -EXPORT_SYMBOL(ibnl_chk_listeners); int ibnl_add_client(int index, int nops, const struct ibnl_client_cbs cb_table[]) @@ -146,7 +146,8 @@ nla_put_failure: } EXPORT_SYMBOL(ibnl_put_attr); -static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct ibnl_client *client; int type = nlh->nlmsg_type; @@ -209,7 +210,7 @@ static void ibnl_rcv_reply_skb(struct sk_buff *skb) if (nlh->nlmsg_flags & NLM_F_REQUEST) return; - ibnl_rcv_msg(skb, nlh); + ibnl_rcv_msg(skb, nlh, NULL); msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c new file mode 100644 index 000000000000..41c31a2bf093 --- /dev/null +++ b/drivers/infiniband/core/rdma_core.c @@ -0,0 +1,627 @@ +/* + * Copyright (c) 2016, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/file.h> +#include <linux/anon_inodes.h> +#include <rdma/ib_verbs.h> +#include <rdma/uverbs_types.h> +#include <linux/rcupdate.h> +#include "uverbs.h" +#include "core_priv.h" +#include "rdma_core.h" + +void uverbs_uobject_get(struct ib_uobject *uobject) +{ + kref_get(&uobject->ref); +} + +static void uverbs_uobject_free(struct kref *ref) +{ + struct ib_uobject *uobj = + container_of(ref, struct ib_uobject, ref); + + if (uobj->type->type_class->needs_kfree_rcu) + kfree_rcu(uobj, rcu); + else + kfree(uobj); +} + +void uverbs_uobject_put(struct ib_uobject *uobject) +{ + kref_put(&uobject->ref, uverbs_uobject_free); +} + +static int uverbs_try_lock_object(struct ib_uobject *uobj, bool exclusive) +{ + /* + * When a shared access is required, we use a positive counter. Each + * shared access request checks that the value != -1 and increment it. + * Exclusive access is required for operations like write or destroy. + * In exclusive access mode, we check that the counter is zero (nobody + * claimed this object) and we set it to -1. Releasing a shared access + * lock is done simply by decreasing the counter. As for exclusive + * access locks, since only a single one of them is is allowed + * concurrently, setting the counter to zero is enough for releasing + * this lock. + */ + if (!exclusive) + return __atomic_add_unless(&uobj->usecnt, 1, -1) == -1 ? + -EBUSY : 0; + + /* lock is either WRITE or DESTROY - should be exclusive */ + return atomic_cmpxchg(&uobj->usecnt, 0, -1) == 0 ? 0 : -EBUSY; +} + +static struct ib_uobject *alloc_uobj(struct ib_ucontext *context, + const struct uverbs_obj_type *type) +{ + struct ib_uobject *uobj = kzalloc(type->obj_size, GFP_KERNEL); + + if (!uobj) + return ERR_PTR(-ENOMEM); + /* + * user_handle should be filled by the handler, + * The object is added to the list in the commit stage. + */ + uobj->context = context; + uobj->type = type; + atomic_set(&uobj->usecnt, 0); + kref_init(&uobj->ref); + + return uobj; +} + +static int idr_add_uobj(struct ib_uobject *uobj) +{ + int ret; + + idr_preload(GFP_KERNEL); + spin_lock(&uobj->context->ufile->idr_lock); + + /* + * We start with allocating an idr pointing to NULL. This represents an + * object which isn't initialized yet. We'll replace it later on with + * the real object once we commit. + */ + ret = idr_alloc(&uobj->context->ufile->idr, NULL, 0, + min_t(unsigned long, U32_MAX - 1, INT_MAX), GFP_NOWAIT); + if (ret >= 0) + uobj->id = ret; + + spin_unlock(&uobj->context->ufile->idr_lock); + idr_preload_end(); + + return ret < 0 ? ret : 0; +} + +/* + * It only removes it from the uobjects list, uverbs_uobject_put() is still + * required. + */ +static void uverbs_idr_remove_uobj(struct ib_uobject *uobj) +{ + spin_lock(&uobj->context->ufile->idr_lock); + idr_remove(&uobj->context->ufile->idr, uobj->id); + spin_unlock(&uobj->context->ufile->idr_lock); +} + +/* Returns the ib_uobject or an error. The caller should check for IS_ERR. */ +static struct ib_uobject *lookup_get_idr_uobject(const struct uverbs_obj_type *type, + struct ib_ucontext *ucontext, + int id, bool exclusive) +{ + struct ib_uobject *uobj; + + rcu_read_lock(); + /* object won't be released as we're protected in rcu */ + uobj = idr_find(&ucontext->ufile->idr, id); + if (!uobj) { + uobj = ERR_PTR(-ENOENT); + goto free; + } + + uverbs_uobject_get(uobj); +free: + rcu_read_unlock(); + return uobj; +} + +static struct ib_uobject *lookup_get_fd_uobject(const struct uverbs_obj_type *type, + struct ib_ucontext *ucontext, + int id, bool exclusive) +{ + struct file *f; + struct ib_uobject *uobject; + const struct uverbs_obj_fd_type *fd_type = + container_of(type, struct uverbs_obj_fd_type, type); + + if (exclusive) + return ERR_PTR(-EOPNOTSUPP); + + f = fget(id); + if (!f) + return ERR_PTR(-EBADF); + + uobject = f->private_data; + /* + * fget(id) ensures we are not currently running uverbs_close_fd, + * and the caller is expected to ensure that uverbs_close_fd is never + * done while a call top lookup is possible. + */ + if (f->f_op != fd_type->fops) { + fput(f); + return ERR_PTR(-EBADF); + } + + uverbs_uobject_get(uobject); + return uobject; +} + +struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_obj_type *type, + struct ib_ucontext *ucontext, + int id, bool exclusive) +{ + struct ib_uobject *uobj; + int ret; + + uobj = type->type_class->lookup_get(type, ucontext, id, exclusive); + if (IS_ERR(uobj)) + return uobj; + + if (uobj->type != type) { + ret = -EINVAL; + goto free; + } + + ret = uverbs_try_lock_object(uobj, exclusive); + if (ret) { + WARN(ucontext->cleanup_reason, + "ib_uverbs: Trying to lookup_get while cleanup context\n"); + goto free; + } + + return uobj; +free: + uobj->type->type_class->lookup_put(uobj, exclusive); + uverbs_uobject_put(uobj); + return ERR_PTR(ret); +} + +static struct ib_uobject *alloc_begin_idr_uobject(const struct uverbs_obj_type *type, + struct ib_ucontext *ucontext) +{ + int ret; + struct ib_uobject *uobj; + + uobj = alloc_uobj(ucontext, type); + if (IS_ERR(uobj)) + return uobj; + + ret = idr_add_uobj(uobj); + if (ret) + goto uobj_put; + + ret = ib_rdmacg_try_charge(&uobj->cg_obj, ucontext->device, + RDMACG_RESOURCE_HCA_OBJECT); + if (ret) + goto idr_remove; + + return uobj; + +idr_remove: + uverbs_idr_remove_uobj(uobj); +uobj_put: + uverbs_uobject_put(uobj); + return ERR_PTR(ret); +} + +static struct ib_uobject *alloc_begin_fd_uobject(const struct uverbs_obj_type *type, + struct ib_ucontext *ucontext) +{ + const struct uverbs_obj_fd_type *fd_type = + container_of(type, struct uverbs_obj_fd_type, type); + int new_fd; + struct ib_uobject *uobj; + struct ib_uobject_file *uobj_file; + struct file *filp; + + new_fd = get_unused_fd_flags(O_CLOEXEC); + if (new_fd < 0) + return ERR_PTR(new_fd); + + uobj = alloc_uobj(ucontext, type); + if (IS_ERR(uobj)) { + put_unused_fd(new_fd); + return uobj; + } + + uobj_file = container_of(uobj, struct ib_uobject_file, uobj); + filp = anon_inode_getfile(fd_type->name, + fd_type->fops, + uobj_file, + fd_type->flags); + if (IS_ERR(filp)) { + put_unused_fd(new_fd); + uverbs_uobject_put(uobj); + return (void *)filp; + } + + uobj_file->uobj.id = new_fd; + uobj_file->uobj.object = filp; + uobj_file->ufile = ucontext->ufile; + INIT_LIST_HEAD(&uobj->list); + kref_get(&uobj_file->ufile->ref); + + return uobj; +} + +struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_obj_type *type, + struct ib_ucontext *ucontext) +{ + return type->type_class->alloc_begin(type, ucontext); +} + +static void uverbs_uobject_add(struct ib_uobject *uobject) +{ + mutex_lock(&uobject->context->uobjects_lock); + list_add(&uobject->list, &uobject->context->uobjects); + mutex_unlock(&uobject->context->uobjects_lock); +} + +static int __must_check remove_commit_idr_uobject(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + const struct uverbs_obj_idr_type *idr_type = + container_of(uobj->type, struct uverbs_obj_idr_type, + type); + int ret = idr_type->destroy_object(uobj, why); + + /* + * We can only fail gracefully if the user requested to destroy the + * object. In the rest of the cases, just remove whatever you can. + */ + if (why == RDMA_REMOVE_DESTROY && ret) + return ret; + + ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device, + RDMACG_RESOURCE_HCA_OBJECT); + uverbs_idr_remove_uobj(uobj); + + return ret; +} + +static void alloc_abort_fd_uobject(struct ib_uobject *uobj) +{ + struct ib_uobject_file *uobj_file = + container_of(uobj, struct ib_uobject_file, uobj); + struct file *filp = uobj->object; + int id = uobj_file->uobj.id; + + /* Unsuccessful NEW */ + fput(filp); + put_unused_fd(id); +} + +static int __must_check remove_commit_fd_uobject(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + const struct uverbs_obj_fd_type *fd_type = + container_of(uobj->type, struct uverbs_obj_fd_type, type); + struct ib_uobject_file *uobj_file = + container_of(uobj, struct ib_uobject_file, uobj); + int ret = fd_type->context_closed(uobj_file, why); + + if (why == RDMA_REMOVE_DESTROY && ret) + return ret; + + if (why == RDMA_REMOVE_DURING_CLEANUP) { + alloc_abort_fd_uobject(uobj); + return ret; + } + + uobj_file->uobj.context = NULL; + return ret; +} + +static void lockdep_check(struct ib_uobject *uobj, bool exclusive) +{ +#ifdef CONFIG_LOCKDEP + if (exclusive) + WARN_ON(atomic_read(&uobj->usecnt) > 0); + else + WARN_ON(atomic_read(&uobj->usecnt) == -1); +#endif +} + +static int __must_check _rdma_remove_commit_uobject(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + int ret; + struct ib_ucontext *ucontext = uobj->context; + + ret = uobj->type->type_class->remove_commit(uobj, why); + if (ret && why == RDMA_REMOVE_DESTROY) { + /* We couldn't remove the object, so just unlock the uobject */ + atomic_set(&uobj->usecnt, 0); + uobj->type->type_class->lookup_put(uobj, true); + } else { + mutex_lock(&ucontext->uobjects_lock); + list_del(&uobj->list); + mutex_unlock(&ucontext->uobjects_lock); + /* put the ref we took when we created the object */ + uverbs_uobject_put(uobj); + } + + return ret; +} + +/* This is called only for user requested DESTROY reasons */ +int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj) +{ + int ret; + struct ib_ucontext *ucontext = uobj->context; + + /* put the ref count we took at lookup_get */ + uverbs_uobject_put(uobj); + /* Cleanup is running. Calling this should have been impossible */ + if (!down_read_trylock(&ucontext->cleanup_rwsem)) { + WARN(true, "ib_uverbs: Cleanup is running while removing an uobject\n"); + return 0; + } + lockdep_check(uobj, true); + ret = _rdma_remove_commit_uobject(uobj, RDMA_REMOVE_DESTROY); + + up_read(&ucontext->cleanup_rwsem); + return ret; +} + +static void alloc_commit_idr_uobject(struct ib_uobject *uobj) +{ + uverbs_uobject_add(uobj); + spin_lock(&uobj->context->ufile->idr_lock); + /* + * We already allocated this IDR with a NULL object, so + * this shouldn't fail. + */ + WARN_ON(idr_replace(&uobj->context->ufile->idr, + uobj, uobj->id)); + spin_unlock(&uobj->context->ufile->idr_lock); +} + +static void alloc_commit_fd_uobject(struct ib_uobject *uobj) +{ + struct ib_uobject_file *uobj_file = + container_of(uobj, struct ib_uobject_file, uobj); + + uverbs_uobject_add(&uobj_file->uobj); + fd_install(uobj_file->uobj.id, uobj->object); + /* This shouldn't be used anymore. Use the file object instead */ + uobj_file->uobj.id = 0; + /* Get another reference as we export this to the fops */ + uverbs_uobject_get(&uobj_file->uobj); +} + +int rdma_alloc_commit_uobject(struct ib_uobject *uobj) +{ + /* Cleanup is running. Calling this should have been impossible */ + if (!down_read_trylock(&uobj->context->cleanup_rwsem)) { + int ret; + + WARN(true, "ib_uverbs: Cleanup is running while allocating an uobject\n"); + ret = uobj->type->type_class->remove_commit(uobj, + RDMA_REMOVE_DURING_CLEANUP); + if (ret) + pr_warn("ib_uverbs: cleanup of idr object %d failed\n", + uobj->id); + return ret; + } + + uobj->type->type_class->alloc_commit(uobj); + up_read(&uobj->context->cleanup_rwsem); + + return 0; +} + +static void alloc_abort_idr_uobject(struct ib_uobject *uobj) +{ + uverbs_idr_remove_uobj(uobj); + ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device, + RDMACG_RESOURCE_HCA_OBJECT); + uverbs_uobject_put(uobj); +} + +void rdma_alloc_abort_uobject(struct ib_uobject *uobj) +{ + uobj->type->type_class->alloc_abort(uobj); +} + +static void lookup_put_idr_uobject(struct ib_uobject *uobj, bool exclusive) +{ +} + +static void lookup_put_fd_uobject(struct ib_uobject *uobj, bool exclusive) +{ + struct file *filp = uobj->object; + + WARN_ON(exclusive); + /* This indirectly calls uverbs_close_fd and free the object */ + fput(filp); +} + +void rdma_lookup_put_uobject(struct ib_uobject *uobj, bool exclusive) +{ + lockdep_check(uobj, exclusive); + uobj->type->type_class->lookup_put(uobj, exclusive); + /* + * In order to unlock an object, either decrease its usecnt for + * read access or zero it in case of exclusive access. See + * uverbs_try_lock_object for locking schema information. + */ + if (!exclusive) + atomic_dec(&uobj->usecnt); + else + atomic_set(&uobj->usecnt, 0); + + uverbs_uobject_put(uobj); +} + +const struct uverbs_obj_type_class uverbs_idr_class = { + .alloc_begin = alloc_begin_idr_uobject, + .lookup_get = lookup_get_idr_uobject, + .alloc_commit = alloc_commit_idr_uobject, + .alloc_abort = alloc_abort_idr_uobject, + .lookup_put = lookup_put_idr_uobject, + .remove_commit = remove_commit_idr_uobject, + /* + * When we destroy an object, we first just lock it for WRITE and + * actually DESTROY it in the finalize stage. So, the problematic + * scenario is when we just started the finalize stage of the + * destruction (nothing was executed yet). Now, the other thread + * fetched the object for READ access, but it didn't lock it yet. + * The DESTROY thread continues and starts destroying the object. + * When the other thread continue - without the RCU, it would + * access freed memory. However, the rcu_read_lock delays the free + * until the rcu_read_lock of the READ operation quits. Since the + * exclusive lock of the object is still taken by the DESTROY flow, the + * READ operation will get -EBUSY and it'll just bail out. + */ + .needs_kfree_rcu = true, +}; + +static void _uverbs_close_fd(struct ib_uobject_file *uobj_file) +{ + struct ib_ucontext *ucontext; + struct ib_uverbs_file *ufile = uobj_file->ufile; + int ret; + + mutex_lock(&uobj_file->ufile->cleanup_mutex); + + /* uobject was either already cleaned up or is cleaned up right now anyway */ + if (!uobj_file->uobj.context || + !down_read_trylock(&uobj_file->uobj.context->cleanup_rwsem)) + goto unlock; + + ucontext = uobj_file->uobj.context; + ret = _rdma_remove_commit_uobject(&uobj_file->uobj, RDMA_REMOVE_CLOSE); + up_read(&ucontext->cleanup_rwsem); + if (ret) + pr_warn("uverbs: unable to clean up uobject file in uverbs_close_fd.\n"); +unlock: + mutex_unlock(&ufile->cleanup_mutex); +} + +void uverbs_close_fd(struct file *f) +{ + struct ib_uobject_file *uobj_file = f->private_data; + struct kref *uverbs_file_ref = &uobj_file->ufile->ref; + + _uverbs_close_fd(uobj_file); + uverbs_uobject_put(&uobj_file->uobj); + kref_put(uverbs_file_ref, ib_uverbs_release_file); +} + +void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed) +{ + enum rdma_remove_reason reason = device_removed ? + RDMA_REMOVE_DRIVER_REMOVE : RDMA_REMOVE_CLOSE; + unsigned int cur_order = 0; + + ucontext->cleanup_reason = reason; + /* + * Waits for all remove_commit and alloc_commit to finish. Logically, We + * want to hold this forever as the context is going to be destroyed, + * but we'll release it since it causes a "held lock freed" BUG message. + */ + down_write(&ucontext->cleanup_rwsem); + + while (!list_empty(&ucontext->uobjects)) { + struct ib_uobject *obj, *next_obj; + unsigned int next_order = UINT_MAX; + + /* + * This shouldn't run while executing other commands on this + * context. Thus, the only thing we should take care of is + * releasing a FD while traversing this list. The FD could be + * closed and released from the _release fop of this FD. + * In order to mitigate this, we add a lock. + * We take and release the lock per order traversal in order + * to let other threads (which might still use the FDs) chance + * to run. + */ + mutex_lock(&ucontext->uobjects_lock); + list_for_each_entry_safe(obj, next_obj, &ucontext->uobjects, + list) { + if (obj->type->destroy_order == cur_order) { + int ret; + + /* + * if we hit this WARN_ON, that means we are + * racing with a lookup_get. + */ + WARN_ON(uverbs_try_lock_object(obj, true)); + ret = obj->type->type_class->remove_commit(obj, + reason); + list_del(&obj->list); + if (ret) + pr_warn("ib_uverbs: failed to remove uobject id %d order %u\n", + obj->id, cur_order); + /* put the ref we took when we created the object */ + uverbs_uobject_put(obj); + } else { + next_order = min(next_order, + obj->type->destroy_order); + } + } + mutex_unlock(&ucontext->uobjects_lock); + cur_order = next_order; + } + up_write(&ucontext->cleanup_rwsem); +} + +void uverbs_initialize_ucontext(struct ib_ucontext *ucontext) +{ + ucontext->cleanup_reason = 0; + mutex_init(&ucontext->uobjects_lock); + INIT_LIST_HEAD(&ucontext->uobjects); + init_rwsem(&ucontext->cleanup_rwsem); +} + +const struct uverbs_obj_type_class uverbs_fd_class = { + .alloc_begin = alloc_begin_fd_uobject, + .lookup_get = lookup_get_fd_uobject, + .alloc_commit = alloc_commit_fd_uobject, + .alloc_abort = alloc_abort_fd_uobject, + .lookup_put = lookup_put_fd_uobject, + .remove_commit = remove_commit_fd_uobject, + .needs_kfree_rcu = false, +}; + diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h new file mode 100644 index 000000000000..1b82e7ff7fe8 --- /dev/null +++ b/drivers/infiniband/core/rdma_core.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. + * Copyright (c) 2005-2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RDMA_CORE_H +#define RDMA_CORE_H + +#include <linux/idr.h> +#include <rdma/uverbs_types.h> +#include <rdma/ib_verbs.h> +#include <linux/mutex.h> + +/* + * These functions initialize the context and cleanups its uobjects. + * The context has a list of objects which is protected by a mutex + * on the context. initialize_ucontext should be called when we create + * a context. + * cleanup_ucontext removes all uobjects from the context and puts them. + */ +void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed); +void uverbs_initialize_ucontext(struct ib_ucontext *ucontext); + +/* + * uverbs_uobject_get is called in order to increase the reference count on + * an uobject. This is useful when a handler wants to keep the uobject's memory + * alive, regardless if this uobject is still alive in the context's objects + * repository. Objects are put via uverbs_uobject_put. + */ +void uverbs_uobject_get(struct ib_uobject *uobject); + +/* + * In order to indicate we no longer needs this uobject, uverbs_uobject_put + * is called. When the reference count is decreased, the uobject is freed. + * For example, this is used when attaching a completion channel to a CQ. + */ +void uverbs_uobject_put(struct ib_uobject *uobject); + +/* Indicate this fd is no longer used by this consumer, but its memory isn't + * necessarily released yet. When the last reference is put, we release the + * memory. After this call is executed, calling uverbs_uobject_get isn't + * allowed. + * This must be called from the release file_operations of the file! + */ +void uverbs_close_fd(struct file *f); + +#endif /* RDMA_CORE_H */ diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 0621f4455732..94a9eefb3cfc 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -42,6 +42,8 @@ #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> +static struct workqueue_struct *gid_cache_wq; + enum gid_op_type { GID_DEL = 0, GID_ADD @@ -144,7 +146,6 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port, struct net_device *rdma_ndev, void *cookie) { - struct net_device *event_ndev = (struct net_device *)cookie; struct net_device *real_dev; int res; @@ -152,11 +153,11 @@ static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port, return 0; rcu_read_lock(); - real_dev = rdma_vlan_dev_real_dev(event_ndev); + real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) - real_dev = event_ndev; + real_dev = cookie; - res = ((rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) && + res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); @@ -192,17 +193,16 @@ static int pass_all_filter(struct ib_device *ib_dev, u8 port, static int upper_device_filter(struct ib_device *ib_dev, u8 port, struct net_device *rdma_ndev, void *cookie) { - struct net_device *event_ndev = (struct net_device *)cookie; int res; if (!rdma_ndev) return 0; - if (rdma_ndev == event_ndev) + if (rdma_ndev == cookie) return 1; rcu_read_lock(); - res = rdma_is_upper_dev_rcu(rdma_ndev, event_ndev); + res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; @@ -379,18 +379,14 @@ static void _add_netdev_ips(struct ib_device *ib_dev, u8 port, static void add_netdev_ips(struct ib_device *ib_dev, u8 port, struct net_device *rdma_ndev, void *cookie) { - struct net_device *event_ndev = (struct net_device *)cookie; - - enum_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev); - _add_netdev_ips(ib_dev, port, event_ndev); + enum_netdev_default_gids(ib_dev, port, cookie, rdma_ndev); + _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u8 port, struct net_device *rdma_ndev, void *cookie) { - struct net_device *event_ndev = (struct net_device *)cookie; - - ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev); + ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, @@ -460,7 +456,7 @@ static void handle_netdev_upper(struct ib_device *ib_dev, u8 port, u8 port, struct net_device *ndev)) { - struct net_device *ndev = (struct net_device *)cookie; + struct net_device *ndev = cookie; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); @@ -519,9 +515,7 @@ static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port, static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port, struct net_device *rdma_ndev, void *cookie) { - struct net_device *event_ndev = (struct net_device *)cookie; - - bond_delete_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev); + bond_delete_netdev_default_gids(ib_dev, port, cookie, rdma_ndev); } /* The following functions operate on all IB devices. netdevice_event and @@ -568,7 +562,7 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); - queue_work(ib_wq, &ndev_work->work); + queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } @@ -701,7 +695,7 @@ static int addr_event(struct notifier_block *this, unsigned long event, dev_hold(ndev); work->gid_attr.ndev = ndev; - queue_work(ib_wq, &work->work); + queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } @@ -748,6 +742,10 @@ static struct notifier_block nb_inet6addr = { int __init roce_gid_mgmt_init(void) { + gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); + if (!gid_cache_wq) + return -ENOMEM; + register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); @@ -772,4 +770,5 @@ void __exit roce_gid_mgmt_cleanup(void) * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ + destroy_workqueue(gid_cache_wq); } diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 81b742ca1639..70fa4cabe48e 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -56,6 +56,8 @@ #define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 #define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 #define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000 +#define IB_SA_CPI_MAX_RETRY_CNT 3 +#define IB_SA_CPI_RETRY_WAIT 1000 /*msecs */ static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT; struct ib_sa_sm_ah { @@ -65,9 +67,23 @@ struct ib_sa_sm_ah { u8 src_path_mask; }; +enum rdma_class_port_info_type { + RDMA_CLASS_PORT_INFO_IB, + RDMA_CLASS_PORT_INFO_OPA +}; + +struct rdma_class_port_info { + enum rdma_class_port_info_type type; + union { + struct ib_class_port_info ib; + struct opa_class_port_info opa; + }; +}; + struct ib_sa_classport_cache { bool valid; - struct ib_class_port_info data; + int retry_cnt; + struct rdma_class_port_info data; }; struct ib_sa_port { @@ -75,6 +91,7 @@ struct ib_sa_port { struct ib_sa_sm_ah *sm_ah; struct work_struct update_task; struct ib_sa_classport_cache classport_info; + struct delayed_work ib_cpi_work; spinlock_t classport_lock; /* protects class port info set */ spinlock_t ah_lock; u8 port_num; @@ -103,6 +120,7 @@ struct ib_sa_query { #define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001 #define IB_SA_CANCEL 0x00000002 +#define IB_SA_QUERY_OPA 0x00000004 struct ib_sa_service_query { void (*callback)(int, struct ib_sa_service_rec *, void *); @@ -111,9 +129,10 @@ struct ib_sa_service_query { }; struct ib_sa_path_query { - void (*callback)(int, struct ib_sa_path_rec *, void *); + void (*callback)(int, struct sa_path_rec *, void *); void *context; struct ib_sa_query sa_query; + struct sa_path_rec *conv_pr; }; struct ib_sa_guidinfo_query { @@ -123,7 +142,7 @@ struct ib_sa_guidinfo_query { }; struct ib_sa_classport_info_query { - void (*callback)(int, struct ib_class_port_info *, void *); + void (*callback)(void *); void *context; struct ib_sa_query sa_query; }; @@ -170,8 +189,8 @@ static DEFINE_SPINLOCK(tid_lock); static u32 tid; #define PATH_REC_FIELD(field) \ - .struct_offset_bytes = offsetof(struct ib_sa_path_rec, field), \ - .struct_size_bytes = sizeof ((struct ib_sa_path_rec *) 0)->field, \ + .struct_offset_bytes = offsetof(struct sa_path_rec, field), \ + .struct_size_bytes = sizeof((struct sa_path_rec *)0)->field, \ .field_name = "sa_path_rec:" #field static const struct ib_field path_rec_table[] = { @@ -187,15 +206,15 @@ static const struct ib_field path_rec_table[] = { .offset_words = 6, .offset_bits = 0, .size_bits = 128 }, - { PATH_REC_FIELD(dlid), + { PATH_REC_FIELD(ib.dlid), .offset_words = 10, .offset_bits = 0, .size_bits = 16 }, - { PATH_REC_FIELD(slid), + { PATH_REC_FIELD(ib.slid), .offset_words = 10, .offset_bits = 16, .size_bits = 16 }, - { PATH_REC_FIELD(raw_traffic), + { PATH_REC_FIELD(ib.raw_traffic), .offset_words = 11, .offset_bits = 0, .size_bits = 1 }, @@ -269,6 +288,136 @@ static const struct ib_field path_rec_table[] = { .size_bits = 48 }, }; +#define OPA_PATH_REC_FIELD(field) \ + .struct_offset_bytes = \ + offsetof(struct sa_path_rec, field), \ + .struct_size_bytes = \ + sizeof((struct sa_path_rec *)0)->field, \ + .field_name = "sa_path_rec:" #field + +static const struct ib_field opa_path_rec_table[] = { + { OPA_PATH_REC_FIELD(service_id), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 64 }, + { OPA_PATH_REC_FIELD(dgid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { OPA_PATH_REC_FIELD(sgid), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 128 }, + { OPA_PATH_REC_FIELD(opa.dlid), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_PATH_REC_FIELD(opa.slid), + .offset_words = 11, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_PATH_REC_FIELD(opa.raw_traffic), + .offset_words = 12, + .offset_bits = 0, + .size_bits = 1 }, + { RESERVED, + .offset_words = 12, + .offset_bits = 1, + .size_bits = 3 }, + { OPA_PATH_REC_FIELD(flow_label), + .offset_words = 12, + .offset_bits = 4, + .size_bits = 20 }, + { OPA_PATH_REC_FIELD(hop_limit), + .offset_words = 12, + .offset_bits = 24, + .size_bits = 8 }, + { OPA_PATH_REC_FIELD(traffic_class), + .offset_words = 13, + .offset_bits = 0, + .size_bits = 8 }, + { OPA_PATH_REC_FIELD(reversible), + .offset_words = 13, + .offset_bits = 8, + .size_bits = 1 }, + { OPA_PATH_REC_FIELD(numb_path), + .offset_words = 13, + .offset_bits = 9, + .size_bits = 7 }, + { OPA_PATH_REC_FIELD(pkey), + .offset_words = 13, + .offset_bits = 16, + .size_bits = 16 }, + { OPA_PATH_REC_FIELD(opa.l2_8B), + .offset_words = 14, + .offset_bits = 0, + .size_bits = 1 }, + { OPA_PATH_REC_FIELD(opa.l2_10B), + .offset_words = 14, + .offset_bits = 1, + .size_bits = 1 }, + { OPA_PATH_REC_FIELD(opa.l2_9B), + .offset_words = 14, + .offset_bits = 2, + .size_bits = 1 }, + { OPA_PATH_REC_FIELD(opa.l2_16B), + .offset_words = 14, + .offset_bits = 3, + .size_bits = 1 }, + { RESERVED, + .offset_words = 14, + .offset_bits = 4, + .size_bits = 2 }, + { OPA_PATH_REC_FIELD(opa.qos_type), + .offset_words = 14, + .offset_bits = 6, + .size_bits = 2 }, + { OPA_PATH_REC_FIELD(opa.qos_priority), + .offset_words = 14, + .offset_bits = 8, + .size_bits = 8 }, + { RESERVED, + .offset_words = 14, + .offset_bits = 16, + .size_bits = 3 }, + { OPA_PATH_REC_FIELD(sl), + .offset_words = 14, + .offset_bits = 19, + .size_bits = 5 }, + { RESERVED, + .offset_words = 14, + .offset_bits = 24, + .size_bits = 8 }, + { OPA_PATH_REC_FIELD(mtu_selector), + .offset_words = 15, + .offset_bits = 0, + .size_bits = 2 }, + { OPA_PATH_REC_FIELD(mtu), + .offset_words = 15, + .offset_bits = 2, + .size_bits = 6 }, + { OPA_PATH_REC_FIELD(rate_selector), + .offset_words = 15, + .offset_bits = 8, + .size_bits = 2 }, + { OPA_PATH_REC_FIELD(rate), + .offset_words = 15, + .offset_bits = 10, + .size_bits = 6 }, + { OPA_PATH_REC_FIELD(packet_life_time_selector), + .offset_words = 15, + .offset_bits = 16, + .size_bits = 2 }, + { OPA_PATH_REC_FIELD(packet_life_time), + .offset_words = 15, + .offset_bits = 18, + .size_bits = 6 }, + { OPA_PATH_REC_FIELD(preference), + .offset_words = 15, + .offset_bits = 24, + .size_bits = 8 }, +}; + #define MCMEMBER_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \ .struct_size_bytes = sizeof ((struct ib_sa_mcmember_rec *) 0)->field, \ @@ -406,7 +555,7 @@ static const struct ib_field service_rec_table[] = { .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \ .field_name = "ib_class_port_info:" #field -static const struct ib_field classport_info_rec_table[] = { +static const struct ib_field ib_classport_info_rec_table[] = { { CLASSPORTINFO_REC_FIELD(base_version), .offset_words = 0, .offset_bits = 0, @@ -477,6 +626,88 @@ static const struct ib_field classport_info_rec_table[] = { .size_bits = 32 }, }; +#define OPA_CLASSPORTINFO_REC_FIELD(field) \ + .struct_offset_bytes =\ + offsetof(struct opa_class_port_info, field), \ + .struct_size_bytes = \ + sizeof((struct opa_class_port_info *)0)->field, \ + .field_name = "opa_class_port_info:" #field + +static const struct ib_field opa_classport_info_rec_table[] = { + { OPA_CLASSPORTINFO_REC_FIELD(base_version), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 8 }, + { OPA_CLASSPORTINFO_REC_FIELD(class_version), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { OPA_CLASSPORTINFO_REC_FIELD(cap_mask), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { OPA_CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(redirect_gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { OPA_CLASSPORTINFO_REC_FIELD(redirect_tc_fl), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(redirect_lid), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(redirect_sl_qp), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(redirect_qkey), + .offset_words = 9, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_gid), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 128 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_tc_fl), + .offset_words = 14, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_lid), + .offset_words = 15, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_hl_qp), + .offset_words = 16, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_qkey), + .offset_words = 17, + .offset_bits = 0, + .size_bits = 32 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_pkey), + .offset_words = 18, + .offset_bits = 0, + .size_bits = 16 }, + { OPA_CLASSPORTINFO_REC_FIELD(redirect_pkey), + .offset_words = 18, + .offset_bits = 16, + .size_bits = 16 }, + { OPA_CLASSPORTINFO_REC_FIELD(trap_sl_rsvd), + .offset_words = 19, + .offset_bits = 0, + .size_bits = 8 }, + { RESERVED, + .offset_words = 19, + .offset_bits = 8, + .size_bits = 24 }, +}; + #define GUIDINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ @@ -518,7 +749,7 @@ static inline int ib_sa_query_cancelled(struct ib_sa_query *query) static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, struct ib_sa_query *query) { - struct ib_sa_path_rec *sa_rec = query->mad_buf->context[1]; + struct sa_path_rec *sa_rec = query->mad_buf->context[1]; struct ib_sa_mad *mad = query->mad_buf->mad; ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask; u16 val16; @@ -528,8 +759,7 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, query->mad_buf->context[1] = NULL; /* Construct the family header first */ - header = (struct rdma_ls_resolve_header *) - skb_put(skb, NLMSG_ALIGN(sizeof(*header))); + header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); memcpy(header->device_name, query->port->agent->device->name, LS_DEVICE_NAME_MAX); header->port_num = query->port->port_num; @@ -808,7 +1038,7 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, return -EPERM; ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), - nlmsg_len(nlh), ib_nl_policy); + nlmsg_len(nlh), ib_nl_policy, NULL); attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT]; if (ret || !attr) goto settimeout_out; @@ -860,7 +1090,7 @@ static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) return 0; ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), - nlmsg_len(nlh), ib_nl_policy); + nlmsg_len(nlh), ib_nl_policy, NULL); if (ret) return 0; @@ -927,96 +1157,10 @@ static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); - ib_destroy_ah(sm_ah->ah); + rdma_destroy_ah(sm_ah->ah); kfree(sm_ah); } -static void update_sm_ah(struct work_struct *work) -{ - struct ib_sa_port *port = - container_of(work, struct ib_sa_port, update_task); - struct ib_sa_sm_ah *new_ah; - struct ib_port_attr port_attr; - struct ib_ah_attr ah_attr; - - if (ib_query_port(port->agent->device, port->port_num, &port_attr)) { - pr_warn("Couldn't query port\n"); - return; - } - - new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL); - if (!new_ah) { - return; - } - - kref_init(&new_ah->ref); - new_ah->src_path_mask = (1 << port_attr.lmc) - 1; - - new_ah->pkey_index = 0; - if (ib_find_pkey(port->agent->device, port->port_num, - IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index)) - pr_err("Couldn't find index for default PKey\n"); - - memset(&ah_attr, 0, sizeof ah_attr); - ah_attr.dlid = port_attr.sm_lid; - ah_attr.sl = port_attr.sm_sl; - ah_attr.port_num = port->port_num; - if (port_attr.grh_required) { - ah_attr.ah_flags = IB_AH_GRH; - ah_attr.grh.dgid.global.subnet_prefix = cpu_to_be64(port_attr.subnet_prefix); - ah_attr.grh.dgid.global.interface_id = cpu_to_be64(IB_SA_WELL_KNOWN_GUID); - } - - new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr); - if (IS_ERR(new_ah->ah)) { - pr_warn("Couldn't create new SM AH\n"); - kfree(new_ah); - return; - } - - spin_lock_irq(&port->ah_lock); - if (port->sm_ah) - kref_put(&port->sm_ah->ref, free_sm_ah); - port->sm_ah = new_ah; - spin_unlock_irq(&port->ah_lock); - -} - -static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event) -{ - if (event->event == IB_EVENT_PORT_ERR || - event->event == IB_EVENT_PORT_ACTIVE || - event->event == IB_EVENT_LID_CHANGE || - event->event == IB_EVENT_PKEY_CHANGE || - event->event == IB_EVENT_SM_CHANGE || - event->event == IB_EVENT_CLIENT_REREGISTER) { - unsigned long flags; - struct ib_sa_device *sa_dev = - container_of(handler, typeof(*sa_dev), event_handler); - struct ib_sa_port *port = - &sa_dev->port[event->element.port_num - sa_dev->start_port]; - - if (!rdma_cap_ib_sa(handler->device, port->port_num)) - return; - - spin_lock_irqsave(&port->ah_lock, flags); - if (port->sm_ah) - kref_put(&port->sm_ah->ref, free_sm_ah); - port->sm_ah = NULL; - spin_unlock_irqrestore(&port->ah_lock, flags); - - if (event->event == IB_EVENT_SM_CHANGE || - event->event == IB_EVENT_CLIENT_REREGISTER || - event->event == IB_EVENT_LID_CHANGE) { - spin_lock_irqsave(&port->classport_lock, flags); - port->classport_info.valid = false; - spin_unlock_irqrestore(&port->classport_lock, flags); - } - queue_work(ib_wq, &sa_dev->port[event->element.port_num - - sa_dev->start_port].update_task); - } -} - void ib_sa_register_client(struct ib_sa_client *client) { atomic_set(&client->users, 1); @@ -1085,7 +1229,8 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) } int ib_init_ah_from_path(struct ib_device *device, u8 port_num, - struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr) + struct sa_path_rec *rec, + struct rdma_ah_attr *ah_attr) { int ret; u16 gid_index; @@ -1093,21 +1238,26 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, struct net_device *ndev = NULL; memset(ah_attr, 0, sizeof *ah_attr); - ah_attr->dlid = be16_to_cpu(rec->dlid); - ah_attr->sl = rec->sl; - ah_attr->src_path_bits = be16_to_cpu(rec->slid) & - get_src_path_mask(device, port_num); - ah_attr->port_num = port_num; - ah_attr->static_rate = rec->rate; - + ah_attr->type = rdma_ah_find_type(device, port_num); + + rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec))); + rdma_ah_set_sl(ah_attr, rec->sl); + rdma_ah_set_path_bits(ah_attr, be32_to_cpu(sa_path_get_slid(rec)) & + get_src_path_mask(device, port_num)); + rdma_ah_set_port_num(ah_attr, port_num); + rdma_ah_set_static_rate(ah_attr, rec->rate); use_roce = rdma_cap_eth_ah(device, port_num); if (use_roce) { struct net_device *idev; struct net_device *resolved_dev; - struct rdma_dev_addr dev_addr = {.bound_dev_if = rec->ifindex, - .net = rec->net ? rec->net : - &init_net}; + struct rdma_dev_addr dev_addr = { + .bound_dev_if = ((sa_path_get_ifindex(rec) >= 0) ? + sa_path_get_ifindex(rec) : 0), + .net = sa_path_get_ndev(rec) ? + sa_path_get_ndev(rec) : + &init_net + }; union { struct sockaddr _sockaddr; struct sockaddr_in _sockaddr_in; @@ -1128,7 +1278,7 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, if ((dev_addr.network == RDMA_NETWORK_IPV4 || dev_addr.network == RDMA_NETWORK_IPV6) && - rec->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) return -EINVAL; idev = device->get_netdev(device, port_num); @@ -1159,28 +1309,31 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, } if (rec->hop_limit > 0 || use_roce) { - ah_attr->ah_flags = IB_AH_GRH; - ah_attr->grh.dgid = rec->dgid; + enum ib_gid_type type = sa_conv_pathrec_to_gid_type(rec); - ret = ib_find_cached_gid_by_port(device, &rec->sgid, - rec->gid_type, port_num, ndev, - &gid_index); + ret = ib_find_cached_gid_by_port(device, &rec->sgid, type, + port_num, ndev, &gid_index); if (ret) { if (ndev) dev_put(ndev); return ret; } - ah_attr->grh.sgid_index = gid_index; - ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label); - ah_attr->grh.hop_limit = rec->hop_limit; - ah_attr->grh.traffic_class = rec->traffic_class; + rdma_ah_set_grh(ah_attr, &rec->dgid, + be32_to_cpu(rec->flow_label), + gid_index, rec->hop_limit, + rec->traffic_class); if (ndev) dev_put(ndev); } - if (use_roce) - memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN); + if (use_roce) { + u8 *dmac = sa_path_get_dmac(rec); + + if (!dmac) + return -EINVAL; + memcpy(ah_attr->roce.dmac, dmac, ETH_ALEN); + } return 0; } @@ -1203,7 +1356,9 @@ static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) query->sm_ah->pkey_index, 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, gfp_mask, - IB_MGMT_BASE_VERSION); + ((query->flags & IB_SA_QUERY_OPA) ? + OPA_MGMT_BASE_VERSION : + IB_MGMT_BASE_VERSION)); if (IS_ERR(query->mad_buf)) { kref_put(&query->sm_ah->ref, free_sm_ah); return -ENOMEM; @@ -1220,16 +1375,21 @@ static void free_mad(struct ib_sa_query *query) kref_put(&query->sm_ah->ref, free_sm_ah); } -static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent) +static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent) { + struct ib_sa_mad *mad = query->mad_buf->mad; unsigned long flags; memset(mad, 0, sizeof *mad); - mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION; + if (query->flags & IB_SA_QUERY_OPA) { + mad->mad_hdr.base_version = OPA_MGMT_BASE_VERSION; + mad->mad_hdr.class_version = OPA_SA_CLASS_VERSION; + } else { + mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION; + mad->mad_hdr.class_version = IB_SA_CLASS_VERSION; + } mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; - mad->mad_hdr.class_version = IB_SA_CLASS_VERSION; - spin_lock_irqsave(&tid_lock, flags); mad->mad_hdr.tid = cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++); @@ -1258,7 +1418,8 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) query->mad_buf->context[0] = query; query->id = id; - if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) { + if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) && + (!(query->flags & IB_SA_QUERY_OPA))) { if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) { if (!ib_nl_make_request(query, gfp_mask)) return id; @@ -1281,18 +1442,75 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) return ret ? ret : id; } -void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec) +void ib_sa_unpack_path(void *attribute, struct sa_path_rec *rec) { ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec); } EXPORT_SYMBOL(ib_sa_unpack_path); -void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute) +void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute) { ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute); } EXPORT_SYMBOL(ib_sa_pack_path); +static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client, + struct ib_device *device, + u8 port_num) +{ + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + unsigned long flags; + bool ret = false; + + if (!sa_dev) + return ret; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + spin_lock_irqsave(&port->classport_lock, flags); + if (!port->classport_info.valid) + goto ret; + + if (port->classport_info.data.type == RDMA_CLASS_PORT_INFO_OPA) + ret = opa_get_cpi_capmask2(&port->classport_info.data.opa) & + OPA_CLASS_PORT_INFO_PR_SUPPORT; +ret: + spin_unlock_irqrestore(&port->classport_lock, flags); + return ret; +} + +enum opa_pr_supported { + PR_NOT_SUPPORTED, + PR_OPA_SUPPORTED, + PR_IB_SUPPORTED +}; + +/** + * Check if current PR query can be an OPA query. + * Retuns PR_NOT_SUPPORTED if a path record query is not + * possible, PR_OPA_SUPPORTED if an OPA path record query + * is possible and PR_IB_SUPPORTED if an IB path record + * query is possible. + */ +static int opa_pr_query_possible(struct ib_sa_client *client, + struct ib_device *device, + u8 port_num, + struct sa_path_rec *rec) +{ + struct ib_port_attr port_attr; + + if (ib_query_port(device, port_num, &port_attr)) + return PR_NOT_SUPPORTED; + + if (ib_sa_opa_pathrecord_support(client, device, port_num)) + return PR_OPA_SUPPORTED; + + if (port_attr.lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) + return PR_NOT_SUPPORTED; + else + return PR_IB_SUPPORTED; +} + static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) @@ -1301,22 +1519,44 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, container_of(sa_query, struct ib_sa_path_query, sa_query); if (mad) { - struct ib_sa_path_rec rec; - - ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), - mad->data, &rec); - rec.net = NULL; - rec.ifindex = 0; - rec.gid_type = IB_GID_TYPE_IB; - eth_zero_addr(rec.dmac); - query->callback(status, &rec, query->context); + struct sa_path_rec rec; + + if (sa_query->flags & IB_SA_QUERY_OPA) { + ib_unpack(opa_path_rec_table, + ARRAY_SIZE(opa_path_rec_table), + mad->data, &rec); + rec.rec_type = SA_PATH_REC_TYPE_OPA; + query->callback(status, &rec, query->context); + } else { + ib_unpack(path_rec_table, + ARRAY_SIZE(path_rec_table), + mad->data, &rec); + rec.rec_type = SA_PATH_REC_TYPE_IB; + sa_path_set_ndev(&rec, NULL); + sa_path_set_ifindex(&rec, 0); + sa_path_set_dmac_zero(&rec); + + if (query->conv_pr) { + struct sa_path_rec opa; + + memset(&opa, 0, sizeof(struct sa_path_rec)); + sa_convert_path_ib_to_opa(&opa, &rec); + query->callback(status, &opa, query->context); + } else { + query->callback(status, &rec, query->context); + } + } } else query->callback(status, NULL, query->context); } static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) { - kfree(container_of(sa_query, struct ib_sa_path_query, sa_query)); + struct ib_sa_path_query *query = + container_of(sa_query, struct ib_sa_path_query, sa_query); + + kfree(query->conv_pr); + kfree(query); } /** @@ -1346,11 +1586,11 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) */ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, - struct ib_sa_path_rec *rec, + struct sa_path_rec *rec, ib_sa_comp_mask comp_mask, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, - struct ib_sa_path_rec *resp, + struct sa_path_rec *resp, void *context), void *context, struct ib_sa_query **sa_query) @@ -1360,11 +1600,16 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; + enum opa_pr_supported status; int ret; if (!sa_dev) return -ENODEV; + if ((rec->rec_type != SA_PATH_REC_TYPE_IB) && + (rec->rec_type != SA_PATH_REC_TYPE_OPA)) + return -EINVAL; + port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; @@ -1373,9 +1618,26 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, return -ENOMEM; query->sa_query.port = port; + if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { + status = opa_pr_query_possible(client, device, port_num, rec); + if (status == PR_NOT_SUPPORTED) { + ret = -EINVAL; + goto err1; + } else if (status == PR_OPA_SUPPORTED) { + query->sa_query.flags |= IB_SA_QUERY_OPA; + } else { + query->conv_pr = + kmalloc(sizeof(*query->conv_pr), gfp_mask); + if (!query->conv_pr) { + ret = -ENOMEM; + goto err1; + } + } + } + ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) - goto err1; + goto err2; ib_sa_client_get(client); query->sa_query.client = client; @@ -1383,7 +1645,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, query->context = context; mad = query->sa_query.mad_buf->mad; - init_mad(mad, agent); + init_mad(&query->sa_query, agent); query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL; query->sa_query.release = ib_sa_path_rec_release; @@ -1391,24 +1653,36 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC); mad->sa_hdr.comp_mask = comp_mask; - ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, mad->data); + if (query->sa_query.flags & IB_SA_QUERY_OPA) { + ib_pack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table), + rec, mad->data); + } else if (query->conv_pr) { + sa_convert_path_opa_to_ib(query->conv_pr, rec); + ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), + query->conv_pr, mad->data); + } else { + ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), + rec, mad->data); + } *sa_query = &query->sa_query; query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE; - query->sa_query.mad_buf->context[1] = rec; + query->sa_query.mad_buf->context[1] = (query->conv_pr) ? + query->conv_pr : rec; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) - goto err2; + goto err3; return ret; -err2: +err3: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); - +err2: + kfree(query->conv_pr); err1: kfree(query); return ret; @@ -1508,7 +1782,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client, query->context = context; mad = query->sa_query.mad_buf->mad; - init_mad(mad, agent); + init_mad(&query->sa_query, agent); query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL; query->sa_query.release = ib_sa_service_rec_release; @@ -1600,7 +1874,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, query->context = context; mad = query->sa_query.mad_buf->mad; - init_mad(mad, agent); + init_mad(&query->sa_query, agent); query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL; query->sa_query.release = ib_sa_mcmember_rec_release; @@ -1697,7 +1971,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, query->context = context; mad = query->sa_query.mad_buf->mad; - init_mad(mad, agent); + init_mad(&query->sa_query, agent); query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL; query->sa_query.release = ib_sa_guidinfo_rec_release; @@ -1728,7 +2002,42 @@ err1: } EXPORT_SYMBOL(ib_sa_guid_info_rec_query); -/* Support get SA ClassPortInfo */ +bool ib_sa_sendonly_fullmem_support(struct ib_sa_client *client, + struct ib_device *device, + u8 port_num) +{ + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + bool ret = false; + unsigned long flags; + + if (!sa_dev) + return ret; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + + spin_lock_irqsave(&port->classport_lock, flags); + if ((port->classport_info.valid) && + (port->classport_info.data.type == RDMA_CLASS_PORT_INFO_IB)) + ret = ib_get_cpi_capmask2(&port->classport_info.data.ib) + & IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT; + spin_unlock_irqrestore(&port->classport_lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_sa_sendonly_fullmem_support); + +struct ib_classport_info_context { + struct completion done; + struct ib_sa_query *sa_query; +}; + +static void ib_classportinfo_cb(void *context) +{ + struct ib_classport_info_context *cb_ctx = context; + + complete(&cb_ctx->done); +} + static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) @@ -1736,91 +2045,91 @@ static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, unsigned long flags; struct ib_sa_classport_info_query *query = container_of(sa_query, struct ib_sa_classport_info_query, sa_query); + struct ib_sa_classport_cache *info = &sa_query->port->classport_info; if (mad) { - struct ib_class_port_info rec; + if (sa_query->flags & IB_SA_QUERY_OPA) { + struct opa_class_port_info rec; - ib_unpack(classport_info_rec_table, - ARRAY_SIZE(classport_info_rec_table), - mad->data, &rec); + ib_unpack(opa_classport_info_rec_table, + ARRAY_SIZE(opa_classport_info_rec_table), + mad->data, &rec); - spin_lock_irqsave(&sa_query->port->classport_lock, flags); - if (!status && !sa_query->port->classport_info.valid) { - memcpy(&sa_query->port->classport_info.data, &rec, - sizeof(sa_query->port->classport_info.data)); + spin_lock_irqsave(&sa_query->port->classport_lock, + flags); + if (!status && !info->valid) { + memcpy(&info->data.opa, &rec, + sizeof(info->data.opa)); - sa_query->port->classport_info.valid = true; - } - spin_unlock_irqrestore(&sa_query->port->classport_lock, flags); + info->valid = true; + info->data.type = RDMA_CLASS_PORT_INFO_OPA; + } + spin_unlock_irqrestore(&sa_query->port->classport_lock, + flags); - query->callback(status, &rec, query->context); - } else { - query->callback(status, NULL, query->context); + } else { + struct ib_class_port_info rec; + + ib_unpack(ib_classport_info_rec_table, + ARRAY_SIZE(ib_classport_info_rec_table), + mad->data, &rec); + + spin_lock_irqsave(&sa_query->port->classport_lock, + flags); + if (!status && !info->valid) { + memcpy(&info->data.ib, &rec, + sizeof(info->data.ib)); + + info->valid = true; + info->data.type = RDMA_CLASS_PORT_INFO_IB; + } + spin_unlock_irqrestore(&sa_query->port->classport_lock, + flags); + } } + query->callback(query->context); } -static void ib_sa_portclass_info_rec_release(struct ib_sa_query *sa_query) +static void ib_sa_classport_info_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_classport_info_query, sa_query)); } -int ib_sa_classport_info_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_class_port_info *resp, - void *context), - void *context, - struct ib_sa_query **sa_query) +static int ib_sa_classport_info_rec_query(struct ib_sa_port *port, + int timeout_ms, + void (*callback)(void *context), + void *context, + struct ib_sa_query **sa_query) { - struct ib_sa_classport_info_query *query; - struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); - struct ib_sa_port *port; struct ib_mad_agent *agent; + struct ib_sa_classport_info_query *query; struct ib_sa_mad *mad; - struct ib_class_port_info cached_class_port_info; + gfp_t gfp_mask = GFP_KERNEL; int ret; - unsigned long flags; - if (!sa_dev) - return -ENODEV; - - port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; - /* Use cached ClassPortInfo attribute if valid instead of sending mad */ - spin_lock_irqsave(&port->classport_lock, flags); - if (port->classport_info.valid && callback) { - memcpy(&cached_class_port_info, &port->classport_info.data, - sizeof(cached_class_port_info)); - spin_unlock_irqrestore(&port->classport_lock, flags); - callback(0, &cached_class_port_info, context); - return 0; - } - spin_unlock_irqrestore(&port->classport_lock, flags); - query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; + query->sa_query.flags |= rdma_cap_opa_ah(port->agent->device, + port->port_num) ? + IB_SA_QUERY_OPA : 0; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) - goto err1; + goto err_free; - ib_sa_client_get(client); - query->sa_query.client = client; - query->callback = callback; - query->context = context; + query->callback = callback; + query->context = context; mad = query->sa_query.mad_buf->mad; - init_mad(mad, agent); + init_mad(&query->sa_query, agent); - query->sa_query.callback = callback ? ib_sa_classport_info_rec_callback : NULL; - - query->sa_query.release = ib_sa_portclass_info_rec_release; - /* support GET only */ + query->sa_query.callback = ib_sa_classport_info_rec_callback; + query->sa_query.release = ib_sa_classport_info_rec_release; mad->mad_hdr.method = IB_MGMT_METHOD_GET; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO); mad->sa_hdr.comp_mask = 0; @@ -1828,20 +2137,71 @@ int ib_sa_classport_info_rec_query(struct ib_sa_client *client, ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) - goto err2; + goto err_free_mad; return ret; -err2: +err_free_mad: *sa_query = NULL; - ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); -err1: +err_free: kfree(query); return ret; } -EXPORT_SYMBOL(ib_sa_classport_info_rec_query); + +static void update_ib_cpi(struct work_struct *work) +{ + struct ib_sa_port *port = + container_of(work, struct ib_sa_port, ib_cpi_work.work); + struct ib_classport_info_context *cb_context; + unsigned long flags; + int ret; + + /* If the classport info is valid, nothing + * to do here. + */ + spin_lock_irqsave(&port->classport_lock, flags); + if (port->classport_info.valid) { + spin_unlock_irqrestore(&port->classport_lock, flags); + return; + } + spin_unlock_irqrestore(&port->classport_lock, flags); + + cb_context = kmalloc(sizeof(*cb_context), GFP_KERNEL); + if (!cb_context) + goto err_nomem; + + init_completion(&cb_context->done); + + ret = ib_sa_classport_info_rec_query(port, 3000, + ib_classportinfo_cb, cb_context, + &cb_context->sa_query); + if (ret < 0) + goto free_cb_err; + wait_for_completion(&cb_context->done); +free_cb_err: + kfree(cb_context); + spin_lock_irqsave(&port->classport_lock, flags); + + /* If the classport info is still not valid, the query should have + * failed for some reason. Retry issuing the query + */ + if (!port->classport_info.valid) { + port->classport_info.retry_cnt++; + if (port->classport_info.retry_cnt <= + IB_SA_CPI_MAX_RETRY_CNT) { + unsigned long delay = + msecs_to_jiffies(IB_SA_CPI_RETRY_WAIT); + + queue_delayed_work(ib_wq, &port->ib_cpi_work, delay); + } + } + spin_unlock_irqrestore(&port->classport_lock, flags); + +err_nomem: + return; +} static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) @@ -1870,7 +2230,8 @@ static void send_handler(struct ib_mad_agent *agent, spin_unlock_irqrestore(&idr_lock, flags); free_mad(query); - ib_sa_client_put(query->client); + if (query->client) + ib_sa_client_put(query->client); query->release(query); } @@ -1897,6 +2258,102 @@ static void recv_handler(struct ib_mad_agent *mad_agent, ib_free_recv_mad(mad_recv_wc); } +static void update_sm_ah(struct work_struct *work) +{ + struct ib_sa_port *port = + container_of(work, struct ib_sa_port, update_task); + struct ib_sa_sm_ah *new_ah; + struct ib_port_attr port_attr; + struct rdma_ah_attr ah_attr; + + if (ib_query_port(port->agent->device, port->port_num, &port_attr)) { + pr_warn("Couldn't query port\n"); + return; + } + + new_ah = kmalloc(sizeof(*new_ah), GFP_KERNEL); + if (!new_ah) + return; + + kref_init(&new_ah->ref); + new_ah->src_path_mask = (1 << port_attr.lmc) - 1; + + new_ah->pkey_index = 0; + if (ib_find_pkey(port->agent->device, port->port_num, + IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index)) + pr_err("Couldn't find index for default PKey\n"); + + memset(&ah_attr, 0, sizeof(ah_attr)); + ah_attr.type = rdma_ah_find_type(port->agent->device, + port->port_num); + rdma_ah_set_dlid(&ah_attr, port_attr.sm_lid); + rdma_ah_set_sl(&ah_attr, port_attr.sm_sl); + rdma_ah_set_port_num(&ah_attr, port->port_num); + if (port_attr.grh_required) { + rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); + + rdma_ah_set_subnet_prefix(&ah_attr, + cpu_to_be64(port_attr.subnet_prefix)); + rdma_ah_set_interface_id(&ah_attr, + cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); + } + + new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr); + if (IS_ERR(new_ah->ah)) { + pr_warn("Couldn't create new SM AH\n"); + kfree(new_ah); + return; + } + + spin_lock_irq(&port->ah_lock); + if (port->sm_ah) + kref_put(&port->sm_ah->ref, free_sm_ah); + port->sm_ah = new_ah; + spin_unlock_irq(&port->ah_lock); +} + +static void ib_sa_event(struct ib_event_handler *handler, + struct ib_event *event) +{ + if (event->event == IB_EVENT_PORT_ERR || + event->event == IB_EVENT_PORT_ACTIVE || + event->event == IB_EVENT_LID_CHANGE || + event->event == IB_EVENT_PKEY_CHANGE || + event->event == IB_EVENT_SM_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER) { + unsigned long flags; + struct ib_sa_device *sa_dev = + container_of(handler, typeof(*sa_dev), event_handler); + u8 port_num = event->element.port_num - sa_dev->start_port; + struct ib_sa_port *port = &sa_dev->port[port_num]; + + if (!rdma_cap_ib_sa(handler->device, port->port_num)) + return; + + spin_lock_irqsave(&port->ah_lock, flags); + if (port->sm_ah) + kref_put(&port->sm_ah->ref, free_sm_ah); + port->sm_ah = NULL; + spin_unlock_irqrestore(&port->ah_lock, flags); + + if (event->event == IB_EVENT_SM_CHANGE || + event->event == IB_EVENT_CLIENT_REREGISTER || + event->event == IB_EVENT_LID_CHANGE || + event->event == IB_EVENT_PORT_ACTIVE) { + unsigned long delay = + msecs_to_jiffies(IB_SA_CPI_RETRY_WAIT); + + spin_lock_irqsave(&port->classport_lock, flags); + port->classport_info.valid = false; + port->classport_info.retry_cnt = 0; + spin_unlock_irqrestore(&port->classport_lock, flags); + queue_delayed_work(ib_wq, + &port->ib_cpi_work, delay); + } + queue_work(ib_wq, &sa_dev->port[port_num].update_task); + } +} + static void ib_sa_add_one(struct ib_device *device) { struct ib_sa_device *sa_dev; @@ -1934,6 +2391,8 @@ static void ib_sa_add_one(struct ib_device *device) goto err; INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); + INIT_DELAYED_WORK(&sa_dev->port[i].ib_cpi_work, + update_ib_cpi); count++; } @@ -1980,11 +2439,11 @@ static void ib_sa_remove_one(struct ib_device *device, void *client_data) return; ib_unregister_event_handler(&sa_dev->event_handler); - flush_workqueue(ib_wq); for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { if (rdma_cap_ib_sa(device, i + 1)) { + cancel_delayed_work_sync(&sa_dev->port[i].ib_cpi_work); ib_unregister_mad_agent(sa_dev->port[i].agent); if (sa_dev->port[i].sm_ah) kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c new file mode 100644 index 000000000000..70ad19c4c73e --- /dev/null +++ b/drivers/infiniband/core/security.c @@ -0,0 +1,709 @@ +/* + * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef CONFIG_SECURITY_INFINIBAND + +#include <linux/security.h> +#include <linux/completion.h> +#include <linux/list.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_cache.h> +#include "core_priv.h" +#include "mad_priv.h" + +static struct pkey_index_qp_list *get_pkey_idx_qp_list(struct ib_port_pkey *pp) +{ + struct pkey_index_qp_list *pkey = NULL; + struct pkey_index_qp_list *tmp_pkey; + struct ib_device *dev = pp->sec->dev; + + spin_lock(&dev->port_pkey_list[pp->port_num].list_lock); + list_for_each_entry(tmp_pkey, + &dev->port_pkey_list[pp->port_num].pkey_list, + pkey_index_list) { + if (tmp_pkey->pkey_index == pp->pkey_index) { + pkey = tmp_pkey; + break; + } + } + spin_unlock(&dev->port_pkey_list[pp->port_num].list_lock); + return pkey; +} + +static int get_pkey_and_subnet_prefix(struct ib_port_pkey *pp, + u16 *pkey, + u64 *subnet_prefix) +{ + struct ib_device *dev = pp->sec->dev; + int ret; + + ret = ib_get_cached_pkey(dev, pp->port_num, pp->pkey_index, pkey); + if (ret) + return ret; + + ret = ib_get_cached_subnet_prefix(dev, pp->port_num, subnet_prefix); + + return ret; +} + +static int enforce_qp_pkey_security(u16 pkey, + u64 subnet_prefix, + struct ib_qp_security *qp_sec) +{ + struct ib_qp_security *shared_qp_sec; + int ret; + + ret = security_ib_pkey_access(qp_sec->security, subnet_prefix, pkey); + if (ret) + return ret; + + if (qp_sec->qp == qp_sec->qp->real_qp) { + list_for_each_entry(shared_qp_sec, + &qp_sec->shared_qp_list, + shared_qp_list) { + ret = security_ib_pkey_access(shared_qp_sec->security, + subnet_prefix, + pkey); + if (ret) + return ret; + } + } + return 0; +} + +/* The caller of this function must hold the QP security + * mutex of the QP of the security structure in *pps. + * + * It takes separate ports_pkeys and security structure + * because in some cases the pps will be for a new settings + * or the pps will be for the real QP and security structure + * will be for a shared QP. + */ +static int check_qp_port_pkey_settings(struct ib_ports_pkeys *pps, + struct ib_qp_security *sec) +{ + u64 subnet_prefix; + u16 pkey; + int ret = 0; + + if (!pps) + return 0; + + if (pps->main.state != IB_PORT_PKEY_NOT_VALID) { + ret = get_pkey_and_subnet_prefix(&pps->main, + &pkey, + &subnet_prefix); + if (ret) + return ret; + + ret = enforce_qp_pkey_security(pkey, + subnet_prefix, + sec); + if (ret) + return ret; + } + + if (pps->alt.state != IB_PORT_PKEY_NOT_VALID) { + ret = get_pkey_and_subnet_prefix(&pps->alt, + &pkey, + &subnet_prefix); + if (ret) + return ret; + + ret = enforce_qp_pkey_security(pkey, + subnet_prefix, + sec); + } + + return ret; +} + +/* The caller of this function must hold the QP security + * mutex. + */ +static void qp_to_error(struct ib_qp_security *sec) +{ + struct ib_qp_security *shared_qp_sec; + struct ib_qp_attr attr = { + .qp_state = IB_QPS_ERR + }; + struct ib_event event = { + .event = IB_EVENT_QP_FATAL + }; + + /* If the QP is in the process of being destroyed + * the qp pointer in the security structure is + * undefined. It cannot be modified now. + */ + if (sec->destroying) + return; + + ib_modify_qp(sec->qp, + &attr, + IB_QP_STATE); + + if (sec->qp->event_handler && sec->qp->qp_context) { + event.element.qp = sec->qp; + sec->qp->event_handler(&event, + sec->qp->qp_context); + } + + list_for_each_entry(shared_qp_sec, + &sec->shared_qp_list, + shared_qp_list) { + struct ib_qp *qp = shared_qp_sec->qp; + + if (qp->event_handler && qp->qp_context) { + event.element.qp = qp; + event.device = qp->device; + qp->event_handler(&event, + qp->qp_context); + } + } +} + +static inline void check_pkey_qps(struct pkey_index_qp_list *pkey, + struct ib_device *device, + u8 port_num, + u64 subnet_prefix) +{ + struct ib_port_pkey *pp, *tmp_pp; + bool comp; + LIST_HEAD(to_error_list); + u16 pkey_val; + + if (!ib_get_cached_pkey(device, + port_num, + pkey->pkey_index, + &pkey_val)) { + spin_lock(&pkey->qp_list_lock); + list_for_each_entry(pp, &pkey->qp_list, qp_list) { + if (atomic_read(&pp->sec->error_list_count)) + continue; + + if (enforce_qp_pkey_security(pkey_val, + subnet_prefix, + pp->sec)) { + atomic_inc(&pp->sec->error_list_count); + list_add(&pp->to_error_list, + &to_error_list); + } + } + spin_unlock(&pkey->qp_list_lock); + } + + list_for_each_entry_safe(pp, + tmp_pp, + &to_error_list, + to_error_list) { + mutex_lock(&pp->sec->mutex); + qp_to_error(pp->sec); + list_del(&pp->to_error_list); + atomic_dec(&pp->sec->error_list_count); + comp = pp->sec->destroying; + mutex_unlock(&pp->sec->mutex); + + if (comp) + complete(&pp->sec->error_complete); + } +} + +/* The caller of this function must hold the QP security + * mutex. + */ +static int port_pkey_list_insert(struct ib_port_pkey *pp) +{ + struct pkey_index_qp_list *tmp_pkey; + struct pkey_index_qp_list *pkey; + struct ib_device *dev; + u8 port_num = pp->port_num; + int ret = 0; + + if (pp->state != IB_PORT_PKEY_VALID) + return 0; + + dev = pp->sec->dev; + + pkey = get_pkey_idx_qp_list(pp); + + if (!pkey) { + bool found = false; + + pkey = kzalloc(sizeof(*pkey), GFP_KERNEL); + if (!pkey) + return -ENOMEM; + + spin_lock(&dev->port_pkey_list[port_num].list_lock); + /* Check for the PKey again. A racing process may + * have created it. + */ + list_for_each_entry(tmp_pkey, + &dev->port_pkey_list[port_num].pkey_list, + pkey_index_list) { + if (tmp_pkey->pkey_index == pp->pkey_index) { + kfree(pkey); + pkey = tmp_pkey; + found = true; + break; + } + } + + if (!found) { + pkey->pkey_index = pp->pkey_index; + spin_lock_init(&pkey->qp_list_lock); + INIT_LIST_HEAD(&pkey->qp_list); + list_add(&pkey->pkey_index_list, + &dev->port_pkey_list[port_num].pkey_list); + } + spin_unlock(&dev->port_pkey_list[port_num].list_lock); + } + + spin_lock(&pkey->qp_list_lock); + list_add(&pp->qp_list, &pkey->qp_list); + spin_unlock(&pkey->qp_list_lock); + + pp->state = IB_PORT_PKEY_LISTED; + + return ret; +} + +/* The caller of this function must hold the QP security + * mutex. + */ +static void port_pkey_list_remove(struct ib_port_pkey *pp) +{ + struct pkey_index_qp_list *pkey; + + if (pp->state != IB_PORT_PKEY_LISTED) + return; + + pkey = get_pkey_idx_qp_list(pp); + + spin_lock(&pkey->qp_list_lock); + list_del(&pp->qp_list); + spin_unlock(&pkey->qp_list_lock); + + /* The setting may still be valid, i.e. after + * a destroy has failed for example. + */ + pp->state = IB_PORT_PKEY_VALID; +} + +static void destroy_qp_security(struct ib_qp_security *sec) +{ + security_ib_free_security(sec->security); + kfree(sec->ports_pkeys); + kfree(sec); +} + +/* The caller of this function must hold the QP security + * mutex. + */ +static struct ib_ports_pkeys *get_new_pps(const struct ib_qp *qp, + const struct ib_qp_attr *qp_attr, + int qp_attr_mask) +{ + struct ib_ports_pkeys *new_pps; + struct ib_ports_pkeys *qp_pps = qp->qp_sec->ports_pkeys; + + new_pps = kzalloc(sizeof(*new_pps), GFP_KERNEL); + if (!new_pps) + return NULL; + + if (qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) { + if (!qp_pps) { + new_pps->main.port_num = qp_attr->port_num; + new_pps->main.pkey_index = qp_attr->pkey_index; + } else { + new_pps->main.port_num = (qp_attr_mask & IB_QP_PORT) ? + qp_attr->port_num : + qp_pps->main.port_num; + + new_pps->main.pkey_index = + (qp_attr_mask & IB_QP_PKEY_INDEX) ? + qp_attr->pkey_index : + qp_pps->main.pkey_index; + } + new_pps->main.state = IB_PORT_PKEY_VALID; + } else if (qp_pps) { + new_pps->main.port_num = qp_pps->main.port_num; + new_pps->main.pkey_index = qp_pps->main.pkey_index; + if (qp_pps->main.state != IB_PORT_PKEY_NOT_VALID) + new_pps->main.state = IB_PORT_PKEY_VALID; + } + + if (qp_attr_mask & IB_QP_ALT_PATH) { + new_pps->alt.port_num = qp_attr->alt_port_num; + new_pps->alt.pkey_index = qp_attr->alt_pkey_index; + new_pps->alt.state = IB_PORT_PKEY_VALID; + } else if (qp_pps) { + new_pps->alt.port_num = qp_pps->alt.port_num; + new_pps->alt.pkey_index = qp_pps->alt.pkey_index; + if (qp_pps->alt.state != IB_PORT_PKEY_NOT_VALID) + new_pps->alt.state = IB_PORT_PKEY_VALID; + } + + new_pps->main.sec = qp->qp_sec; + new_pps->alt.sec = qp->qp_sec; + return new_pps; +} + +int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev) +{ + struct ib_qp *real_qp = qp->real_qp; + int ret; + + ret = ib_create_qp_security(qp, dev); + + if (ret) + return ret; + + mutex_lock(&real_qp->qp_sec->mutex); + ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys, + qp->qp_sec); + + if (ret) + goto ret; + + if (qp != real_qp) + list_add(&qp->qp_sec->shared_qp_list, + &real_qp->qp_sec->shared_qp_list); +ret: + mutex_unlock(&real_qp->qp_sec->mutex); + if (ret) + destroy_qp_security(qp->qp_sec); + + return ret; +} + +void ib_close_shared_qp_security(struct ib_qp_security *sec) +{ + struct ib_qp *real_qp = sec->qp->real_qp; + + mutex_lock(&real_qp->qp_sec->mutex); + list_del(&sec->shared_qp_list); + mutex_unlock(&real_qp->qp_sec->mutex); + + destroy_qp_security(sec); +} + +int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) +{ + int ret; + + qp->qp_sec = kzalloc(sizeof(*qp->qp_sec), GFP_KERNEL); + if (!qp->qp_sec) + return -ENOMEM; + + qp->qp_sec->qp = qp; + qp->qp_sec->dev = dev; + mutex_init(&qp->qp_sec->mutex); + INIT_LIST_HEAD(&qp->qp_sec->shared_qp_list); + atomic_set(&qp->qp_sec->error_list_count, 0); + init_completion(&qp->qp_sec->error_complete); + ret = security_ib_alloc_security(&qp->qp_sec->security); + if (ret) + kfree(qp->qp_sec); + + return ret; +} +EXPORT_SYMBOL(ib_create_qp_security); + +void ib_destroy_qp_security_begin(struct ib_qp_security *sec) +{ + mutex_lock(&sec->mutex); + + /* Remove the QP from the lists so it won't get added to + * a to_error_list during the destroy process. + */ + if (sec->ports_pkeys) { + port_pkey_list_remove(&sec->ports_pkeys->main); + port_pkey_list_remove(&sec->ports_pkeys->alt); + } + + /* If the QP is already in one or more of those lists + * the destroying flag will ensure the to error flow + * doesn't operate on an undefined QP. + */ + sec->destroying = true; + + /* Record the error list count to know how many completions + * to wait for. + */ + sec->error_comps_pending = atomic_read(&sec->error_list_count); + + mutex_unlock(&sec->mutex); +} + +void ib_destroy_qp_security_abort(struct ib_qp_security *sec) +{ + int ret; + int i; + + /* If a concurrent cache update is in progress this + * QP security could be marked for an error state + * transition. Wait for this to complete. + */ + for (i = 0; i < sec->error_comps_pending; i++) + wait_for_completion(&sec->error_complete); + + mutex_lock(&sec->mutex); + sec->destroying = false; + + /* Restore the position in the lists and verify + * access is still allowed in case a cache update + * occurred while attempting to destroy. + * + * Because these setting were listed already + * and removed during ib_destroy_qp_security_begin + * we know the pkey_index_qp_list for the PKey + * already exists so port_pkey_list_insert won't fail. + */ + if (sec->ports_pkeys) { + port_pkey_list_insert(&sec->ports_pkeys->main); + port_pkey_list_insert(&sec->ports_pkeys->alt); + } + + ret = check_qp_port_pkey_settings(sec->ports_pkeys, sec); + if (ret) + qp_to_error(sec); + + mutex_unlock(&sec->mutex); +} + +void ib_destroy_qp_security_end(struct ib_qp_security *sec) +{ + int i; + + /* If a concurrent cache update is occurring we must + * wait until this QP security structure is processed + * in the QP to error flow before destroying it because + * the to_error_list is in use. + */ + for (i = 0; i < sec->error_comps_pending; i++) + wait_for_completion(&sec->error_complete); + + destroy_qp_security(sec); +} + +void ib_security_cache_change(struct ib_device *device, + u8 port_num, + u64 subnet_prefix) +{ + struct pkey_index_qp_list *pkey; + + list_for_each_entry(pkey, + &device->port_pkey_list[port_num].pkey_list, + pkey_index_list) { + check_pkey_qps(pkey, + device, + port_num, + subnet_prefix); + } +} + +void ib_security_destroy_port_pkey_list(struct ib_device *device) +{ + struct pkey_index_qp_list *pkey, *tmp_pkey; + int i; + + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + spin_lock(&device->port_pkey_list[i].list_lock); + list_for_each_entry_safe(pkey, + tmp_pkey, + &device->port_pkey_list[i].pkey_list, + pkey_index_list) { + list_del(&pkey->pkey_index_list); + kfree(pkey); + } + spin_unlock(&device->port_pkey_list[i].list_lock); + } +} + +int ib_security_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_udata *udata) +{ + int ret = 0; + struct ib_ports_pkeys *tmp_pps; + struct ib_ports_pkeys *new_pps; + bool special_qp = (qp->qp_type == IB_QPT_SMI || + qp->qp_type == IB_QPT_GSI || + qp->qp_type >= IB_QPT_RESERVED1); + bool pps_change = ((qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) || + (qp_attr_mask & IB_QP_ALT_PATH)); + + if (pps_change && !special_qp) { + mutex_lock(&qp->qp_sec->mutex); + new_pps = get_new_pps(qp, + qp_attr, + qp_attr_mask); + + /* Add this QP to the lists for the new port + * and pkey settings before checking for permission + * in case there is a concurrent cache update + * occurring. Walking the list for a cache change + * doesn't acquire the security mutex unless it's + * sending the QP to error. + */ + ret = port_pkey_list_insert(&new_pps->main); + + if (!ret) + ret = port_pkey_list_insert(&new_pps->alt); + + if (!ret) + ret = check_qp_port_pkey_settings(new_pps, + qp->qp_sec); + } + + if (!ret) + ret = qp->device->modify_qp(qp->real_qp, + qp_attr, + qp_attr_mask, + udata); + + if (pps_change && !special_qp) { + /* Clean up the lists and free the appropriate + * ports_pkeys structure. + */ + if (ret) { + tmp_pps = new_pps; + } else { + tmp_pps = qp->qp_sec->ports_pkeys; + qp->qp_sec->ports_pkeys = new_pps; + } + + if (tmp_pps) { + port_pkey_list_remove(&tmp_pps->main); + port_pkey_list_remove(&tmp_pps->alt); + } + kfree(tmp_pps); + mutex_unlock(&qp->qp_sec->mutex); + } + return ret; +} +EXPORT_SYMBOL(ib_security_modify_qp); + +int ib_security_pkey_access(struct ib_device *dev, + u8 port_num, + u16 pkey_index, + void *sec) +{ + u64 subnet_prefix; + u16 pkey; + int ret; + + ret = ib_get_cached_pkey(dev, port_num, pkey_index, &pkey); + if (ret) + return ret; + + ret = ib_get_cached_subnet_prefix(dev, port_num, &subnet_prefix); + + if (ret) + return ret; + + return security_ib_pkey_access(sec, subnet_prefix, pkey); +} +EXPORT_SYMBOL(ib_security_pkey_access); + +static int ib_mad_agent_security_change(struct notifier_block *nb, + unsigned long event, + void *data) +{ + struct ib_mad_agent *ag = container_of(nb, struct ib_mad_agent, lsm_nb); + + if (event != LSM_POLICY_CHANGE) + return NOTIFY_DONE; + + ag->smp_allowed = !security_ib_endport_manage_subnet(ag->security, + ag->device->name, + ag->port_num); + + return NOTIFY_OK; +} + +int ib_mad_agent_security_setup(struct ib_mad_agent *agent, + enum ib_qp_type qp_type) +{ + int ret; + + ret = security_ib_alloc_security(&agent->security); + if (ret) + return ret; + + if (qp_type != IB_QPT_SMI) + return 0; + + ret = security_ib_endport_manage_subnet(agent->security, + agent->device->name, + agent->port_num); + if (ret) + return ret; + + agent->lsm_nb.notifier_call = ib_mad_agent_security_change; + ret = register_lsm_notifier(&agent->lsm_nb); + if (ret) + return ret; + + agent->smp_allowed = true; + agent->lsm_nb_reg = true; + return 0; +} + +void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) +{ + security_ib_free_security(agent->security); + if (agent->lsm_nb_reg) + unregister_lsm_notifier(&agent->lsm_nb); +} + +int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) +{ + int ret; + + if (map->agent.qp->qp_type == IB_QPT_SMI && !map->agent.smp_allowed) + return -EACCES; + + ret = ib_security_pkey_access(map->agent.device, + map->agent.port_num, + pkey_index, + map->agent.security); + + if (ret) + return ret; + + return 0; +} + +#endif /* CONFIG_SECURITY_INFINIBAND */ diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index c1fb545e8d78..7ebe1ef23652 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -253,6 +253,10 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, speed = " EDR"; rate = 250; break; + case IB_SPEED_HDR: + speed = " HDR"; + rate = 500; + break; case IB_SPEED_SDR: default: /* default to SDR for invalid rates */ rate = 25; @@ -1258,7 +1262,7 @@ int ib_device_register_sysfs(struct ib_device *device, int ret; int i; - device->dev.parent = device->dma_device; + WARN_ON_ONCE(!device->dev.parent); ret = dev_set_name(class_dev, "%s", device->name); if (ret) return ret; @@ -1301,7 +1305,7 @@ err_put: free_port_list_attributes(device); err_unregister: - device_unregister(class_dev); + device_del(class_dev); err: return ret; diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index e0a995b85a2d..112099c86a19 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -702,10 +702,10 @@ static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len) return 0; } -static int ib_ucm_path_get(struct ib_sa_path_rec **path, u64 src) +static int ib_ucm_path_get(struct sa_path_rec **path, u64 src) { struct ib_user_path_rec upath; - struct ib_sa_path_rec *sa_path; + struct sa_path_rec *sa_path; *path = NULL; @@ -962,7 +962,7 @@ static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file, int in_len, int out_len) { struct ib_ucm_context *ctx; - struct ib_sa_path_rec *path = NULL; + struct sa_path_rec *path = NULL; struct ib_ucm_lap cmd; const void *data = NULL; int result; @@ -1205,12 +1205,15 @@ static void ib_ucm_release_dev(struct device *dev) struct ib_ucm_device *ucm_dev; ucm_dev = container_of(dev, struct ib_ucm_device, dev); - cdev_del(&ucm_dev->cdev); + kfree(ucm_dev); +} + +static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev) +{ if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) clear_bit(ucm_dev->devnum, dev_map); else clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map); - kfree(ucm_dev); } static const struct file_operations ucm_fops = { @@ -1266,7 +1269,9 @@ static void ib_ucm_add_one(struct ib_device *device) if (!ucm_dev) return; + device_initialize(&ucm_dev->dev); ucm_dev->ib_dev = device; + ucm_dev->dev.release = ib_ucm_release_dev; devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); if (devnum >= IB_UCM_MAX_DEVICES) { @@ -1286,16 +1291,14 @@ static void ib_ucm_add_one(struct ib_device *device) cdev_init(&ucm_dev->cdev, &ucm_fops); ucm_dev->cdev.owner = THIS_MODULE; kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum); - if (cdev_add(&ucm_dev->cdev, base, 1)) - goto err; ucm_dev->dev.class = &cm_class; - ucm_dev->dev.parent = device->dma_device; - ucm_dev->dev.devt = ucm_dev->cdev.dev; - ucm_dev->dev.release = ib_ucm_release_dev; + ucm_dev->dev.parent = device->dev.parent; + ucm_dev->dev.devt = base; + dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum); - if (device_register(&ucm_dev->dev)) - goto err_cdev; + if (cdev_device_add(&ucm_dev->cdev, &ucm_dev->dev)) + goto err_devnum; if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev)) goto err_dev; @@ -1304,15 +1307,11 @@ static void ib_ucm_add_one(struct ib_device *device) return; err_dev: - device_unregister(&ucm_dev->dev); -err_cdev: - cdev_del(&ucm_dev->cdev); - if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) - clear_bit(devnum, dev_map); - else - clear_bit(devnum, overflow_map); + cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev); +err_devnum: + ib_ucm_free_dev(ucm_dev); err: - kfree(ucm_dev); + put_device(&ucm_dev->dev); return; } @@ -1323,7 +1322,9 @@ static void ib_ucm_remove_one(struct ib_device *device, void *client_data) if (!ucm_dev) return; - device_unregister(&ucm_dev->dev); + cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev); + ib_ucm_free_dev(ucm_dev); + put_device(&ucm_dev->dev); } static CLASS_ATTR_STRING(abi_version, S_IRUGO, diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index e12f8faf8c23..276f0ef835bd 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -898,11 +898,18 @@ static ssize_t ucma_query_path(struct ucma_context *ctx, for (i = 0, out_len -= sizeof(*resp); i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); i++, out_len -= sizeof(struct ib_path_rec_data)) { + struct sa_path_rec *rec = &ctx->cm_id->route.path_rec[i]; resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL; - ib_sa_pack_path(&ctx->cm_id->route.path_rec[i], - &resp->path_data[i].path_rec); + if (rec->rec_type == SA_PATH_REC_TYPE_IB) { + ib_sa_pack_path(rec, &resp->path_data[i].path_rec); + } else { + struct sa_path_rec ib; + + sa_convert_path_opa_to_ib(&ib, rec); + ib_sa_pack_path(&ib, &resp->path_data[i].path_rec); + } } if (copy_to_user(response, resp, @@ -1197,7 +1204,7 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, static int ucma_set_ib_path(struct ucma_context *ctx, struct ib_path_rec_data *path_data, size_t optlen) { - struct ib_sa_path_rec sa_path; + struct sa_path_rec sa_path; struct rdma_cm_event event; int ret; @@ -1215,8 +1222,17 @@ static int ucma_set_ib_path(struct ucma_context *ctx, memset(&sa_path, 0, sizeof(sa_path)); + sa_path.rec_type = SA_PATH_REC_TYPE_IB; ib_sa_unpack_path(path_data->path_rec, &sa_path); - ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1); + + if (rdma_cap_opa_ah(ctx->cm_id->device, ctx->cm_id->port_num)) { + struct sa_path_rec opa; + + sa_convert_path_ib_to_opa(&opa, &sa_path); + ret = rdma_set_ib_paths(ctx->cm_id, &opa, 1); + } else { + ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1); + } if (ret) return ret; diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 4609b921f899..21e60b1e2ff4 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -34,7 +34,8 @@ #include <linux/mm.h> #include <linux/dma-mapping.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/sched/mm.h> #include <linux/export.h> #include <linux/hugetlb.h> #include <linux/slab.h> @@ -57,7 +58,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { page = sg_page(sg); - if (umem->writable && dirty) + if (!PageDirty(page) && umem->writable && dirty) set_page_dirty_lock(page); put_page(page); } @@ -99,9 +100,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (dmasync) dma_attrs |= DMA_ATTR_WRITE_BARRIER; - if (!size) - return ERR_PTR(-EINVAL); - /* * If the combination of the addr and size requested for this memory * region causes an integer overflow, return error. @@ -117,11 +115,11 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (!umem) return ERR_PTR(-ENOMEM); - umem->context = context; - umem->length = size; - umem->address = addr; - umem->page_size = PAGE_SIZE; - umem->pid = get_task_pid(current, PIDTYPE_PID); + umem->context = context; + umem->length = size; + umem->address = addr; + umem->page_shift = PAGE_SHIFT; + umem->pid = get_task_pid(current, PIDTYPE_PID); /* * We ask for writable memory if any of the following * access flags are set. "Local write" and "remote write" @@ -135,7 +133,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (access & IB_ACCESS_ON_DEMAND) { put_pid(umem->pid); - ret = ib_umem_odp_get(context, umem); + ret = ib_umem_odp_get(context, umem, access); if (ret) { kfree(umem); return ERR_PTR(ret); @@ -317,7 +315,6 @@ EXPORT_SYMBOL(ib_umem_release); int ib_umem_page_count(struct ib_umem *umem) { - int shift; int i; int n; struct scatterlist *sg; @@ -325,11 +322,9 @@ int ib_umem_page_count(struct ib_umem *umem) if (umem->odp_data) return ib_umem_num_pages(umem); - shift = ilog2(umem->page_size); - n = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) - n += sg_dma_len(sg) >> shift; + n += sg_dma_len(sg) >> umem->page_shift; return n; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 6b079a31dced..8c4ec564e495 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -32,10 +32,13 @@ #include <linux/types.h> #include <linux/sched.h> +#include <linux/sched/mm.h> +#include <linux/sched/task.h> #include <linux/pid.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/vmalloc.h> +#include <linux/hugetlb.h> #include <rdma/ib_verbs.h> #include <rdma/ib_umem.h> @@ -239,7 +242,73 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { .invalidate_range_end = ib_umem_notifier_invalidate_range_end, }; -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) +struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, + unsigned long addr, + size_t size) +{ + struct ib_umem *umem; + struct ib_umem_odp *odp_data; + int pages = size >> PAGE_SHIFT; + int ret; + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + umem->context = context; + umem->length = size; + umem->address = addr; + umem->page_shift = PAGE_SHIFT; + umem->writable = 1; + + odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); + if (!odp_data) { + ret = -ENOMEM; + goto out_umem; + } + odp_data->umem = umem; + + mutex_init(&odp_data->umem_mutex); + init_completion(&odp_data->notifier_completion); + + odp_data->page_list = vzalloc(pages * sizeof(*odp_data->page_list)); + if (!odp_data->page_list) { + ret = -ENOMEM; + goto out_odp_data; + } + + odp_data->dma_list = vzalloc(pages * sizeof(*odp_data->dma_list)); + if (!odp_data->dma_list) { + ret = -ENOMEM; + goto out_page_list; + } + + down_write(&context->umem_rwsem); + context->odp_mrs_count++; + rbt_ib_umem_insert(&odp_data->interval_tree, &context->umem_tree); + if (likely(!atomic_read(&context->notifier_count))) + odp_data->mn_counters_active = true; + else + list_add(&odp_data->no_private_counters, + &context->no_private_counters); + up_write(&context->umem_rwsem); + + umem->odp_data = odp_data; + + return umem; + +out_page_list: + vfree(odp_data->page_list); +out_odp_data: + kfree(odp_data); +out_umem: + kfree(umem); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(ib_alloc_odp_umem); + +int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, + int access) { int ret_val; struct pid *our_pid; @@ -248,6 +317,24 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) if (!mm) return -EINVAL; + if (access & IB_ACCESS_HUGETLB) { + struct vm_area_struct *vma; + struct hstate *h; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ib_umem_start(umem)); + if (!vma || !is_vm_hugetlb_page(vma)) { + up_read(&mm->mmap_sem); + return -EINVAL; + } + h = hstate_vma(vma); + umem->page_shift = huge_page_shift(h); + up_read(&mm->mmap_sem); + umem->hugetlb = 1; + } else { + umem->hugetlb = 0; + } + /* Prevent creating ODP MRs in child processes */ rcu_read_lock(); our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); @@ -258,7 +345,6 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) goto out_mm; } - umem->hugetlb = 0; umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); if (!umem->odp_data) { ret_val = -ENOMEM; @@ -270,18 +356,20 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) init_completion(&umem->odp_data->notifier_completion); - umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * + if (ib_umem_num_pages(umem)) { + umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * sizeof(*umem->odp_data->page_list)); - if (!umem->odp_data->page_list) { - ret_val = -ENOMEM; - goto out_odp_data; - } + if (!umem->odp_data->page_list) { + ret_val = -ENOMEM; + goto out_odp_data; + } - umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) * + umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) * sizeof(*umem->odp_data->dma_list)); - if (!umem->odp_data->dma_list) { - ret_val = -ENOMEM; - goto out_page_list; + if (!umem->odp_data->dma_list) { + ret_val = -ENOMEM; + goto out_page_list; + } } /* @@ -435,7 +523,6 @@ out: static int ib_umem_odp_map_dma_single_page( struct ib_umem *umem, int page_index, - u64 base_virt_addr, struct page *page, u64 access_mask, unsigned long current_seq) @@ -458,7 +545,7 @@ static int ib_umem_odp_map_dma_single_page( if (!(umem->odp_data->dma_list[page_index])) { dma_addr = ib_dma_map_page(dev, page, - 0, PAGE_SIZE, + 0, BIT(umem->page_shift), DMA_BIDIRECTIONAL); if (ib_dma_mapping_error(dev, dma_addr)) { ret = -EFAULT; @@ -466,6 +553,7 @@ static int ib_umem_odp_map_dma_single_page( } umem->odp_data->dma_list[page_index] = dma_addr | access_mask; umem->odp_data->page_list[page_index] = page; + umem->npages++; stored_page = 1; } else if (umem->odp_data->page_list[page_index] == page) { umem->odp_data->dma_list[page_index] |= access_mask; @@ -485,8 +573,9 @@ out: if (remove_existing_mapping && umem->context->invalidate_range) { invalidate_page_trampoline( umem, - base_virt_addr + (page_index * PAGE_SIZE), - base_virt_addr + ((page_index+1)*PAGE_SIZE), + ib_umem_start(umem) + (page_index >> umem->page_shift), + ib_umem_start(umem) + ((page_index + 1) >> + umem->page_shift), NULL); ret = -EAGAIN; } @@ -505,7 +594,8 @@ out: * for failure. * An -EAGAIN error code is returned when a concurrent mmu notifier prevents * the function from completing its task. - * + * An -ENOENT error code indicates that userspace process is being terminated + * and mm was already destroyed. * @umem: the umem to map and pin * @user_virt: the address from which we need to map. * @bcnt: the minimal number of bytes to pin and map. The mapping might be @@ -524,10 +614,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; struct page **local_page_list = NULL; - u64 off; - int j, k, ret = 0, start_idx, npages = 0; - u64 base_virt_addr; + u64 page_mask, off; + int j, k, ret = 0, start_idx, npages = 0, page_shift; unsigned int flags = 0; + phys_addr_t p = 0; if (access_mask == 0) return -EINVAL; @@ -540,9 +630,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, if (!local_page_list) return -ENOMEM; - off = user_virt & (~PAGE_MASK); - user_virt = user_virt & PAGE_MASK; - base_virt_addr = user_virt; + page_shift = umem->page_shift; + page_mask = ~(BIT(page_shift) - 1); + off = user_virt & (~page_mask); + user_virt = user_virt & page_mask; bcnt += off; /* Charge for the first page offset as well. */ owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); @@ -553,20 +644,20 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, owning_mm = get_task_mm(owning_process); if (owning_mm == NULL) { - ret = -EINVAL; + ret = -ENOENT; goto out_put_task; } if (access_mask & ODP_WRITE_ALLOWED_BIT) flags |= FOLL_WRITE; - start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; + start_idx = (user_virt - ib_umem_start(umem)) >> page_shift; k = start_idx; while (bcnt > 0) { - const size_t gup_num_pages = - min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, - PAGE_SIZE / sizeof(struct page *)); + const size_t gup_num_pages = min_t(size_t, + (bcnt + BIT(page_shift) - 1) >> page_shift, + PAGE_SIZE / sizeof(struct page *)); down_read(&owning_mm->mmap_sem); /* @@ -585,14 +676,25 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, break; bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); - user_virt += npages << PAGE_SHIFT; mutex_lock(&umem->odp_data->umem_mutex); - for (j = 0; j < npages; ++j) { + for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { + if (user_virt & ~page_mask) { + p += PAGE_SIZE; + if (page_to_phys(local_page_list[j]) != p) { + ret = -EFAULT; + break; + } + put_page(local_page_list[j]); + continue; + } + ret = ib_umem_odp_map_dma_single_page( - umem, k, base_virt_addr, local_page_list[j], - access_mask, current_seq); + umem, k, local_page_list[j], + access_mask, current_seq); if (ret < 0) break; + + p = page_to_phys(local_page_list[j]); k++; } mutex_unlock(&umem->odp_data->umem_mutex); @@ -636,8 +738,8 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, * invalidations, so we must make sure we free each page only * once. */ mutex_lock(&umem->odp_data->umem_mutex); - for (addr = virt; addr < bound; addr += (u64)umem->page_size) { - idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; + for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) { + idx = (addr - ib_umem_start(umem)) >> umem->page_shift; if (umem->odp_data->page_list[idx]) { struct page *page = umem->odp_data->page_list[idx]; dma_addr_t dma = umem->odp_data->dma_list[idx]; @@ -665,6 +767,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, put_page(page); umem->odp_data->page_list[idx] = NULL; umem->odp_data->dma_list[idx] = 0; + umem->npages--; } } mutex_unlock(&umem->odp_data->umem_mutex); diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c index 727d788448f5..d176597b4d78 100644 --- a/drivers/infiniband/core/umem_rbtree.c +++ b/drivers/infiniband/core/umem_rbtree.c @@ -78,17 +78,32 @@ int rbt_ib_umem_for_each_in_range(struct rb_root *root, void *cookie) { int ret_val = 0; - struct umem_odp_node *node; + struct umem_odp_node *node, *next; struct ib_umem_odp *umem; if (unlikely(start == last)) return ret_val; - for (node = rbt_ib_umem_iter_first(root, start, last - 1); node; - node = rbt_ib_umem_iter_next(node, start, last - 1)) { + for (node = rbt_ib_umem_iter_first(root, start, last - 1); + node; node = next) { + next = rbt_ib_umem_iter_next(node, start, last - 1); umem = container_of(node, struct ib_umem_odp, interval_tree); ret_val = cb(umem->umem, start, last, cookie) || ret_val; } return ret_val; } +EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); + +struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root *root, + u64 addr, u64 length) +{ + struct umem_odp_node *node; + + node = rbt_ib_umem_iter_first(root, addr, addr + length - 1); + if (node) + return container_of(node, struct ib_umem_odp, interval_tree); + return NULL; + +} +EXPORT_SYMBOL(rbt_ib_umem_lookup); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 249b403b43a4..36a6f5c8914c 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -197,7 +197,7 @@ static void send_handler(struct ib_mad_agent *agent, struct ib_umad_packet *packet = send_wc->send_buf->context[0]; dequeue_send(file, packet); - ib_destroy_ah(packet->msg->ah); + rdma_destroy_ah(packet->msg->ah); ib_free_send_mad(packet->msg); if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) { @@ -235,17 +235,19 @@ static void recv_handler(struct ib_mad_agent *agent, packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index; packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH); if (packet->mad.hdr.grh_present) { - struct ib_ah_attr ah_attr; + struct rdma_ah_attr ah_attr; + const struct ib_global_route *grh; ib_init_ah_from_wc(agent->device, agent->port_num, mad_recv_wc->wc, mad_recv_wc->recv_buf.grh, &ah_attr); - packet->mad.hdr.gid_index = ah_attr.grh.sgid_index; - packet->mad.hdr.hop_limit = ah_attr.grh.hop_limit; - packet->mad.hdr.traffic_class = ah_attr.grh.traffic_class; - memcpy(packet->mad.hdr.gid, &ah_attr.grh.dgid, 16); - packet->mad.hdr.flow_label = cpu_to_be32(ah_attr.grh.flow_label); + grh = rdma_ah_read_grh(&ah_attr); + packet->mad.hdr.gid_index = grh->sgid_index; + packet->mad.hdr.hop_limit = grh->hop_limit; + packet->mad.hdr.traffic_class = grh->traffic_class; + memcpy(packet->mad.hdr.gid, &grh->dgid, 16); + packet->mad.hdr.flow_label = cpu_to_be32(grh->flow_label); } if (queue_packet(file, agent, packet)) @@ -449,7 +451,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, struct ib_umad_file *file = filp->private_data; struct ib_umad_packet *packet; struct ib_mad_agent *agent; - struct ib_ah_attr ah_attr; + struct rdma_ah_attr ah_attr; struct ib_ah *ah; struct ib_rmpp_mad *rmpp_mad; __be64 *tid; @@ -489,20 +491,22 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, } memset(&ah_attr, 0, sizeof ah_attr); - ah_attr.dlid = be16_to_cpu(packet->mad.hdr.lid); - ah_attr.sl = packet->mad.hdr.sl; - ah_attr.src_path_bits = packet->mad.hdr.path_bits; - ah_attr.port_num = file->port->port_num; + ah_attr.type = rdma_ah_find_type(file->port->ib_dev, + file->port->port_num); + rdma_ah_set_dlid(&ah_attr, be16_to_cpu(packet->mad.hdr.lid)); + rdma_ah_set_sl(&ah_attr, packet->mad.hdr.sl); + rdma_ah_set_path_bits(&ah_attr, packet->mad.hdr.path_bits); + rdma_ah_set_port_num(&ah_attr, file->port->port_num); if (packet->mad.hdr.grh_present) { - ah_attr.ah_flags = IB_AH_GRH; - memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16); - ah_attr.grh.sgid_index = packet->mad.hdr.gid_index; - ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label); - ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit; - ah_attr.grh.traffic_class = packet->mad.hdr.traffic_class; + rdma_ah_set_grh(&ah_attr, NULL, + be32_to_cpu(packet->mad.hdr.flow_label), + packet->mad.hdr.gid_index, + packet->mad.hdr.hop_limit, + packet->mad.hdr.traffic_class); + rdma_ah_set_dgid_raw(&ah_attr, packet->mad.hdr.gid); } - ah = ib_create_ah(agent->qp->pd, &ah_attr); + ah = rdma_create_ah(agent->qp->pd, &ah_attr); if (IS_ERR(ah)) { ret = PTR_ERR(ah); goto err_up; @@ -596,7 +600,7 @@ err_send: err_msg: ib_free_send_mad(packet->msg); err_ah: - ib_destroy_ah(ah); + rdma_destroy_ah(ah); err_up: mutex_unlock(&file->mutex); err: @@ -1183,12 +1187,12 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, cdev_init(&port->cdev, &umad_fops); port->cdev.owner = THIS_MODULE; - port->cdev.kobj.parent = &umad_dev->kobj; + cdev_set_parent(&port->cdev, &umad_dev->kobj); kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num); if (cdev_add(&port->cdev, base, 1)) goto err_cdev; - port->dev = device_create(umad_class, device->dma_device, + port->dev = device_create(umad_class, device->dev.parent, port->cdev.dev, port, "umad%d", port->dev_num); if (IS_ERR(port->dev)) @@ -1202,12 +1206,12 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, base += IB_UMAD_MAX_PORTS; cdev_init(&port->sm_cdev, &umad_sm_fops); port->sm_cdev.owner = THIS_MODULE; - port->sm_cdev.kobj.parent = &umad_dev->kobj; + cdev_set_parent(&port->sm_cdev, &umad_dev->kobj); kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num); if (cdev_add(&port->sm_cdev, base, 1)) goto err_sm_cdev; - port->sm_dev = device_create(umad_class, device->dma_device, + port->sm_dev = device_create(umad_class, device->dev.parent, port->sm_cdev.dev, port, "issm%d", port->dev_num); if (IS_ERR(port->sm_dev)) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 455034ac994e..64d494a64daf 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -76,12 +76,13 @@ * an asynchronous event queue file is created and released when the * event file is closed. * - * struct ib_uverbs_event_file: One reference is held by the VFS and - * released when the file is closed. For asynchronous event files, - * another reference is held by the corresponding main context file - * and released when that file is closed. For completion event files, - * a reference is taken when a CQ is created that uses the file, and - * released when the CQ is destroyed. + * struct ib_uverbs_event_queue: Base structure for + * struct ib_uverbs_async_event_file and struct ib_uverbs_completion_event_file. + * One reference is held by the VFS and released when the file is closed. + * For asynchronous event files, another reference is held by the corresponding + * main context file and released when that file is closed. For completion + * event files, a reference is taken when a CQ is created that uses the file, + * and released when the CQ is destroyed. */ struct ib_uverbs_device { @@ -101,18 +102,26 @@ struct ib_uverbs_device { struct list_head uverbs_events_file_list; }; -struct ib_uverbs_event_file { - struct kref ref; - int is_async; - struct ib_uverbs_file *uverbs_file; +struct ib_uverbs_event_queue { spinlock_t lock; int is_closed; wait_queue_head_t poll_wait; struct fasync_struct *async_queue; struct list_head event_list; +}; + +struct ib_uverbs_async_event_file { + struct ib_uverbs_event_queue ev_queue; + struct ib_uverbs_file *uverbs_file; + struct kref ref; struct list_head list; }; +struct ib_uverbs_completion_event_file { + struct ib_uobject_file uobj_file; + struct ib_uverbs_event_queue ev_queue; +}; + struct ib_uverbs_file { struct kref ref; struct mutex mutex; @@ -120,9 +129,13 @@ struct ib_uverbs_file { struct ib_uverbs_device *device; struct ib_ucontext *ucontext; struct ib_event_handler event_handler; - struct ib_uverbs_event_file *async_file; + struct ib_uverbs_async_event_file *async_file; struct list_head list; int is_closed; + + struct idr idr; + /* spinlock protects write access to idr */ + spinlock_t idr_lock; }; struct ib_uverbs_event { @@ -159,6 +172,8 @@ struct ib_usrq_object { struct ib_uqp_object { struct ib_uevent_object uevent; + /* lock for mcast list */ + struct mutex mcast_lock; struct list_head mcast_list; struct ib_uxrcd_object *uxrcd; }; @@ -176,32 +191,18 @@ struct ib_ucq_object { u32 async_events_reported; }; -extern spinlock_t ib_uverbs_idr_lock; -extern struct idr ib_uverbs_pd_idr; -extern struct idr ib_uverbs_mr_idr; -extern struct idr ib_uverbs_mw_idr; -extern struct idr ib_uverbs_ah_idr; -extern struct idr ib_uverbs_cq_idr; -extern struct idr ib_uverbs_qp_idr; -extern struct idr ib_uverbs_srq_idr; -extern struct idr ib_uverbs_xrcd_idr; -extern struct idr ib_uverbs_rule_idr; -extern struct idr ib_uverbs_wq_idr; -extern struct idr ib_uverbs_rwq_ind_tbl_idr; - -void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); - -struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, - struct ib_device *ib_dev, - int is_async); +extern const struct file_operations uverbs_event_fops; +void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue); +struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file, + struct ib_device *ib_dev); void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file); -struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd); void ib_uverbs_release_ucq(struct ib_uverbs_file *file, - struct ib_uverbs_event_file *ev_file, + struct ib_uverbs_completion_event_file *ev_file, struct ib_ucq_object *uobj); void ib_uverbs_release_uevent(struct ib_uverbs_file *file, struct ib_uevent_object *uobj); +void ib_uverbs_release_file(struct kref *ref); void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context); void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr); @@ -210,9 +211,12 @@ void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); -void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); +int ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd, + enum rdma_remove_reason why); int uverbs_dealloc_mw(struct ib_mw *mw); +void ib_uverbs_detach_umcast(struct ib_qp *qp, + struct ib_uqp_object *uobj); struct ib_uverbs_flow_spec { union { @@ -228,6 +232,8 @@ struct ib_uverbs_flow_spec { struct ib_uverbs_flow_spec_ipv4 ipv4; struct ib_uverbs_flow_spec_tcp_udp tcp_udp; struct ib_uverbs_flow_spec_ipv6 ipv6; + struct ib_uverbs_flow_spec_action_tag flow_tag; + struct ib_uverbs_flow_spec_action_drop drop; }; }; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 700782203483..2c98533a0203 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -40,270 +40,29 @@ #include <linux/uaccess.h> +#include <rdma/uverbs_types.h> +#include <rdma/uverbs_std_types.h> +#include "rdma_core.h" + #include "uverbs.h" #include "core_priv.h" -struct uverbs_lock_class { - struct lock_class_key key; - char name[16]; -}; - -static struct uverbs_lock_class pd_lock_class = { .name = "PD-uobj" }; -static struct uverbs_lock_class mr_lock_class = { .name = "MR-uobj" }; -static struct uverbs_lock_class mw_lock_class = { .name = "MW-uobj" }; -static struct uverbs_lock_class cq_lock_class = { .name = "CQ-uobj" }; -static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" }; -static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; -static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; -static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; -static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; -static struct uverbs_lock_class wq_lock_class = { .name = "WQ-uobj" }; -static struct uverbs_lock_class rwq_ind_table_lock_class = { .name = "IND_TBL-uobj" }; - -/* - * The ib_uobject locking scheme is as follows: - * - * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it - * needs to be held during all idr write operations. When an object is - * looked up, a reference must be taken on the object's kref before - * dropping this lock. For read operations, the rcu_read_lock() - * and rcu_write_lock() but similarly the kref reference is grabbed - * before the rcu_read_unlock(). - * - * - Each object also has an rwsem. This rwsem must be held for - * reading while an operation that uses the object is performed. - * For example, while registering an MR, the associated PD's - * uobject.mutex must be held for reading. The rwsem must be held - * for writing while initializing or destroying an object. - * - * - In addition, each object has a "live" flag. If this flag is not - * set, then lookups of the object will fail even if it is found in - * the idr. This handles a reader that blocks and does not acquire - * the rwsem until after the object is destroyed. The destroy - * operation will set the live flag to 0 and then drop the rwsem; - * this will allow the reader to acquire the rwsem, see that the - * live flag is 0, and then drop the rwsem and its reference to - * object. The underlying storage will not be freed until the last - * reference to the object is dropped. - */ - -static void init_uobj(struct ib_uobject *uobj, u64 user_handle, - struct ib_ucontext *context, struct uverbs_lock_class *c) -{ - uobj->user_handle = user_handle; - uobj->context = context; - kref_init(&uobj->ref); - init_rwsem(&uobj->mutex); - lockdep_set_class_and_name(&uobj->mutex, &c->key, c->name); - uobj->live = 0; -} - -static void release_uobj(struct kref *kref) -{ - kfree_rcu(container_of(kref, struct ib_uobject, ref), rcu); -} - -static void put_uobj(struct ib_uobject *uobj) -{ - kref_put(&uobj->ref, release_uobj); -} - -static void put_uobj_read(struct ib_uobject *uobj) -{ - up_read(&uobj->mutex); - put_uobj(uobj); -} - -static void put_uobj_write(struct ib_uobject *uobj) -{ - up_write(&uobj->mutex); - put_uobj(uobj); -} - -static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj) -{ - int ret; - - idr_preload(GFP_KERNEL); - spin_lock(&ib_uverbs_idr_lock); - - ret = idr_alloc(idr, uobj, 0, 0, GFP_NOWAIT); - if (ret >= 0) - uobj->id = ret; - - spin_unlock(&ib_uverbs_idr_lock); - idr_preload_end(); - - return ret < 0 ? ret : 0; -} - -void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj) +static struct ib_uverbs_completion_event_file * +ib_uverbs_lookup_comp_file(int fd, struct ib_ucontext *context) { - spin_lock(&ib_uverbs_idr_lock); - idr_remove(idr, uobj->id); - spin_unlock(&ib_uverbs_idr_lock); -} + struct ib_uobject *uobj = uobj_get_read(uobj_get_type(comp_channel), + fd, context); + struct ib_uobject_file *uobj_file; -static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, - struct ib_ucontext *context) -{ - struct ib_uobject *uobj; + if (IS_ERR(uobj)) + return (void *)uobj; - rcu_read_lock(); - uobj = idr_find(idr, id); - if (uobj) { - if (uobj->context == context) - kref_get(&uobj->ref); - else - uobj = NULL; - } - rcu_read_unlock(); + uverbs_uobject_get(uobj); + uobj_put_read(uobj); - return uobj; -} - -static struct ib_uobject *idr_read_uobj(struct idr *idr, int id, - struct ib_ucontext *context, int nested) -{ - struct ib_uobject *uobj; - - uobj = __idr_get_uobj(idr, id, context); - if (!uobj) - return NULL; - - if (nested) - down_read_nested(&uobj->mutex, SINGLE_DEPTH_NESTING); - else - down_read(&uobj->mutex); - if (!uobj->live) { - put_uobj_read(uobj); - return NULL; - } - - return uobj; -} - -static struct ib_uobject *idr_write_uobj(struct idr *idr, int id, - struct ib_ucontext *context) -{ - struct ib_uobject *uobj; - - uobj = __idr_get_uobj(idr, id, context); - if (!uobj) - return NULL; - - down_write(&uobj->mutex); - if (!uobj->live) { - put_uobj_write(uobj); - return NULL; - } - - return uobj; -} - -static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context, - int nested) -{ - struct ib_uobject *uobj; - - uobj = idr_read_uobj(idr, id, context, nested); - return uobj ? uobj->object : NULL; -} - -static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context) -{ - return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context, 0); -} - -static void put_pd_read(struct ib_pd *pd) -{ - put_uobj_read(pd->uobject); -} - -static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested) -{ - return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested); -} - -static void put_cq_read(struct ib_cq *cq) -{ - put_uobj_read(cq->uobject); -} - -static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context) -{ - return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context, 0); -} - -static void put_ah_read(struct ib_ah *ah) -{ - put_uobj_read(ah->uobject); -} - -static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context) -{ - return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0); -} - -static struct ib_wq *idr_read_wq(int wq_handle, struct ib_ucontext *context) -{ - return idr_read_obj(&ib_uverbs_wq_idr, wq_handle, context, 0); -} - -static void put_wq_read(struct ib_wq *wq) -{ - put_uobj_read(wq->uobject); -} - -static struct ib_rwq_ind_table *idr_read_rwq_indirection_table(int ind_table_handle, - struct ib_ucontext *context) -{ - return idr_read_obj(&ib_uverbs_rwq_ind_tbl_idr, ind_table_handle, context, 0); -} - -static void put_rwq_indirection_table_read(struct ib_rwq_ind_table *ind_table) -{ - put_uobj_read(ind_table->uobject); -} - -static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context) -{ - struct ib_uobject *uobj; - - uobj = idr_write_uobj(&ib_uverbs_qp_idr, qp_handle, context); - return uobj ? uobj->object : NULL; -} - -static void put_qp_read(struct ib_qp *qp) -{ - put_uobj_read(qp->uobject); -} - -static void put_qp_write(struct ib_qp *qp) -{ - put_uobj_write(qp->uobject); -} - -static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context) -{ - return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0); -} - -static void put_srq_read(struct ib_srq *srq) -{ - put_uobj_read(srq->uobject); -} - -static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, struct ib_ucontext *context, - struct ib_uobject **uobj) -{ - *uobj = idr_read_uobj(&ib_uverbs_xrcd_idr, xrcd_handle, context, 0); - return *uobj ? (*uobj)->object : NULL; -} - -static void put_xrcd_read(struct ib_uobject *uobj) -{ - put_uobj_read(uobj); + uobj_file = container_of(uobj, struct ib_uobject_file, uobj); + return container_of(uobj_file, struct ib_uverbs_completion_event_file, + uobj_file); } ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, @@ -316,6 +75,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, struct ib_udata udata; struct ib_ucontext *ucontext; struct file *filp; + struct ib_rdmacg_object cg_obj; int ret; if (out_len < sizeof resp) @@ -335,24 +95,22 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); + ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); + if (ret) + goto err; + ucontext = ib_dev->alloc_ucontext(ib_dev, &udata); if (IS_ERR(ucontext)) { ret = PTR_ERR(ucontext); - goto err; + goto err_alloc; } ucontext->device = ib_dev; - INIT_LIST_HEAD(&ucontext->pd_list); - INIT_LIST_HEAD(&ucontext->mr_list); - INIT_LIST_HEAD(&ucontext->mw_list); - INIT_LIST_HEAD(&ucontext->cq_list); - INIT_LIST_HEAD(&ucontext->qp_list); - INIT_LIST_HEAD(&ucontext->srq_list); - INIT_LIST_HEAD(&ucontext->ah_list); - INIT_LIST_HEAD(&ucontext->wq_list); - INIT_LIST_HEAD(&ucontext->rwq_ind_tbl_list); - INIT_LIST_HEAD(&ucontext->xrcd_list); - INIT_LIST_HEAD(&ucontext->rule_list); + ucontext->cg_obj = cg_obj; + /* ufile is required when some objects are released */ + ucontext->ufile = file; + uverbs_initialize_ucontext(ucontext); + rcu_read_lock(); ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); rcu_read_unlock(); @@ -376,7 +134,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, goto err_free; resp.async_fd = ret; - filp = ib_uverbs_alloc_event_file(file, ib_dev, 1); + filp = ib_uverbs_alloc_async_event_file(file, ib_dev); if (IS_ERR(filp)) { ret = PTR_ERR(filp); goto err_fd; @@ -407,6 +165,9 @@ err_free: put_pid(ucontext->tgid); ib_dev->dealloc_ucontext(ucontext); +err_alloc: + ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); + err: mutex_unlock(&file->mutex); return ret; @@ -556,12 +317,9 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - uobj = kmalloc(sizeof *uobj, GFP_KERNEL); - if (!uobj) - return -ENOMEM; - - init_uobj(uobj, 0, file->ucontext, &pd_lock_class); - down_write(&uobj->mutex); + uobj = uobj_alloc(uobj_get_type(pd), file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata); if (IS_ERR(pd)) { @@ -575,10 +333,6 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, atomic_set(&pd->usecnt, 0); uobj->object = pd; - ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj); - if (ret) - goto err_idr; - memset(&resp, 0, sizeof resp); resp.pd_handle = uobj->id; @@ -588,24 +342,15 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, goto err_copy; } - mutex_lock(&file->mutex); - list_add_tail(&uobj->list, &file->ucontext->pd_list); - mutex_unlock(&file->mutex); - - uobj->live = 1; - - up_write(&uobj->mutex); + uobj_alloc_commit(uobj); return in_len; err_copy: - idr_remove_uobj(&ib_uverbs_pd_idr, uobj); - -err_idr: ib_dealloc_pd(pd); err: - put_uobj_write(uobj); + uobj_alloc_abort(uobj); return ret; } @@ -616,43 +361,19 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, { struct ib_uverbs_dealloc_pd cmd; struct ib_uobject *uobj; - struct ib_pd *pd; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext); - if (!uobj) - return -EINVAL; - pd = uobj->object; - - if (atomic_read(&pd->usecnt)) { - ret = -EBUSY; - goto err_put; - } - - ret = pd->device->dealloc_pd(uobj->object); - WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd"); - if (ret) - goto err_put; - - uobj->live = 0; - put_uobj_write(uobj); + uobj = uobj_get_write(uobj_get_type(pd), cmd.pd_handle, + file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); - idr_remove_uobj(&ib_uverbs_pd_idr, uobj); + ret = uobj_remove_commit(uobj); - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); - - put_uobj(uobj); - - return in_len; - -err_put: - put_uobj_write(uobj); - return ret; + return ret ?: in_len; } struct xrcd_table_entry { @@ -789,16 +510,13 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, } } - obj = kmalloc(sizeof *obj, GFP_KERNEL); - if (!obj) { - ret = -ENOMEM; + obj = (struct ib_uxrcd_object *)uobj_alloc(uobj_get_type(xrcd), + file->ucontext); + if (IS_ERR(obj)) { + ret = PTR_ERR(obj); goto err_tree_mutex_unlock; } - init_uobj(&obj->uobject, 0, file->ucontext, &xrcd_lock_class); - - down_write(&obj->uobject.mutex); - if (!xrcd) { xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata); if (IS_ERR(xrcd)) { @@ -816,10 +534,6 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, atomic_set(&obj->refcnt, 0); obj->uobject.object = xrcd; - ret = idr_add_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); - if (ret) - goto err_idr; - memset(&resp, 0, sizeof resp); resp.xrcd_handle = obj->uobject.id; @@ -828,7 +542,7 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, /* create new inode/xrcd table entry */ ret = xrcd_table_insert(file->device, inode, xrcd); if (ret) - goto err_insert_xrcd; + goto err_dealloc_xrcd; } atomic_inc(&xrcd->usecnt); } @@ -842,12 +556,7 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, if (f.file) fdput(f); - mutex_lock(&file->mutex); - list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list); - mutex_unlock(&file->mutex); - - obj->uobject.live = 1; - up_write(&obj->uobject.mutex); + uobj_alloc_commit(&obj->uobject); mutex_unlock(&file->device->xrcd_tree_mutex); return in_len; @@ -859,14 +568,11 @@ err_copy: atomic_dec(&xrcd->usecnt); } -err_insert_xrcd: - idr_remove_uobj(&ib_uverbs_xrcd_idr, &obj->uobject); - -err_idr: +err_dealloc_xrcd: ib_dealloc_xrcd(xrcd); err: - put_uobj_write(&obj->uobject); + uobj_alloc_abort(&obj->uobject); err_tree_mutex_unlock: if (f.file) @@ -884,75 +590,41 @@ ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, { struct ib_uverbs_close_xrcd cmd; struct ib_uobject *uobj; - struct ib_xrcd *xrcd = NULL; - struct inode *inode = NULL; - struct ib_uxrcd_object *obj; - int live; int ret = 0; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - mutex_lock(&file->device->xrcd_tree_mutex); - uobj = idr_write_uobj(&ib_uverbs_xrcd_idr, cmd.xrcd_handle, file->ucontext); - if (!uobj) { - ret = -EINVAL; - goto out; - } - - xrcd = uobj->object; - inode = xrcd->inode; - obj = container_of(uobj, struct ib_uxrcd_object, uobject); - if (atomic_read(&obj->refcnt)) { - put_uobj_write(uobj); - ret = -EBUSY; - goto out; - } - - if (!inode || atomic_dec_and_test(&xrcd->usecnt)) { - ret = ib_dealloc_xrcd(uobj->object); - if (!ret) - uobj->live = 0; + uobj = uobj_get_write(uobj_get_type(xrcd), cmd.xrcd_handle, + file->ucontext); + if (IS_ERR(uobj)) { + mutex_unlock(&file->device->xrcd_tree_mutex); + return PTR_ERR(uobj); } - live = uobj->live; - if (inode && ret) - atomic_inc(&xrcd->usecnt); - - put_uobj_write(uobj); - - if (ret) - goto out; - - if (inode && !live) - xrcd_table_delete(file->device, inode); - - idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj); - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); - - put_uobj(uobj); - ret = in_len; - -out: - mutex_unlock(&file->device->xrcd_tree_mutex); - return ret; + ret = uobj_remove_commit(uobj); + return ret ?: in_len; } -void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, - struct ib_xrcd *xrcd) +int ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, + struct ib_xrcd *xrcd, + enum rdma_remove_reason why) { struct inode *inode; + int ret; inode = xrcd->inode; if (inode && !atomic_dec_and_test(&xrcd->usecnt)) - return; + return 0; - ib_dealloc_xrcd(xrcd); + ret = ib_dealloc_xrcd(xrcd); - if (inode) + if (why == RDMA_REMOVE_DESTROY && ret) + atomic_inc(&xrcd->usecnt); + else if (inode) xrcd_table_delete(dev, inode); + + return ret; } ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, @@ -985,14 +657,11 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if (ret) return ret; - uobj = kmalloc(sizeof *uobj, GFP_KERNEL); - if (!uobj) - return -ENOMEM; - - init_uobj(uobj, 0, file->ucontext, &mr_lock_class); - down_write(&uobj->mutex); + uobj = uobj_alloc(uobj_get_type(mr), file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); - pd = idr_read_pd(cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_free; @@ -1020,9 +689,6 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, atomic_inc(&pd->usecnt); uobj->object = mr; - ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); - if (ret) - goto err_unreg; memset(&resp, 0, sizeof resp); resp.lkey = mr->lkey; @@ -1035,29 +701,20 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, goto err_copy; } - put_pd_read(pd); - - mutex_lock(&file->mutex); - list_add_tail(&uobj->list, &file->ucontext->mr_list); - mutex_unlock(&file->mutex); - - uobj->live = 1; + uobj_put_obj_read(pd); - up_write(&uobj->mutex); + uobj_alloc_commit(uobj); return in_len; err_copy: - idr_remove_uobj(&ib_uverbs_mr_idr, uobj); - -err_unreg: ib_dereg_mr(mr); err_put: - put_pd_read(pd); + uobj_put_obj_read(pd); err_free: - put_uobj_write(uobj); + uobj_alloc_abort(uobj); return ret; } @@ -1093,11 +750,10 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) return -EINVAL; - uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, - file->ucontext); - - if (!uobj) - return -EINVAL; + uobj = uobj_get_write(uobj_get_type(mr), cmd.mr_handle, + file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); mr = uobj->object; @@ -1108,7 +764,7 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, } if (cmd.flags & IB_MR_REREG_PD) { - pd = idr_read_pd(cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto put_uobjs; @@ -1141,11 +797,10 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, put_uobj_pd: if (cmd.flags & IB_MR_REREG_PD) - put_pd_read(pd); + uobj_put_obj_read(pd); put_uobjs: - - put_uobj_write(mr->uobject); + uobj_put_write(uobj); return ret; } @@ -1156,37 +811,20 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, int out_len) { struct ib_uverbs_dereg_mr cmd; - struct ib_mr *mr; struct ib_uobject *uobj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext); - if (!uobj) - return -EINVAL; + uobj = uobj_get_write(uobj_get_type(mr), cmd.mr_handle, + file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); - mr = uobj->object; + ret = uobj_remove_commit(uobj); - ret = ib_dereg_mr(mr); - if (!ret) - uobj->live = 0; - - put_uobj_write(uobj); - - if (ret) - return ret; - - idr_remove_uobj(&ib_uverbs_mr_idr, uobj); - - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); - - put_uobj(uobj); - - return in_len; + return ret ?: in_len; } ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, @@ -1208,14 +846,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; - uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); - if (!uobj) - return -ENOMEM; + uobj = uobj_alloc(uobj_get_type(mw), file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); - init_uobj(uobj, 0, file->ucontext, &mw_lock_class); - down_write(&uobj->mutex); - - pd = idr_read_pd(cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_free; @@ -1238,9 +873,6 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, atomic_inc(&pd->usecnt); uobj->object = mw; - ret = idr_add_uobj(&ib_uverbs_mw_idr, uobj); - if (ret) - goto err_unalloc; memset(&resp, 0, sizeof(resp)); resp.rkey = mw->rkey; @@ -1252,29 +884,17 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, goto err_copy; } - put_pd_read(pd); - - mutex_lock(&file->mutex); - list_add_tail(&uobj->list, &file->ucontext->mw_list); - mutex_unlock(&file->mutex); - - uobj->live = 1; - - up_write(&uobj->mutex); + uobj_put_obj_read(pd); + uobj_alloc_commit(uobj); return in_len; err_copy: - idr_remove_uobj(&ib_uverbs_mw_idr, uobj); - -err_unalloc: uverbs_dealloc_mw(mw); - err_put: - put_pd_read(pd); - + uobj_put_obj_read(pd); err_free: - put_uobj_write(uobj); + uobj_alloc_abort(uobj); return ret; } @@ -1284,37 +904,19 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, int out_len) { struct ib_uverbs_dealloc_mw cmd; - struct ib_mw *mw; struct ib_uobject *uobj; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; - uobj = idr_write_uobj(&ib_uverbs_mw_idr, cmd.mw_handle, file->ucontext); - if (!uobj) - return -EINVAL; - - mw = uobj->object; + uobj = uobj_get_write(uobj_get_type(mw), cmd.mw_handle, + file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); - ret = uverbs_dealloc_mw(mw); - if (!ret) - uobj->live = 0; - - put_uobj_write(uobj); - - if (ret) - return ret; - - idr_remove_uobj(&ib_uverbs_mw_idr, uobj); - - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); - - put_uobj(uobj); - - return in_len; + ret = uobj_remove_commit(uobj); + return ret ?: in_len; } ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, @@ -1324,8 +926,8 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, { struct ib_uverbs_create_comp_channel cmd; struct ib_uverbs_create_comp_channel_resp resp; - struct file *filp; - int ret; + struct ib_uobject *uobj; + struct ib_uverbs_completion_event_file *ev_file; if (out_len < sizeof resp) return -ENOSPC; @@ -1333,25 +935,23 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) - return ret; - resp.fd = ret; + uobj = uobj_alloc(uobj_get_type(comp_channel), file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); - filp = ib_uverbs_alloc_event_file(file, ib_dev, 0); - if (IS_ERR(filp)) { - put_unused_fd(resp.fd); - return PTR_ERR(filp); - } + resp.fd = uobj->id; + + ev_file = container_of(uobj, struct ib_uverbs_completion_event_file, + uobj_file.uobj); + ib_uverbs_init_event_queue(&ev_file->ev_queue); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { - put_unused_fd(resp.fd); - fput(filp); + uobj_alloc_abort(uobj); return -EFAULT; } - fd_install(resp.fd, filp); + uobj_alloc_commit(uobj); return in_len; } @@ -1369,7 +969,7 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, void *context) { struct ib_ucq_object *obj; - struct ib_uverbs_event_file *ev_file = NULL; + struct ib_uverbs_completion_event_file *ev_file = NULL; struct ib_cq *cq; int ret; struct ib_uverbs_ex_create_cq_resp resp; @@ -1378,21 +978,21 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, if (cmd->comp_vector >= file->device->num_comp_vectors) return ERR_PTR(-EINVAL); - obj = kmalloc(sizeof *obj, GFP_KERNEL); - if (!obj) - return ERR_PTR(-ENOMEM); - - init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, &cq_lock_class); - down_write(&obj->uobject.mutex); + obj = (struct ib_ucq_object *)uobj_alloc(uobj_get_type(cq), + file->ucontext); + if (IS_ERR(obj)) + return obj; if (cmd->comp_channel >= 0) { - ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel); - if (!ev_file) { - ret = -EINVAL; + ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, + file->ucontext); + if (IS_ERR(ev_file)) { + ret = PTR_ERR(ev_file); goto err; } } + obj->uobject.user_handle = cmd->user_handle; obj->uverbs_file = file; obj->comp_events_reported = 0; obj->async_events_reported = 0; @@ -1405,8 +1005,7 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags)) attr.flags = cmd->flags; - cq = ib_dev->create_cq(ib_dev, &attr, - file->ucontext, uhw); + cq = ib_dev->create_cq(ib_dev, &attr, file->ucontext, uhw); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto err_file; @@ -1416,14 +1015,10 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, cq->uobject = &obj->uobject; cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; - cq->cq_context = ev_file; + cq->cq_context = &ev_file->ev_queue; atomic_set(&cq->usecnt, 0); obj->uobject.object = cq; - ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject); - if (ret) - goto err_free; - memset(&resp, 0, sizeof resp); resp.base.cq_handle = obj->uobject.id; resp.base.cqe = cq->cqe; @@ -1435,20 +1030,11 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, if (ret) goto err_cb; - mutex_lock(&file->mutex); - list_add_tail(&obj->uobject.list, &file->ucontext->cq_list); - mutex_unlock(&file->mutex); - - obj->uobject.live = 1; - - up_write(&obj->uobject.mutex); + uobj_alloc_commit(&obj->uobject); return obj; err_cb: - idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject); - -err_free: ib_destroy_cq(cq); err_file: @@ -1456,7 +1042,7 @@ err_file: ib_uverbs_release_ucq(file, ev_file, obj); err: - put_uobj_write(&obj->uobject); + uobj_alloc_abort(&obj->uobject); return ERR_PTR(ret); } @@ -1579,7 +1165,7 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + cq = uobj_get_obj_read(cq, cmd.cq_handle, file->ucontext); if (!cq) return -EINVAL; @@ -1594,7 +1180,7 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, ret = -EFAULT; out: - put_cq_read(cq); + uobj_put_obj_read(cq); return ret ? ret : in_len; } @@ -1641,7 +1227,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + cq = uobj_get_obj_read(cq, cmd.cq_handle, file->ucontext); if (!cq) return -EINVAL; @@ -1673,7 +1259,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, ret = in_len; out_put: - put_cq_read(cq); + uobj_put_obj_read(cq); return ret; } @@ -1688,14 +1274,14 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + cq = uobj_get_obj_read(cq, cmd.cq_handle, file->ucontext); if (!cq) return -EINVAL; ib_req_notify_cq(cq, cmd.solicited_only ? IB_CQ_SOLICITED : IB_CQ_NEXT_COMP); - put_cq_read(cq); + uobj_put_obj_read(cq); return in_len; } @@ -1710,42 +1296,36 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, struct ib_uobject *uobj; struct ib_cq *cq; struct ib_ucq_object *obj; - struct ib_uverbs_event_file *ev_file; int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext); - if (!uobj) - return -EINVAL; + uobj = uobj_get_write(uobj_get_type(cq), cmd.cq_handle, + file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + /* + * Make sure we don't free the memory in remove_commit as we still + * needs the uobject memory to create the response. + */ + uverbs_uobject_get(uobj); cq = uobj->object; - ev_file = cq->cq_context; obj = container_of(cq->uobject, struct ib_ucq_object, uobject); - ret = ib_destroy_cq(cq); - if (!ret) - uobj->live = 0; - - put_uobj_write(uobj); + memset(&resp, 0, sizeof(resp)); - if (ret) + ret = uobj_remove_commit(uobj); + if (ret) { + uverbs_uobject_put(uobj); return ret; + } - idr_remove_uobj(&ib_uverbs_cq_idr, uobj); - - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); - - ib_uverbs_release_ucq(file, ev_file, obj); - - memset(&resp, 0, sizeof resp); resp.comp_events_reported = obj->comp_events_reported; resp.async_events_reported = obj->async_events_reported; - put_uobj(uobj); - + uverbs_uobject_put(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) return -EFAULT; @@ -1767,7 +1347,7 @@ static int create_qp(struct ib_uverbs_file *file, struct ib_device *device; struct ib_pd *pd = NULL; struct ib_xrcd *xrcd = NULL; - struct ib_uobject *uninitialized_var(xrcd_uobj); + struct ib_uobject *xrcd_uobj = ERR_PTR(-ENOENT); struct ib_cq *scq = NULL, *rcq = NULL; struct ib_srq *srq = NULL; struct ib_qp *qp; @@ -1781,18 +1361,20 @@ static int create_qp(struct ib_uverbs_file *file, if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) return -EPERM; - obj = kzalloc(sizeof *obj, GFP_KERNEL); - if (!obj) - return -ENOMEM; + obj = (struct ib_uqp_object *)uobj_alloc(uobj_get_type(qp), + file->ucontext); + if (IS_ERR(obj)) + return PTR_ERR(obj); + obj->uxrcd = NULL; + obj->uevent.uobject.user_handle = cmd->user_handle; + mutex_init(&obj->mcast_lock); - init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, - &qp_lock_class); - down_write(&obj->uevent.uobject.mutex); if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) + sizeof(cmd->rwq_ind_tbl_handle) && (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) { - ind_tbl = idr_read_rwq_indirection_table(cmd->rwq_ind_tbl_handle, - file->ucontext); + ind_tbl = uobj_get_obj_read(rwq_ind_table, + cmd->rwq_ind_tbl_handle, + file->ucontext); if (!ind_tbl) { ret = -EINVAL; goto err_put; @@ -1816,8 +1398,15 @@ static int create_qp(struct ib_uverbs_file *file, has_sq = false; if (cmd->qp_type == IB_QPT_XRC_TGT) { - xrcd = idr_read_xrcd(cmd->pd_handle, file->ucontext, - &xrcd_uobj); + xrcd_uobj = uobj_get_read(uobj_get_type(xrcd), cmd->pd_handle, + file->ucontext); + + if (IS_ERR(xrcd_uobj)) { + ret = -EINVAL; + goto err_put; + } + + xrcd = (struct ib_xrcd *)xrcd_uobj->object; if (!xrcd) { ret = -EINVAL; goto err_put; @@ -1829,8 +1418,8 @@ static int create_qp(struct ib_uverbs_file *file, cmd->max_recv_sge = 0; } else { if (cmd->is_srq) { - srq = idr_read_srq(cmd->srq_handle, - file->ucontext); + srq = uobj_get_obj_read(srq, cmd->srq_handle, + file->ucontext); if (!srq || srq->srq_type != IB_SRQT_BASIC) { ret = -EINVAL; goto err_put; @@ -1839,8 +1428,8 @@ static int create_qp(struct ib_uverbs_file *file, if (!ind_tbl) { if (cmd->recv_cq_handle != cmd->send_cq_handle) { - rcq = idr_read_cq(cmd->recv_cq_handle, - file->ucontext, 0); + rcq = uobj_get_obj_read(cq, cmd->recv_cq_handle, + file->ucontext); if (!rcq) { ret = -EINVAL; goto err_put; @@ -1850,10 +1439,11 @@ static int create_qp(struct ib_uverbs_file *file, } if (has_sq) - scq = idr_read_cq(cmd->send_cq_handle, file->ucontext, !!rcq); + scq = uobj_get_obj_read(cq, cmd->send_cq_handle, + file->ucontext); if (!ind_tbl) rcq = rcq ?: scq; - pd = idr_read_pd(cmd->pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, cmd->pd_handle, file->ucontext); if (!pd || (!scq && has_sq)) { ret = -EINVAL; goto err_put; @@ -1891,7 +1481,8 @@ static int create_qp(struct ib_uverbs_file *file, IB_QP_CREATE_CROSS_CHANNEL | IB_QP_CREATE_MANAGED_SEND | IB_QP_CREATE_MANAGED_RECV | - IB_QP_CREATE_SCATTER_FCS)) { + IB_QP_CREATE_SCATTER_FCS | + IB_QP_CREATE_CVLAN_STRIPPING)) { ret = -EINVAL; goto err_put; } @@ -1915,6 +1506,10 @@ static int create_qp(struct ib_uverbs_file *file, } if (cmd->qp_type != IB_QPT_XRC_TGT) { + ret = ib_create_qp_security(qp, device); + if (ret) + goto err_cb; + qp->real_qp = qp; qp->device = device; qp->pd = pd; @@ -1939,9 +1534,6 @@ static int create_qp(struct ib_uverbs_file *file, qp->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = qp; - ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); - if (ret) - goto err_destroy; memset(&resp, 0, sizeof resp); resp.base.qpn = qp->qp_num; @@ -1963,50 +1555,41 @@ static int create_qp(struct ib_uverbs_file *file, obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); - put_xrcd_read(xrcd_uobj); + uobj_put_read(xrcd_uobj); } if (pd) - put_pd_read(pd); + uobj_put_obj_read(pd); if (scq) - put_cq_read(scq); + uobj_put_obj_read(scq); if (rcq && rcq != scq) - put_cq_read(rcq); + uobj_put_obj_read(rcq); if (srq) - put_srq_read(srq); + uobj_put_obj_read(srq); if (ind_tbl) - put_rwq_indirection_table_read(ind_tbl); + uobj_put_obj_read(ind_tbl); - mutex_lock(&file->mutex); - list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); - mutex_unlock(&file->mutex); - - obj->uevent.uobject.live = 1; - - up_write(&obj->uevent.uobject.mutex); + uobj_alloc_commit(&obj->uevent.uobject); return 0; err_cb: - idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); - -err_destroy: ib_destroy_qp(qp); err_put: - if (xrcd) - put_xrcd_read(xrcd_uobj); + if (!IS_ERR(xrcd_uobj)) + uobj_put_read(xrcd_uobj); if (pd) - put_pd_read(pd); + uobj_put_obj_read(pd); if (scq) - put_cq_read(scq); + uobj_put_obj_read(scq); if (rcq && rcq != scq) - put_cq_read(rcq); + uobj_put_obj_read(rcq); if (srq) - put_srq_read(srq); + uobj_put_obj_read(srq); if (ind_tbl) - put_rwq_indirection_table_read(ind_tbl); + uobj_put_obj_read(ind_tbl); - put_uobj_write(&obj->uevent.uobject); + uobj_alloc_abort(&obj->uevent.uobject); return ret; } @@ -2142,17 +1725,22 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - obj = kmalloc(sizeof *obj, GFP_KERNEL); - if (!obj) - return -ENOMEM; + obj = (struct ib_uqp_object *)uobj_alloc(uobj_get_type(qp), + file->ucontext); + if (IS_ERR(obj)) + return PTR_ERR(obj); - init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class); - down_write(&obj->uevent.uobject.mutex); + xrcd_uobj = uobj_get_read(uobj_get_type(xrcd), cmd.pd_handle, + file->ucontext); + if (IS_ERR(xrcd_uobj)) { + ret = -EINVAL; + goto err_put; + } - xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj); + xrcd = (struct ib_xrcd *)xrcd_uobj->object; if (!xrcd) { ret = -EINVAL; - goto err_put; + goto err_xrcd; } attr.event_handler = ib_uverbs_qp_event_handler; @@ -2167,15 +1755,11 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, qp = ib_open_qp(xrcd, &attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); - goto err_put; + goto err_xrcd; } - qp->uobject = &obj->uevent.uobject; - obj->uevent.uobject.object = qp; - ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); - if (ret) - goto err_destroy; + obj->uevent.uobject.user_handle = cmd.user_handle; memset(&resp, 0, sizeof resp); resp.qpn = qp->qp_num; @@ -2184,32 +1768,25 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) { ret = -EFAULT; - goto err_remove; + goto err_destroy; } obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); - put_xrcd_read(xrcd_uobj); - - mutex_lock(&file->mutex); - list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); - mutex_unlock(&file->mutex); + qp->uobject = &obj->uevent.uobject; + uobj_put_read(xrcd_uobj); - obj->uevent.uobject.live = 1; - up_write(&obj->uevent.uobject.mutex); + uobj_alloc_commit(&obj->uevent.uobject); return in_len; -err_remove: - idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); - err_destroy: ib_destroy_qp(qp); - +err_xrcd: + uobj_put_read(xrcd_uobj); err_put: - put_xrcd_read(xrcd_uobj); - put_uobj_write(&obj->uevent.uobject); + uobj_alloc_abort(&obj->uevent.uobject); return ret; } @@ -2223,6 +1800,7 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, struct ib_qp *qp; struct ib_qp_attr *attr; struct ib_qp_init_attr *init_attr; + const struct ib_global_route *grh; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) @@ -2235,7 +1813,7 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, goto out; } - qp = idr_read_qp(cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, cmd.qp_handle, file->ucontext); if (!qp) { ret = -EINVAL; goto out; @@ -2243,7 +1821,7 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr); - put_qp_read(qp); + uobj_put_obj_read(qp); if (ret) goto out; @@ -2272,29 +1850,39 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, resp.alt_port_num = attr->alt_port_num; resp.alt_timeout = attr->alt_timeout; - memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16); - resp.dest.flow_label = attr->ah_attr.grh.flow_label; - resp.dest.sgid_index = attr->ah_attr.grh.sgid_index; - resp.dest.hop_limit = attr->ah_attr.grh.hop_limit; - resp.dest.traffic_class = attr->ah_attr.grh.traffic_class; - resp.dest.dlid = attr->ah_attr.dlid; - resp.dest.sl = attr->ah_attr.sl; - resp.dest.src_path_bits = attr->ah_attr.src_path_bits; - resp.dest.static_rate = attr->ah_attr.static_rate; - resp.dest.is_global = !!(attr->ah_attr.ah_flags & IB_AH_GRH); - resp.dest.port_num = attr->ah_attr.port_num; - - memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16); - resp.alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label; - resp.alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index; - resp.alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit; - resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class; - resp.alt_dest.dlid = attr->alt_ah_attr.dlid; - resp.alt_dest.sl = attr->alt_ah_attr.sl; - resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits; - resp.alt_dest.static_rate = attr->alt_ah_attr.static_rate; - resp.alt_dest.is_global = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH); - resp.alt_dest.port_num = attr->alt_ah_attr.port_num; + resp.dest.dlid = rdma_ah_get_dlid(&attr->ah_attr); + resp.dest.sl = rdma_ah_get_sl(&attr->ah_attr); + resp.dest.src_path_bits = rdma_ah_get_path_bits(&attr->ah_attr); + resp.dest.static_rate = rdma_ah_get_static_rate(&attr->ah_attr); + resp.dest.is_global = !!(rdma_ah_get_ah_flags(&attr->ah_attr) & + IB_AH_GRH); + if (resp.dest.is_global) { + grh = rdma_ah_read_grh(&attr->ah_attr); + memcpy(resp.dest.dgid, grh->dgid.raw, 16); + resp.dest.flow_label = grh->flow_label; + resp.dest.sgid_index = grh->sgid_index; + resp.dest.hop_limit = grh->hop_limit; + resp.dest.traffic_class = grh->traffic_class; + } + resp.dest.port_num = rdma_ah_get_port_num(&attr->ah_attr); + + resp.alt_dest.dlid = rdma_ah_get_dlid(&attr->alt_ah_attr); + resp.alt_dest.sl = rdma_ah_get_sl(&attr->alt_ah_attr); + resp.alt_dest.src_path_bits = rdma_ah_get_path_bits(&attr->alt_ah_attr); + resp.alt_dest.static_rate + = rdma_ah_get_static_rate(&attr->alt_ah_attr); + resp.alt_dest.is_global + = !!(rdma_ah_get_ah_flags(&attr->alt_ah_attr) & + IB_AH_GRH); + if (resp.alt_dest.is_global) { + grh = rdma_ah_read_grh(&attr->alt_ah_attr); + memcpy(resp.alt_dest.dgid, grh->dgid.raw, 16); + resp.alt_dest.flow_label = grh->flow_label; + resp.alt_dest.sgid_index = grh->sgid_index; + resp.alt_dest.hop_limit = grh->hop_limit; + resp.alt_dest.traffic_class = grh->traffic_class; + } + resp.alt_dest.port_num = rdma_ah_get_port_num(&attr->alt_ah_attr); resp.max_send_wr = init_attr->cap.max_send_wr; resp.max_recv_wr = init_attr->cap.max_recv_wr; @@ -2339,12 +1927,18 @@ static int modify_qp(struct ib_uverbs_file *file, if (!attr) return -ENOMEM; - qp = idr_read_qp(cmd->base.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, cmd->base.qp_handle, file->ucontext); if (!qp) { ret = -EINVAL; goto out; } + if ((cmd->base.attr_mask & IB_QP_PORT) && + !rdma_is_port_valid(qp->device, cmd->base.port_num)) { + ret = -EINVAL; + goto release_qp; + } + attr->qp_state = cmd->base.qp_state; attr->cur_qp_state = cmd->base.cur_qp_state; attr->path_mtu = cmd->base.path_mtu; @@ -2368,51 +1962,55 @@ static int modify_qp(struct ib_uverbs_file *file, attr->alt_timeout = cmd->base.alt_timeout; attr->rate_limit = cmd->rate_limit; - memcpy(attr->ah_attr.grh.dgid.raw, cmd->base.dest.dgid, 16); - attr->ah_attr.grh.flow_label = cmd->base.dest.flow_label; - attr->ah_attr.grh.sgid_index = cmd->base.dest.sgid_index; - attr->ah_attr.grh.hop_limit = cmd->base.dest.hop_limit; - attr->ah_attr.grh.traffic_class = cmd->base.dest.traffic_class; - attr->ah_attr.dlid = cmd->base.dest.dlid; - attr->ah_attr.sl = cmd->base.dest.sl; - attr->ah_attr.src_path_bits = cmd->base.dest.src_path_bits; - attr->ah_attr.static_rate = cmd->base.dest.static_rate; - attr->ah_attr.ah_flags = cmd->base.dest.is_global ? - IB_AH_GRH : 0; - attr->ah_attr.port_num = cmd->base.dest.port_num; - - memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd->base.alt_dest.dgid, 16); - attr->alt_ah_attr.grh.flow_label = cmd->base.alt_dest.flow_label; - attr->alt_ah_attr.grh.sgid_index = cmd->base.alt_dest.sgid_index; - attr->alt_ah_attr.grh.hop_limit = cmd->base.alt_dest.hop_limit; - attr->alt_ah_attr.grh.traffic_class = cmd->base.alt_dest.traffic_class; - attr->alt_ah_attr.dlid = cmd->base.alt_dest.dlid; - attr->alt_ah_attr.sl = cmd->base.alt_dest.sl; - attr->alt_ah_attr.src_path_bits = cmd->base.alt_dest.src_path_bits; - attr->alt_ah_attr.static_rate = cmd->base.alt_dest.static_rate; - attr->alt_ah_attr.ah_flags = cmd->base.alt_dest.is_global ? - IB_AH_GRH : 0; - attr->alt_ah_attr.port_num = cmd->base.alt_dest.port_num; - - if (qp->real_qp == qp) { - if (cmd->base.attr_mask & IB_QP_AV) { - ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); - if (ret) - goto release_qp; - } - ret = qp->device->modify_qp(qp, attr, - modify_qp_mask(qp->qp_type, - cmd->base.attr_mask), - udata); + attr->ah_attr.type = rdma_ah_find_type(qp->device, + cmd->base.dest.port_num); + if (cmd->base.dest.is_global) { + rdma_ah_set_grh(&attr->ah_attr, NULL, + cmd->base.dest.flow_label, + cmd->base.dest.sgid_index, + cmd->base.dest.hop_limit, + cmd->base.dest.traffic_class); + rdma_ah_set_dgid_raw(&attr->ah_attr, cmd->base.dest.dgid); + } else { + rdma_ah_set_ah_flags(&attr->ah_attr, 0); + } + rdma_ah_set_dlid(&attr->ah_attr, cmd->base.dest.dlid); + rdma_ah_set_sl(&attr->ah_attr, cmd->base.dest.sl); + rdma_ah_set_path_bits(&attr->ah_attr, cmd->base.dest.src_path_bits); + rdma_ah_set_static_rate(&attr->ah_attr, cmd->base.dest.static_rate); + rdma_ah_set_port_num(&attr->ah_attr, + cmd->base.dest.port_num); + + attr->alt_ah_attr.type = rdma_ah_find_type(qp->device, + cmd->base.dest.port_num); + if (cmd->base.alt_dest.is_global) { + rdma_ah_set_grh(&attr->alt_ah_attr, NULL, + cmd->base.alt_dest.flow_label, + cmd->base.alt_dest.sgid_index, + cmd->base.alt_dest.hop_limit, + cmd->base.alt_dest.traffic_class); + rdma_ah_set_dgid_raw(&attr->alt_ah_attr, + cmd->base.alt_dest.dgid); } else { - ret = ib_modify_qp(qp, attr, - modify_qp_mask(qp->qp_type, - cmd->base.attr_mask)); + rdma_ah_set_ah_flags(&attr->alt_ah_attr, 0); } -release_qp: - put_qp_read(qp); + rdma_ah_set_dlid(&attr->alt_ah_attr, cmd->base.alt_dest.dlid); + rdma_ah_set_sl(&attr->alt_ah_attr, cmd->base.alt_dest.sl); + rdma_ah_set_path_bits(&attr->alt_ah_attr, + cmd->base.alt_dest.src_path_bits); + rdma_ah_set_static_rate(&attr->alt_ah_attr, + cmd->base.alt_dest.static_rate); + rdma_ah_set_port_num(&attr->alt_ah_attr, + cmd->base.alt_dest.port_num); + + ret = ib_modify_qp_with_udata(qp, attr, + modify_qp_mask(qp->qp_type, + cmd->base.attr_mask), + udata); +release_qp: + uobj_put_obj_read(qp); out: kfree(attr); @@ -2489,7 +2087,6 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, struct ib_uverbs_destroy_qp cmd; struct ib_uverbs_destroy_qp_resp resp; struct ib_uobject *uobj; - struct ib_qp *qp; struct ib_uqp_object *obj; int ret = -EINVAL; @@ -2498,40 +2095,26 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, memset(&resp, 0, sizeof resp); - uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext); - if (!uobj) - return -EINVAL; - qp = uobj->object; - obj = container_of(uobj, struct ib_uqp_object, uevent.uobject); - - if (!list_empty(&obj->mcast_list)) { - put_uobj_write(uobj); - return -EBUSY; - } - - ret = ib_destroy_qp(qp); - if (!ret) - uobj->live = 0; + uobj = uobj_get_write(uobj_get_type(qp), cmd.qp_handle, + file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); - put_uobj_write(uobj); + obj = container_of(uobj, struct ib_uqp_object, uevent.uobject); + /* + * Make sure we don't free the memory in remove_commit as we still + * needs the uobject memory to create the response. + */ + uverbs_uobject_get(uobj); - if (ret) + ret = uobj_remove_commit(uobj); + if (ret) { + uverbs_uobject_put(uobj); return ret; - - if (obj->uxrcd) - atomic_dec(&obj->uxrcd->refcnt); - - idr_remove_uobj(&ib_uverbs_qp_idr, uobj); - - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); - - ib_uverbs_release_uevent(file, &obj->uevent); + } resp.events_reported = obj->uevent.events_reported; - - put_uobj(uobj); + uverbs_uobject_put(uobj); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) @@ -2542,9 +2125,13 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, static void *alloc_wr(size_t wr_size, __u32 num_sge) { + if (num_sge >= (U32_MAX - ALIGN(wr_size, sizeof (struct ib_sge))) / + sizeof (struct ib_sge)) + return NULL; + return kmalloc(ALIGN(wr_size, sizeof (struct ib_sge)) + num_sge * sizeof (struct ib_sge), GFP_KERNEL); -}; +} ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, struct ib_device *ib_dev, @@ -2575,7 +2162,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, if (!user_wr) return -ENOMEM; - qp = idr_read_qp(cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, cmd.qp_handle, file->ucontext); if (!qp) goto out; @@ -2611,7 +2198,8 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, goto out_put; } - ud->ah = idr_read_ah(user_wr->wr.ud.ah, file->ucontext); + ud->ah = uobj_get_obj_read(ah, user_wr->wr.ud.ah, + file->ucontext); if (!ud->ah) { kfree(ud); ret = -EINVAL; @@ -2718,11 +2306,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, ret = -EFAULT; out_put: - put_qp_read(qp); + uobj_put_obj_read(qp); while (wr) { if (is_ud && ud_wr(wr)->ah) - put_ah_read(ud_wr(wr)->ah); + uobj_put_obj_read(ud_wr(wr)->ah); next = wr->next; kfree(wr); wr = next; @@ -2771,6 +2359,13 @@ static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf, goto err; } + if (user_wr->num_sge >= + (U32_MAX - ALIGN(sizeof *next, sizeof (struct ib_sge))) / + sizeof (struct ib_sge)) { + ret = -EINVAL; + goto err; + } + next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) + user_wr->num_sge * sizeof (struct ib_sge), GFP_KERNEL); @@ -2839,21 +2434,21 @@ ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, if (IS_ERR(wr)) return PTR_ERR(wr); - qp = idr_read_qp(cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, cmd.qp_handle, file->ucontext); if (!qp) goto out; resp.bad_wr = 0; ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr); - put_qp_read(qp); - - if (ret) + uobj_put_obj_read(qp); + if (ret) { for (next = wr; next; next = next->next) { ++resp.bad_wr; if (next == bad_wr) break; } + } if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) @@ -2889,14 +2484,14 @@ ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, if (IS_ERR(wr)) return PTR_ERR(wr); - srq = idr_read_srq(cmd.srq_handle, file->ucontext); + srq = uobj_get_obj_read(srq, cmd.srq_handle, file->ucontext); if (!srq) goto out; resp.bad_wr = 0; ret = srq->device->post_srq_recv(srq, wr, &bad_wr); - put_srq_read(srq); + uobj_put_obj_read(srq); if (ret) for (next = wr; next; next = next->next) { @@ -2929,9 +2524,10 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, struct ib_uobject *uobj; struct ib_pd *pd; struct ib_ah *ah; - struct ib_ah_attr attr; + struct rdma_ah_attr attr; int ret; struct ib_udata udata; + u8 *dmac; if (out_len < sizeof resp) return -ENOSPC; @@ -2939,35 +2535,42 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; + if (!rdma_is_port_valid(ib_dev, cmd.attr.port_num)) + return -EINVAL; + INIT_UDATA(&udata, buf + sizeof(cmd), (unsigned long)cmd.response + sizeof(resp), in_len - sizeof(cmd), out_len - sizeof(resp)); - uobj = kmalloc(sizeof *uobj, GFP_KERNEL); - if (!uobj) - return -ENOMEM; - - init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class); - down_write(&uobj->mutex); + uobj = uobj_alloc(uobj_get_type(ah), file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); - pd = idr_read_pd(cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, cmd.pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err; } - attr.dlid = cmd.attr.dlid; - attr.sl = cmd.attr.sl; - attr.src_path_bits = cmd.attr.src_path_bits; - attr.static_rate = cmd.attr.static_rate; - attr.ah_flags = cmd.attr.is_global ? IB_AH_GRH : 0; - attr.port_num = cmd.attr.port_num; - attr.grh.flow_label = cmd.attr.grh.flow_label; - attr.grh.sgid_index = cmd.attr.grh.sgid_index; - attr.grh.hop_limit = cmd.attr.grh.hop_limit; - attr.grh.traffic_class = cmd.attr.grh.traffic_class; - memset(&attr.dmac, 0, sizeof(attr.dmac)); - memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16); + attr.type = rdma_ah_find_type(ib_dev, cmd.attr.port_num); + rdma_ah_set_dlid(&attr, cmd.attr.dlid); + rdma_ah_set_sl(&attr, cmd.attr.sl); + rdma_ah_set_path_bits(&attr, cmd.attr.src_path_bits); + rdma_ah_set_static_rate(&attr, cmd.attr.static_rate); + rdma_ah_set_port_num(&attr, cmd.attr.port_num); + + if (cmd.attr.is_global) { + rdma_ah_set_grh(&attr, NULL, cmd.attr.grh.flow_label, + cmd.attr.grh.sgid_index, + cmd.attr.grh.hop_limit, + cmd.attr.grh.traffic_class); + rdma_ah_set_dgid_raw(&attr, cmd.attr.grh.dgid); + } else { + rdma_ah_set_ah_flags(&attr, 0); + } + dmac = rdma_ah_retrieve_dmac(&attr); + if (dmac) + memset(dmac, 0, ETH_ALEN); ah = pd->device->create_ah(pd, &attr, &udata); @@ -2980,12 +2583,9 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, ah->pd = pd; atomic_inc(&pd->usecnt); ah->uobject = uobj; + uobj->user_handle = cmd.user_handle; uobj->object = ah; - ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj); - if (ret) - goto err_destroy; - resp.ah_handle = uobj->id; if (copy_to_user((void __user *) (unsigned long) cmd.response, @@ -2994,29 +2594,19 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, goto err_copy; } - put_pd_read(pd); - - mutex_lock(&file->mutex); - list_add_tail(&uobj->list, &file->ucontext->ah_list); - mutex_unlock(&file->mutex); - - uobj->live = 1; - - up_write(&uobj->mutex); + uobj_put_obj_read(pd); + uobj_alloc_commit(uobj); return in_len; err_copy: - idr_remove_uobj(&ib_uverbs_ah_idr, uobj); - -err_destroy: - ib_destroy_ah(ah); + rdma_destroy_ah(ah); err_put: - put_pd_read(pd); + uobj_put_obj_read(pd); err: - put_uobj_write(uobj); + uobj_alloc_abort(uobj); return ret; } @@ -3025,36 +2615,19 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_ah cmd; - struct ib_ah *ah; struct ib_uobject *uobj; int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext); - if (!uobj) - return -EINVAL; - ah = uobj->object; - - ret = ib_destroy_ah(ah); - if (!ret) - uobj->live = 0; - - put_uobj_write(uobj); - - if (ret) - return ret; - - idr_remove_uobj(&ib_uverbs_ah_idr, uobj); + uobj = uobj_get_write(uobj_get_type(ah), cmd.ah_handle, + file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); - - put_uobj(uobj); - - return in_len; + ret = uobj_remove_commit(uobj); + return ret ?: in_len; } ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, @@ -3071,12 +2644,13 @@ ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - qp = idr_write_qp(cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, cmd.qp_handle, file->ucontext); if (!qp) return -EINVAL; obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); + mutex_lock(&obj->mcast_lock); list_for_each_entry(mcast, &obj->mcast_list, list) if (cmd.mlid == mcast->lid && !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { @@ -3100,7 +2674,8 @@ ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, kfree(mcast); out_put: - put_qp_write(qp); + mutex_unlock(&obj->mcast_lock); + uobj_put_obj_read(qp); return ret ? ret : in_len; } @@ -3115,34 +2690,66 @@ ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, struct ib_qp *qp; struct ib_uverbs_mcast_entry *mcast; int ret = -EINVAL; + bool found = false; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - qp = idr_write_qp(cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, cmd.qp_handle, file->ucontext); if (!qp) return -EINVAL; - ret = ib_detach_mcast(qp, (union ib_gid *) cmd.gid, cmd.mlid); - if (ret) - goto out_put; - obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject); + mutex_lock(&obj->mcast_lock); list_for_each_entry(mcast, &obj->mcast_list, list) if (cmd.mlid == mcast->lid && !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) { list_del(&mcast->list); kfree(mcast); + found = true; break; } -out_put: - put_qp_write(qp); + if (!found) { + ret = -EINVAL; + goto out_put; + } + ret = ib_detach_mcast(qp, (union ib_gid *)cmd.gid, cmd.mlid); + +out_put: + mutex_unlock(&obj->mcast_lock); + uobj_put_obj_read(qp); return ret ? ret : in_len; } +static int kern_spec_to_ib_spec_action(struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec) +{ + ib_spec->type = kern_spec->type; + switch (ib_spec->type) { + case IB_FLOW_SPEC_ACTION_TAG: + if (kern_spec->flow_tag.size != + sizeof(struct ib_uverbs_flow_spec_action_tag)) + return -EINVAL; + + ib_spec->flow_tag.size = sizeof(struct ib_flow_spec_action_tag); + ib_spec->flow_tag.tag_id = kern_spec->flow_tag.tag_id; + break; + case IB_FLOW_SPEC_ACTION_DROP: + if (kern_spec->drop.size != + sizeof(struct ib_uverbs_flow_spec_action_drop)) + return -EINVAL; + + ib_spec->drop.size = sizeof(struct ib_flow_spec_action_drop); + break; + default: + return -EINVAL; + } + return 0; +} + static size_t kern_spec_filter_sz(struct ib_uverbs_flow_spec_hdr *spec) { /* Returns user space filter size, includes padding */ @@ -3167,8 +2774,8 @@ static ssize_t spec_filter_size(void *kern_spec_filter, u16 kern_filter_size, return kern_filter_size; } -static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, - union ib_flow_spec *ib_spec) +static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec) { ssize_t actual_filter_sz; ssize_t kern_filter_sz; @@ -3263,6 +2870,18 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, return 0; } +static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, + union ib_flow_spec *ib_spec) +{ + if (kern_spec->reserved) + return -EINVAL; + + if (kern_spec->type >= IB_FLOW_SPEC_ACTION_TAG) + return kern_spec_to_ib_spec_action(kern_spec, ib_spec); + else + return kern_spec_to_ib_spec_filter(kern_spec, ib_spec); +} + int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, struct ib_device *ib_dev, struct ib_udata *ucore, @@ -3300,20 +2919,18 @@ int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, if (cmd.comp_mask) return -EOPNOTSUPP; - obj = kmalloc(sizeof(*obj), GFP_KERNEL); - if (!obj) - return -ENOMEM; + obj = (struct ib_uwq_object *)uobj_alloc(uobj_get_type(wq), + file->ucontext); + if (IS_ERR(obj)) + return PTR_ERR(obj); - init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, - &wq_lock_class); - down_write(&obj->uevent.uobject.mutex); - pd = idr_read_pd(cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, cmd.pd_handle, file->ucontext); if (!pd) { err = -EINVAL; goto err_uobj; } - cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + cq = uobj_get_obj_read(cq, cmd.cq_handle, file->ucontext); if (!cq) { err = -EINVAL; goto err_put_pd; @@ -3325,6 +2942,9 @@ int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, wq_init_attr.wq_context = file; wq_init_attr.wq_type = cmd.wq_type; wq_init_attr.event_handler = ib_uverbs_wq_event_handler; + if (ucore->inlen >= (offsetof(typeof(cmd), create_flags) + + sizeof(cmd.create_flags))) + wq_init_attr.create_flags = cmd.create_flags; obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); wq = pd->device->create_wq(pd, &wq_init_attr, uhw); @@ -3345,9 +2965,6 @@ int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, atomic_inc(&cq->usecnt); wq->uobject = &obj->uevent.uobject; obj->uevent.uobject.object = wq; - err = idr_add_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject); - if (err) - goto destroy_wq; memset(&resp, 0, sizeof(resp)); resp.wq_handle = obj->uevent.uobject.id; @@ -3360,27 +2977,19 @@ int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, if (err) goto err_copy; - put_pd_read(pd); - put_cq_read(cq); - - mutex_lock(&file->mutex); - list_add_tail(&obj->uevent.uobject.list, &file->ucontext->wq_list); - mutex_unlock(&file->mutex); - - obj->uevent.uobject.live = 1; - up_write(&obj->uevent.uobject.mutex); + uobj_put_obj_read(pd); + uobj_put_obj_read(cq); + uobj_alloc_commit(&obj->uevent.uobject); return 0; err_copy: - idr_remove_uobj(&ib_uverbs_wq_idr, &obj->uevent.uobject); -destroy_wq: ib_destroy_wq(wq); err_put_cq: - put_cq_read(cq); + uobj_put_obj_read(cq); err_put_pd: - put_pd_read(pd); + uobj_put_obj_read(pd); err_uobj: - put_uobj_write(&obj->uevent.uobject); + uobj_alloc_abort(&obj->uevent.uobject); return err; } @@ -3392,7 +3001,6 @@ int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file, { struct ib_uverbs_ex_destroy_wq cmd = {}; struct ib_uverbs_ex_destroy_wq_resp resp = {}; - struct ib_wq *wq; struct ib_uobject *uobj; struct ib_uwq_object *obj; size_t required_cmd_sz; @@ -3421,36 +3029,25 @@ int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file, return -EOPNOTSUPP; resp.response_length = required_resp_len; - uobj = idr_write_uobj(&ib_uverbs_wq_idr, cmd.wq_handle, - file->ucontext); - if (!uobj) - return -EINVAL; + uobj = uobj_get_write(uobj_get_type(wq), cmd.wq_handle, + file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); - wq = uobj->object; obj = container_of(uobj, struct ib_uwq_object, uevent.uobject); - ret = ib_destroy_wq(wq); - if (!ret) - uobj->live = 0; - - put_uobj_write(uobj); - if (ret) - return ret; - - idr_remove_uobj(&ib_uverbs_wq_idr, uobj); - - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); + /* + * Make sure we don't free the memory in remove_commit as we still + * needs the uobject memory to create the response. + */ + uverbs_uobject_get(uobj); - ib_uverbs_release_uevent(file, &obj->uevent); + ret = uobj_remove_commit(uobj); resp.events_reported = obj->uevent.events_reported; - put_uobj(uobj); - - ret = ib_copy_to_udata(ucore, &resp, resp.response_length); + uverbs_uobject_put(uobj); if (ret) return ret; - return 0; + return ib_copy_to_udata(ucore, &resp, resp.response_length); } int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file, @@ -3480,17 +3077,21 @@ int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file, if (!cmd.attr_mask) return -EINVAL; - if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE)) + if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE | IB_WQ_FLAGS)) return -EINVAL; - wq = idr_read_wq(cmd.wq_handle, file->ucontext); + wq = uobj_get_obj_read(wq, cmd.wq_handle, file->ucontext); if (!wq) return -EINVAL; wq_attr.curr_wq_state = cmd.curr_wq_state; wq_attr.wq_state = cmd.wq_state; + if (cmd.attr_mask & IB_WQ_FLAGS) { + wq_attr.flags = cmd.flags; + wq_attr.flags_mask = cmd.flags_mask; + } ret = wq->device->modify_wq(wq, &wq_attr, cmd.attr_mask, uhw); - put_wq_read(wq); + uobj_put_obj_read(wq); return ret; } @@ -3568,7 +3169,8 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, for (num_read_wqs = 0; num_read_wqs < num_wq_handles; num_read_wqs++) { - wq = idr_read_wq(wqs_handles[num_read_wqs], file->ucontext); + wq = uobj_get_obj_read(wq, wqs_handles[num_read_wqs], + file->ucontext); if (!wq) { err = -EINVAL; goto put_wqs; @@ -3577,14 +3179,12 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, wqs[num_read_wqs] = wq; } - uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); - if (!uobj) { - err = -ENOMEM; + uobj = uobj_alloc(uobj_get_type(rwq_ind_table), file->ucontext); + if (IS_ERR(uobj)) { + err = PTR_ERR(uobj); goto put_wqs; } - init_uobj(uobj, 0, file->ucontext, &rwq_ind_table_lock_class); - down_write(&uobj->mutex); init_attr.log_ind_tbl_size = cmd.log_ind_tbl_size; init_attr.ind_tbl = wqs; rwq_ind_tbl = ib_dev->create_rwq_ind_table(ib_dev, &init_attr, uhw); @@ -3604,10 +3204,6 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, for (i = 0; i < num_wq_handles; i++) atomic_inc(&wqs[i]->usecnt); - err = idr_add_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); - if (err) - goto destroy_ind_tbl; - resp.ind_tbl_handle = uobj->id; resp.ind_tbl_num = rwq_ind_tbl->ind_tbl_num; resp.response_length = required_resp_len; @@ -3620,26 +3216,18 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, kfree(wqs_handles); for (j = 0; j < num_read_wqs; j++) - put_wq_read(wqs[j]); - - mutex_lock(&file->mutex); - list_add_tail(&uobj->list, &file->ucontext->rwq_ind_tbl_list); - mutex_unlock(&file->mutex); + uobj_put_obj_read(wqs[j]); - uobj->live = 1; - - up_write(&uobj->mutex); + uobj_alloc_commit(uobj); return 0; err_copy: - idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); -destroy_ind_tbl: ib_destroy_rwq_ind_table(rwq_ind_tbl); err_uobj: - put_uobj_write(uobj); + uobj_alloc_abort(uobj); put_wqs: for (j = 0; j < num_read_wqs; j++) - put_wq_read(wqs[j]); + uobj_put_obj_read(wqs[j]); err_free: kfree(wqs_handles); kfree(wqs); @@ -3652,10 +3240,8 @@ int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file, struct ib_udata *uhw) { struct ib_uverbs_ex_destroy_rwq_ind_table cmd = {}; - struct ib_rwq_ind_table *rwq_ind_tbl; struct ib_uobject *uobj; int ret; - struct ib_wq **ind_tbl; size_t required_cmd_sz; required_cmd_sz = offsetof(typeof(cmd), ind_tbl_handle) + sizeof(cmd.ind_tbl_handle); @@ -3675,31 +3261,12 @@ int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file, if (cmd.comp_mask) return -EOPNOTSUPP; - uobj = idr_write_uobj(&ib_uverbs_rwq_ind_tbl_idr, cmd.ind_tbl_handle, - file->ucontext); - if (!uobj) - return -EINVAL; - rwq_ind_tbl = uobj->object; - ind_tbl = rwq_ind_tbl->ind_tbl; - - ret = ib_destroy_rwq_ind_table(rwq_ind_tbl); - if (!ret) - uobj->live = 0; - - put_uobj_write(uobj); + uobj = uobj_get_write(uobj_get_type(rwq_ind_table), cmd.ind_tbl_handle, + file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); - if (ret) - return ret; - - idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); - - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); - - put_uobj(uobj); - kfree(ind_tbl); - return ret; + return uobj_remove_commit(uobj); } int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, @@ -3773,15 +3340,13 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, kern_flow_attr = &cmd.flow_attr; } - uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); - if (!uobj) { - err = -ENOMEM; + uobj = uobj_alloc(uobj_get_type(flow), file->ucontext); + if (IS_ERR(uobj)) { + err = PTR_ERR(uobj); goto err_free_attr; } - init_uobj(uobj, 0, file->ucontext, &rule_lock_class); - down_write(&uobj->mutex); - qp = idr_read_qp(cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, cmd.qp_handle, file->ucontext); if (!qp) { err = -EINVAL; goto err_uobj; @@ -3830,10 +3395,6 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, flow_id->uobject = uobj; uobj->object = flow_id; - err = idr_add_uobj(&ib_uverbs_rule_idr, uobj); - if (err) - goto destroy_flow; - memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; @@ -3842,28 +3403,20 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, if (err) goto err_copy; - put_qp_read(qp); - mutex_lock(&file->mutex); - list_add_tail(&uobj->list, &file->ucontext->rule_list); - mutex_unlock(&file->mutex); - - uobj->live = 1; - - up_write(&uobj->mutex); + uobj_put_obj_read(qp); + uobj_alloc_commit(uobj); kfree(flow_attr); if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); return 0; err_copy: - idr_remove_uobj(&ib_uverbs_rule_idr, uobj); -destroy_flow: ib_destroy_flow(flow_id); err_free: kfree(flow_attr); err_put: - put_qp_read(qp); + uobj_put_obj_read(qp); err_uobj: - put_uobj_write(uobj); + uobj_alloc_abort(uobj); err_free_attr: if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); @@ -3876,7 +3429,6 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, struct ib_udata *uhw) { struct ib_uverbs_destroy_flow cmd; - struct ib_flow *flow_id; struct ib_uobject *uobj; int ret; @@ -3890,26 +3442,12 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, if (cmd.comp_mask) return -EINVAL; - uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, - file->ucontext); - if (!uobj) - return -EINVAL; - flow_id = uobj->object; - - ret = ib_destroy_flow(flow_id); - if (!ret) - uobj->live = 0; - - put_uobj_write(uobj); - - idr_remove_uobj(&ib_uverbs_rule_idr, uobj); - - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); - - put_uobj(uobj); + uobj = uobj_get_write(uobj_get_type(flow), cmd.flow_handle, + file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + ret = uobj_remove_commit(uobj); return ret; } @@ -3926,31 +3464,37 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, struct ib_srq_init_attr attr; int ret; - obj = kmalloc(sizeof *obj, GFP_KERNEL); - if (!obj) - return -ENOMEM; - - init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class); - down_write(&obj->uevent.uobject.mutex); + obj = (struct ib_usrq_object *)uobj_alloc(uobj_get_type(srq), + file->ucontext); + if (IS_ERR(obj)) + return PTR_ERR(obj); if (cmd->srq_type == IB_SRQT_XRC) { - attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj); - if (!attr.ext.xrc.xrcd) { + xrcd_uobj = uobj_get_read(uobj_get_type(xrcd), cmd->xrcd_handle, + file->ucontext); + if (IS_ERR(xrcd_uobj)) { ret = -EINVAL; goto err; } + attr.ext.xrc.xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!attr.ext.xrc.xrcd) { + ret = -EINVAL; + goto err_put_xrcd; + } + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); - attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0); + attr.ext.xrc.cq = uobj_get_obj_read(cq, cmd->cq_handle, + file->ucontext); if (!attr.ext.xrc.cq) { ret = -EINVAL; goto err_put_xrcd; } } - pd = idr_read_pd(cmd->pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, cmd->pd_handle, file->ucontext); if (!pd) { ret = -EINVAL; goto err_put_cq; @@ -3990,9 +3534,7 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, atomic_set(&srq->usecnt, 0); obj->uevent.uobject.object = srq; - ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); - if (ret) - goto err_destroy; + obj->uevent.uobject.user_handle = cmd->user_handle; memset(&resp, 0, sizeof resp); resp.srq_handle = obj->uevent.uobject.id; @@ -4008,42 +3550,32 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, } if (cmd->srq_type == IB_SRQT_XRC) { - put_uobj_read(xrcd_uobj); - put_cq_read(attr.ext.xrc.cq); + uobj_put_read(xrcd_uobj); + uobj_put_obj_read(attr.ext.xrc.cq); } - put_pd_read(pd); - - mutex_lock(&file->mutex); - list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list); - mutex_unlock(&file->mutex); - - obj->uevent.uobject.live = 1; - - up_write(&obj->uevent.uobject.mutex); + uobj_put_obj_read(pd); + uobj_alloc_commit(&obj->uevent.uobject); return 0; err_copy: - idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject); - -err_destroy: ib_destroy_srq(srq); err_put: - put_pd_read(pd); + uobj_put_obj_read(pd); err_put_cq: if (cmd->srq_type == IB_SRQT_XRC) - put_cq_read(attr.ext.xrc.cq); + uobj_put_obj_read(attr.ext.xrc.cq); err_put_xrcd: if (cmd->srq_type == IB_SRQT_XRC) { atomic_dec(&obj->uxrcd->refcnt); - put_uobj_read(xrcd_uobj); + uobj_put_read(xrcd_uobj); } err: - put_uobj_write(&obj->uevent.uobject); + uobj_alloc_abort(&obj->uevent.uobject); return ret; } @@ -4128,7 +3660,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, out_len); - srq = idr_read_srq(cmd.srq_handle, file->ucontext); + srq = uobj_get_obj_read(srq, cmd.srq_handle, file->ucontext); if (!srq) return -EINVAL; @@ -4137,7 +3669,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata); - put_srq_read(srq); + uobj_put_obj_read(srq); return ret ? ret : in_len; } @@ -4159,13 +3691,13 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - srq = idr_read_srq(cmd.srq_handle, file->ucontext); + srq = uobj_get_obj_read(srq, cmd.srq_handle, file->ucontext); if (!srq) return -EINVAL; ret = ib_query_srq(srq, &attr); - put_srq_read(srq); + uobj_put_obj_read(srq); if (ret) return ret; @@ -4191,54 +3723,38 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, struct ib_uverbs_destroy_srq cmd; struct ib_uverbs_destroy_srq_resp resp; struct ib_uobject *uobj; - struct ib_srq *srq; struct ib_uevent_object *obj; int ret = -EINVAL; - struct ib_usrq_object *us; - enum ib_srq_type srq_type; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext); - if (!uobj) - return -EINVAL; - srq = uobj->object; - obj = container_of(uobj, struct ib_uevent_object, uobject); - srq_type = srq->srq_type; + uobj = uobj_get_write(uobj_get_type(srq), cmd.srq_handle, + file->ucontext); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); - ret = ib_destroy_srq(srq); - if (!ret) - uobj->live = 0; + obj = container_of(uobj, struct ib_uevent_object, uobject); + /* + * Make sure we don't free the memory in remove_commit as we still + * needs the uobject memory to create the response. + */ + uverbs_uobject_get(uobj); - put_uobj_write(uobj); + memset(&resp, 0, sizeof(resp)); - if (ret) + ret = uobj_remove_commit(uobj); + if (ret) { + uverbs_uobject_put(uobj); return ret; - - if (srq_type == IB_SRQT_XRC) { - us = container_of(obj, struct ib_usrq_object, uevent); - atomic_dec(&us->uxrcd->refcnt); } - - idr_remove_uobj(&ib_uverbs_srq_idr, uobj); - - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); - - ib_uverbs_release_uevent(file, obj); - - memset(&resp, 0, sizeof resp); resp.events_reported = obj->events_reported; + uverbs_uobject_put(uobj); + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + return -EFAULT; - put_uobj(uobj); - - if (copy_to_user((void __user *) (unsigned long) cmd.response, - &resp, sizeof resp)) - ret = -EFAULT; - - return ret ? ret : in_len; + return in_len; } int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, @@ -4323,6 +3839,12 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.max_wq_type_rq = attr.max_wq_type_rq; resp.response_length += sizeof(resp.max_wq_type_rq); + + if (ucore->outlen < resp.response_length + sizeof(resp.raw_packet_caps)) + goto end; + + resp.raw_packet_caps = attr.raw_packet_caps; + resp.response_length += sizeof(resp.raw_packet_caps); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index b3f95d453fba..3d2609608f58 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -51,6 +51,8 @@ #include <rdma/ib.h> #include "uverbs.h" +#include "core_priv.h" +#include "rdma_core.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace verbs access"); @@ -66,19 +68,6 @@ enum { static struct class *uverbs_class; -DEFINE_SPINLOCK(ib_uverbs_idr_lock); -DEFINE_IDR(ib_uverbs_pd_idr); -DEFINE_IDR(ib_uverbs_mr_idr); -DEFINE_IDR(ib_uverbs_mw_idr); -DEFINE_IDR(ib_uverbs_ah_idr); -DEFINE_IDR(ib_uverbs_cq_idr); -DEFINE_IDR(ib_uverbs_qp_idr); -DEFINE_IDR(ib_uverbs_srq_idr); -DEFINE_IDR(ib_uverbs_xrcd_idr); -DEFINE_IDR(ib_uverbs_rule_idr); -DEFINE_IDR(ib_uverbs_wq_idr); -DEFINE_IDR(ib_uverbs_rwq_ind_tbl_idr); - static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); @@ -167,37 +156,37 @@ static struct kobj_type ib_uverbs_dev_ktype = { .release = ib_uverbs_release_dev, }; -static void ib_uverbs_release_event_file(struct kref *ref) +static void ib_uverbs_release_async_event_file(struct kref *ref) { - struct ib_uverbs_event_file *file = - container_of(ref, struct ib_uverbs_event_file, ref); + struct ib_uverbs_async_event_file *file = + container_of(ref, struct ib_uverbs_async_event_file, ref); kfree(file); } void ib_uverbs_release_ucq(struct ib_uverbs_file *file, - struct ib_uverbs_event_file *ev_file, + struct ib_uverbs_completion_event_file *ev_file, struct ib_ucq_object *uobj) { struct ib_uverbs_event *evt, *tmp; if (ev_file) { - spin_lock_irq(&ev_file->lock); + spin_lock_irq(&ev_file->ev_queue.lock); list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) { list_del(&evt->list); kfree(evt); } - spin_unlock_irq(&ev_file->lock); + spin_unlock_irq(&ev_file->ev_queue.lock); - kref_put(&ev_file->ref, ib_uverbs_release_event_file); + uverbs_uobject_put(&ev_file->uobj_file.uobj); } - spin_lock_irq(&file->async_file->lock); + spin_lock_irq(&file->async_file->ev_queue.lock); list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) { list_del(&evt->list); kfree(evt); } - spin_unlock_irq(&file->async_file->lock); + spin_unlock_irq(&file->async_file->ev_queue.lock); } void ib_uverbs_release_uevent(struct ib_uverbs_file *file, @@ -205,16 +194,16 @@ void ib_uverbs_release_uevent(struct ib_uverbs_file *file, { struct ib_uverbs_event *evt, *tmp; - spin_lock_irq(&file->async_file->lock); + spin_lock_irq(&file->async_file->ev_queue.lock); list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) { list_del(&evt->list); kfree(evt); } - spin_unlock_irq(&file->async_file->lock); + spin_unlock_irq(&file->async_file->ev_queue.lock); } -static void ib_uverbs_detach_umcast(struct ib_qp *qp, - struct ib_uqp_object *uobj) +void ib_uverbs_detach_umcast(struct ib_qp *qp, + struct ib_uqp_object *uobj) { struct ib_uverbs_mcast_entry *mcast, *tmp; @@ -226,124 +215,16 @@ static void ib_uverbs_detach_umcast(struct ib_qp *qp, } static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, - struct ib_ucontext *context) + struct ib_ucontext *context, + bool device_removed) { - struct ib_uobject *uobj, *tmp; - context->closing = 1; - - list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) { - struct ib_ah *ah = uobj->object; - - idr_remove_uobj(&ib_uverbs_ah_idr, uobj); - ib_destroy_ah(ah); - kfree(uobj); - } - - /* Remove MWs before QPs, in order to support type 2A MWs. */ - list_for_each_entry_safe(uobj, tmp, &context->mw_list, list) { - struct ib_mw *mw = uobj->object; - - idr_remove_uobj(&ib_uverbs_mw_idr, uobj); - uverbs_dealloc_mw(mw); - kfree(uobj); - } - - list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) { - struct ib_flow *flow_id = uobj->object; - - idr_remove_uobj(&ib_uverbs_rule_idr, uobj); - ib_destroy_flow(flow_id); - kfree(uobj); - } - - list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) { - struct ib_qp *qp = uobj->object; - struct ib_uqp_object *uqp = - container_of(uobj, struct ib_uqp_object, uevent.uobject); - - idr_remove_uobj(&ib_uverbs_qp_idr, uobj); - if (qp == qp->real_qp) - ib_uverbs_detach_umcast(qp, uqp); - ib_destroy_qp(qp); - ib_uverbs_release_uevent(file, &uqp->uevent); - kfree(uqp); - } - - list_for_each_entry_safe(uobj, tmp, &context->rwq_ind_tbl_list, list) { - struct ib_rwq_ind_table *rwq_ind_tbl = uobj->object; - struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; - - idr_remove_uobj(&ib_uverbs_rwq_ind_tbl_idr, uobj); - ib_destroy_rwq_ind_table(rwq_ind_tbl); - kfree(ind_tbl); - kfree(uobj); - } - - list_for_each_entry_safe(uobj, tmp, &context->wq_list, list) { - struct ib_wq *wq = uobj->object; - struct ib_uwq_object *uwq = - container_of(uobj, struct ib_uwq_object, uevent.uobject); - - idr_remove_uobj(&ib_uverbs_wq_idr, uobj); - ib_destroy_wq(wq); - ib_uverbs_release_uevent(file, &uwq->uevent); - kfree(uwq); - } - - list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { - struct ib_srq *srq = uobj->object; - struct ib_uevent_object *uevent = - container_of(uobj, struct ib_uevent_object, uobject); - - idr_remove_uobj(&ib_uverbs_srq_idr, uobj); - ib_destroy_srq(srq); - ib_uverbs_release_uevent(file, uevent); - kfree(uevent); - } - - list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) { - struct ib_cq *cq = uobj->object; - struct ib_uverbs_event_file *ev_file = cq->cq_context; - struct ib_ucq_object *ucq = - container_of(uobj, struct ib_ucq_object, uobject); - - idr_remove_uobj(&ib_uverbs_cq_idr, uobj); - ib_destroy_cq(cq); - ib_uverbs_release_ucq(file, ev_file, ucq); - kfree(ucq); - } - - list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) { - struct ib_mr *mr = uobj->object; - - idr_remove_uobj(&ib_uverbs_mr_idr, uobj); - ib_dereg_mr(mr); - kfree(uobj); - } - - mutex_lock(&file->device->xrcd_tree_mutex); - list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) { - struct ib_xrcd *xrcd = uobj->object; - struct ib_uxrcd_object *uxrcd = - container_of(uobj, struct ib_uxrcd_object, uobject); - - idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj); - ib_uverbs_dealloc_xrcd(file->device, xrcd); - kfree(uxrcd); - } - mutex_unlock(&file->device->xrcd_tree_mutex); - - list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) { - struct ib_pd *pd = uobj->object; - - idr_remove_uobj(&ib_uverbs_pd_idr, uobj); - ib_dealloc_pd(pd); - kfree(uobj); - } - + uverbs_cleanup_ucontext(context, device_removed); put_pid(context->tgid); + ib_rdmacg_uncharge(&context->cg_obj, context->device, + RDMACG_RESOURCE_HCA_HANDLE); + return context->device->dealloc_ucontext(context); } @@ -352,7 +233,7 @@ static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev) complete(&dev->comp); } -static void ib_uverbs_release_file(struct kref *ref) +void ib_uverbs_release_file(struct kref *ref) { struct ib_uverbs_file *file = container_of(ref, struct ib_uverbs_file, ref); @@ -372,58 +253,54 @@ static void ib_uverbs_release_file(struct kref *ref) kfree(file); } -static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf, - size_t count, loff_t *pos) +static ssize_t ib_uverbs_event_read(struct ib_uverbs_event_queue *ev_queue, + struct ib_uverbs_file *uverbs_file, + struct file *filp, char __user *buf, + size_t count, loff_t *pos, + size_t eventsz) { - struct ib_uverbs_event_file *file = filp->private_data; struct ib_uverbs_event *event; - int eventsz; int ret = 0; - spin_lock_irq(&file->lock); + spin_lock_irq(&ev_queue->lock); - while (list_empty(&file->event_list)) { - spin_unlock_irq(&file->lock); + while (list_empty(&ev_queue->event_list)) { + spin_unlock_irq(&ev_queue->lock); if (filp->f_flags & O_NONBLOCK) return -EAGAIN; - if (wait_event_interruptible(file->poll_wait, - (!list_empty(&file->event_list) || + if (wait_event_interruptible(ev_queue->poll_wait, + (!list_empty(&ev_queue->event_list) || /* The barriers built into wait_event_interruptible() * and wake_up() guarentee this will see the null set * without using RCU */ - !file->uverbs_file->device->ib_dev))) + !uverbs_file->device->ib_dev))) return -ERESTARTSYS; /* If device was disassociated and no event exists set an error */ - if (list_empty(&file->event_list) && - !file->uverbs_file->device->ib_dev) + if (list_empty(&ev_queue->event_list) && + !uverbs_file->device->ib_dev) return -EIO; - spin_lock_irq(&file->lock); + spin_lock_irq(&ev_queue->lock); } - event = list_entry(file->event_list.next, struct ib_uverbs_event, list); - - if (file->is_async) - eventsz = sizeof (struct ib_uverbs_async_event_desc); - else - eventsz = sizeof (struct ib_uverbs_comp_event_desc); + event = list_entry(ev_queue->event_list.next, struct ib_uverbs_event, list); if (eventsz > count) { ret = -EINVAL; event = NULL; } else { - list_del(file->event_list.next); + list_del(ev_queue->event_list.next); if (event->counter) { ++(*event->counter); list_del(&event->obj_list); } } - spin_unlock_irq(&file->lock); + spin_unlock_irq(&ev_queue->lock); if (event) { if (copy_to_user(buf, event, eventsz)) @@ -437,87 +314,158 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf, return ret; } -static unsigned int ib_uverbs_event_poll(struct file *filp, +static ssize_t ib_uverbs_async_event_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct ib_uverbs_async_event_file *file = filp->private_data; + + return ib_uverbs_event_read(&file->ev_queue, file->uverbs_file, filp, + buf, count, pos, + sizeof(struct ib_uverbs_async_event_desc)); +} + +static ssize_t ib_uverbs_comp_event_read(struct file *filp, char __user *buf, + size_t count, loff_t *pos) +{ + struct ib_uverbs_completion_event_file *comp_ev_file = + filp->private_data; + + return ib_uverbs_event_read(&comp_ev_file->ev_queue, + comp_ev_file->uobj_file.ufile, filp, + buf, count, pos, + sizeof(struct ib_uverbs_comp_event_desc)); +} + +static unsigned int ib_uverbs_event_poll(struct ib_uverbs_event_queue *ev_queue, + struct file *filp, struct poll_table_struct *wait) { unsigned int pollflags = 0; - struct ib_uverbs_event_file *file = filp->private_data; - poll_wait(filp, &file->poll_wait, wait); + poll_wait(filp, &ev_queue->poll_wait, wait); - spin_lock_irq(&file->lock); - if (!list_empty(&file->event_list)) + spin_lock_irq(&ev_queue->lock); + if (!list_empty(&ev_queue->event_list)) pollflags = POLLIN | POLLRDNORM; - spin_unlock_irq(&file->lock); + spin_unlock_irq(&ev_queue->lock); return pollflags; } -static int ib_uverbs_event_fasync(int fd, struct file *filp, int on) +static unsigned int ib_uverbs_async_event_poll(struct file *filp, + struct poll_table_struct *wait) +{ + return ib_uverbs_event_poll(filp->private_data, filp, wait); +} + +static unsigned int ib_uverbs_comp_event_poll(struct file *filp, + struct poll_table_struct *wait) { - struct ib_uverbs_event_file *file = filp->private_data; + struct ib_uverbs_completion_event_file *comp_ev_file = + filp->private_data; - return fasync_helper(fd, filp, on, &file->async_queue); + return ib_uverbs_event_poll(&comp_ev_file->ev_queue, filp, wait); } -static int ib_uverbs_event_close(struct inode *inode, struct file *filp) +static int ib_uverbs_async_event_fasync(int fd, struct file *filp, int on) { - struct ib_uverbs_event_file *file = filp->private_data; + struct ib_uverbs_event_queue *ev_queue = filp->private_data; + + return fasync_helper(fd, filp, on, &ev_queue->async_queue); +} + +static int ib_uverbs_comp_event_fasync(int fd, struct file *filp, int on) +{ + struct ib_uverbs_completion_event_file *comp_ev_file = + filp->private_data; + + return fasync_helper(fd, filp, on, &comp_ev_file->ev_queue.async_queue); +} + +static int ib_uverbs_async_event_close(struct inode *inode, struct file *filp) +{ + struct ib_uverbs_async_event_file *file = filp->private_data; + struct ib_uverbs_file *uverbs_file = file->uverbs_file; struct ib_uverbs_event *entry, *tmp; int closed_already = 0; - mutex_lock(&file->uverbs_file->device->lists_mutex); - spin_lock_irq(&file->lock); - closed_already = file->is_closed; - file->is_closed = 1; - list_for_each_entry_safe(entry, tmp, &file->event_list, list) { + mutex_lock(&uverbs_file->device->lists_mutex); + spin_lock_irq(&file->ev_queue.lock); + closed_already = file->ev_queue.is_closed; + file->ev_queue.is_closed = 1; + list_for_each_entry_safe(entry, tmp, &file->ev_queue.event_list, list) { if (entry->counter) list_del(&entry->obj_list); kfree(entry); } - spin_unlock_irq(&file->lock); + spin_unlock_irq(&file->ev_queue.lock); if (!closed_already) { list_del(&file->list); - if (file->is_async) - ib_unregister_event_handler(&file->uverbs_file-> - event_handler); + ib_unregister_event_handler(&uverbs_file->event_handler); } - mutex_unlock(&file->uverbs_file->device->lists_mutex); + mutex_unlock(&uverbs_file->device->lists_mutex); - kref_put(&file->uverbs_file->ref, ib_uverbs_release_file); - kref_put(&file->ref, ib_uverbs_release_event_file); + kref_put(&uverbs_file->ref, ib_uverbs_release_file); + kref_put(&file->ref, ib_uverbs_release_async_event_file); return 0; } -static const struct file_operations uverbs_event_fops = { +static int ib_uverbs_comp_event_close(struct inode *inode, struct file *filp) +{ + struct ib_uverbs_completion_event_file *file = filp->private_data; + struct ib_uverbs_event *entry, *tmp; + + spin_lock_irq(&file->ev_queue.lock); + list_for_each_entry_safe(entry, tmp, &file->ev_queue.event_list, list) { + if (entry->counter) + list_del(&entry->obj_list); + kfree(entry); + } + spin_unlock_irq(&file->ev_queue.lock); + + uverbs_close_fd(filp); + + return 0; +} + +const struct file_operations uverbs_event_fops = { .owner = THIS_MODULE, - .read = ib_uverbs_event_read, - .poll = ib_uverbs_event_poll, - .release = ib_uverbs_event_close, - .fasync = ib_uverbs_event_fasync, + .read = ib_uverbs_comp_event_read, + .poll = ib_uverbs_comp_event_poll, + .release = ib_uverbs_comp_event_close, + .fasync = ib_uverbs_comp_event_fasync, + .llseek = no_llseek, +}; + +static const struct file_operations uverbs_async_event_fops = { + .owner = THIS_MODULE, + .read = ib_uverbs_async_event_read, + .poll = ib_uverbs_async_event_poll, + .release = ib_uverbs_async_event_close, + .fasync = ib_uverbs_async_event_fasync, .llseek = no_llseek, }; void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) { - struct ib_uverbs_event_file *file = cq_context; + struct ib_uverbs_event_queue *ev_queue = cq_context; struct ib_ucq_object *uobj; struct ib_uverbs_event *entry; unsigned long flags; - if (!file) + if (!ev_queue) return; - spin_lock_irqsave(&file->lock, flags); - if (file->is_closed) { - spin_unlock_irqrestore(&file->lock, flags); + spin_lock_irqsave(&ev_queue->lock, flags); + if (ev_queue->is_closed) { + spin_unlock_irqrestore(&ev_queue->lock, flags); return; } entry = kmalloc(sizeof *entry, GFP_ATOMIC); if (!entry) { - spin_unlock_irqrestore(&file->lock, flags); + spin_unlock_irqrestore(&ev_queue->lock, flags); return; } @@ -526,12 +474,12 @@ void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) entry->desc.comp.cq_handle = cq->uobject->user_handle; entry->counter = &uobj->comp_events_reported; - list_add_tail(&entry->list, &file->event_list); + list_add_tail(&entry->list, &ev_queue->event_list); list_add_tail(&entry->obj_list, &uobj->comp_list); - spin_unlock_irqrestore(&file->lock, flags); + spin_unlock_irqrestore(&ev_queue->lock, flags); - wake_up_interruptible(&file->poll_wait); - kill_fasync(&file->async_queue, SIGIO, POLL_IN); + wake_up_interruptible(&ev_queue->poll_wait); + kill_fasync(&ev_queue->async_queue, SIGIO, POLL_IN); } static void ib_uverbs_async_handler(struct ib_uverbs_file *file, @@ -542,15 +490,15 @@ static void ib_uverbs_async_handler(struct ib_uverbs_file *file, struct ib_uverbs_event *entry; unsigned long flags; - spin_lock_irqsave(&file->async_file->lock, flags); - if (file->async_file->is_closed) { - spin_unlock_irqrestore(&file->async_file->lock, flags); + spin_lock_irqsave(&file->async_file->ev_queue.lock, flags); + if (file->async_file->ev_queue.is_closed) { + spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags); return; } entry = kmalloc(sizeof *entry, GFP_ATOMIC); if (!entry) { - spin_unlock_irqrestore(&file->async_file->lock, flags); + spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags); return; } @@ -559,13 +507,13 @@ static void ib_uverbs_async_handler(struct ib_uverbs_file *file, entry->desc.async.reserved = 0; entry->counter = counter; - list_add_tail(&entry->list, &file->async_file->event_list); + list_add_tail(&entry->list, &file->async_file->ev_queue.event_list); if (obj_list) list_add_tail(&entry->obj_list, obj_list); - spin_unlock_irqrestore(&file->async_file->lock, flags); + spin_unlock_irqrestore(&file->async_file->ev_queue.lock, flags); - wake_up_interruptible(&file->async_file->poll_wait); - kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN); + wake_up_interruptible(&file->async_file->ev_queue.poll_wait); + kill_fasync(&file->async_file->ev_queue.async_queue, SIGIO, POLL_IN); } void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr) @@ -583,7 +531,7 @@ void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr) struct ib_uevent_object *uobj; /* for XRC target qp's, check that qp is live */ - if (!event->element.qp->uobject || !event->element.qp->uobject->live) + if (!event->element.qp->uobject) return; uobj = container_of(event->element.qp->uobject, @@ -628,15 +576,23 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file) { - kref_put(&file->async_file->ref, ib_uverbs_release_event_file); + kref_put(&file->async_file->ref, ib_uverbs_release_async_event_file); file->async_file = NULL; } -struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, - struct ib_device *ib_dev, - int is_async) +void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue) +{ + spin_lock_init(&ev_queue->lock); + INIT_LIST_HEAD(&ev_queue->event_list); + init_waitqueue_head(&ev_queue->poll_wait); + ev_queue->is_closed = 0; + ev_queue->async_queue = NULL; +} + +struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file, + struct ib_device *ib_dev) { - struct ib_uverbs_event_file *ev_file; + struct ib_uverbs_async_event_file *ev_file; struct file *filp; int ret; @@ -644,16 +600,11 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, if (!ev_file) return ERR_PTR(-ENOMEM); - kref_init(&ev_file->ref); - spin_lock_init(&ev_file->lock); - INIT_LIST_HEAD(&ev_file->event_list); - init_waitqueue_head(&ev_file->poll_wait); + ib_uverbs_init_event_queue(&ev_file->ev_queue); ev_file->uverbs_file = uverbs_file; kref_get(&ev_file->uverbs_file->ref); - ev_file->async_queue = NULL; - ev_file->is_closed = 0; - - filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops, + kref_init(&ev_file->ref); + filp = anon_inode_getfile("[infinibandevent]", &uverbs_async_event_fops, ev_file, O_RDONLY); if (IS_ERR(filp)) goto err_put_refs; @@ -663,64 +614,33 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, &uverbs_file->device->uverbs_events_file_list); mutex_unlock(&uverbs_file->device->lists_mutex); - if (is_async) { - WARN_ON(uverbs_file->async_file); - uverbs_file->async_file = ev_file; - kref_get(&uverbs_file->async_file->ref); - INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler, - ib_dev, - ib_uverbs_event_handler); - ret = ib_register_event_handler(&uverbs_file->event_handler); - if (ret) - goto err_put_file; - - /* At that point async file stuff was fully set */ - ev_file->is_async = 1; - } + WARN_ON(uverbs_file->async_file); + uverbs_file->async_file = ev_file; + kref_get(&uverbs_file->async_file->ref); + INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler, + ib_dev, + ib_uverbs_event_handler); + ret = ib_register_event_handler(&uverbs_file->event_handler); + if (ret) + goto err_put_file; + + /* At that point async file stuff was fully set */ return filp; err_put_file: fput(filp); - kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file); + kref_put(&uverbs_file->async_file->ref, + ib_uverbs_release_async_event_file); uverbs_file->async_file = NULL; return ERR_PTR(ret); err_put_refs: kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file); - kref_put(&ev_file->ref, ib_uverbs_release_event_file); + kref_put(&ev_file->ref, ib_uverbs_release_async_event_file); return filp; } -/* - * Look up a completion event file by FD. If lookup is successful, - * takes a ref to the event file struct that it returns; if - * unsuccessful, returns NULL. - */ -struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd) -{ - struct ib_uverbs_event_file *ev_file = NULL; - struct fd f = fdget(fd); - - if (!f.file) - return NULL; - - if (f.file->f_op != &uverbs_event_fops) - goto out; - - ev_file = f.file->private_data; - if (ev_file->is_async) { - ev_file = NULL; - goto out; - } - - kref_get(&ev_file->ref); - -out: - fdput(f); - return ev_file; -} - static int verify_command_mask(struct ib_device *ib_dev, __u32 command) { u64 mask; @@ -966,6 +886,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) } file->device = dev; + spin_lock_init(&file->idr_lock); + idr_init(&file->idr); file->ucontext = NULL; file->async_file = NULL; kref_init(&file->ref); @@ -999,10 +921,11 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) mutex_lock(&file->cleanup_mutex); if (file->ucontext) { - ib_uverbs_cleanup_ucontext(file, file->ucontext); + ib_uverbs_cleanup_ucontext(file, file->ucontext, false); file->ucontext = NULL; } mutex_unlock(&file->cleanup_mutex); + idr_destroy(&file->idr); mutex_lock(&file->device->lists_mutex); if (!file->is_closed) { @@ -1012,7 +935,8 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) mutex_unlock(&file->device->lists_mutex); if (file->async_file) - kref_put(&file->async_file->ref, ib_uverbs_release_event_file); + kref_put(&file->async_file->ref, + ib_uverbs_release_async_event_file); kref_put(&file->ref, ib_uverbs_release_file); kobject_put(&dev->kobj); @@ -1169,12 +1093,12 @@ static void ib_uverbs_add_one(struct ib_device *device) cdev_init(&uverbs_dev->cdev, NULL); uverbs_dev->cdev.owner = THIS_MODULE; uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; - uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj; + cdev_set_parent(&uverbs_dev->cdev, &uverbs_dev->kobj); kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); if (cdev_add(&uverbs_dev->cdev, base, 1)) goto err_cdev; - uverbs_dev->dev = device_create(uverbs_class, device->dma_device, + uverbs_dev->dev = device_create(uverbs_class, device->dev.parent, uverbs_dev->cdev.dev, uverbs_dev, "uverbs%d", uverbs_dev->devnum); if (IS_ERR(uverbs_dev->dev)) @@ -1211,7 +1135,7 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, struct ib_device *ib_dev) { struct ib_uverbs_file *file; - struct ib_uverbs_event_file *event_file; + struct ib_uverbs_async_event_file *event_file; struct ib_event event; /* Pending running commands to terminate */ @@ -1248,7 +1172,9 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, * (e.g mmput). */ ib_dev->disassociate_ucontext(ucontext); - ib_uverbs_cleanup_ucontext(file, ucontext); + mutex_lock(&file->cleanup_mutex); + ib_uverbs_cleanup_ucontext(file, ucontext, true); + mutex_unlock(&file->cleanup_mutex); } mutex_lock(&uverbs_dev->lists_mutex); @@ -1258,21 +1184,20 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, while (!list_empty(&uverbs_dev->uverbs_events_file_list)) { event_file = list_first_entry(&uverbs_dev-> uverbs_events_file_list, - struct ib_uverbs_event_file, + struct ib_uverbs_async_event_file, list); - spin_lock_irq(&event_file->lock); - event_file->is_closed = 1; - spin_unlock_irq(&event_file->lock); + spin_lock_irq(&event_file->ev_queue.lock); + event_file->ev_queue.is_closed = 1; + spin_unlock_irq(&event_file->ev_queue.lock); list_del(&event_file->list); - if (event_file->is_async) { - ib_unregister_event_handler(&event_file->uverbs_file-> - event_handler); - event_file->uverbs_file->event_handler.device = NULL; - } + ib_unregister_event_handler( + &event_file->uverbs_file->event_handler); + event_file->uverbs_file->event_handler.device = + NULL; - wake_up_interruptible(&event_file->poll_wait); - kill_fasync(&event_file->async_queue, SIGIO, POLL_IN); + wake_up_interruptible(&event_file->ev_queue.poll_wait); + kill_fasync(&event_file->ev_queue.async_queue, SIGIO, POLL_IN); } mutex_unlock(&uverbs_dev->lists_mutex); } @@ -1376,13 +1301,6 @@ static void __exit ib_uverbs_cleanup(void) unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); if (overflow_maj) unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES); - idr_destroy(&ib_uverbs_pd_idr); - idr_destroy(&ib_uverbs_mr_idr); - idr_destroy(&ib_uverbs_mw_idr); - idr_destroy(&ib_uverbs_ah_idr); - idr_destroy(&ib_uverbs_cq_idr); - idr_destroy(&ib_uverbs_qp_idr); - idr_destroy(&ib_uverbs_srq_idr); } module_init(ib_uverbs_init); diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index af020f80d50f..94fd989c9060 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -34,20 +34,25 @@ #include <rdma/ib_marshall.h> void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, - struct ib_ah_attr *src) + struct rdma_ah_attr *src) { - memcpy(dst->grh.dgid, src->grh.dgid.raw, sizeof src->grh.dgid); - dst->grh.flow_label = src->grh.flow_label; - dst->grh.sgid_index = src->grh.sgid_index; - dst->grh.hop_limit = src->grh.hop_limit; - dst->grh.traffic_class = src->grh.traffic_class; memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved)); - dst->dlid = src->dlid; - dst->sl = src->sl; - dst->src_path_bits = src->src_path_bits; - dst->static_rate = src->static_rate; - dst->is_global = src->ah_flags & IB_AH_GRH ? 1 : 0; - dst->port_num = src->port_num; + dst->dlid = rdma_ah_get_dlid(src); + dst->sl = rdma_ah_get_sl(src); + dst->src_path_bits = rdma_ah_get_path_bits(src); + dst->static_rate = rdma_ah_get_static_rate(src); + dst->is_global = rdma_ah_get_ah_flags(src) & + IB_AH_GRH ? 1 : 0; + if (dst->is_global) { + const struct ib_global_route *grh = rdma_ah_read_grh(src); + + memcpy(dst->grh.dgid, grh->dgid.raw, sizeof(grh->dgid)); + dst->grh.flow_label = grh->flow_label; + dst->grh.sgid_index = grh->sgid_index; + dst->grh.hop_limit = grh->hop_limit; + dst->grh.traffic_class = grh->traffic_class; + } + dst->port_num = rdma_ah_get_port_num(src); dst->reserved = 0; } EXPORT_SYMBOL(ib_copy_ah_attr_to_user); @@ -91,15 +96,15 @@ void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, } EXPORT_SYMBOL(ib_copy_qp_attr_to_user); -void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, - struct ib_sa_path_rec *src) +static void __ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, + struct sa_path_rec *src) { - memcpy(dst->dgid, src->dgid.raw, sizeof src->dgid); - memcpy(dst->sgid, src->sgid.raw, sizeof src->sgid); + memcpy(dst->dgid, src->dgid.raw, sizeof(src->dgid)); + memcpy(dst->sgid, src->sgid.raw, sizeof(src->sgid)); - dst->dlid = src->dlid; - dst->slid = src->slid; - dst->raw_traffic = src->raw_traffic; + dst->dlid = htons(ntohl(sa_path_get_dlid(src))); + dst->slid = htons(ntohl(sa_path_get_slid(src))); + dst->raw_traffic = sa_path_get_raw_traffic(src); dst->flow_label = src->flow_label; dst->hop_limit = src->hop_limit; dst->traffic_class = src->traffic_class; @@ -115,17 +120,43 @@ void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, dst->preference = src->preference; dst->packet_life_time_selector = src->packet_life_time_selector; } + +void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, + struct sa_path_rec *src) +{ + struct sa_path_rec rec; + + if (src->rec_type == SA_PATH_REC_TYPE_OPA) { + sa_convert_path_opa_to_ib(&rec, src); + __ib_copy_path_rec_to_user(dst, &rec); + return; + } + __ib_copy_path_rec_to_user(dst, src); +} EXPORT_SYMBOL(ib_copy_path_rec_to_user); -void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, +void ib_copy_path_rec_from_user(struct sa_path_rec *dst, struct ib_user_path_rec *src) { + __be32 slid, dlid; + + memset(dst, 0, sizeof(*dst)); + if ((ib_is_opa_gid((union ib_gid *)src->sgid)) || + (ib_is_opa_gid((union ib_gid *)src->dgid))) { + dst->rec_type = SA_PATH_REC_TYPE_OPA; + slid = htonl(opa_get_lid_from_gid((union ib_gid *)src->sgid)); + dlid = htonl(opa_get_lid_from_gid((union ib_gid *)src->dgid)); + } else { + dst->rec_type = SA_PATH_REC_TYPE_IB; + slid = htonl(ntohs(src->slid)); + dlid = htonl(ntohs(src->dlid)); + } memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid); memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid); - dst->dlid = src->dlid; - dst->slid = src->slid; - dst->raw_traffic = src->raw_traffic; + sa_path_set_dlid(dst, dlid); + sa_path_set_slid(dst, slid); + sa_path_set_raw_traffic(dst, src->raw_traffic); dst->flow_label = src->flow_label; dst->hop_limit = src->hop_limit; dst->traffic_class = src->traffic_class; @@ -141,9 +172,9 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, dst->preference = src->preference; dst->packet_life_time_selector = src->packet_life_time_selector; - memset(dst->dmac, 0, sizeof(dst->dmac)); - dst->net = NULL; - dst->ifindex = 0; - dst->gid_type = IB_GID_TYPE_IB; + /* TODO: No need to set this */ + sa_path_set_dmac_zero(dst); + sa_path_set_ndev(dst, NULL); + sa_path_set_ifindex(dst, 0); } EXPORT_SYMBOL(ib_copy_path_rec_from_user); diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c new file mode 100644 index 000000000000..ef293379f37a --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <rdma/uverbs_std_types.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_verbs.h> +#include <linux/bug.h> +#include <linux/file.h> +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_ah(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + return rdma_destroy_ah((struct ib_ah *)uobject->object); +} + +static int uverbs_free_flow(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + return ib_destroy_flow((struct ib_flow *)uobject->object); +} + +static int uverbs_free_mw(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + return uverbs_dealloc_mw((struct ib_mw *)uobject->object); +} + +static int uverbs_free_qp(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct ib_qp *qp = uobject->object; + struct ib_uqp_object *uqp = + container_of(uobject, struct ib_uqp_object, uevent.uobject); + int ret; + + if (why == RDMA_REMOVE_DESTROY) { + if (!list_empty(&uqp->mcast_list)) + return -EBUSY; + } else if (qp == qp->real_qp) { + ib_uverbs_detach_umcast(qp, uqp); + } + + ret = ib_destroy_qp(qp); + if (ret && why == RDMA_REMOVE_DESTROY) + return ret; + + if (uqp->uxrcd) + atomic_dec(&uqp->uxrcd->refcnt); + + ib_uverbs_release_uevent(uobject->context->ufile, &uqp->uevent); + return ret; +} + +static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object; + struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; + int ret; + + ret = ib_destroy_rwq_ind_table(rwq_ind_tbl); + if (!ret || why != RDMA_REMOVE_DESTROY) + kfree(ind_tbl); + return ret; +} + +static int uverbs_free_wq(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct ib_wq *wq = uobject->object; + struct ib_uwq_object *uwq = + container_of(uobject, struct ib_uwq_object, uevent.uobject); + int ret; + + ret = ib_destroy_wq(wq); + if (!ret || why != RDMA_REMOVE_DESTROY) + ib_uverbs_release_uevent(uobject->context->ufile, &uwq->uevent); + return ret; +} + +static int uverbs_free_srq(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct ib_srq *srq = uobject->object; + struct ib_uevent_object *uevent = + container_of(uobject, struct ib_uevent_object, uobject); + enum ib_srq_type srq_type = srq->srq_type; + int ret; + + ret = ib_destroy_srq(srq); + + if (ret && why == RDMA_REMOVE_DESTROY) + return ret; + + if (srq_type == IB_SRQT_XRC) { + struct ib_usrq_object *us = + container_of(uevent, struct ib_usrq_object, uevent); + + atomic_dec(&us->uxrcd->refcnt); + } + + ib_uverbs_release_uevent(uobject->context->ufile, uevent); + return ret; +} + +static int uverbs_free_cq(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct ib_cq *cq = uobject->object; + struct ib_uverbs_event_queue *ev_queue = cq->cq_context; + struct ib_ucq_object *ucq = + container_of(uobject, struct ib_ucq_object, uobject); + int ret; + + ret = ib_destroy_cq(cq); + if (!ret || why != RDMA_REMOVE_DESTROY) + ib_uverbs_release_ucq(uobject->context->ufile, ev_queue ? + container_of(ev_queue, + struct ib_uverbs_completion_event_file, + ev_queue) : NULL, + ucq); + return ret; +} + +static int uverbs_free_mr(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + return ib_dereg_mr((struct ib_mr *)uobject->object); +} + +static int uverbs_free_xrcd(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct ib_xrcd *xrcd = uobject->object; + struct ib_uxrcd_object *uxrcd = + container_of(uobject, struct ib_uxrcd_object, uobject); + int ret; + + mutex_lock(&uobject->context->ufile->device->xrcd_tree_mutex); + if (why == RDMA_REMOVE_DESTROY && atomic_read(&uxrcd->refcnt)) + ret = -EBUSY; + else + ret = ib_uverbs_dealloc_xrcd(uobject->context->ufile->device, + xrcd, why); + mutex_unlock(&uobject->context->ufile->device->xrcd_tree_mutex); + + return ret; +} + +static int uverbs_free_pd(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct ib_pd *pd = uobject->object; + + if (why == RDMA_REMOVE_DESTROY && atomic_read(&pd->usecnt)) + return -EBUSY; + + ib_dealloc_pd((struct ib_pd *)uobject->object); + return 0; +} + +static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_file, + enum rdma_remove_reason why) +{ + struct ib_uverbs_completion_event_file *comp_event_file = + container_of(uobj_file, struct ib_uverbs_completion_event_file, + uobj_file); + struct ib_uverbs_event_queue *event_queue = &comp_event_file->ev_queue; + + spin_lock_irq(&event_queue->lock); + event_queue->is_closed = 1; + spin_unlock_irq(&event_queue->lock); + + if (why == RDMA_REMOVE_DRIVER_REMOVE) { + wake_up_interruptible(&event_queue->poll_wait); + kill_fasync(&event_queue->async_queue, SIGIO, POLL_IN); + } + return 0; +}; + +const struct uverbs_obj_fd_type uverbs_type_attrs_comp_channel = { + .type = UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_completion_event_file), 0), + .context_closed = uverbs_hot_unplug_completion_event_file, + .fops = &uverbs_event_fops, + .name = "[infinibandevent]", + .flags = O_RDONLY, +}; + +const struct uverbs_obj_idr_type uverbs_type_attrs_cq = { + .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0), + .destroy_object = uverbs_free_cq, +}; + +const struct uverbs_obj_idr_type uverbs_type_attrs_qp = { + .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0), + .destroy_object = uverbs_free_qp, +}; + +const struct uverbs_obj_idr_type uverbs_type_attrs_mw = { + .type = UVERBS_TYPE_ALLOC_IDR(0), + .destroy_object = uverbs_free_mw, +}; + +const struct uverbs_obj_idr_type uverbs_type_attrs_mr = { + /* 1 is used in order to free the MR after all the MWs */ + .type = UVERBS_TYPE_ALLOC_IDR(1), + .destroy_object = uverbs_free_mr, +}; + +const struct uverbs_obj_idr_type uverbs_type_attrs_srq = { + .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0), + .destroy_object = uverbs_free_srq, +}; + +const struct uverbs_obj_idr_type uverbs_type_attrs_ah = { + .type = UVERBS_TYPE_ALLOC_IDR(0), + .destroy_object = uverbs_free_ah, +}; + +const struct uverbs_obj_idr_type uverbs_type_attrs_flow = { + .type = UVERBS_TYPE_ALLOC_IDR(0), + .destroy_object = uverbs_free_flow, +}; + +const struct uverbs_obj_idr_type uverbs_type_attrs_wq = { + .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0), + .destroy_object = uverbs_free_wq, +}; + +const struct uverbs_obj_idr_type uverbs_type_attrs_rwq_ind_table = { + .type = UVERBS_TYPE_ALLOC_IDR(0), + .destroy_object = uverbs_free_rwq_ind_tbl, +}; + +const struct uverbs_obj_idr_type uverbs_type_attrs_xrcd = { + .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0), + .destroy_object = uverbs_free_xrcd, +}; + +const struct uverbs_obj_idr_type uverbs_type_attrs_pd = { + /* 2 is used in order to free the PD after MRs */ + .type = UVERBS_TYPE_ALLOC_IDR(2), + .destroy_object = uverbs_free_pd, +}; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 71580cc28c9e..fb98ed67d5bc 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -44,6 +44,7 @@ #include <linux/in.h> #include <linux/in6.h> #include <net/addrconf.h> +#include <linux/security.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> @@ -311,7 +312,7 @@ EXPORT_SYMBOL(ib_dealloc_pd); /* Address handles */ -struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr) { struct ib_ah *ah; @@ -321,12 +322,13 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) ah->device = pd->device; ah->pd = pd; ah->uobject = NULL; + ah->type = ah_attr->type; atomic_inc(&pd->usecnt); } return ah; } -EXPORT_SYMBOL(ib_create_ah); +EXPORT_SYMBOL(rdma_create_ah); int ib_get_rdma_header_version(const union rdma_network_hdr *hdr) { @@ -450,9 +452,22 @@ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, } EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); +/* + * This function creates ah from the incoming packet. + * Incoming packet has dgid of the receiver node on which this code is + * getting executed and, sgid contains the GID of the sender. + * + * When resolving mac address of destination, the arrived dgid is used + * as sgid and, sgid is used as dgid because sgid contains destinations + * GID whom to respond to. + * + * This is why when calling rdma_addr_find_l2_eth_by_grh() function, the + * position of arguments dgid and sgid do not match the order of the + * parameters. + */ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, const struct ib_wc *wc, const struct ib_grh *grh, - struct ib_ah_attr *ah_attr) + struct rdma_ah_attr *ah_attr) { u32 flow_class; u16 gid_index; @@ -464,6 +479,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, union ib_gid sgid; memset(ah_attr, 0, sizeof *ah_attr); + ah_attr->type = rdma_ah_find_type(device, port_num); if (rdma_cap_eth_ah(device, port_num)) { if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE) net_type = wc->network_hdr_type; @@ -494,7 +510,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, return -ENODEV; ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid, - ah_attr->dmac, + ah_attr->roce.dmac, wc->wc_flags & IB_WC_WITH_VLAN ? NULL : &vlan_id, &if_index, &hoplimit); @@ -504,11 +520,6 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, } resolved_dev = dev_get_by_index(&init_net, if_index); - if (resolved_dev->flags & IFF_LOOPBACK) { - dev_put(resolved_dev); - resolved_dev = idev; - dev_hold(resolved_dev); - } rcu_read_lock(); if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev, resolved_dev)) @@ -525,15 +536,12 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, return ret; } - ah_attr->dlid = wc->slid; - ah_attr->sl = wc->sl; - ah_attr->src_path_bits = wc->dlid_path_bits; - ah_attr->port_num = port_num; + rdma_ah_set_dlid(ah_attr, wc->slid); + rdma_ah_set_sl(ah_attr, wc->sl); + rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits); + rdma_ah_set_port_num(ah_attr, port_num); if (wc->wc_flags & IB_WC_GRH) { - ah_attr->ah_flags = IB_AH_GRH; - ah_attr->grh.dgid = sgid; - if (!rdma_cap_eth_ah(device, port_num)) { if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { ret = ib_find_cached_gid_by_port(device, &dgid, @@ -547,11 +555,12 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, } } - ah_attr->grh.sgid_index = (u8) gid_index; flow_class = be32_to_cpu(grh->version_tclass_flow); - ah_attr->grh.flow_label = flow_class & 0xFFFFF; - ah_attr->grh.hop_limit = hoplimit; - ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; + rdma_ah_set_grh(ah_attr, &sgid, + flow_class & 0xFFFFF, + (u8)gid_index, hoplimit, + (flow_class >> 20) & 0xFF); + } return 0; } @@ -560,34 +569,37 @@ EXPORT_SYMBOL(ib_init_ah_from_wc); struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u8 port_num) { - struct ib_ah_attr ah_attr; + struct rdma_ah_attr ah_attr; int ret; ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr); if (ret) return ERR_PTR(ret); - return ib_create_ah(pd, &ah_attr); + return rdma_create_ah(pd, &ah_attr); } EXPORT_SYMBOL(ib_create_ah_from_wc); -int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) { + if (ah->type != ah_attr->type) + return -EINVAL; + return ah->device->modify_ah ? ah->device->modify_ah(ah, ah_attr) : -ENOSYS; } -EXPORT_SYMBOL(ib_modify_ah); +EXPORT_SYMBOL(rdma_modify_ah); -int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) { return ah->device->query_ah ? ah->device->query_ah(ah, ah_attr) : -ENOSYS; } -EXPORT_SYMBOL(ib_query_ah); +EXPORT_SYMBOL(rdma_query_ah); -int ib_destroy_ah(struct ib_ah *ah) +int rdma_destroy_ah(struct ib_ah *ah) { struct ib_pd *pd; int ret; @@ -599,7 +611,7 @@ int ib_destroy_ah(struct ib_ah *ah) return ret; } -EXPORT_SYMBOL(ib_destroy_ah); +EXPORT_SYMBOL(rdma_destroy_ah); /* Shared receive queues */ @@ -710,12 +722,20 @@ static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, { struct ib_qp *qp; unsigned long flags; + int err; qp = kzalloc(sizeof *qp, GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); qp->real_qp = real_qp; + err = ib_open_shared_qp_security(qp, real_qp->device); + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + qp->real_qp = real_qp; atomic_inc(&real_qp->usecnt); qp->device = real_qp->device; qp->event_handler = event_handler; @@ -801,6 +821,12 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, if (IS_ERR(qp)) return qp; + ret = ib_create_qp_security(qp, device); + if (ret) { + ib_destroy_qp(qp); + return ERR_PTR(ret); + } + qp->device = device; qp->real_qp = qp; qp->uobject = NULL; @@ -869,6 +895,7 @@ static const struct { } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, + [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_INIT] = { .valid = 1, .req_param = { @@ -1201,20 +1228,22 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, EXPORT_SYMBOL(ib_modify_qp_is_ok); int ib_resolve_eth_dmac(struct ib_device *device, - struct ib_ah_attr *ah_attr) + struct rdma_ah_attr *ah_attr) { int ret = 0; + struct ib_global_route *grh; - if (ah_attr->port_num < rdma_start_port(device) || - ah_attr->port_num > rdma_end_port(device)) + if (!rdma_is_port_valid(device, rdma_ah_get_port_num(ah_attr))) return -EINVAL; - if (!rdma_cap_eth_ah(device, ah_attr->port_num)) + if (ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE) return 0; - if (rdma_link_local_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { - rdma_get_ll_mac((struct in6_addr *)ah_attr->grh.dgid.raw, - ah_attr->dmac); + grh = rdma_ah_retrieve_grh(ah_attr); + + if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw)) { + rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, + ah_attr->roce.dmac); } else { union ib_gid sgid; struct ib_gid_attr sgid_attr; @@ -1222,8 +1251,8 @@ int ib_resolve_eth_dmac(struct ib_device *device, int hop_limit; ret = ib_query_gid(device, - ah_attr->port_num, - ah_attr->grh.sgid_index, + rdma_ah_get_port_num(ah_attr), + grh->sgid_index, &sgid, &sgid_attr); if (ret || !sgid_attr.ndev) { @@ -1234,34 +1263,50 @@ int ib_resolve_eth_dmac(struct ib_device *device, ifindex = sgid_attr.ndev->ifindex; - ret = rdma_addr_find_l2_eth_by_grh(&sgid, - &ah_attr->grh.dgid, - ah_attr->dmac, - NULL, &ifindex, &hop_limit); + ret = + rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid, + ah_attr->roce.dmac, + NULL, &ifindex, &hop_limit); dev_put(sgid_attr.ndev); - ah_attr->grh.hop_limit = hop_limit; + grh->hop_limit = hop_limit; } out: return ret; } EXPORT_SYMBOL(ib_resolve_eth_dmac); -int ib_modify_qp(struct ib_qp *qp, - struct ib_qp_attr *qp_attr, - int qp_attr_mask) +/** + * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. + * @qp: The QP to modify. + * @attr: On input, specifies the QP attributes to modify. On output, + * the current values of selected QP attributes are returned. + * @attr_mask: A bit-mask used to specify which attributes of the QP + * are being modified. + * @udata: pointer to user's input output buffer information + * are being modified. + * It returns 0 on success and returns appropriate error code on error. + */ +int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) { + int ret; - if (qp_attr_mask & IB_QP_AV) { - int ret; - - ret = ib_resolve_eth_dmac(qp->device, &qp_attr->ah_attr); + if (attr_mask & IB_QP_AV) { + ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); if (ret) return ret; } + return ib_security_modify_qp(qp, attr, attr_mask, udata); +} +EXPORT_SYMBOL(ib_modify_qp_with_udata); - return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); +int ib_modify_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask) +{ + return ib_modify_qp_with_udata(qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); @@ -1290,6 +1335,7 @@ int ib_close_qp(struct ib_qp *qp) spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags); atomic_dec(&real_qp->usecnt); + ib_close_shared_qp_security(qp->qp_sec); kfree(qp); return 0; @@ -1330,6 +1376,7 @@ int ib_destroy_qp(struct ib_qp *qp) struct ib_cq *scq, *rcq; struct ib_srq *srq; struct ib_rwq_ind_table *ind_tbl; + struct ib_qp_security *sec; int ret; WARN_ON_ONCE(qp->mrs_used > 0); @@ -1345,6 +1392,9 @@ int ib_destroy_qp(struct ib_qp *qp) rcq = qp->recv_cq; srq = qp->srq; ind_tbl = qp->rwq_ind_tbl; + sec = qp->qp_sec; + if (sec) + ib_destroy_qp_security_begin(sec); if (!qp->uobject) rdma_rw_cleanup_mrs(qp); @@ -1361,6 +1411,11 @@ int ib_destroy_qp(struct ib_qp *qp) atomic_dec(&srq->usecnt); if (ind_tbl) atomic_dec(&ind_tbl->usecnt); + if (sec) + ib_destroy_qp_security_end(sec); + } else { + if (sec) + ib_destroy_qp_security_abort(sec); } return ret; @@ -1520,7 +1575,9 @@ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) if (!qp->device->attach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || + lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || + lid == be16_to_cpu(IB_LID_PERMISSIVE)) return -EINVAL; ret = qp->device->attach_mcast(qp, gid, lid); @@ -1536,7 +1593,9 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) if (!qp->device->detach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || + lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || + lid == be16_to_cpu(IB_LID_PERMISSIVE)) return -EINVAL; ret = qp->device->detach_mcast(qp, gid, lid); @@ -1949,17 +2008,12 @@ static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) */ static void __ib_drain_sq(struct ib_qp *qp) { + struct ib_cq *cq = qp->send_cq; struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_drain_cqe sdrain; struct ib_send_wr swr = {}, *bad_swr; int ret; - if (qp->send_cq->poll_ctx == IB_POLL_DIRECT) { - WARN_ONCE(qp->send_cq->poll_ctx == IB_POLL_DIRECT, - "IB_POLL_DIRECT poll_ctx not supported for drain\n"); - return; - } - swr.wr_cqe = &sdrain.cqe; sdrain.cqe.done = ib_drain_qp_done; init_completion(&sdrain.done); @@ -1976,7 +2030,11 @@ static void __ib_drain_sq(struct ib_qp *qp) return; } - wait_for_completion(&sdrain.done); + if (cq->poll_ctx == IB_POLL_DIRECT) + while (wait_for_completion_timeout(&sdrain.done, HZ / 10) <= 0) + ib_process_cq_direct(cq, -1); + else + wait_for_completion(&sdrain.done); } /* @@ -1984,17 +2042,12 @@ static void __ib_drain_sq(struct ib_qp *qp) */ static void __ib_drain_rq(struct ib_qp *qp) { + struct ib_cq *cq = qp->recv_cq; struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_drain_cqe rdrain; struct ib_recv_wr rwr = {}, *bad_rwr; int ret; - if (qp->recv_cq->poll_ctx == IB_POLL_DIRECT) { - WARN_ONCE(qp->recv_cq->poll_ctx == IB_POLL_DIRECT, - "IB_POLL_DIRECT poll_ctx not supported for drain\n"); - return; - } - rwr.wr_cqe = &rdrain.cqe; rdrain.cqe.done = ib_drain_qp_done; init_completion(&rdrain.done); @@ -2011,7 +2064,11 @@ static void __ib_drain_rq(struct ib_qp *qp) return; } - wait_for_completion(&rdrain.done); + if (cq->poll_ctx == IB_POLL_DIRECT) + while (wait_for_completion_timeout(&rdrain.done, HZ / 10) <= 0) + ib_process_cq_direct(cq, -1); + else + wait_for_completion(&rdrain.done); } /** @@ -2028,8 +2085,7 @@ static void __ib_drain_rq(struct ib_qp *qp) * ensure there is room in the CQ and SQ for the drain work request and * completion. * - * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be - * IB_POLL_DIRECT. + * allocate the CQ using ib_alloc_cq(). * * ensure that there are no other contexts that are posting WRs concurrently. * Otherwise the drain is not guaranteed. @@ -2057,8 +2113,7 @@ EXPORT_SYMBOL(ib_drain_sq); * ensure there is room in the CQ and RQ for the drain work request and * completion. * - * allocate the CQ using ib_alloc_cq() and the CQ poll context cannot be - * IB_POLL_DIRECT. + * allocate the CQ using ib_alloc_cq(). * * ensure that there are no other contexts that are posting WRs concurrently. * Otherwise the drain is not guaranteed. @@ -2082,8 +2137,7 @@ EXPORT_SYMBOL(ib_drain_rq); * ensure there is room in the CQ(s), SQ, and RQ for drain work requests * and completions. * - * allocate the CQs using ib_alloc_cq() and the CQ poll context cannot be - * IB_POLL_DIRECT. + * allocate the CQs using ib_alloc_cq(). * * ensure that there are no other contexts that are posting WRs concurrently. * Otherwise the drain is not guaranteed. |