diff options
Diffstat (limited to 'net/sunrpc')
-rw-r--r-- | net/sunrpc/auth.c | 16 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/auth_gss.c | 2 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/svcauth_gss.c | 4 | ||||
-rw-r--r-- | net/sunrpc/auth_null.c | 3 | ||||
-rw-r--r-- | net/sunrpc/auth_unix.c | 18 | ||||
-rw-r--r-- | net/sunrpc/cache.c | 121 | ||||
-rw-r--r-- | net/sunrpc/clnt.c | 51 | ||||
-rw-r--r-- | net/sunrpc/debugfs.c | 35 | ||||
-rw-r--r-- | net/sunrpc/svc.c | 28 | ||||
-rw-r--r-- | net/sunrpc/svcauth_unix.c | 4 | ||||
-rw-r--r-- | net/sunrpc/svcsock.c | 5 | ||||
-rw-r--r-- | net/sunrpc/xdr.c | 34 | ||||
-rw-r--r-- | net/sunrpc/xprt.c | 2 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/fmr_ops.c | 5 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/frwr_ops.c | 11 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 82 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 17 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_marshal.c | 299 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 20 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_sendto.c | 22 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_transport.c | 69 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 6 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 96 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 30 | ||||
-rw-r--r-- | net/sunrpc/xprtsock.c | 102 |
25 files changed, 553 insertions, 529 deletions
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 2bff63a73cf8..d2623b9f23d6 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -8,6 +8,7 @@ #include <linux/types.h> #include <linux/sched.h> +#include <linux/cred.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/errno.h> @@ -464,8 +465,10 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) * Note that the cred_unused list must be time-ordered. */ if (time_in_range(cred->cr_expire, expired, jiffies) && - test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) + test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { + freed = SHRINK_STOP; break; + } list_del_init(&cred->cr_lru); number_cred_unused--; @@ -520,7 +523,7 @@ static unsigned long rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - return (number_cred_unused / 100) * sysctl_vfs_cache_pressure; + return number_cred_unused * sysctl_vfs_cache_pressure / 100; } static void @@ -646,9 +649,6 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, cred->cr_auth = auth; cred->cr_ops = ops; cred->cr_expire = jiffies; -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - cred->cr_magic = RPCAUTH_CRED_MAGIC; -#endif cred->cr_uid = acred->uid; } EXPORT_SYMBOL_GPL(rpcauth_init_cred); @@ -876,8 +876,12 @@ int __init rpcauth_init_module(void) err = rpc_init_generic_auth(); if (err < 0) goto out2; - register_shrinker(&rpc_cred_shrinker); + err = register_shrinker(&rpc_cred_shrinker); + if (err < 0) + goto out3; return 0; +out3: + rpc_destroy_generic_auth(); out2: rpc_destroy_authunix(); out1: diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index cdeb1d814833..4f16953e4954 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -763,7 +763,7 @@ err_put_ctx: err: kfree(buf); out: - dprintk("RPC: %s returning %Zd\n", __func__, err); + dprintk("RPC: %s returning %zd\n", __func__, err); return err; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 153082598522..a54a7a3d28f5 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1489,8 +1489,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) case RPC_GSS_PROC_DESTROY: if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; - rsci->h.expiry_time = seconds_since_boot(); - set_bit(CACHE_NEGATIVE, &rsci->h.flags); + /* Delete the entry from the cache_list and call cache_put */ + sunrpc_cache_unhash(sn->rsc_cache, &rsci->h); if (resv->iov_len + 4 > PAGE_SIZE) goto drop; svc_putnl(resv, RPC_SUCCESS); diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 4d17376b2acb..5f3d527dff65 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -139,7 +139,4 @@ struct rpc_cred null_cred = { .cr_ops = &null_credops, .cr_count = ATOMIC_INIT(1), .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - .cr_magic = RPCAUTH_CRED_MAGIC, -#endif }; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 306fc0f54596..82337e1ec9cd 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -14,12 +14,10 @@ #include <linux/sunrpc/auth.h> #include <linux/user_namespace.h> -#define NFS_NGROUPS 16 - struct unx_cred { struct rpc_cred uc_base; kgid_t uc_gid; - kgid_t uc_gids[NFS_NGROUPS]; + kgid_t uc_gids[UNX_NGROUPS]; }; #define uc_uid uc_base.cr_uid @@ -82,13 +80,13 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t if (acred->group_info != NULL) groups = acred->group_info->ngroups; - if (groups > NFS_NGROUPS) - groups = NFS_NGROUPS; + if (groups > UNX_NGROUPS) + groups = UNX_NGROUPS; cred->uc_gid = acred->gid; for (i = 0; i < groups; i++) cred->uc_gids[i] = acred->group_info->gid[i]; - if (i < NFS_NGROUPS) + if (i < UNX_NGROUPS) cred->uc_gids[i] = INVALID_GID; return &cred->uc_base; @@ -132,12 +130,12 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags) if (acred->group_info != NULL) groups = acred->group_info->ngroups; - if (groups > NFS_NGROUPS) - groups = NFS_NGROUPS; + if (groups > UNX_NGROUPS) + groups = UNX_NGROUPS; for (i = 0; i < groups ; i++) if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i])) return 0; - if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups])) + if (groups < UNX_NGROUPS && gid_valid(cred->uc_gids[groups])) return 0; return 1; } @@ -166,7 +164,7 @@ unx_marshal(struct rpc_task *task, __be32 *p) *p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid)); *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid)); hold = p++; - for (i = 0; i < 16 && gid_valid(cred->uc_gids[i]); i++) + for (i = 0; i < UNX_NGROUPS && gid_valid(cred->uc_gids[i]); i++) *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i])); *hold = htonl(p - hold - 1); /* gid array length */ *base = htonl((p - base - 1) << 2); /* cred length */ diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index f39e3e11f9aa..79d55d949d9a 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -362,11 +362,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) cache_purge(cd); spin_lock(&cache_list_lock); write_lock(&cd->hash_lock); - if (cd->entries) { - write_unlock(&cd->hash_lock); - spin_unlock(&cache_list_lock); - goto out; - } if (current_detail == cd) current_detail = NULL; list_del_init(&cd->others); @@ -376,9 +371,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd) /* module must be being unloaded so its safe to kill the worker */ cancel_delayed_work_sync(&cache_cleaner); } - return; -out: - printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name); } EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail); @@ -497,13 +489,32 @@ EXPORT_SYMBOL_GPL(cache_flush); void cache_purge(struct cache_detail *detail) { - time_t now = seconds_since_boot(); - if (detail->flush_time >= now) - now = detail->flush_time + 1; - /* 'now' is the maximum value any 'last_refresh' can have */ - detail->flush_time = now; - detail->nextcheck = seconds_since_boot(); - cache_flush(); + struct cache_head *ch = NULL; + struct hlist_head *head = NULL; + struct hlist_node *tmp = NULL; + int i = 0; + + write_lock(&detail->hash_lock); + if (!detail->entries) { + write_unlock(&detail->hash_lock); + return; + } + + dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name); + for (i = 0; i < detail->hash_size; i++) { + head = &detail->hash_table[i]; + hlist_for_each_entry_safe(ch, tmp, head, cache_list) { + hlist_del_init(&ch->cache_list); + detail->entries--; + + set_bit(CACHE_CLEANED, &ch->flags); + write_unlock(&detail->hash_lock); + cache_fresh_unlocked(ch, detail); + cache_put(ch, detail); + write_lock(&detail->hash_lock); + } + } + write_unlock(&detail->hash_lock); } EXPORT_SYMBOL_GPL(cache_purge); @@ -717,7 +728,7 @@ void cache_clean_deferred(void *owner) /* * communicate with user-space * - * We have a magic /proc file - /proc/sunrpc/<cachename>/channel. + * We have a magic /proc file - /proc/net/rpc/<cachename>/channel. * On read, you get a full request, or block. * On write, an update request is processed. * Poll works if anything to read, and always allows write. @@ -1272,7 +1283,7 @@ EXPORT_SYMBOL_GPL(qword_get); /* - * support /proc/sunrpc/cache/$CACHENAME/content + * support /proc/net/rpc/$CACHENAME/content * as a seqfile. * We call ->cache_show passing NULL for the item to * get a header, then pass each real item in the cache @@ -1427,20 +1438,11 @@ static ssize_t read_flush(struct file *file, char __user *buf, struct cache_detail *cd) { char tbuf[22]; - unsigned long p = *ppos; size_t len; - snprintf(tbuf, sizeof(tbuf), "%lu\n", convert_to_wallclock(cd->flush_time)); - len = strlen(tbuf); - if (p >= len) - return 0; - len -= p; - if (len > count) - len = count; - if (copy_to_user(buf, (void*)(tbuf+p), len)) - return -EFAULT; - *ppos += len; - return len; + len = snprintf(tbuf, sizeof(tbuf), "%lu\n", + convert_to_wallclock(cd->flush_time)); + return simple_read_from_buffer(buf, count, ppos, tbuf, len); } static ssize_t write_flush(struct file *file, const char __user *buf, @@ -1600,21 +1602,12 @@ static const struct file_operations cache_flush_operations_procfs = { .llseek = no_llseek, }; -static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net) +static void remove_cache_proc_entries(struct cache_detail *cd) { - struct sunrpc_net *sn; - - if (cd->u.procfs.proc_ent == NULL) - return; - if (cd->u.procfs.flush_ent) - remove_proc_entry("flush", cd->u.procfs.proc_ent); - if (cd->u.procfs.channel_ent) - remove_proc_entry("channel", cd->u.procfs.proc_ent); - if (cd->u.procfs.content_ent) - remove_proc_entry("content", cd->u.procfs.proc_ent); - cd->u.procfs.proc_ent = NULL; - sn = net_generic(net, sunrpc_net_id); - remove_proc_entry(cd->name, sn->proc_net_rpc); + if (cd->procfs) { + proc_remove(cd->procfs); + cd->procfs = NULL; + } } #ifdef CONFIG_PROC_FS @@ -1624,38 +1617,30 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) struct sunrpc_net *sn; sn = net_generic(net, sunrpc_net_id); - cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc); - if (cd->u.procfs.proc_ent == NULL) + cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc); + if (cd->procfs == NULL) goto out_nomem; - cd->u.procfs.channel_ent = NULL; - cd->u.procfs.content_ent = NULL; p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR, - cd->u.procfs.proc_ent, - &cache_flush_operations_procfs, cd); - cd->u.procfs.flush_ent = p; + cd->procfs, &cache_flush_operations_procfs, cd); if (p == NULL) goto out_nomem; if (cd->cache_request || cd->cache_parse) { p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR, - cd->u.procfs.proc_ent, - &cache_file_operations_procfs, cd); - cd->u.procfs.channel_ent = p; + cd->procfs, &cache_file_operations_procfs, cd); if (p == NULL) goto out_nomem; } if (cd->cache_show) { p = proc_create_data("content", S_IFREG|S_IRUSR, - cd->u.procfs.proc_ent, - &content_file_operations_procfs, cd); - cd->u.procfs.content_ent = p; + cd->procfs, &content_file_operations_procfs, cd); if (p == NULL) goto out_nomem; } return 0; out_nomem: - remove_cache_proc_entries(cd, net); + remove_cache_proc_entries(cd); return -ENOMEM; } #else /* CONFIG_PROC_FS */ @@ -1684,7 +1669,7 @@ EXPORT_SYMBOL_GPL(cache_register_net); void cache_unregister_net(struct cache_detail *cd, struct net *net) { - remove_cache_proc_entries(cd, net); + remove_cache_proc_entries(cd); sunrpc_destroy_cache_detail(cd); } EXPORT_SYMBOL_GPL(cache_unregister_net); @@ -1843,15 +1828,29 @@ int sunrpc_cache_register_pipefs(struct dentry *parent, struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd); if (IS_ERR(dir)) return PTR_ERR(dir); - cd->u.pipefs.dir = dir; + cd->pipefs = dir; return 0; } EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs); void sunrpc_cache_unregister_pipefs(struct cache_detail *cd) { - rpc_remove_cache_dir(cd->u.pipefs.dir); - cd->u.pipefs.dir = NULL; + if (cd->pipefs) { + rpc_remove_cache_dir(cd->pipefs); + cd->pipefs = NULL; + } } EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs); +void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h) +{ + write_lock(&cd->hash_lock); + if (!hlist_unhashed(&h->cache_list)){ + hlist_del_init(&h->cache_list); + cd->entries--; + write_unlock(&cd->hash_lock); + cache_put(h, cd); + } else + write_unlock(&cd->hash_lock); +} +EXPORT_SYMBOL_GPL(sunrpc_cache_unhash); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 1dc9f3bac099..52da3ce54bb5 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1453,21 +1453,6 @@ size_t rpc_max_bc_payload(struct rpc_clnt *clnt) EXPORT_SYMBOL_GPL(rpc_max_bc_payload); /** - * rpc_get_timeout - Get timeout for transport in units of HZ - * @clnt: RPC client to query - */ -unsigned long rpc_get_timeout(struct rpc_clnt *clnt) -{ - unsigned long ret; - - rcu_read_lock(); - ret = rcu_dereference(clnt->cl_xprt)->timeout->to_initval; - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(rpc_get_timeout); - -/** * rpc_force_rebind - force transport to check that remote port is unchanged * @clnt: client to rebind * @@ -2699,6 +2684,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; + unsigned long connect_timeout; unsigned long reconnect_timeout; unsigned char resvport; int ret = 0; @@ -2711,6 +2697,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, return -EAGAIN; } resvport = xprt->resvport; + connect_timeout = xprt->connect_timeout; reconnect_timeout = xprt->max_reconnect_timeout; rcu_read_unlock(); @@ -2720,7 +2707,10 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, goto out_put_switch; } xprt->resvport = resvport; - xprt->max_reconnect_timeout = reconnect_timeout; + if (xprt->ops->set_connect_timeout != NULL) + xprt->ops->set_connect_timeout(xprt, + connect_timeout, + reconnect_timeout); rpc_xprt_switch_set_roundrobin(xps); if (setup) { @@ -2737,26 +2727,39 @@ out_put_switch: } EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt); +struct connect_timeout_data { + unsigned long connect_timeout; + unsigned long reconnect_timeout; +}; + static int -rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt, +rpc_xprt_set_connect_timeout(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { - unsigned long timeout = *((unsigned long *)data); + struct connect_timeout_data *timeo = data; - if (timeout < xprt->max_reconnect_timeout) - xprt->max_reconnect_timeout = timeout; + if (xprt->ops->set_connect_timeout) + xprt->ops->set_connect_timeout(xprt, + timeo->connect_timeout, + timeo->reconnect_timeout); return 0; } void -rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo) +rpc_set_connect_timeout(struct rpc_clnt *clnt, + unsigned long connect_timeout, + unsigned long reconnect_timeout) { + struct connect_timeout_data timeout = { + .connect_timeout = connect_timeout, + .reconnect_timeout = reconnect_timeout, + }; rpc_clnt_iterate_for_each_xprt(clnt, - rpc_xprt_cap_max_reconnect_timeout, - &timeo); + rpc_xprt_set_connect_timeout, + &timeout); } -EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); +EXPORT_SYMBOL_GPL(rpc_set_connect_timeout); void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) { diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index e7b4d93566df..c8fd0b6c1618 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -16,11 +16,6 @@ static struct dentry *rpc_xprt_dir; unsigned int rpc_inject_disconnect; -struct rpc_clnt_iter { - struct rpc_clnt *clnt; - loff_t pos; -}; - static int tasks_show(struct seq_file *f, void *v) { @@ -47,12 +42,10 @@ static void * tasks_start(struct seq_file *f, loff_t *ppos) __acquires(&clnt->cl_lock) { - struct rpc_clnt_iter *iter = f->private; + struct rpc_clnt *clnt = f->private; loff_t pos = *ppos; - struct rpc_clnt *clnt = iter->clnt; struct rpc_task *task; - iter->pos = pos + 1; spin_lock(&clnt->cl_lock); list_for_each_entry(task, &clnt->cl_tasks, tk_task) if (pos-- == 0) @@ -63,12 +56,10 @@ tasks_start(struct seq_file *f, loff_t *ppos) static void * tasks_next(struct seq_file *f, void *v, loff_t *pos) { - struct rpc_clnt_iter *iter = f->private; - struct rpc_clnt *clnt = iter->clnt; + struct rpc_clnt *clnt = f->private; struct rpc_task *task = v; struct list_head *next = task->tk_task.next; - ++iter->pos; ++*pos; /* If there's another task on list, return it */ @@ -81,9 +72,7 @@ static void tasks_stop(struct seq_file *f, void *v) __releases(&clnt->cl_lock) { - struct rpc_clnt_iter *iter = f->private; - struct rpc_clnt *clnt = iter->clnt; - + struct rpc_clnt *clnt = f->private; spin_unlock(&clnt->cl_lock); } @@ -96,17 +85,13 @@ static const struct seq_operations tasks_seq_operations = { static int tasks_open(struct inode *inode, struct file *filp) { - int ret = seq_open_private(filp, &tasks_seq_operations, - sizeof(struct rpc_clnt_iter)); - + int ret = seq_open(filp, &tasks_seq_operations); if (!ret) { struct seq_file *seq = filp->private_data; - struct rpc_clnt_iter *iter = seq->private; - - iter->clnt = inode->i_private; + struct rpc_clnt *clnt = seq->private = inode->i_private; - if (!atomic_inc_not_zero(&iter->clnt->cl_count)) { - seq_release_private(inode, filp); + if (!atomic_inc_not_zero(&clnt->cl_count)) { + seq_release(inode, filp); ret = -EINVAL; } } @@ -118,10 +103,10 @@ static int tasks_release(struct inode *inode, struct file *filp) { struct seq_file *seq = filp->private_data; - struct rpc_clnt_iter *iter = seq->private; + struct rpc_clnt *clnt = seq->private; - rpc_release_client(iter->clnt); - return seq_release_private(inode, filp); + rpc_release_client(clnt); + return seq_release(inode, filp); } static const struct file_operations tasks_fops = { diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 75f290bddca1..a08aeb56b8e4 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -11,7 +11,7 @@ */ #include <linux/linkage.h> -#include <linux/sched.h> +#include <linux/sched/signal.h> #include <linux/errno.h> #include <linux/net.h> #include <linux/in.h> @@ -385,7 +385,7 @@ static int svc_uses_rpcbind(struct svc_serv *serv) for (i = 0; i < progp->pg_nvers; i++) { if (progp->pg_vers[i] == NULL) continue; - if (progp->pg_vers[i]->vs_hidden == 0) + if (!progp->pg_vers[i]->vs_hidden) return 1; } } @@ -976,6 +976,13 @@ int svc_register(const struct svc_serv *serv, struct net *net, if (vers->vs_hidden) continue; + /* + * Don't register a UDP port if we need congestion + * control. + */ + if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP) + continue; + error = __svc_register(net, progp->pg_name, progp->pg_prog, i, family, proto, port); @@ -1169,6 +1176,21 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) !(versp = progp->pg_vers[vers])) goto err_bad_vers; + /* + * Some protocol versions (namely NFSv4) require some form of + * congestion control. (See RFC 7530 section 3.1 paragraph 2) + * In other words, UDP is not allowed. We mark those when setting + * up the svc_xprt, and verify that here. + * + * The spec is not very clear about what error should be returned + * when someone tries to access a server that is listening on UDP + * for lower versions. RPC_PROG_MISMATCH seems to be the closest + * fit. + */ + if (versp->vs_need_cong_ctrl && + !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags)) + goto err_bad_vers; + procp = versp->vs_proc + proc; if (proc >= versp->vs_nproc || !procp->pc_func) goto err_bad_proc; @@ -1260,7 +1282,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) return 0; err_short_len: - svc_printk(rqstp, "short len %Zd, dropping request\n", + svc_printk(rqstp, "short len %zd, dropping request\n", argv->iov_len); goto close; diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 64af4f034de6..f81eaa8e0888 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -403,7 +403,7 @@ svcauth_unix_info_release(struct svc_xprt *xpt) /**************************************************************************** * auth.unix.gid cache * simple cache to map a UID to a list of GIDs - * because AUTH_UNIX aka AUTH_SYS has a max of 16 + * because AUTH_UNIX aka AUTH_SYS has a max of UNX_NGROUPS */ #define GID_HASHBITS 8 #define GID_HASHMAX (1<<GID_HASHBITS) @@ -810,7 +810,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */ cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */ slen = svc_getnl(argv); /* gids length */ - if (slen > 16 || (len -= (slen + 2)*4) < 0) + if (slen > UNX_NGROUPS || (len -= (slen + 2)*4) < 0) goto badcred; cred->cr_group_info = groups_alloc(slen); if (cred->cr_group_info == NULL) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index de066acdb34e..8931e33b6541 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -278,7 +278,7 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) rqstp->rq_respages[0], tailoff); out: - dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", + dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n", svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); @@ -346,7 +346,7 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, if (len == buflen) set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", + dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n", svsk, iov[0].iov_base, iov[0].iov_len, len); return len; } @@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class, &svsk->sk_xprt, serv); set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags); if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 7f1071e103ca..1f7082144e01 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1518,3 +1518,37 @@ out: } EXPORT_SYMBOL_GPL(xdr_process_buf); +/** + * xdr_stream_decode_string_dup - Decode and duplicate variable length string + * @xdr: pointer to xdr_stream + * @str: location to store pointer to string + * @maxlen: maximum acceptable string length + * @gfp_flags: GFP mask to use + * + * Return values: + * On success, returns length of NUL-terminated string stored in *@ptr + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE if the size of the string would exceed @maxlen + * %-ENOMEM on memory allocation failure + */ +ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, + size_t maxlen, gfp_t gfp_flags) +{ + void *p; + ssize_t ret; + + ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); + if (ret > 0) { + char *s = kmalloc(ret + 1, gfp_flags); + if (s != NULL) { + memcpy(s, p, ret); + s[ret] = '\0'; + *str = s; + return strlen(s); + } + ret = -ENOMEM; + } + *str = NULL; + return ret; +} +EXPORT_SYMBOL_GPL(xdr_stream_decode_string_dup); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 9a6be030ca7d..b530a2852ba8 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -897,13 +897,11 @@ static void xprt_timer(struct rpc_task *task) return; dprintk("RPC: %5u xprt_timer\n", task->tk_pid); - spin_lock_bh(&xprt->transport_lock); if (!req->rq_reply_bytes_recvd) { if (xprt->ops->timer) xprt->ops->timer(xprt, task); } else task->tk_status = 0; - spin_unlock_bh(&xprt->transport_lock); } /** diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 1ebb09e1ac4f..59e64025ed96 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -310,10 +310,7 @@ fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpcrdma_mw *mw; while (!list_empty(&req->rl_registered)) { - mw = list_first_entry(&req->rl_registered, - struct rpcrdma_mw, mw_list); - list_del_init(&mw->mw_list); - + mw = rpcrdma_pop_mw(&req->rl_registered); if (sync) fmr_op_recover_mr(mw); else diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 47bed5333c7f..f81dd93176c0 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -466,8 +466,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) struct ib_send_wr *first, **prev, *last, *bad_wr; struct rpcrdma_rep *rep = req->rl_reply; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mw *mw, *tmp; struct rpcrdma_frmr *f; + struct rpcrdma_mw *mw; int count, rc; dprintk("RPC: %s: req %p\n", __func__, req); @@ -534,10 +534,10 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * them to the free MW list. */ unmap: - list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { + while (!list_empty(&req->rl_registered)) { + mw = rpcrdma_pop_mw(&req->rl_registered); dprintk("RPC: %s: DMA unmapping frmr %p\n", __func__, &mw->frmr); - list_del_init(&mw->mw_list); ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); @@ -571,10 +571,7 @@ frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct rpcrdma_mw *mw; while (!list_empty(&req->rl_registered)) { - mw = list_first_entry(&req->rl_registered, - struct rpcrdma_mw, mw_list); - list_del_init(&mw->mw_list); - + mw = rpcrdma_pop_mw(&req->rl_registered); if (sync) frwr_op_recover_mr(mw); else diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c52e0f2ffe52..a044be2d6ad7 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -125,14 +125,34 @@ void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) /* The client can send a request inline as long as the RPCRDMA header * plus the RPC call fit under the transport's inline limit. If the * combined call message size exceeds that limit, the client must use - * the read chunk list for this operation. + * a Read chunk for this operation. + * + * A Read chunk is also required if sending the RPC call inline would + * exceed this device's max_sge limit. */ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct xdr_buf *xdr = &rqst->rq_snd_buf; + unsigned int count, remaining, offset; + + if (xdr->len > r_xprt->rx_ia.ri_max_inline_write) + return false; + + if (xdr->page_len) { + remaining = xdr->page_len; + offset = xdr->page_base & ~PAGE_MASK; + count = 0; + while (remaining) { + remaining -= min_t(unsigned int, + PAGE_SIZE - offset, remaining); + offset = 0; + if (++count > r_xprt->rx_ia.ri_max_send_sges) + return false; + } + } - return rqst->rq_snd_buf.len <= ia->ri_max_inline_write; + return true; } /* The client can't know how large the actual reply will be. Thus it @@ -186,9 +206,9 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) */ static int -rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, - enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, - bool reminv_expected) +rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, + unsigned int pos, enum rpcrdma_chunktype type, + struct rpcrdma_mr_seg *seg) { int len, n, p, page_base; struct page **ppages; @@ -226,22 +246,21 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, if (len && n == RPCRDMA_MAX_SEGS) goto out_overflow; - /* When encoding the read list, the tail is always sent inline */ - if (type == rpcrdma_readch) + /* When encoding a Read chunk, the tail iovec contains an + * XDR pad and may be omitted. + */ + if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) return n; - /* When encoding the Write list, some servers need to see an extra - * segment for odd-length Write chunks. The upper layer provides - * space in the tail iovec for this purpose. + /* When encoding a Write chunk, some servers need to see an + * extra segment for non-XDR-aligned Write chunks. The upper + * layer provides space in the tail iovec that may be used + * for this purpose. */ - if (type == rpcrdma_writech && reminv_expected) + if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) return n; if (xdrbuf->tail[0].iov_len) { - /* the rpcrdma protocol allows us to omit any trailing - * xdr pad bytes, saving the server an RDMA operation. */ - if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) - return n; n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); if (n == RPCRDMA_MAX_SEGS) goto out_overflow; @@ -293,7 +312,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, if (rtype == rpcrdma_areadch) pos = 0; seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false); + nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, + rtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -302,7 +322,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, false, &mw); if (n < 0) return ERR_PTR(n); - list_add(&mw->mw_list, &req->rl_registered); + rpcrdma_push_mw(mw, &req->rl_registered); *iptr++ = xdr_one; /* item present */ @@ -355,10 +375,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, } seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, + nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, rqst->rq_rcv_buf.head[0].iov_len, - wtype, seg, - r_xprt->rx_ia.ri_reminv_expected); + wtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -371,7 +390,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, true, &mw); if (n < 0) return ERR_PTR(n); - list_add(&mw->mw_list, &req->rl_registered); + rpcrdma_push_mw(mw, &req->rl_registered); iptr = xdr_encode_rdma_segment(iptr, mw); @@ -423,8 +442,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, } seg = req->rl_segments; - nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, - r_xprt->rx_ia.ri_reminv_expected); + nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) return ERR_PTR(nsegs); @@ -437,7 +455,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, true, &mw); if (n < 0) return ERR_PTR(n); - list_add(&mw->mw_list, &req->rl_registered); + rpcrdma_push_mw(mw, &req->rl_registered); iptr = xdr_encode_rdma_segment(iptr, mw); @@ -741,13 +759,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) iptr = headerp->rm_body.rm_chunks; iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); if (IS_ERR(iptr)) - goto out_unmap; + goto out_err; iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); if (IS_ERR(iptr)) - goto out_unmap; + goto out_err; iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); if (IS_ERR(iptr)) - goto out_unmap; + goto out_err; hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", @@ -758,12 +776,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen, &rqst->rq_snd_buf, rtype)) { iptr = ERR_PTR(-EIO); - goto out_unmap; + goto out_err; } return 0; -out_unmap: - r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); +out_err: + pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", + PTR_ERR(iptr)); + r_xprt->rx_stats.failed_marshal_count++; return PTR_ERR(iptr); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index cb1e48e54eb1..ff1df40f0d26 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -201,19 +201,20 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) { struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer; + __be32 *p; int rc; /* Space in the send buffer for an RPC/RDMA header is reserved * via xprt->tsh_size. */ - headerp->rm_xid = rqst->rq_xid; - headerp->rm_vers = rpcrdma_version; - headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); - headerp->rm_type = rdma_msg; - headerp->rm_body.rm_chunks[0] = xdr_zero; - headerp->rm_body.rm_chunks[1] = xdr_zero; - headerp->rm_body.rm_chunks[2] = xdr_zero; + p = rqst->rq_buffer; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); + *p++ = rdma_msg; + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; #ifdef SVCRDMA_BACKCHANNEL_DEBUG pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c index 0ba9887f3e22..1c4aabf0f657 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2016 Oracle. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * * This software is available to you under a choice of one of two @@ -47,102 +48,43 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -/* - * Decodes a read chunk list. The expected format is as follows: - * descrim : xdr_one - * position : __be32 offset into XDR stream - * handle : __be32 RKEY - * . . . - * end-of-list: xdr_zero - */ -static __be32 *decode_read_list(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_read_list(__be32 *p, __be32 *end) { - struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; + __be32 *next; - while (ch->rc_discrim != xdr_zero) { - if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > - (unsigned long)vaend) { - dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch); + while (*p++ != xdr_zero) { + next = p + rpcrdma_readchunk_maxsz - 1; + if (next > end) return NULL; - } - ch++; + p = next; } - return &ch->rc_position; + return p; } -/* - * Decodes a write chunk list. The expected format is as follows: - * descrim : xdr_one - * nchunks : <count> - * handle : __be32 RKEY ---+ - * length : __be32 <len of segment> | - * offset : remove va + <count> - * . . . | - * ---+ - */ -static __be32 *decode_write_list(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_write_list(__be32 *p, __be32 *end) { - unsigned long start, end; - int nchunks; - - struct rpcrdma_write_array *ary = - (struct rpcrdma_write_array *)va; + __be32 *next; - /* Check for not write-array */ - if (ary->wc_discrim == xdr_zero) - return &ary->wc_nchunks; - - if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > - (unsigned long)vaend) { - dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); - return NULL; - } - nchunks = be32_to_cpu(ary->wc_nchunks); - - start = (unsigned long)&ary->wc_array[0]; - end = (unsigned long)vaend; - if (nchunks < 0 || - nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) || - (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) { - dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", - ary, nchunks, vaend); - return NULL; + while (*p++ != xdr_zero) { + next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + if (next > end) + return NULL; + p = next; } - /* - * rs_length is the 2nd 4B field in wc_target and taking its - * address skips the list terminator - */ - return &ary->wc_array[nchunks].wc_target.rs_length; + return p; } -static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) +static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end) { - unsigned long start, end; - int nchunks; - struct rpcrdma_write_array *ary = - (struct rpcrdma_write_array *)va; - - /* Check for no reply-array */ - if (ary->wc_discrim == xdr_zero) - return &ary->wc_nchunks; - - if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > - (unsigned long)vaend) { - dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); - return NULL; - } - nchunks = be32_to_cpu(ary->wc_nchunks); - - start = (unsigned long)&ary->wc_array[0]; - end = (unsigned long)vaend; - if (nchunks < 0 || - nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) || - (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) { - dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", - ary, nchunks, vaend); - return NULL; + __be32 *next; + + if (*p++ != xdr_zero) { + next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + if (next > end) + return NULL; + p = next; } - return (__be32 *)&ary->wc_array[nchunks]; + return p; } /** @@ -158,87 +100,71 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) */ int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) { - struct rpcrdma_msg *rmsgp; - __be32 *va, *vaend; - unsigned int len; - u32 hdr_len; + __be32 *p, *end, *rdma_argp; + unsigned int hdr_len; /* Verify that there's enough bytes for header + something */ - if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) { - dprintk("svcrdma: header too short = %d\n", - rq_arg->len); - return -EINVAL; - } + if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) + goto out_short; - rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base; - if (rmsgp->rm_vers != rpcrdma_version) { - dprintk("%s: bad version %u\n", __func__, - be32_to_cpu(rmsgp->rm_vers)); - return -EPROTONOSUPPORT; - } + rdma_argp = rq_arg->head[0].iov_base; + if (*(rdma_argp + 1) != rpcrdma_version) + goto out_version; - switch (be32_to_cpu(rmsgp->rm_type)) { - case RDMA_MSG: - case RDMA_NOMSG: + switch (*(rdma_argp + 3)) { + case rdma_msg: + case rdma_nomsg: break; - case RDMA_DONE: - /* Just drop it */ - dprintk("svcrdma: dropping RDMA_DONE message\n"); - return 0; - - case RDMA_ERROR: - /* Possible if this is a backchannel reply. - * XXX: We should cancel this XID, though. - */ - dprintk("svcrdma: dropping RDMA_ERROR message\n"); - return 0; - - case RDMA_MSGP: - /* Pull in the extra for the padded case, bump our pointer */ - rmsgp->rm_body.rm_padded.rm_align = - be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align); - rmsgp->rm_body.rm_padded.rm_thresh = - be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh); - - va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; - rq_arg->head[0].iov_base = va; - len = (u32)((unsigned long)va - (unsigned long)rmsgp); - rq_arg->head[0].iov_len -= len; - if (len > rq_arg->len) - return -EINVAL; - return len; - default: - dprintk("svcrdma: bad rdma procedure (%u)\n", - be32_to_cpu(rmsgp->rm_type)); - return -EINVAL; - } + case rdma_done: + goto out_drop; - /* The chunk list may contain either a read chunk list or a write - * chunk list and a reply chunk list. - */ - va = &rmsgp->rm_body.rm_chunks[0]; - vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len); - va = decode_read_list(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode read list\n"); - return -EINVAL; - } - va = decode_write_list(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode write list\n"); - return -EINVAL; - } - va = decode_reply_array(va, vaend); - if (!va) { - dprintk("svcrdma: failed to decode reply chunk\n"); - return -EINVAL; + case rdma_error: + goto out_drop; + + default: + goto out_proc; } - rq_arg->head[0].iov_base = va; - hdr_len = (unsigned long)va - (unsigned long)rmsgp; + end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); + p = xdr_check_read_list(rdma_argp + 4, end); + if (!p) + goto out_inval; + p = xdr_check_write_list(p, end); + if (!p) + goto out_inval; + p = xdr_check_reply_chunk(p, end); + if (!p) + goto out_inval; + if (p > end) + goto out_inval; + + rq_arg->head[0].iov_base = p; + hdr_len = (unsigned long)p - (unsigned long)rdma_argp; rq_arg->head[0].iov_len -= hdr_len; return hdr_len; + +out_short: + dprintk("svcrdma: header too short = %d\n", rq_arg->len); + return -EINVAL; + +out_version: + dprintk("svcrdma: bad xprt version: %u\n", + be32_to_cpup(rdma_argp + 1)); + return -EPROTONOSUPPORT; + +out_drop: + dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n"); + return 0; + +out_proc: + dprintk("svcrdma: bad rdma procedure (%u)\n", + be32_to_cpup(rdma_argp + 3)); + return -EINVAL; + +out_inval: + dprintk("svcrdma: failed to parse transport header\n"); + return -EINVAL; } int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, @@ -249,7 +175,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, *va++ = rmsgp->rm_xid; *va++ = rmsgp->rm_vers; - *va++ = cpu_to_be32(xprt->sc_max_requests); + *va++ = xprt->sc_fc_credits; *va++ = rdma_error; *va++ = cpu_to_be32(err); if (err == ERR_VERS) { @@ -260,32 +186,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, return (int)((unsigned long)va - (unsigned long)startp); } -int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) +/** + * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header + * @rdma_resp: buffer containing Reply transport header + * + * Returns length of transport header, in bytes. + */ +unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp) { - struct rpcrdma_write_array *wr_ary; + unsigned int nsegs; + __be32 *p; - /* There is no read-list in a reply */ + p = rdma_resp; - /* skip write list */ - wr_ary = (struct rpcrdma_write_array *) - &rmsgp->rm_body.rm_chunks[1]; - if (wr_ary->wc_discrim) - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]. - wc_target.rs_length; - else - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_nchunks; - - /* skip reply array */ - if (wr_ary->wc_discrim) - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)]; - else - wr_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_nchunks; - - return (unsigned long) wr_ary - (unsigned long) rmsgp; + /* RPC-over-RDMA V1 replies never have a Read list. */ + p += rpcrdma_fixed_maxsz + 1; + + /* Skip Write list. */ + while (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } + + /* Skip Reply chunk. */ + if (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } + + return (unsigned long)p - (unsigned long)rdma_resp; } void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) @@ -326,19 +255,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, seg->rs_offset = rs_offset; seg->rs_length = cpu_to_be32(write_len); } - -void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rdma_argp, - struct rpcrdma_msg *rdma_resp, - enum rpcrdma_proc rdma_type) -{ - rdma_resp->rm_xid = rdma_argp->rm_xid; - rdma_resp->rm_vers = rdma_argp->rm_vers; - rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests); - rdma_resp->rm_type = cpu_to_be32(rdma_type); - - /* Encode <nul> chunks lists */ - rdma_resp->rm_body.rm_chunks[0] = xdr_zero; - rdma_resp->rm_body.rm_chunks[1] = xdr_zero; - rdma_resp->rm_body.rm_chunks[2] = xdr_zero; -} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 172b537f8cfc..f7b2daf72a86 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -606,26 +606,24 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) dprintk("svcrdma: rqstp=%p\n", rqstp); - spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); + spin_lock(&rdma_xprt->sc_rq_dto_lock); if (!list_empty(&rdma_xprt->sc_read_complete_q)) { - ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); - spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); rdma_read_complete(rqstp, ctxt); goto complete; } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { - ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); } else { atomic_inc(&rdma_stat_rq_starve); clear_bit(XPT_DATA, &xprt->xpt_flags); ctxt = NULL; } - spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + spin_unlock(&rdma_xprt->sc_rq_dto_lock); if (!ctxt) { /* This is the EAGAIN path. The svc_recv routine will * return -EAGAIN, the nfsd thread will go to call into diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index ad4d286a83c5..515221b16d09 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -476,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma, /* Prepare the SGE for the RPCRDMA Header */ ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); + ctxt->sge[0].length = + svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, ctxt->sge[0].length, DMA_TO_DEVICE); @@ -559,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) struct rpcrdma_msg *rdma_argp; struct rpcrdma_msg *rdma_resp; struct rpcrdma_write_array *wr_ary, *rp_ary; - enum rpcrdma_proc reply_type; int ret; int inline_bytes; struct page *res_page; struct svc_rdma_req_map *vec; u32 inv_rkey; + __be32 *p; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); @@ -596,12 +597,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!res_page) goto err0; rdma_resp = page_address(res_page); - if (rp_ary) - reply_type = RDMA_NOMSG; - else - reply_type = RDMA_MSG; - svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, - rdma_resp, reply_type); + + p = &rdma_resp->rm_xid; + *p++ = rdma_argp->rm_xid; + *p++ = rdma_argp->rm_vers; + *p++ = rdma->sc_fc_credits; + *p++ = rp_ary ? rdma_nomsg : rdma_msg; + + /* Start with empty chunks */ + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; /* Send any write-chunk data and build resp write-list */ if (wr_ary) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 39652d390a9c..c13a5c35ce14 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -157,8 +157,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt, ctxt = kmalloc(sizeof(*ctxt), flags); if (ctxt) { ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->free); - INIT_LIST_HEAD(&ctxt->dto_q); + INIT_LIST_HEAD(&ctxt->list); } return ctxt; } @@ -180,7 +179,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) dprintk("svcrdma: No memory for RDMA ctxt\n"); return false; } - list_add(&ctxt->free, &xprt->sc_ctxts); + list_add(&ctxt->list, &xprt->sc_ctxts); } return true; } @@ -189,15 +188,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt = NULL; - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used++; if (list_empty(&xprt->sc_ctxts)) goto out_empty; ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, free); - list_del_init(&ctxt->free); - spin_unlock_bh(&xprt->sc_ctxt_lock); + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); + spin_unlock(&xprt->sc_ctxt_lock); out: ctxt->count = 0; @@ -209,15 +208,15 @@ out_empty: /* Either pre-allocation missed the mark, or send * queue accounting is broken. */ - spin_unlock_bh(&xprt->sc_ctxt_lock); + spin_unlock(&xprt->sc_ctxt_lock); ctxt = alloc_ctxt(xprt, GFP_NOIO); if (ctxt) goto out; - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used--; - spin_unlock_bh(&xprt->sc_ctxt_lock); + spin_unlock(&xprt->sc_ctxt_lock); WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); return NULL; } @@ -254,10 +253,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - spin_lock_bh(&xprt->sc_ctxt_lock); + spin_lock(&xprt->sc_ctxt_lock); xprt->sc_ctxt_used--; - list_add(&ctxt->free, &xprt->sc_ctxts); - spin_unlock_bh(&xprt->sc_ctxt_lock); + list_add(&ctxt->list, &xprt->sc_ctxts); + spin_unlock(&xprt->sc_ctxt_lock); } static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) @@ -266,8 +265,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) struct svc_rdma_op_ctxt *ctxt; ctxt = list_first_entry(&xprt->sc_ctxts, - struct svc_rdma_op_ctxt, free); - list_del(&ctxt->free); + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); kfree(ctxt); } } @@ -404,7 +403,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* All wc fields are now known to be valid */ ctxt->byte_len = wc->byte_len; spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q); spin_unlock(&xprt->sc_rq_dto_lock); set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); @@ -525,7 +524,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc) read_hdr = ctxt->read_hdr; spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&read_hdr->dto_q, + list_add_tail(&read_hdr->list, &xprt->sc_read_complete_q); spin_unlock(&xprt->sc_rq_dto_lock); @@ -557,7 +556,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return NULL; svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); - INIT_LIST_HEAD(&cma_xprt->sc_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); @@ -571,6 +569,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, spin_lock_init(&cma_xprt->sc_ctxt_lock); spin_lock_init(&cma_xprt->sc_map_lock); + /* + * Note that this implies that the underlying transport support + * has some form of congestion control (see RFC 7530 section 3.1 + * paragraph 2). For now, we assume that all supported RDMA + * transports are suitable here. + */ + set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags); + if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); @@ -923,14 +929,14 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) { struct svc_rdma_fastreg_mr *frmr = NULL; - spin_lock_bh(&rdma->sc_frmr_q_lock); + spin_lock(&rdma->sc_frmr_q_lock); if (!list_empty(&rdma->sc_frmr_q)) { frmr = list_entry(rdma->sc_frmr_q.next, struct svc_rdma_fastreg_mr, frmr_list); list_del_init(&frmr->frmr_list); frmr->sg_nents = 0; } - spin_unlock_bh(&rdma->sc_frmr_q_lock); + spin_unlock(&rdma->sc_frmr_q_lock); if (frmr) return frmr; @@ -943,10 +949,10 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, if (frmr) { ib_dma_unmap_sg(rdma->sc_cm_id->device, frmr->sg, frmr->sg_nents, frmr->direction); - spin_lock_bh(&rdma->sc_frmr_q_lock); + spin_lock(&rdma->sc_frmr_q_lock); WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); list_add(&frmr->frmr_list, &rdma->sc_frmr_q); - spin_unlock_bh(&rdma->sc_frmr_q_lock); + spin_unlock(&rdma->sc_frmr_q_lock); } } @@ -1002,6 +1008,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, svcrdma_max_requests); + newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests); newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, svcrdma_max_bc_requests); newxprt->sc_rq_depth = newxprt->sc_max_requests + @@ -1027,13 +1034,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; } newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_sq_cq)) { dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth, - 0, IB_POLL_SOFTIRQ); + 0, IB_POLL_WORKQUEUE); if (IS_ERR(newxprt->sc_rq_cq)) { dprintk("svcrdma: error creating RQ CQ for connect request\n"); goto errout; @@ -1213,20 +1220,18 @@ static void __svc_rdma_free(struct work_struct *work) */ while (!list_empty(&rdma->sc_read_complete_q)) { struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(rdma->sc_read_complete_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma->sc_read_complete_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); svc_rdma_put_context(ctxt, 1); } /* Destroy queued, but not processed recv completions */ while (!list_empty(&rdma->sc_rq_dto_q)) { struct svc_rdma_op_ctxt *ctxt; - ctxt = list_entry(rdma->sc_rq_dto_q.next, - struct svc_rdma_op_ctxt, - dto_q); - list_del_init(&ctxt->dto_q); + ctxt = list_first_entry(&rdma->sc_rq_dto_q, + struct svc_rdma_op_ctxt, list); + list_del(&ctxt->list); svc_rdma_put_context(ctxt, 1); } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 534c178d2a7e..c717f5410776 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -67,7 +67,7 @@ unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; - int xprt_rdma_pad_optimize = 1; + int xprt_rdma_pad_optimize = 0; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -709,10 +709,6 @@ xprt_rdma_send_request(struct rpc_task *task) return 0; failed_marshal: - dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", - __func__, rc); - if (rc == -EIO) - r_xprt->rx_stats.failed_marshal_count++; if (rc != -ENOTCONN) return rc; drop_connection: diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 11d07748f699..81cd31acf690 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -54,6 +54,7 @@ #include <linux/sunrpc/svc_rdma.h> #include <asm/bitops.h> #include <linux/module.h> /* try_module_get()/module_put() */ +#include <rdma/ib_cm.h> #include "xprt_rdma.h" @@ -208,6 +209,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, /* Default settings for RPC-over-RDMA Version One */ r_xprt->rx_ia.ri_reminv_expected = false; + r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; rsize = RPCRDMA_V1_DEF_INLINE_SIZE; wsize = RPCRDMA_V1_DEF_INLINE_SIZE; @@ -215,6 +217,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { r_xprt->rx_ia.ri_reminv_expected = true; + r_xprt->rx_ia.ri_implicit_roundup = true; rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } @@ -277,7 +280,14 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connstate = -ENETDOWN; goto connected; case RDMA_CM_EVENT_REJECTED: +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) + pr_info("rpcrdma: connection to %pIS:%u on %s rejected: %s\n", + sap, rpc_get_port(sap), ia->ri_device->name, + rdma_reject_msg(id, event->status)); +#endif connstate = -ECONNREFUSED; + if (event->status == IB_CM_REJ_STALE_CONN) + connstate = -EAGAIN; goto connected; case RDMA_CM_EVENT_DISCONNECTED: connstate = -ECONNABORTED; @@ -486,18 +496,19 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) */ int rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, - struct rpcrdma_create_data_internal *cdata) + struct rpcrdma_create_data_internal *cdata) { struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; + unsigned int max_qp_wr, max_sge; struct ib_cq *sendcq, *recvcq; - unsigned int max_qp_wr; int rc; - if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) { - dprintk("RPC: %s: insufficient sge's available\n", - __func__); + max_sge = min(ia->ri_device->attrs.max_sge, RPCRDMA_MAX_SEND_SGES); + if (max_sge < RPCRDMA_MIN_SEND_SGES) { + pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge); return -ENOMEM; } + ia->ri_max_send_sges = max_sge - RPCRDMA_MIN_SEND_SGES; if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) { dprintk("RPC: %s: insufficient wqe's available\n", @@ -522,7 +533,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */ - ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES; + ep->rep_attr.cap.max_send_sge = max_sge; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; @@ -640,20 +651,21 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) int rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { + struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, + rx_ia); struct rdma_cm_id *id, *old; + struct sockaddr *sap; + unsigned int extras; int rc = 0; - int retry_count = 0; if (ep->rep_connected != 0) { - struct rpcrdma_xprt *xprt; retry: dprintk("RPC: %s: reconnecting...\n", __func__); rpcrdma_ep_disconnect(ep, ia); - xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); - id = rpcrdma_create_id(xprt, ia, - (struct sockaddr *)&xprt->rx_data.addr); + sap = (struct sockaddr *)&r_xprt->rx_data.addr; + id = rpcrdma_create_id(r_xprt, ia, sap); if (IS_ERR(id)) { rc = -EHOSTUNREACH; goto out; @@ -708,51 +720,18 @@ retry: } wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); - - /* - * Check state. A non-peer reject indicates no listener - * (ECONNREFUSED), which may be a transient state. All - * others indicate a transport condition which has already - * undergone a best-effort. - */ - if (ep->rep_connected == -ECONNREFUSED && - ++retry_count <= RDMA_CONNECT_RETRY_MAX) { - dprintk("RPC: %s: non-peer_reject, retry\n", __func__); - goto retry; - } if (ep->rep_connected <= 0) { - /* Sometimes, the only way to reliably connect to remote - * CMs is to use same nonzero values for ORD and IRD. */ - if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && - (ep->rep_remote_cma.responder_resources == 0 || - ep->rep_remote_cma.initiator_depth != - ep->rep_remote_cma.responder_resources)) { - if (ep->rep_remote_cma.responder_resources == 0) - ep->rep_remote_cma.responder_resources = 1; - ep->rep_remote_cma.initiator_depth = - ep->rep_remote_cma.responder_resources; + if (ep->rep_connected == -EAGAIN) goto retry; - } rc = ep->rep_connected; - } else { - struct rpcrdma_xprt *r_xprt; - unsigned int extras; - - dprintk("RPC: %s: connected\n", __func__); - - r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); - extras = r_xprt->rx_buf.rb_bc_srv_max_requests; - - if (extras) { - rc = rpcrdma_ep_post_extra_recv(r_xprt, extras); - if (rc) { - pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n", - __func__, rc); - rc = 0; - } - } + goto out; } + dprintk("RPC: %s: connected\n", __func__); + extras = r_xprt->rx_buf.rb_bc_srv_max_requests; + if (extras) + rpcrdma_ep_post_extra_recv(r_xprt, extras); + out: if (rc) ep->rep_connected = rc; @@ -797,9 +776,7 @@ rpcrdma_mr_recovery_worker(struct work_struct *work) spin_lock(&buf->rb_recovery_lock); while (!list_empty(&buf->rb_stale_mrs)) { - mw = list_first_entry(&buf->rb_stale_mrs, - struct rpcrdma_mw, mw_list); - list_del_init(&mw->mw_list); + mw = rpcrdma_pop_mw(&buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); dprintk("RPC: %s: recovering MR %p\n", __func__, mw); @@ -817,7 +794,7 @@ rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; spin_lock(&buf->rb_recovery_lock); - list_add(&mw->mw_list, &buf->rb_stale_mrs); + rpcrdma_push_mw(mw, &buf->rb_stale_mrs); spin_unlock(&buf->rb_recovery_lock); schedule_delayed_work(&buf->rb_recovery_worker, 0); @@ -1093,11 +1070,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) struct rpcrdma_mw *mw = NULL; spin_lock(&buf->rb_mwlock); - if (!list_empty(&buf->rb_mws)) { - mw = list_first_entry(&buf->rb_mws, - struct rpcrdma_mw, mw_list); - list_del_init(&mw->mw_list); - } + if (!list_empty(&buf->rb_mws)) + mw = rpcrdma_pop_mw(&buf->rb_mws); spin_unlock(&buf->rb_mwlock); if (!mw) @@ -1120,7 +1094,7 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; spin_lock(&buf->rb_mwlock); - list_add_tail(&mw->mw_list, &buf->rb_mws); + rpcrdma_push_mw(mw, &buf->rb_mws); spin_unlock(&buf->rb_mwlock); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index e35efd4ac1e4..171a35116de9 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -74,7 +74,9 @@ struct rpcrdma_ia { unsigned int ri_max_frmr_depth; unsigned int ri_max_inline_write; unsigned int ri_max_inline_read; + unsigned int ri_max_send_sges; bool ri_reminv_expected; + bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; @@ -303,15 +305,19 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ char *mr_offset; /* kva if no page, else offset */ }; -/* Reserve enough Send SGEs to send a maximum size inline request: +/* The Send SGE array is provisioned to send a maximum size + * inline request: * - RPC-over-RDMA header * - xdr_buf head iovec - * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages + * - RPCRDMA_MAX_INLINE bytes, in pages * - xdr_buf tail iovec + * + * The actual number of array elements consumed by each RPC + * depends on the device's max_sge limit. */ enum { - RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1, - RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1, + RPCRDMA_MIN_SEND_SGES = 3, + RPCRDMA_MAX_PAGE_SGES = RPCRDMA_MAX_INLINE >> PAGE_SHIFT, RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1, }; @@ -348,6 +354,22 @@ rpcr_to_rdmar(struct rpc_rqst *rqst) return rqst->rq_xprtdata; } +static inline void +rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list) +{ + list_add_tail(&mw->mw_list, list); +} + +static inline struct rpcrdma_mw * +rpcrdma_pop_mw(struct list_head *list) +{ + struct rpcrdma_mw *mw; + + mw = list_first_entry(list, struct rpcrdma_mw, mw_list); + list_del(&mw->mw_list); + return mw; +} + /* * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for * inline requests/replies, and client/server credits. diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index af392d9b9cec..16aff8ddc16f 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -52,6 +52,8 @@ #include "sunrpc.h" static void xs_close(struct rpc_xprt *xprt); +static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, + struct socket *sock); /* * xprtsock tunables @@ -666,6 +668,9 @@ static int xs_tcp_send_request(struct rpc_task *task) if (task->tk_flags & RPC_TASK_SENT) zerocopy = false; + if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state)) + xs_tcp_set_socket_timeouts(xprt, transport->sock); + /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ @@ -1188,7 +1193,7 @@ static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_r char *p; len = sizeof(transport->tcp_xid) - transport->tcp_offset; - dprintk("RPC: reading XID (%Zu bytes)\n", len); + dprintk("RPC: reading XID (%zu bytes)\n", len); p = ((char *) &transport->tcp_xid) + transport->tcp_offset; used = xdr_skb_read_bits(desc, p, len); transport->tcp_offset += used; @@ -1219,7 +1224,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport, */ offset = transport->tcp_offset - sizeof(transport->tcp_xid); len = sizeof(transport->tcp_calldir) - offset; - dprintk("RPC: reading CALL/REPLY flag (%Zu bytes)\n", len); + dprintk("RPC: reading CALL/REPLY flag (%zu bytes)\n", len); p = ((char *) &transport->tcp_calldir) + offset; used = xdr_skb_read_bits(desc, p, len); transport->tcp_offset += used; @@ -1310,7 +1315,7 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt, return; } - dprintk("RPC: XID %08x read %Zd bytes\n", + dprintk("RPC: XID %08x read %zd bytes\n", ntohl(transport->tcp_xid), r); dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, " "tcp_reclen = %u\n", xprt, transport->tcp_copied, @@ -1456,7 +1461,7 @@ static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_s desc->count -= len; desc->offset += len; transport->tcp_offset += len; - dprintk("RPC: discarded %Zu bytes\n", len); + dprintk("RPC: discarded %zu bytes\n", len); xs_tcp_check_fraghdr(transport); } @@ -1734,7 +1739,9 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t */ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) { + spin_lock_bh(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, -ETIMEDOUT); + spin_unlock_bh(&xprt->transport_lock); } static unsigned short xs_get_random_port(void) @@ -2235,6 +2242,66 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) xs_reset_transport(transport); } +static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, + struct socket *sock) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + unsigned int keepidle; + unsigned int keepcnt; + unsigned int opt_on = 1; + unsigned int timeo; + + spin_lock_bh(&xprt->transport_lock); + keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ); + keepcnt = xprt->timeout->to_retries + 1; + timeo = jiffies_to_msecs(xprt->timeout->to_initval) * + (xprt->timeout->to_retries + 1); + clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); + spin_unlock_bh(&xprt->transport_lock); + + /* TCP Keepalive options */ + kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&opt_on, sizeof(opt_on)); + kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, + (char *)&keepidle, sizeof(keepidle)); + kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, + (char *)&keepidle, sizeof(keepidle)); + kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, + (char *)&keepcnt, sizeof(keepcnt)); + + /* TCP user timeout (see RFC5482) */ + kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, + (char *)&timeo, sizeof(timeo)); +} + +static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, + unsigned long connect_timeout, + unsigned long reconnect_timeout) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + struct rpc_timeout to; + unsigned long initval; + + spin_lock_bh(&xprt->transport_lock); + if (reconnect_timeout < xprt->max_reconnect_timeout) + xprt->max_reconnect_timeout = reconnect_timeout; + if (connect_timeout < xprt->connect_timeout) { + memcpy(&to, xprt->timeout, sizeof(to)); + initval = DIV_ROUND_UP(connect_timeout, to.to_retries + 1); + /* Arbitrary lower limit */ + if (initval < XS_TCP_INIT_REEST_TO << 1) + initval = XS_TCP_INIT_REEST_TO << 1; + to.to_initval = initval; + to.to_maxval = initval; + memcpy(&transport->tcp_timeout, &to, + sizeof(transport->tcp_timeout)); + xprt->timeout = &transport->tcp_timeout; + xprt->connect_timeout = connect_timeout; + } + set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); + spin_unlock_bh(&xprt->transport_lock); +} + static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -2242,22 +2309,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) if (!transport->inet) { struct sock *sk = sock->sk; - unsigned int keepidle = xprt->timeout->to_initval / HZ; - unsigned int keepcnt = xprt->timeout->to_retries + 1; - unsigned int opt_on = 1; - unsigned int timeo; unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC; - /* TCP Keepalive options */ - kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&opt_on, sizeof(opt_on)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); - /* Avoid temporary address, they are bad for long-lived * connections such as NFS mounts. * RFC4941, section 3.6 suggests that: @@ -2268,11 +2321,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES, (char *)&addr_pref, sizeof(addr_pref)); - /* TCP user timeout (see RFC5482) */ - timeo = jiffies_to_msecs(xprt->timeout->to_initval) * - (xprt->timeout->to_retries + 1); - kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (char *)&timeo, sizeof(timeo)); + xs_tcp_set_socket_timeouts(xprt, sock); write_lock_bh(&sk->sk_callback_lock); @@ -2721,6 +2770,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_tcp_shutdown, .destroy = xs_destroy, + .set_connect_timeout = xs_tcp_set_connect_timeout, .print_stats = xs_tcp_print_stats, .enable_swap = xs_enable_swap, .disable_swap = xs_disable_swap, @@ -3007,6 +3057,8 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->timeout = &xs_tcp_default_timeout; xprt->max_reconnect_timeout = xprt->timeout->to_maxval; + xprt->connect_timeout = xprt->timeout->to_initval * + (xprt->timeout->to_retries + 1); INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn); INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); @@ -3209,7 +3261,9 @@ static int param_set_uint_minmax(const char *val, if (!val) return -EINVAL; ret = kstrtouint(val, 0, &num); - if (ret == -EINVAL || num < min || num > max) + if (ret) + return ret; + if (num < min || num > max) return -EINVAL; *((unsigned int *)kp->arg) = num; return 0; |