aboutsummaryrefslogtreecommitdiff
path: root/net/sunrpc
diff options
context:
space:
mode:
Diffstat (limited to 'net/sunrpc')
-rw-r--r--net/sunrpc/auth.c16
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c4
-rw-r--r--net/sunrpc/auth_null.c3
-rw-r--r--net/sunrpc/auth_unix.c18
-rw-r--r--net/sunrpc/cache.c121
-rw-r--r--net/sunrpc/clnt.c51
-rw-r--r--net/sunrpc/debugfs.c35
-rw-r--r--net/sunrpc/svc.c28
-rw-r--r--net/sunrpc/svcauth_unix.c4
-rw-r--r--net/sunrpc/svcsock.c5
-rw-r--r--net/sunrpc/xdr.c34
-rw-r--r--net/sunrpc/xprt.c2
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c5
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c11
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c82
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c17
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c299
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c20
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c22
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c69
-rw-r--r--net/sunrpc/xprtrdma/transport.c6
-rw-r--r--net/sunrpc/xprtrdma/verbs.c96
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h30
-rw-r--r--net/sunrpc/xprtsock.c102
25 files changed, 553 insertions, 529 deletions
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 2bff63a73cf8..d2623b9f23d6 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -8,6 +8,7 @@
#include <linux/types.h>
#include <linux/sched.h>
+#include <linux/cred.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/errno.h>
@@ -464,8 +465,10 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
* Note that the cred_unused list must be time-ordered.
*/
if (time_in_range(cred->cr_expire, expired, jiffies) &&
- test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0)
+ test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) {
+ freed = SHRINK_STOP;
break;
+ }
list_del_init(&cred->cr_lru);
number_cred_unused--;
@@ -520,7 +523,7 @@ static unsigned long
rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
- return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
+ return number_cred_unused * sysctl_vfs_cache_pressure / 100;
}
static void
@@ -646,9 +649,6 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred,
cred->cr_auth = auth;
cred->cr_ops = ops;
cred->cr_expire = jiffies;
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
- cred->cr_magic = RPCAUTH_CRED_MAGIC;
-#endif
cred->cr_uid = acred->uid;
}
EXPORT_SYMBOL_GPL(rpcauth_init_cred);
@@ -876,8 +876,12 @@ int __init rpcauth_init_module(void)
err = rpc_init_generic_auth();
if (err < 0)
goto out2;
- register_shrinker(&rpc_cred_shrinker);
+ err = register_shrinker(&rpc_cred_shrinker);
+ if (err < 0)
+ goto out3;
return 0;
+out3:
+ rpc_destroy_generic_auth();
out2:
rpc_destroy_authunix();
out1:
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index cdeb1d814833..4f16953e4954 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -763,7 +763,7 @@ err_put_ctx:
err:
kfree(buf);
out:
- dprintk("RPC: %s returning %Zd\n", __func__, err);
+ dprintk("RPC: %s returning %zd\n", __func__, err);
return err;
}
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 153082598522..a54a7a3d28f5 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1489,8 +1489,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
case RPC_GSS_PROC_DESTROY:
if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
goto auth_err;
- rsci->h.expiry_time = seconds_since_boot();
- set_bit(CACHE_NEGATIVE, &rsci->h.flags);
+ /* Delete the entry from the cache_list and call cache_put */
+ sunrpc_cache_unhash(sn->rsc_cache, &rsci->h);
if (resv->iov_len + 4 > PAGE_SIZE)
goto drop;
svc_putnl(resv, RPC_SUCCESS);
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 4d17376b2acb..5f3d527dff65 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -139,7 +139,4 @@ struct rpc_cred null_cred = {
.cr_ops = &null_credops,
.cr_count = ATOMIC_INIT(1),
.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE,
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
- .cr_magic = RPCAUTH_CRED_MAGIC,
-#endif
};
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 306fc0f54596..82337e1ec9cd 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -14,12 +14,10 @@
#include <linux/sunrpc/auth.h>
#include <linux/user_namespace.h>
-#define NFS_NGROUPS 16
-
struct unx_cred {
struct rpc_cred uc_base;
kgid_t uc_gid;
- kgid_t uc_gids[NFS_NGROUPS];
+ kgid_t uc_gids[UNX_NGROUPS];
};
#define uc_uid uc_base.cr_uid
@@ -82,13 +80,13 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t
if (acred->group_info != NULL)
groups = acred->group_info->ngroups;
- if (groups > NFS_NGROUPS)
- groups = NFS_NGROUPS;
+ if (groups > UNX_NGROUPS)
+ groups = UNX_NGROUPS;
cred->uc_gid = acred->gid;
for (i = 0; i < groups; i++)
cred->uc_gids[i] = acred->group_info->gid[i];
- if (i < NFS_NGROUPS)
+ if (i < UNX_NGROUPS)
cred->uc_gids[i] = INVALID_GID;
return &cred->uc_base;
@@ -132,12 +130,12 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags)
if (acred->group_info != NULL)
groups = acred->group_info->ngroups;
- if (groups > NFS_NGROUPS)
- groups = NFS_NGROUPS;
+ if (groups > UNX_NGROUPS)
+ groups = UNX_NGROUPS;
for (i = 0; i < groups ; i++)
if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i]))
return 0;
- if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups]))
+ if (groups < UNX_NGROUPS && gid_valid(cred->uc_gids[groups]))
return 0;
return 1;
}
@@ -166,7 +164,7 @@ unx_marshal(struct rpc_task *task, __be32 *p)
*p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid));
*p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid));
hold = p++;
- for (i = 0; i < 16 && gid_valid(cred->uc_gids[i]); i++)
+ for (i = 0; i < UNX_NGROUPS && gid_valid(cred->uc_gids[i]); i++)
*p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i]));
*hold = htonl(p - hold - 1); /* gid array length */
*base = htonl((p - base - 1) << 2); /* cred length */
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index f39e3e11f9aa..79d55d949d9a 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -362,11 +362,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
cache_purge(cd);
spin_lock(&cache_list_lock);
write_lock(&cd->hash_lock);
- if (cd->entries) {
- write_unlock(&cd->hash_lock);
- spin_unlock(&cache_list_lock);
- goto out;
- }
if (current_detail == cd)
current_detail = NULL;
list_del_init(&cd->others);
@@ -376,9 +371,6 @@ void sunrpc_destroy_cache_detail(struct cache_detail *cd)
/* module must be being unloaded so its safe to kill the worker */
cancel_delayed_work_sync(&cache_cleaner);
}
- return;
-out:
- printk(KERN_ERR "RPC: failed to unregister %s cache\n", cd->name);
}
EXPORT_SYMBOL_GPL(sunrpc_destroy_cache_detail);
@@ -497,13 +489,32 @@ EXPORT_SYMBOL_GPL(cache_flush);
void cache_purge(struct cache_detail *detail)
{
- time_t now = seconds_since_boot();
- if (detail->flush_time >= now)
- now = detail->flush_time + 1;
- /* 'now' is the maximum value any 'last_refresh' can have */
- detail->flush_time = now;
- detail->nextcheck = seconds_since_boot();
- cache_flush();
+ struct cache_head *ch = NULL;
+ struct hlist_head *head = NULL;
+ struct hlist_node *tmp = NULL;
+ int i = 0;
+
+ write_lock(&detail->hash_lock);
+ if (!detail->entries) {
+ write_unlock(&detail->hash_lock);
+ return;
+ }
+
+ dprintk("RPC: %d entries in %s cache\n", detail->entries, detail->name);
+ for (i = 0; i < detail->hash_size; i++) {
+ head = &detail->hash_table[i];
+ hlist_for_each_entry_safe(ch, tmp, head, cache_list) {
+ hlist_del_init(&ch->cache_list);
+ detail->entries--;
+
+ set_bit(CACHE_CLEANED, &ch->flags);
+ write_unlock(&detail->hash_lock);
+ cache_fresh_unlocked(ch, detail);
+ cache_put(ch, detail);
+ write_lock(&detail->hash_lock);
+ }
+ }
+ write_unlock(&detail->hash_lock);
}
EXPORT_SYMBOL_GPL(cache_purge);
@@ -717,7 +728,7 @@ void cache_clean_deferred(void *owner)
/*
* communicate with user-space
*
- * We have a magic /proc file - /proc/sunrpc/<cachename>/channel.
+ * We have a magic /proc file - /proc/net/rpc/<cachename>/channel.
* On read, you get a full request, or block.
* On write, an update request is processed.
* Poll works if anything to read, and always allows write.
@@ -1272,7 +1283,7 @@ EXPORT_SYMBOL_GPL(qword_get);
/*
- * support /proc/sunrpc/cache/$CACHENAME/content
+ * support /proc/net/rpc/$CACHENAME/content
* as a seqfile.
* We call ->cache_show passing NULL for the item to
* get a header, then pass each real item in the cache
@@ -1427,20 +1438,11 @@ static ssize_t read_flush(struct file *file, char __user *buf,
struct cache_detail *cd)
{
char tbuf[22];
- unsigned long p = *ppos;
size_t len;
- snprintf(tbuf, sizeof(tbuf), "%lu\n", convert_to_wallclock(cd->flush_time));
- len = strlen(tbuf);
- if (p >= len)
- return 0;
- len -= p;
- if (len > count)
- len = count;
- if (copy_to_user(buf, (void*)(tbuf+p), len))
- return -EFAULT;
- *ppos += len;
- return len;
+ len = snprintf(tbuf, sizeof(tbuf), "%lu\n",
+ convert_to_wallclock(cd->flush_time));
+ return simple_read_from_buffer(buf, count, ppos, tbuf, len);
}
static ssize_t write_flush(struct file *file, const char __user *buf,
@@ -1600,21 +1602,12 @@ static const struct file_operations cache_flush_operations_procfs = {
.llseek = no_llseek,
};
-static void remove_cache_proc_entries(struct cache_detail *cd, struct net *net)
+static void remove_cache_proc_entries(struct cache_detail *cd)
{
- struct sunrpc_net *sn;
-
- if (cd->u.procfs.proc_ent == NULL)
- return;
- if (cd->u.procfs.flush_ent)
- remove_proc_entry("flush", cd->u.procfs.proc_ent);
- if (cd->u.procfs.channel_ent)
- remove_proc_entry("channel", cd->u.procfs.proc_ent);
- if (cd->u.procfs.content_ent)
- remove_proc_entry("content", cd->u.procfs.proc_ent);
- cd->u.procfs.proc_ent = NULL;
- sn = net_generic(net, sunrpc_net_id);
- remove_proc_entry(cd->name, sn->proc_net_rpc);
+ if (cd->procfs) {
+ proc_remove(cd->procfs);
+ cd->procfs = NULL;
+ }
}
#ifdef CONFIG_PROC_FS
@@ -1624,38 +1617,30 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
struct sunrpc_net *sn;
sn = net_generic(net, sunrpc_net_id);
- cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc);
- if (cd->u.procfs.proc_ent == NULL)
+ cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc);
+ if (cd->procfs == NULL)
goto out_nomem;
- cd->u.procfs.channel_ent = NULL;
- cd->u.procfs.content_ent = NULL;
p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
- cd->u.procfs.proc_ent,
- &cache_flush_operations_procfs, cd);
- cd->u.procfs.flush_ent = p;
+ cd->procfs, &cache_flush_operations_procfs, cd);
if (p == NULL)
goto out_nomem;
if (cd->cache_request || cd->cache_parse) {
p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
- cd->u.procfs.proc_ent,
- &cache_file_operations_procfs, cd);
- cd->u.procfs.channel_ent = p;
+ cd->procfs, &cache_file_operations_procfs, cd);
if (p == NULL)
goto out_nomem;
}
if (cd->cache_show) {
p = proc_create_data("content", S_IFREG|S_IRUSR,
- cd->u.procfs.proc_ent,
- &content_file_operations_procfs, cd);
- cd->u.procfs.content_ent = p;
+ cd->procfs, &content_file_operations_procfs, cd);
if (p == NULL)
goto out_nomem;
}
return 0;
out_nomem:
- remove_cache_proc_entries(cd, net);
+ remove_cache_proc_entries(cd);
return -ENOMEM;
}
#else /* CONFIG_PROC_FS */
@@ -1684,7 +1669,7 @@ EXPORT_SYMBOL_GPL(cache_register_net);
void cache_unregister_net(struct cache_detail *cd, struct net *net)
{
- remove_cache_proc_entries(cd, net);
+ remove_cache_proc_entries(cd);
sunrpc_destroy_cache_detail(cd);
}
EXPORT_SYMBOL_GPL(cache_unregister_net);
@@ -1843,15 +1828,29 @@ int sunrpc_cache_register_pipefs(struct dentry *parent,
struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd);
if (IS_ERR(dir))
return PTR_ERR(dir);
- cd->u.pipefs.dir = dir;
+ cd->pipefs = dir;
return 0;
}
EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs);
void sunrpc_cache_unregister_pipefs(struct cache_detail *cd)
{
- rpc_remove_cache_dir(cd->u.pipefs.dir);
- cd->u.pipefs.dir = NULL;
+ if (cd->pipefs) {
+ rpc_remove_cache_dir(cd->pipefs);
+ cd->pipefs = NULL;
+ }
}
EXPORT_SYMBOL_GPL(sunrpc_cache_unregister_pipefs);
+void sunrpc_cache_unhash(struct cache_detail *cd, struct cache_head *h)
+{
+ write_lock(&cd->hash_lock);
+ if (!hlist_unhashed(&h->cache_list)){
+ hlist_del_init(&h->cache_list);
+ cd->entries--;
+ write_unlock(&cd->hash_lock);
+ cache_put(h, cd);
+ } else
+ write_unlock(&cd->hash_lock);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_unhash);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 1dc9f3bac099..52da3ce54bb5 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1453,21 +1453,6 @@ size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
/**
- * rpc_get_timeout - Get timeout for transport in units of HZ
- * @clnt: RPC client to query
- */
-unsigned long rpc_get_timeout(struct rpc_clnt *clnt)
-{
- unsigned long ret;
-
- rcu_read_lock();
- ret = rcu_dereference(clnt->cl_xprt)->timeout->to_initval;
- rcu_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(rpc_get_timeout);
-
-/**
* rpc_force_rebind - force transport to check that remote port is unchanged
* @clnt: client to rebind
*
@@ -2699,6 +2684,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
{
struct rpc_xprt_switch *xps;
struct rpc_xprt *xprt;
+ unsigned long connect_timeout;
unsigned long reconnect_timeout;
unsigned char resvport;
int ret = 0;
@@ -2711,6 +2697,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
return -EAGAIN;
}
resvport = xprt->resvport;
+ connect_timeout = xprt->connect_timeout;
reconnect_timeout = xprt->max_reconnect_timeout;
rcu_read_unlock();
@@ -2720,7 +2707,10 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
goto out_put_switch;
}
xprt->resvport = resvport;
- xprt->max_reconnect_timeout = reconnect_timeout;
+ if (xprt->ops->set_connect_timeout != NULL)
+ xprt->ops->set_connect_timeout(xprt,
+ connect_timeout,
+ reconnect_timeout);
rpc_xprt_switch_set_roundrobin(xps);
if (setup) {
@@ -2737,26 +2727,39 @@ out_put_switch:
}
EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
+struct connect_timeout_data {
+ unsigned long connect_timeout;
+ unsigned long reconnect_timeout;
+};
+
static int
-rpc_xprt_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
+rpc_xprt_set_connect_timeout(struct rpc_clnt *clnt,
struct rpc_xprt *xprt,
void *data)
{
- unsigned long timeout = *((unsigned long *)data);
+ struct connect_timeout_data *timeo = data;
- if (timeout < xprt->max_reconnect_timeout)
- xprt->max_reconnect_timeout = timeout;
+ if (xprt->ops->set_connect_timeout)
+ xprt->ops->set_connect_timeout(xprt,
+ timeo->connect_timeout,
+ timeo->reconnect_timeout);
return 0;
}
void
-rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
+rpc_set_connect_timeout(struct rpc_clnt *clnt,
+ unsigned long connect_timeout,
+ unsigned long reconnect_timeout)
{
+ struct connect_timeout_data timeout = {
+ .connect_timeout = connect_timeout,
+ .reconnect_timeout = reconnect_timeout,
+ };
rpc_clnt_iterate_for_each_xprt(clnt,
- rpc_xprt_cap_max_reconnect_timeout,
- &timeo);
+ rpc_xprt_set_connect_timeout,
+ &timeout);
}
-EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
+EXPORT_SYMBOL_GPL(rpc_set_connect_timeout);
void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
{
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index e7b4d93566df..c8fd0b6c1618 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -16,11 +16,6 @@ static struct dentry *rpc_xprt_dir;
unsigned int rpc_inject_disconnect;
-struct rpc_clnt_iter {
- struct rpc_clnt *clnt;
- loff_t pos;
-};
-
static int
tasks_show(struct seq_file *f, void *v)
{
@@ -47,12 +42,10 @@ static void *
tasks_start(struct seq_file *f, loff_t *ppos)
__acquires(&clnt->cl_lock)
{
- struct rpc_clnt_iter *iter = f->private;
+ struct rpc_clnt *clnt = f->private;
loff_t pos = *ppos;
- struct rpc_clnt *clnt = iter->clnt;
struct rpc_task *task;
- iter->pos = pos + 1;
spin_lock(&clnt->cl_lock);
list_for_each_entry(task, &clnt->cl_tasks, tk_task)
if (pos-- == 0)
@@ -63,12 +56,10 @@ tasks_start(struct seq_file *f, loff_t *ppos)
static void *
tasks_next(struct seq_file *f, void *v, loff_t *pos)
{
- struct rpc_clnt_iter *iter = f->private;
- struct rpc_clnt *clnt = iter->clnt;
+ struct rpc_clnt *clnt = f->private;
struct rpc_task *task = v;
struct list_head *next = task->tk_task.next;
- ++iter->pos;
++*pos;
/* If there's another task on list, return it */
@@ -81,9 +72,7 @@ static void
tasks_stop(struct seq_file *f, void *v)
__releases(&clnt->cl_lock)
{
- struct rpc_clnt_iter *iter = f->private;
- struct rpc_clnt *clnt = iter->clnt;
-
+ struct rpc_clnt *clnt = f->private;
spin_unlock(&clnt->cl_lock);
}
@@ -96,17 +85,13 @@ static const struct seq_operations tasks_seq_operations = {
static int tasks_open(struct inode *inode, struct file *filp)
{
- int ret = seq_open_private(filp, &tasks_seq_operations,
- sizeof(struct rpc_clnt_iter));
-
+ int ret = seq_open(filp, &tasks_seq_operations);
if (!ret) {
struct seq_file *seq = filp->private_data;
- struct rpc_clnt_iter *iter = seq->private;
-
- iter->clnt = inode->i_private;
+ struct rpc_clnt *clnt = seq->private = inode->i_private;
- if (!atomic_inc_not_zero(&iter->clnt->cl_count)) {
- seq_release_private(inode, filp);
+ if (!atomic_inc_not_zero(&clnt->cl_count)) {
+ seq_release(inode, filp);
ret = -EINVAL;
}
}
@@ -118,10 +103,10 @@ static int
tasks_release(struct inode *inode, struct file *filp)
{
struct seq_file *seq = filp->private_data;
- struct rpc_clnt_iter *iter = seq->private;
+ struct rpc_clnt *clnt = seq->private;
- rpc_release_client(iter->clnt);
- return seq_release_private(inode, filp);
+ rpc_release_client(clnt);
+ return seq_release(inode, filp);
}
static const struct file_operations tasks_fops = {
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 75f290bddca1..a08aeb56b8e4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -11,7 +11,7 @@
*/
#include <linux/linkage.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
#include <linux/errno.h>
#include <linux/net.h>
#include <linux/in.h>
@@ -385,7 +385,7 @@ static int svc_uses_rpcbind(struct svc_serv *serv)
for (i = 0; i < progp->pg_nvers; i++) {
if (progp->pg_vers[i] == NULL)
continue;
- if (progp->pg_vers[i]->vs_hidden == 0)
+ if (!progp->pg_vers[i]->vs_hidden)
return 1;
}
}
@@ -976,6 +976,13 @@ int svc_register(const struct svc_serv *serv, struct net *net,
if (vers->vs_hidden)
continue;
+ /*
+ * Don't register a UDP port if we need congestion
+ * control.
+ */
+ if (vers->vs_need_cong_ctrl && proto == IPPROTO_UDP)
+ continue;
+
error = __svc_register(net, progp->pg_name, progp->pg_prog,
i, family, proto, port);
@@ -1169,6 +1176,21 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
!(versp = progp->pg_vers[vers]))
goto err_bad_vers;
+ /*
+ * Some protocol versions (namely NFSv4) require some form of
+ * congestion control. (See RFC 7530 section 3.1 paragraph 2)
+ * In other words, UDP is not allowed. We mark those when setting
+ * up the svc_xprt, and verify that here.
+ *
+ * The spec is not very clear about what error should be returned
+ * when someone tries to access a server that is listening on UDP
+ * for lower versions. RPC_PROG_MISMATCH seems to be the closest
+ * fit.
+ */
+ if (versp->vs_need_cong_ctrl &&
+ !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags))
+ goto err_bad_vers;
+
procp = versp->vs_proc + proc;
if (proc >= versp->vs_nproc || !procp->pc_func)
goto err_bad_proc;
@@ -1260,7 +1282,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
return 0;
err_short_len:
- svc_printk(rqstp, "short len %Zd, dropping request\n",
+ svc_printk(rqstp, "short len %zd, dropping request\n",
argv->iov_len);
goto close;
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 64af4f034de6..f81eaa8e0888 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -403,7 +403,7 @@ svcauth_unix_info_release(struct svc_xprt *xpt)
/****************************************************************************
* auth.unix.gid cache
* simple cache to map a UID to a list of GIDs
- * because AUTH_UNIX aka AUTH_SYS has a max of 16
+ * because AUTH_UNIX aka AUTH_SYS has a max of UNX_NGROUPS
*/
#define GID_HASHBITS 8
#define GID_HASHMAX (1<<GID_HASHBITS)
@@ -810,7 +810,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
cred->cr_uid = make_kuid(&init_user_ns, svc_getnl(argv)); /* uid */
cred->cr_gid = make_kgid(&init_user_ns, svc_getnl(argv)); /* gid */
slen = svc_getnl(argv); /* gids length */
- if (slen > 16 || (len -= (slen + 2)*4) < 0)
+ if (slen > UNX_NGROUPS || (len -= (slen + 2)*4) < 0)
goto badcred;
cred->cr_group_info = groups_alloc(slen);
if (cred->cr_group_info == NULL)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index de066acdb34e..8931e33b6541 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -278,7 +278,7 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
rqstp->rq_respages[0], tailoff);
out:
- dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
+ dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n",
svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
@@ -346,7 +346,7 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
if (len == buflen)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
- dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
+ dprintk("svc: socket %p recvfrom(%p, %zu) = %d\n",
svsk, iov[0].iov_base, iov[0].iov_len, len);
return len;
}
@@ -1306,6 +1306,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv)
svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_tcp_class,
&svsk->sk_xprt, serv);
set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
+ set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags);
if (sk->sk_state == TCP_LISTEN) {
dprintk("setting up TCP socket for listening\n");
set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 7f1071e103ca..1f7082144e01 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1518,3 +1518,37 @@ out:
}
EXPORT_SYMBOL_GPL(xdr_process_buf);
+/**
+ * xdr_stream_decode_string_dup - Decode and duplicate variable length string
+ * @xdr: pointer to xdr_stream
+ * @str: location to store pointer to string
+ * @maxlen: maximum acceptable string length
+ * @gfp_flags: GFP mask to use
+ *
+ * Return values:
+ * On success, returns length of NUL-terminated string stored in *@ptr
+ * %-EBADMSG on XDR buffer overflow
+ * %-EMSGSIZE if the size of the string would exceed @maxlen
+ * %-ENOMEM on memory allocation failure
+ */
+ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
+ size_t maxlen, gfp_t gfp_flags)
+{
+ void *p;
+ ssize_t ret;
+
+ ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen);
+ if (ret > 0) {
+ char *s = kmalloc(ret + 1, gfp_flags);
+ if (s != NULL) {
+ memcpy(s, p, ret);
+ s[ret] = '\0';
+ *str = s;
+ return strlen(s);
+ }
+ ret = -ENOMEM;
+ }
+ *str = NULL;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xdr_stream_decode_string_dup);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 9a6be030ca7d..b530a2852ba8 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -897,13 +897,11 @@ static void xprt_timer(struct rpc_task *task)
return;
dprintk("RPC: %5u xprt_timer\n", task->tk_pid);
- spin_lock_bh(&xprt->transport_lock);
if (!req->rq_reply_bytes_recvd) {
if (xprt->ops->timer)
xprt->ops->timer(xprt, task);
} else
task->tk_status = 0;
- spin_unlock_bh(&xprt->transport_lock);
}
/**
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 1ebb09e1ac4f..59e64025ed96 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -310,10 +310,7 @@ fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
struct rpcrdma_mw *mw;
while (!list_empty(&req->rl_registered)) {
- mw = list_first_entry(&req->rl_registered,
- struct rpcrdma_mw, mw_list);
- list_del_init(&mw->mw_list);
-
+ mw = rpcrdma_pop_mw(&req->rl_registered);
if (sync)
fmr_op_recover_mr(mw);
else
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 47bed5333c7f..f81dd93176c0 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -466,8 +466,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
struct ib_send_wr *first, **prev, *last, *bad_wr;
struct rpcrdma_rep *rep = req->rl_reply;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_mw *mw, *tmp;
struct rpcrdma_frmr *f;
+ struct rpcrdma_mw *mw;
int count, rc;
dprintk("RPC: %s: req %p\n", __func__, req);
@@ -534,10 +534,10 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* them to the free MW list.
*/
unmap:
- list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+ while (!list_empty(&req->rl_registered)) {
+ mw = rpcrdma_pop_mw(&req->rl_registered);
dprintk("RPC: %s: DMA unmapping frmr %p\n",
__func__, &mw->frmr);
- list_del_init(&mw->mw_list);
ib_dma_unmap_sg(ia->ri_device,
mw->mw_sg, mw->mw_nents, mw->mw_dir);
rpcrdma_put_mw(r_xprt, mw);
@@ -571,10 +571,7 @@ frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
struct rpcrdma_mw *mw;
while (!list_empty(&req->rl_registered)) {
- mw = list_first_entry(&req->rl_registered,
- struct rpcrdma_mw, mw_list);
- list_del_init(&mw->mw_list);
-
+ mw = rpcrdma_pop_mw(&req->rl_registered);
if (sync)
frwr_op_recover_mr(mw);
else
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c52e0f2ffe52..a044be2d6ad7 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -125,14 +125,34 @@ void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
/* The client can send a request inline as long as the RPCRDMA header
* plus the RPC call fit under the transport's inline limit. If the
* combined call message size exceeds that limit, the client must use
- * the read chunk list for this operation.
+ * a Read chunk for this operation.
+ *
+ * A Read chunk is also required if sending the RPC call inline would
+ * exceed this device's max_sge limit.
*/
static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct xdr_buf *xdr = &rqst->rq_snd_buf;
+ unsigned int count, remaining, offset;
+
+ if (xdr->len > r_xprt->rx_ia.ri_max_inline_write)
+ return false;
+
+ if (xdr->page_len) {
+ remaining = xdr->page_len;
+ offset = xdr->page_base & ~PAGE_MASK;
+ count = 0;
+ while (remaining) {
+ remaining -= min_t(unsigned int,
+ PAGE_SIZE - offset, remaining);
+ offset = 0;
+ if (++count > r_xprt->rx_ia.ri_max_send_sges)
+ return false;
+ }
+ }
- return rqst->rq_snd_buf.len <= ia->ri_max_inline_write;
+ return true;
}
/* The client can't know how large the actual reply will be. Thus it
@@ -186,9 +206,9 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
*/
static int
-rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
- enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg,
- bool reminv_expected)
+rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
+ unsigned int pos, enum rpcrdma_chunktype type,
+ struct rpcrdma_mr_seg *seg)
{
int len, n, p, page_base;
struct page **ppages;
@@ -226,22 +246,21 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
if (len && n == RPCRDMA_MAX_SEGS)
goto out_overflow;
- /* When encoding the read list, the tail is always sent inline */
- if (type == rpcrdma_readch)
+ /* When encoding a Read chunk, the tail iovec contains an
+ * XDR pad and may be omitted.
+ */
+ if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
return n;
- /* When encoding the Write list, some servers need to see an extra
- * segment for odd-length Write chunks. The upper layer provides
- * space in the tail iovec for this purpose.
+ /* When encoding a Write chunk, some servers need to see an
+ * extra segment for non-XDR-aligned Write chunks. The upper
+ * layer provides space in the tail iovec that may be used
+ * for this purpose.
*/
- if (type == rpcrdma_writech && reminv_expected)
+ if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
return n;
if (xdrbuf->tail[0].iov_len) {
- /* the rpcrdma protocol allows us to omit any trailing
- * xdr pad bytes, saving the server an RDMA operation. */
- if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
- return n;
n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
if (n == RPCRDMA_MAX_SEGS)
goto out_overflow;
@@ -293,7 +312,8 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
if (rtype == rpcrdma_areadch)
pos = 0;
seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false);
+ nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
+ rtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -302,7 +322,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
false, &mw);
if (n < 0)
return ERR_PTR(n);
- list_add(&mw->mw_list, &req->rl_registered);
+ rpcrdma_push_mw(mw, &req->rl_registered);
*iptr++ = xdr_one; /* item present */
@@ -355,10 +375,9 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
}
seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
+ nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
rqst->rq_rcv_buf.head[0].iov_len,
- wtype, seg,
- r_xprt->rx_ia.ri_reminv_expected);
+ wtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -371,7 +390,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
true, &mw);
if (n < 0)
return ERR_PTR(n);
- list_add(&mw->mw_list, &req->rl_registered);
+ rpcrdma_push_mw(mw, &req->rl_registered);
iptr = xdr_encode_rdma_segment(iptr, mw);
@@ -423,8 +442,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
}
seg = req->rl_segments;
- nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
- r_xprt->rx_ia.ri_reminv_expected);
+ nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0)
return ERR_PTR(nsegs);
@@ -437,7 +455,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
true, &mw);
if (n < 0)
return ERR_PTR(n);
- list_add(&mw->mw_list, &req->rl_registered);
+ rpcrdma_push_mw(mw, &req->rl_registered);
iptr = xdr_encode_rdma_segment(iptr, mw);
@@ -741,13 +759,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
iptr = headerp->rm_body.rm_chunks;
iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
if (IS_ERR(iptr))
- goto out_unmap;
+ goto out_err;
iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
if (IS_ERR(iptr))
- goto out_unmap;
+ goto out_err;
iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
if (IS_ERR(iptr))
- goto out_unmap;
+ goto out_err;
hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
@@ -758,12 +776,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
&rqst->rq_snd_buf, rtype)) {
iptr = ERR_PTR(-EIO);
- goto out_unmap;
+ goto out_err;
}
return 0;
-out_unmap:
- r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
+out_err:
+ pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
+ PTR_ERR(iptr));
+ r_xprt->rx_stats.failed_marshal_count++;
return PTR_ERR(iptr);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index cb1e48e54eb1..ff1df40f0d26 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -201,19 +201,20 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
{
struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer;
+ __be32 *p;
int rc;
/* Space in the send buffer for an RPC/RDMA header is reserved
* via xprt->tsh_size.
*/
- headerp->rm_xid = rqst->rq_xid;
- headerp->rm_vers = rpcrdma_version;
- headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
- headerp->rm_type = rdma_msg;
- headerp->rm_body.rm_chunks[0] = xdr_zero;
- headerp->rm_body.rm_chunks[1] = xdr_zero;
- headerp->rm_body.rm_chunks[2] = xdr_zero;
+ p = rqst->rq_buffer;
+ *p++ = rqst->rq_xid;
+ *p++ = rpcrdma_version;
+ *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+ *p++ = rdma_msg;
+ *p++ = xdr_zero;
+ *p++ = xdr_zero;
+ *p = xdr_zero;
#ifdef SVCRDMA_BACKCHANNEL_DEBUG
pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 0ba9887f3e22..1c4aabf0f657 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
@@ -47,102 +48,43 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-/*
- * Decodes a read chunk list. The expected format is as follows:
- * descrim : xdr_one
- * position : __be32 offset into XDR stream
- * handle : __be32 RKEY
- * . . .
- * end-of-list: xdr_zero
- */
-static __be32 *decode_read_list(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_read_list(__be32 *p, __be32 *end)
{
- struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
+ __be32 *next;
- while (ch->rc_discrim != xdr_zero) {
- if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
- (unsigned long)vaend) {
- dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
+ while (*p++ != xdr_zero) {
+ next = p + rpcrdma_readchunk_maxsz - 1;
+ if (next > end)
return NULL;
- }
- ch++;
+ p = next;
}
- return &ch->rc_position;
+ return p;
}
-/*
- * Decodes a write chunk list. The expected format is as follows:
- * descrim : xdr_one
- * nchunks : <count>
- * handle : __be32 RKEY ---+
- * length : __be32 <len of segment> |
- * offset : remove va + <count>
- * . . . |
- * ---+
- */
-static __be32 *decode_write_list(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_write_list(__be32 *p, __be32 *end)
{
- unsigned long start, end;
- int nchunks;
-
- struct rpcrdma_write_array *ary =
- (struct rpcrdma_write_array *)va;
+ __be32 *next;
- /* Check for not write-array */
- if (ary->wc_discrim == xdr_zero)
- return &ary->wc_nchunks;
-
- if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
- (unsigned long)vaend) {
- dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
- return NULL;
- }
- nchunks = be32_to_cpu(ary->wc_nchunks);
-
- start = (unsigned long)&ary->wc_array[0];
- end = (unsigned long)vaend;
- if (nchunks < 0 ||
- nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
- (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
- dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
- ary, nchunks, vaend);
- return NULL;
+ while (*p++ != xdr_zero) {
+ next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+ if (next > end)
+ return NULL;
+ p = next;
}
- /*
- * rs_length is the 2nd 4B field in wc_target and taking its
- * address skips the list terminator
- */
- return &ary->wc_array[nchunks].wc_target.rs_length;
+ return p;
}
-static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
+static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end)
{
- unsigned long start, end;
- int nchunks;
- struct rpcrdma_write_array *ary =
- (struct rpcrdma_write_array *)va;
-
- /* Check for no reply-array */
- if (ary->wc_discrim == xdr_zero)
- return &ary->wc_nchunks;
-
- if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
- (unsigned long)vaend) {
- dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
- return NULL;
- }
- nchunks = be32_to_cpu(ary->wc_nchunks);
-
- start = (unsigned long)&ary->wc_array[0];
- end = (unsigned long)vaend;
- if (nchunks < 0 ||
- nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
- (start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
- dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
- ary, nchunks, vaend);
- return NULL;
+ __be32 *next;
+
+ if (*p++ != xdr_zero) {
+ next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+ if (next > end)
+ return NULL;
+ p = next;
}
- return (__be32 *)&ary->wc_array[nchunks];
+ return p;
}
/**
@@ -158,87 +100,71 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
*/
int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
{
- struct rpcrdma_msg *rmsgp;
- __be32 *va, *vaend;
- unsigned int len;
- u32 hdr_len;
+ __be32 *p, *end, *rdma_argp;
+ unsigned int hdr_len;
/* Verify that there's enough bytes for header + something */
- if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) {
- dprintk("svcrdma: header too short = %d\n",
- rq_arg->len);
- return -EINVAL;
- }
+ if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
+ goto out_short;
- rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base;
- if (rmsgp->rm_vers != rpcrdma_version) {
- dprintk("%s: bad version %u\n", __func__,
- be32_to_cpu(rmsgp->rm_vers));
- return -EPROTONOSUPPORT;
- }
+ rdma_argp = rq_arg->head[0].iov_base;
+ if (*(rdma_argp + 1) != rpcrdma_version)
+ goto out_version;
- switch (be32_to_cpu(rmsgp->rm_type)) {
- case RDMA_MSG:
- case RDMA_NOMSG:
+ switch (*(rdma_argp + 3)) {
+ case rdma_msg:
+ case rdma_nomsg:
break;
- case RDMA_DONE:
- /* Just drop it */
- dprintk("svcrdma: dropping RDMA_DONE message\n");
- return 0;
-
- case RDMA_ERROR:
- /* Possible if this is a backchannel reply.
- * XXX: We should cancel this XID, though.
- */
- dprintk("svcrdma: dropping RDMA_ERROR message\n");
- return 0;
-
- case RDMA_MSGP:
- /* Pull in the extra for the padded case, bump our pointer */
- rmsgp->rm_body.rm_padded.rm_align =
- be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align);
- rmsgp->rm_body.rm_padded.rm_thresh =
- be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh);
-
- va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
- rq_arg->head[0].iov_base = va;
- len = (u32)((unsigned long)va - (unsigned long)rmsgp);
- rq_arg->head[0].iov_len -= len;
- if (len > rq_arg->len)
- return -EINVAL;
- return len;
- default:
- dprintk("svcrdma: bad rdma procedure (%u)\n",
- be32_to_cpu(rmsgp->rm_type));
- return -EINVAL;
- }
+ case rdma_done:
+ goto out_drop;
- /* The chunk list may contain either a read chunk list or a write
- * chunk list and a reply chunk list.
- */
- va = &rmsgp->rm_body.rm_chunks[0];
- vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len);
- va = decode_read_list(va, vaend);
- if (!va) {
- dprintk("svcrdma: failed to decode read list\n");
- return -EINVAL;
- }
- va = decode_write_list(va, vaend);
- if (!va) {
- dprintk("svcrdma: failed to decode write list\n");
- return -EINVAL;
- }
- va = decode_reply_array(va, vaend);
- if (!va) {
- dprintk("svcrdma: failed to decode reply chunk\n");
- return -EINVAL;
+ case rdma_error:
+ goto out_drop;
+
+ default:
+ goto out_proc;
}
- rq_arg->head[0].iov_base = va;
- hdr_len = (unsigned long)va - (unsigned long)rmsgp;
+ end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
+ p = xdr_check_read_list(rdma_argp + 4, end);
+ if (!p)
+ goto out_inval;
+ p = xdr_check_write_list(p, end);
+ if (!p)
+ goto out_inval;
+ p = xdr_check_reply_chunk(p, end);
+ if (!p)
+ goto out_inval;
+ if (p > end)
+ goto out_inval;
+
+ rq_arg->head[0].iov_base = p;
+ hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
rq_arg->head[0].iov_len -= hdr_len;
return hdr_len;
+
+out_short:
+ dprintk("svcrdma: header too short = %d\n", rq_arg->len);
+ return -EINVAL;
+
+out_version:
+ dprintk("svcrdma: bad xprt version: %u\n",
+ be32_to_cpup(rdma_argp + 1));
+ return -EPROTONOSUPPORT;
+
+out_drop:
+ dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n");
+ return 0;
+
+out_proc:
+ dprintk("svcrdma: bad rdma procedure (%u)\n",
+ be32_to_cpup(rdma_argp + 3));
+ return -EINVAL;
+
+out_inval:
+ dprintk("svcrdma: failed to parse transport header\n");
+ return -EINVAL;
}
int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
@@ -249,7 +175,7 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
*va++ = rmsgp->rm_xid;
*va++ = rmsgp->rm_vers;
- *va++ = cpu_to_be32(xprt->sc_max_requests);
+ *va++ = xprt->sc_fc_credits;
*va++ = rdma_error;
*va++ = cpu_to_be32(err);
if (err == ERR_VERS) {
@@ -260,32 +186,35 @@ int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
return (int)((unsigned long)va - (unsigned long)startp);
}
-int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
+/**
+ * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
+ * @rdma_resp: buffer containing Reply transport header
+ *
+ * Returns length of transport header, in bytes.
+ */
+unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
{
- struct rpcrdma_write_array *wr_ary;
+ unsigned int nsegs;
+ __be32 *p;
- /* There is no read-list in a reply */
+ p = rdma_resp;
- /* skip write list */
- wr_ary = (struct rpcrdma_write_array *)
- &rmsgp->rm_body.rm_chunks[1];
- if (wr_ary->wc_discrim)
- wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)].
- wc_target.rs_length;
- else
- wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_nchunks;
-
- /* skip reply array */
- if (wr_ary->wc_discrim)
- wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_array[be32_to_cpu(wr_ary->wc_nchunks)];
- else
- wr_ary = (struct rpcrdma_write_array *)
- &wr_ary->wc_nchunks;
-
- return (unsigned long) wr_ary - (unsigned long) rmsgp;
+ /* RPC-over-RDMA V1 replies never have a Read list. */
+ p += rpcrdma_fixed_maxsz + 1;
+
+ /* Skip Write list. */
+ while (*p++ != xdr_zero) {
+ nsegs = be32_to_cpup(p++);
+ p += nsegs * rpcrdma_segment_maxsz;
+ }
+
+ /* Skip Reply chunk. */
+ if (*p++ != xdr_zero) {
+ nsegs = be32_to_cpup(p++);
+ p += nsegs * rpcrdma_segment_maxsz;
+ }
+
+ return (unsigned long)p - (unsigned long)rdma_resp;
}
void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
@@ -326,19 +255,3 @@ void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
seg->rs_offset = rs_offset;
seg->rs_length = cpu_to_be32(write_len);
}
-
-void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
- struct rpcrdma_msg *rdma_argp,
- struct rpcrdma_msg *rdma_resp,
- enum rpcrdma_proc rdma_type)
-{
- rdma_resp->rm_xid = rdma_argp->rm_xid;
- rdma_resp->rm_vers = rdma_argp->rm_vers;
- rdma_resp->rm_credit = cpu_to_be32(xprt->sc_max_requests);
- rdma_resp->rm_type = cpu_to_be32(rdma_type);
-
- /* Encode <nul> chunks lists */
- rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
- rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
- rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
-}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 172b537f8cfc..f7b2daf72a86 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -606,26 +606,24 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
dprintk("svcrdma: rqstp=%p\n", rqstp);
- spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
+ spin_lock(&rdma_xprt->sc_rq_dto_lock);
if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
- ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
- struct svc_rdma_op_ctxt,
- dto_q);
- list_del_init(&ctxt->dto_q);
- spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+ ctxt = list_first_entry(&rdma_xprt->sc_read_complete_q,
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
+ spin_unlock(&rdma_xprt->sc_rq_dto_lock);
rdma_read_complete(rqstp, ctxt);
goto complete;
} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
- ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
- struct svc_rdma_op_ctxt,
- dto_q);
- list_del_init(&ctxt->dto_q);
+ ctxt = list_first_entry(&rdma_xprt->sc_rq_dto_q,
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
} else {
atomic_inc(&rdma_stat_rq_starve);
clear_bit(XPT_DATA, &xprt->xpt_flags);
ctxt = NULL;
}
- spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
+ spin_unlock(&rdma_xprt->sc_rq_dto_lock);
if (!ctxt) {
/* This is the EAGAIN path. The svc_recv routine will
* return -EAGAIN, the nfsd thread will go to call into
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index ad4d286a83c5..515221b16d09 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -476,7 +476,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
/* Prepare the SGE for the RPCRDMA Header */
ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
- ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
+ ctxt->sge[0].length =
+ svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
ctxt->sge[0].addr =
ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
ctxt->sge[0].length, DMA_TO_DEVICE);
@@ -559,12 +560,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
struct rpcrdma_msg *rdma_argp;
struct rpcrdma_msg *rdma_resp;
struct rpcrdma_write_array *wr_ary, *rp_ary;
- enum rpcrdma_proc reply_type;
int ret;
int inline_bytes;
struct page *res_page;
struct svc_rdma_req_map *vec;
u32 inv_rkey;
+ __be32 *p;
dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
@@ -596,12 +597,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
if (!res_page)
goto err0;
rdma_resp = page_address(res_page);
- if (rp_ary)
- reply_type = RDMA_NOMSG;
- else
- reply_type = RDMA_MSG;
- svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
- rdma_resp, reply_type);
+
+ p = &rdma_resp->rm_xid;
+ *p++ = rdma_argp->rm_xid;
+ *p++ = rdma_argp->rm_vers;
+ *p++ = rdma->sc_fc_credits;
+ *p++ = rp_ary ? rdma_nomsg : rdma_msg;
+
+ /* Start with empty chunks */
+ *p++ = xdr_zero;
+ *p++ = xdr_zero;
+ *p = xdr_zero;
/* Send any write-chunk data and build resp write-list */
if (wr_ary) {
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 39652d390a9c..c13a5c35ce14 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -157,8 +157,7 @@ static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
ctxt = kmalloc(sizeof(*ctxt), flags);
if (ctxt) {
ctxt->xprt = xprt;
- INIT_LIST_HEAD(&ctxt->free);
- INIT_LIST_HEAD(&ctxt->dto_q);
+ INIT_LIST_HEAD(&ctxt->list);
}
return ctxt;
}
@@ -180,7 +179,7 @@ static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
dprintk("svcrdma: No memory for RDMA ctxt\n");
return false;
}
- list_add(&ctxt->free, &xprt->sc_ctxts);
+ list_add(&ctxt->list, &xprt->sc_ctxts);
}
return true;
}
@@ -189,15 +188,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
{
struct svc_rdma_op_ctxt *ctxt = NULL;
- spin_lock_bh(&xprt->sc_ctxt_lock);
+ spin_lock(&xprt->sc_ctxt_lock);
xprt->sc_ctxt_used++;
if (list_empty(&xprt->sc_ctxts))
goto out_empty;
ctxt = list_first_entry(&xprt->sc_ctxts,
- struct svc_rdma_op_ctxt, free);
- list_del_init(&ctxt->free);
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
+ spin_unlock(&xprt->sc_ctxt_lock);
out:
ctxt->count = 0;
@@ -209,15 +208,15 @@ out_empty:
/* Either pre-allocation missed the mark, or send
* queue accounting is broken.
*/
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ spin_unlock(&xprt->sc_ctxt_lock);
ctxt = alloc_ctxt(xprt, GFP_NOIO);
if (ctxt)
goto out;
- spin_lock_bh(&xprt->sc_ctxt_lock);
+ spin_lock(&xprt->sc_ctxt_lock);
xprt->sc_ctxt_used--;
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ spin_unlock(&xprt->sc_ctxt_lock);
WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
return NULL;
}
@@ -254,10 +253,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
for (i = 0; i < ctxt->count; i++)
put_page(ctxt->pages[i]);
- spin_lock_bh(&xprt->sc_ctxt_lock);
+ spin_lock(&xprt->sc_ctxt_lock);
xprt->sc_ctxt_used--;
- list_add(&ctxt->free, &xprt->sc_ctxts);
- spin_unlock_bh(&xprt->sc_ctxt_lock);
+ list_add(&ctxt->list, &xprt->sc_ctxts);
+ spin_unlock(&xprt->sc_ctxt_lock);
}
static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
@@ -266,8 +265,8 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
struct svc_rdma_op_ctxt *ctxt;
ctxt = list_first_entry(&xprt->sc_ctxts,
- struct svc_rdma_op_ctxt, free);
- list_del(&ctxt->free);
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
kfree(ctxt);
}
}
@@ -404,7 +403,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
/* All wc fields are now known to be valid */
ctxt->byte_len = wc->byte_len;
spin_lock(&xprt->sc_rq_dto_lock);
- list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+ list_add_tail(&ctxt->list, &xprt->sc_rq_dto_q);
spin_unlock(&xprt->sc_rq_dto_lock);
set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
@@ -525,7 +524,7 @@ void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc)
read_hdr = ctxt->read_hdr;
spin_lock(&xprt->sc_rq_dto_lock);
- list_add_tail(&read_hdr->dto_q,
+ list_add_tail(&read_hdr->list,
&xprt->sc_read_complete_q);
spin_unlock(&xprt->sc_rq_dto_lock);
@@ -557,7 +556,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
return NULL;
svc_xprt_init(&init_net, &svc_rdma_class, &cma_xprt->sc_xprt, serv);
INIT_LIST_HEAD(&cma_xprt->sc_accept_q);
- INIT_LIST_HEAD(&cma_xprt->sc_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
@@ -571,6 +569,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
spin_lock_init(&cma_xprt->sc_ctxt_lock);
spin_lock_init(&cma_xprt->sc_map_lock);
+ /*
+ * Note that this implies that the underlying transport support
+ * has some form of congestion control (see RFC 7530 section 3.1
+ * paragraph 2). For now, we assume that all supported RDMA
+ * transports are suitable here.
+ */
+ set_bit(XPT_CONG_CTRL, &cma_xprt->sc_xprt.xpt_flags);
+
if (listener)
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
@@ -923,14 +929,14 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
{
struct svc_rdma_fastreg_mr *frmr = NULL;
- spin_lock_bh(&rdma->sc_frmr_q_lock);
+ spin_lock(&rdma->sc_frmr_q_lock);
if (!list_empty(&rdma->sc_frmr_q)) {
frmr = list_entry(rdma->sc_frmr_q.next,
struct svc_rdma_fastreg_mr, frmr_list);
list_del_init(&frmr->frmr_list);
frmr->sg_nents = 0;
}
- spin_unlock_bh(&rdma->sc_frmr_q_lock);
+ spin_unlock(&rdma->sc_frmr_q_lock);
if (frmr)
return frmr;
@@ -943,10 +949,10 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
if (frmr) {
ib_dma_unmap_sg(rdma->sc_cm_id->device,
frmr->sg, frmr->sg_nents, frmr->direction);
- spin_lock_bh(&rdma->sc_frmr_q_lock);
+ spin_lock(&rdma->sc_frmr_q_lock);
WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
- spin_unlock_bh(&rdma->sc_frmr_q_lock);
+ spin_unlock(&rdma->sc_frmr_q_lock);
}
}
@@ -1002,6 +1008,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt->sc_max_req_size = svcrdma_max_req_size;
newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
svcrdma_max_requests);
+ newxprt->sc_fc_credits = cpu_to_be32(newxprt->sc_max_requests);
newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
svcrdma_max_bc_requests);
newxprt->sc_rq_depth = newxprt->sc_max_requests +
@@ -1027,13 +1034,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
goto errout;
}
newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
- 0, IB_POLL_SOFTIRQ);
+ 0, IB_POLL_WORKQUEUE);
if (IS_ERR(newxprt->sc_sq_cq)) {
dprintk("svcrdma: error creating SQ CQ for connect request\n");
goto errout;
}
newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
- 0, IB_POLL_SOFTIRQ);
+ 0, IB_POLL_WORKQUEUE);
if (IS_ERR(newxprt->sc_rq_cq)) {
dprintk("svcrdma: error creating RQ CQ for connect request\n");
goto errout;
@@ -1213,20 +1220,18 @@ static void __svc_rdma_free(struct work_struct *work)
*/
while (!list_empty(&rdma->sc_read_complete_q)) {
struct svc_rdma_op_ctxt *ctxt;
- ctxt = list_entry(rdma->sc_read_complete_q.next,
- struct svc_rdma_op_ctxt,
- dto_q);
- list_del_init(&ctxt->dto_q);
+ ctxt = list_first_entry(&rdma->sc_read_complete_q,
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
svc_rdma_put_context(ctxt, 1);
}
/* Destroy queued, but not processed recv completions */
while (!list_empty(&rdma->sc_rq_dto_q)) {
struct svc_rdma_op_ctxt *ctxt;
- ctxt = list_entry(rdma->sc_rq_dto_q.next,
- struct svc_rdma_op_ctxt,
- dto_q);
- list_del_init(&ctxt->dto_q);
+ ctxt = list_first_entry(&rdma->sc_rq_dto_q,
+ struct svc_rdma_op_ctxt, list);
+ list_del(&ctxt->list);
svc_rdma_put_context(ctxt, 1);
}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 534c178d2a7e..c717f5410776 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -67,7 +67,7 @@ unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_inline_write_padding;
static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
- int xprt_rdma_pad_optimize = 1;
+ int xprt_rdma_pad_optimize = 0;
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -709,10 +709,6 @@ xprt_rdma_send_request(struct rpc_task *task)
return 0;
failed_marshal:
- dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
- __func__, rc);
- if (rc == -EIO)
- r_xprt->rx_stats.failed_marshal_count++;
if (rc != -ENOTCONN)
return rc;
drop_connection:
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 11d07748f699..81cd31acf690 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -54,6 +54,7 @@
#include <linux/sunrpc/svc_rdma.h>
#include <asm/bitops.h>
#include <linux/module.h> /* try_module_get()/module_put() */
+#include <rdma/ib_cm.h>
#include "xprt_rdma.h"
@@ -208,6 +209,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
/* Default settings for RPC-over-RDMA Version One */
r_xprt->rx_ia.ri_reminv_expected = false;
+ r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
@@ -215,6 +217,7 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
pmsg->cp_magic == rpcrdma_cmp_magic &&
pmsg->cp_version == RPCRDMA_CMP_VERSION) {
r_xprt->rx_ia.ri_reminv_expected = true;
+ r_xprt->rx_ia.ri_implicit_roundup = true;
rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
}
@@ -277,7 +280,14 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
connstate = -ENETDOWN;
goto connected;
case RDMA_CM_EVENT_REJECTED:
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+ pr_info("rpcrdma: connection to %pIS:%u on %s rejected: %s\n",
+ sap, rpc_get_port(sap), ia->ri_device->name,
+ rdma_reject_msg(id, event->status));
+#endif
connstate = -ECONNREFUSED;
+ if (event->status == IB_CM_REJ_STALE_CONN)
+ connstate = -EAGAIN;
goto connected;
case RDMA_CM_EVENT_DISCONNECTED:
connstate = -ECONNABORTED;
@@ -486,18 +496,19 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
*/
int
rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
- struct rpcrdma_create_data_internal *cdata)
+ struct rpcrdma_create_data_internal *cdata)
{
struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
+ unsigned int max_qp_wr, max_sge;
struct ib_cq *sendcq, *recvcq;
- unsigned int max_qp_wr;
int rc;
- if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {
- dprintk("RPC: %s: insufficient sge's available\n",
- __func__);
+ max_sge = min(ia->ri_device->attrs.max_sge, RPCRDMA_MAX_SEND_SGES);
+ if (max_sge < RPCRDMA_MIN_SEND_SGES) {
+ pr_warn("rpcrdma: HCA provides only %d send SGEs\n", max_sge);
return -ENOMEM;
}
+ ia->ri_max_send_sges = max_sge - RPCRDMA_MIN_SEND_SGES;
if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
dprintk("RPC: %s: insufficient wqe's available\n",
@@ -522,7 +533,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */
- ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;
+ ep->rep_attr.cap.max_send_sge = max_sge;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -640,20 +651,21 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
int
rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
+ struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
+ rx_ia);
struct rdma_cm_id *id, *old;
+ struct sockaddr *sap;
+ unsigned int extras;
int rc = 0;
- int retry_count = 0;
if (ep->rep_connected != 0) {
- struct rpcrdma_xprt *xprt;
retry:
dprintk("RPC: %s: reconnecting...\n", __func__);
rpcrdma_ep_disconnect(ep, ia);
- xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
- id = rpcrdma_create_id(xprt, ia,
- (struct sockaddr *)&xprt->rx_data.addr);
+ sap = (struct sockaddr *)&r_xprt->rx_data.addr;
+ id = rpcrdma_create_id(r_xprt, ia, sap);
if (IS_ERR(id)) {
rc = -EHOSTUNREACH;
goto out;
@@ -708,51 +720,18 @@ retry:
}
wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
-
- /*
- * Check state. A non-peer reject indicates no listener
- * (ECONNREFUSED), which may be a transient state. All
- * others indicate a transport condition which has already
- * undergone a best-effort.
- */
- if (ep->rep_connected == -ECONNREFUSED &&
- ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
- dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
- goto retry;
- }
if (ep->rep_connected <= 0) {
- /* Sometimes, the only way to reliably connect to remote
- * CMs is to use same nonzero values for ORD and IRD. */
- if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
- (ep->rep_remote_cma.responder_resources == 0 ||
- ep->rep_remote_cma.initiator_depth !=
- ep->rep_remote_cma.responder_resources)) {
- if (ep->rep_remote_cma.responder_resources == 0)
- ep->rep_remote_cma.responder_resources = 1;
- ep->rep_remote_cma.initiator_depth =
- ep->rep_remote_cma.responder_resources;
+ if (ep->rep_connected == -EAGAIN)
goto retry;
- }
rc = ep->rep_connected;
- } else {
- struct rpcrdma_xprt *r_xprt;
- unsigned int extras;
-
- dprintk("RPC: %s: connected\n", __func__);
-
- r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
- extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
-
- if (extras) {
- rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
- if (rc) {
- pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
- __func__, rc);
- rc = 0;
- }
- }
+ goto out;
}
+ dprintk("RPC: %s: connected\n", __func__);
+ extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
+ if (extras)
+ rpcrdma_ep_post_extra_recv(r_xprt, extras);
+
out:
if (rc)
ep->rep_connected = rc;
@@ -797,9 +776,7 @@ rpcrdma_mr_recovery_worker(struct work_struct *work)
spin_lock(&buf->rb_recovery_lock);
while (!list_empty(&buf->rb_stale_mrs)) {
- mw = list_first_entry(&buf->rb_stale_mrs,
- struct rpcrdma_mw, mw_list);
- list_del_init(&mw->mw_list);
+ mw = rpcrdma_pop_mw(&buf->rb_stale_mrs);
spin_unlock(&buf->rb_recovery_lock);
dprintk("RPC: %s: recovering MR %p\n", __func__, mw);
@@ -817,7 +794,7 @@ rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw)
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
spin_lock(&buf->rb_recovery_lock);
- list_add(&mw->mw_list, &buf->rb_stale_mrs);
+ rpcrdma_push_mw(mw, &buf->rb_stale_mrs);
spin_unlock(&buf->rb_recovery_lock);
schedule_delayed_work(&buf->rb_recovery_worker, 0);
@@ -1093,11 +1070,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_mw *mw = NULL;
spin_lock(&buf->rb_mwlock);
- if (!list_empty(&buf->rb_mws)) {
- mw = list_first_entry(&buf->rb_mws,
- struct rpcrdma_mw, mw_list);
- list_del_init(&mw->mw_list);
- }
+ if (!list_empty(&buf->rb_mws))
+ mw = rpcrdma_pop_mw(&buf->rb_mws);
spin_unlock(&buf->rb_mwlock);
if (!mw)
@@ -1120,7 +1094,7 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
spin_lock(&buf->rb_mwlock);
- list_add_tail(&mw->mw_list, &buf->rb_mws);
+ rpcrdma_push_mw(mw, &buf->rb_mws);
spin_unlock(&buf->rb_mwlock);
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index e35efd4ac1e4..171a35116de9 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -74,7 +74,9 @@ struct rpcrdma_ia {
unsigned int ri_max_frmr_depth;
unsigned int ri_max_inline_write;
unsigned int ri_max_inline_read;
+ unsigned int ri_max_send_sges;
bool ri_reminv_expected;
+ bool ri_implicit_roundup;
enum ib_mr_type ri_mrtype;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
@@ -303,15 +305,19 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
char *mr_offset; /* kva if no page, else offset */
};
-/* Reserve enough Send SGEs to send a maximum size inline request:
+/* The Send SGE array is provisioned to send a maximum size
+ * inline request:
* - RPC-over-RDMA header
* - xdr_buf head iovec
- * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages
+ * - RPCRDMA_MAX_INLINE bytes, in pages
* - xdr_buf tail iovec
+ *
+ * The actual number of array elements consumed by each RPC
+ * depends on the device's max_sge limit.
*/
enum {
- RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1,
- RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1,
+ RPCRDMA_MIN_SEND_SGES = 3,
+ RPCRDMA_MAX_PAGE_SGES = RPCRDMA_MAX_INLINE >> PAGE_SHIFT,
RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
};
@@ -348,6 +354,22 @@ rpcr_to_rdmar(struct rpc_rqst *rqst)
return rqst->rq_xprtdata;
}
+static inline void
+rpcrdma_push_mw(struct rpcrdma_mw *mw, struct list_head *list)
+{
+ list_add_tail(&mw->mw_list, list);
+}
+
+static inline struct rpcrdma_mw *
+rpcrdma_pop_mw(struct list_head *list)
+{
+ struct rpcrdma_mw *mw;
+
+ mw = list_first_entry(list, struct rpcrdma_mw, mw_list);
+ list_del(&mw->mw_list);
+ return mw;
+}
+
/*
* struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
* inline requests/replies, and client/server credits.
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index af392d9b9cec..16aff8ddc16f 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -52,6 +52,8 @@
#include "sunrpc.h"
static void xs_close(struct rpc_xprt *xprt);
+static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
+ struct socket *sock);
/*
* xprtsock tunables
@@ -666,6 +668,9 @@ static int xs_tcp_send_request(struct rpc_task *task)
if (task->tk_flags & RPC_TASK_SENT)
zerocopy = false;
+ if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state))
+ xs_tcp_set_socket_timeouts(xprt, transport->sock);
+
/* Continue transmitting the packet/record. We must be careful
* to cope with writespace callbacks arriving _after_ we have
* called sendmsg(). */
@@ -1188,7 +1193,7 @@ static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_r
char *p;
len = sizeof(transport->tcp_xid) - transport->tcp_offset;
- dprintk("RPC: reading XID (%Zu bytes)\n", len);
+ dprintk("RPC: reading XID (%zu bytes)\n", len);
p = ((char *) &transport->tcp_xid) + transport->tcp_offset;
used = xdr_skb_read_bits(desc, p, len);
transport->tcp_offset += used;
@@ -1219,7 +1224,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport,
*/
offset = transport->tcp_offset - sizeof(transport->tcp_xid);
len = sizeof(transport->tcp_calldir) - offset;
- dprintk("RPC: reading CALL/REPLY flag (%Zu bytes)\n", len);
+ dprintk("RPC: reading CALL/REPLY flag (%zu bytes)\n", len);
p = ((char *) &transport->tcp_calldir) + offset;
used = xdr_skb_read_bits(desc, p, len);
transport->tcp_offset += used;
@@ -1310,7 +1315,7 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
return;
}
- dprintk("RPC: XID %08x read %Zd bytes\n",
+ dprintk("RPC: XID %08x read %zd bytes\n",
ntohl(transport->tcp_xid), r);
dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
"tcp_reclen = %u\n", xprt, transport->tcp_copied,
@@ -1456,7 +1461,7 @@ static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_s
desc->count -= len;
desc->offset += len;
transport->tcp_offset += len;
- dprintk("RPC: discarded %Zu bytes\n", len);
+ dprintk("RPC: discarded %zu bytes\n", len);
xs_tcp_check_fraghdr(transport);
}
@@ -1734,7 +1739,9 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t
*/
static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task)
{
+ spin_lock_bh(&xprt->transport_lock);
xprt_adjust_cwnd(xprt, task, -ETIMEDOUT);
+ spin_unlock_bh(&xprt->transport_lock);
}
static unsigned short xs_get_random_port(void)
@@ -2235,6 +2242,66 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt)
xs_reset_transport(transport);
}
+static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
+ struct socket *sock)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ unsigned int keepidle;
+ unsigned int keepcnt;
+ unsigned int opt_on = 1;
+ unsigned int timeo;
+
+ spin_lock_bh(&xprt->transport_lock);
+ keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ);
+ keepcnt = xprt->timeout->to_retries + 1;
+ timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
+ (xprt->timeout->to_retries + 1);
+ clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
+ spin_unlock_bh(&xprt->transport_lock);
+
+ /* TCP Keepalive options */
+ kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *)&opt_on, sizeof(opt_on));
+ kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
+ (char *)&keepidle, sizeof(keepidle));
+ kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
+ (char *)&keepidle, sizeof(keepidle));
+ kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
+ (char *)&keepcnt, sizeof(keepcnt));
+
+ /* TCP user timeout (see RFC5482) */
+ kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
+ (char *)&timeo, sizeof(timeo));
+}
+
+static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt,
+ unsigned long connect_timeout,
+ unsigned long reconnect_timeout)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ struct rpc_timeout to;
+ unsigned long initval;
+
+ spin_lock_bh(&xprt->transport_lock);
+ if (reconnect_timeout < xprt->max_reconnect_timeout)
+ xprt->max_reconnect_timeout = reconnect_timeout;
+ if (connect_timeout < xprt->connect_timeout) {
+ memcpy(&to, xprt->timeout, sizeof(to));
+ initval = DIV_ROUND_UP(connect_timeout, to.to_retries + 1);
+ /* Arbitrary lower limit */
+ if (initval < XS_TCP_INIT_REEST_TO << 1)
+ initval = XS_TCP_INIT_REEST_TO << 1;
+ to.to_initval = initval;
+ to.to_maxval = initval;
+ memcpy(&transport->tcp_timeout, &to,
+ sizeof(transport->tcp_timeout));
+ xprt->timeout = &transport->tcp_timeout;
+ xprt->connect_timeout = connect_timeout;
+ }
+ set_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state);
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -2242,22 +2309,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
if (!transport->inet) {
struct sock *sk = sock->sk;
- unsigned int keepidle = xprt->timeout->to_initval / HZ;
- unsigned int keepcnt = xprt->timeout->to_retries + 1;
- unsigned int opt_on = 1;
- unsigned int timeo;
unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC;
- /* TCP Keepalive options */
- kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
- (char *)&opt_on, sizeof(opt_on));
- kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE,
- (char *)&keepidle, sizeof(keepidle));
- kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL,
- (char *)&keepidle, sizeof(keepidle));
- kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
- (char *)&keepcnt, sizeof(keepcnt));
-
/* Avoid temporary address, they are bad for long-lived
* connections such as NFS mounts.
* RFC4941, section 3.6 suggests that:
@@ -2268,11 +2321,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES,
(char *)&addr_pref, sizeof(addr_pref));
- /* TCP user timeout (see RFC5482) */
- timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
- (xprt->timeout->to_retries + 1);
- kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
- (char *)&timeo, sizeof(timeo));
+ xs_tcp_set_socket_timeouts(xprt, sock);
write_lock_bh(&sk->sk_callback_lock);
@@ -2721,6 +2770,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
.set_retrans_timeout = xprt_set_retrans_timeout_def,
.close = xs_tcp_shutdown,
.destroy = xs_destroy,
+ .set_connect_timeout = xs_tcp_set_connect_timeout,
.print_stats = xs_tcp_print_stats,
.enable_swap = xs_enable_swap,
.disable_swap = xs_disable_swap,
@@ -3007,6 +3057,8 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
xprt->timeout = &xs_tcp_default_timeout;
xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
+ xprt->connect_timeout = xprt->timeout->to_initval *
+ (xprt->timeout->to_retries + 1);
INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
@@ -3209,7 +3261,9 @@ static int param_set_uint_minmax(const char *val,
if (!val)
return -EINVAL;
ret = kstrtouint(val, 0, &num);
- if (ret == -EINVAL || num < min || num > max)
+ if (ret)
+ return ret;
+ if (num < min || num > max)
return -EINVAL;
*((unsigned int *)kp->arg) = num;
return 0;