From f38a5181d9f3e004b1f50f9d7e1f2a8492ce240a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 7 Aug 2013 14:30:24 -0700 Subject: ceph: Convert to immutable biovecs Now that we've got a mechanism for immutable biovecs - bi_iter.bi_bvec_done - we need to convert drivers to use primitives that respect it instead of using the bvec array directly. Signed-off-by: Kent Overstreet Cc: Jens Axboe Cc: Sage Weil Cc: ceph-devel@vger.kernel.org --- net/ceph/messenger.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4a5df7b1cc9f..18c039b95c22 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -777,13 +777,12 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, bio = data->bio; BUG_ON(!bio); - BUG_ON(!bio->bi_vcnt); cursor->resid = min(length, data->bio_length); cursor->bio = bio; - cursor->vector_index = 0; - cursor->vector_offset = 0; - cursor->last_piece = length <= bio->bi_io_vec[0].bv_len; + cursor->bvec_iter = bio->bi_iter; + cursor->last_piece = + cursor->resid <= bio_iter_len(bio, cursor->bvec_iter); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, @@ -792,71 +791,63 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, { struct ceph_msg_data *data = cursor->data; struct bio *bio; - struct bio_vec *bio_vec; - unsigned int index; + struct bio_vec bio_vec; BUG_ON(data->type != CEPH_MSG_DATA_BIO); bio = cursor->bio; BUG_ON(!bio); - index = cursor->vector_index; - BUG_ON(index >= (unsigned int) bio->bi_vcnt); + bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); - bio_vec = &bio->bi_io_vec[index]; - BUG_ON(cursor->vector_offset >= bio_vec->bv_len); - *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset); + *page_offset = (size_t) bio_vec.bv_offset; BUG_ON(*page_offset >= PAGE_SIZE); if (cursor->last_piece) /* pagelist offset is always 0 */ *length = cursor->resid; else - *length = (size_t) (bio_vec->bv_len - cursor->vector_offset); + *length = (size_t) bio_vec.bv_len; BUG_ON(*length > cursor->resid); BUG_ON(*page_offset + *length > PAGE_SIZE); - return bio_vec->bv_page; + return bio_vec.bv_page; } static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct bio *bio; - struct bio_vec *bio_vec; - unsigned int index; + struct bio_vec bio_vec; BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); bio = cursor->bio; BUG_ON(!bio); - index = cursor->vector_index; - BUG_ON(index >= (unsigned int) bio->bi_vcnt); - bio_vec = &bio->bi_io_vec[index]; + bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); /* Advance the cursor offset */ BUG_ON(cursor->resid < bytes); cursor->resid -= bytes; - cursor->vector_offset += bytes; - if (cursor->vector_offset < bio_vec->bv_len) + + bio_advance_iter(bio, &cursor->bvec_iter, bytes); + + if (bytes < bio_vec.bv_len) return false; /* more bytes to process in this segment */ - BUG_ON(cursor->vector_offset != bio_vec->bv_len); /* Move on to the next segment, and possibly the next bio */ - if (++index == (unsigned int) bio->bi_vcnt) { + if (!cursor->bvec_iter.bi_size) { bio = bio->bi_next; - index = 0; + cursor->bvec_iter = bio->bi_iter; } cursor->bio = bio; - cursor->vector_index = index; - cursor->vector_offset = 0; if (!cursor->last_piece) { BUG_ON(!cursor->resid); BUG_ON(!bio); /* A short read is OK, so use <= rather than == */ - if (cursor->resid <= bio->bi_io_vec[index].bv_len) + if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter)) cursor->last_piece = true; } -- cgit From 4b9a445e3eeb8bd9278b1ae51c1b3a651e370cd6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 14 Nov 2013 07:25:17 -0500 Subject: sunrpc: create a new dummy pipe for gssd to hold open rpc.gssd will naturally hold open any pipe named */clnt*/gssd that shows up under rpc_pipefs. That behavior gives us a reliable mechanism to tell whether it's actually running or not. Create a new toplevel "gssd" directory in rpc_pipefs when it's mounted. Under that directory create another directory called "clntXX", and then within that a pipe called "gssd". We'll never send an upcall along that pipe, and any downcall written to it will just return -EINVAL. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/sunrpc/rpc_pipe_fs.h | 3 +- net/sunrpc/netns.h | 1 + net/sunrpc/rpc_pipe.c | 93 ++++++++++++++++++++++++++++++++++++-- net/sunrpc/sunrpc_syms.c | 8 +++- 4 files changed, 100 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index a353e0300b54..85f13424647c 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -84,7 +84,8 @@ enum { extern struct dentry *rpc_d_lookup_sb(const struct super_block *sb, const unsigned char *dir_name); -extern void rpc_pipefs_init_net(struct net *net); +extern int rpc_pipefs_init_net(struct net *net); +extern void rpc_pipefs_exit_net(struct net *net); extern struct super_block *rpc_get_sb_net(const struct net *net); extern void rpc_put_sb_net(const struct net *net); diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h index 779742cfc1ff..8a8e841d1547 100644 --- a/net/sunrpc/netns.h +++ b/net/sunrpc/netns.h @@ -14,6 +14,7 @@ struct sunrpc_net { struct cache_detail *rsi_cache; struct super_block *pipefs_sb; + struct rpc_pipe *gssd_dummy; struct mutex pipefs_sb_lock; struct list_head all_clients; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index bf04b30a788a..c23458b464c4 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -38,7 +38,7 @@ #define NET_NAME(net) ((net == &init_net) ? " (init_net)" : "") static struct file_system_type rpc_pipe_fs_type; - +static const struct rpc_pipe_ops gssd_dummy_pipe_ops; static struct kmem_cache *rpc_inode_cachep __read_mostly; @@ -1159,6 +1159,7 @@ enum { RPCAUTH_nfsd4_cb, RPCAUTH_cache, RPCAUTH_nfsd, + RPCAUTH_gssd, RPCAUTH_RootEOF }; @@ -1195,6 +1196,10 @@ static const struct rpc_filelist files[] = { .name = "nfsd", .mode = S_IFDIR | S_IRUGO | S_IXUGO, }, + [RPCAUTH_gssd] = { + .name = "gssd", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, }; /* @@ -1208,13 +1213,25 @@ struct dentry *rpc_d_lookup_sb(const struct super_block *sb, } EXPORT_SYMBOL_GPL(rpc_d_lookup_sb); -void rpc_pipefs_init_net(struct net *net) +int rpc_pipefs_init_net(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + sn->gssd_dummy = rpc_mkpipe_data(&gssd_dummy_pipe_ops, 0); + if (IS_ERR(sn->gssd_dummy)) + return PTR_ERR(sn->gssd_dummy); + mutex_init(&sn->pipefs_sb_lock); sn->gssd_running = 1; sn->pipe_version = -1; + return 0; +} + +void rpc_pipefs_exit_net(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + rpc_destroy_pipe_data(sn->gssd_dummy); } /* @@ -1244,11 +1261,73 @@ void rpc_put_sb_net(const struct net *net) } EXPORT_SYMBOL_GPL(rpc_put_sb_net); +static const struct rpc_filelist gssd_dummy_clnt_dir[] = { + [0] = { + .name = "clntXX", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, +}; + +static ssize_t +dummy_downcall(struct file *filp, const char __user *src, size_t len) +{ + return -EINVAL; +} + +static const struct rpc_pipe_ops gssd_dummy_pipe_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = dummy_downcall, +}; + +/** + * rpc_gssd_dummy_populate - create a dummy gssd pipe + * @root: root of the rpc_pipefs filesystem + * @pipe_data: pipe data created when netns is initialized + * + * Create a dummy set of directories and a pipe that gssd can hold open to + * indicate that it is up and running. + */ +static struct dentry * +rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) +{ + int ret = 0; + struct dentry *gssd_dentry; + struct dentry *clnt_dentry = NULL; + struct dentry *pipe_dentry = NULL; + struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name, + strlen(files[RPCAUTH_gssd].name)); + + /* We should never get this far if "gssd" doesn't exist */ + gssd_dentry = d_hash_and_lookup(root, &q); + if (!gssd_dentry) + return ERR_PTR(-ENOENT); + + ret = rpc_populate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1, NULL); + if (ret) { + pipe_dentry = ERR_PTR(ret); + goto out; + } + + q.name = gssd_dummy_clnt_dir[0].name; + q.len = strlen(gssd_dummy_clnt_dir[0].name); + clnt_dentry = d_hash_and_lookup(gssd_dentry, &q); + if (!clnt_dentry) { + pipe_dentry = ERR_PTR(-ENOENT); + goto out; + } + + pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data); +out: + dput(clnt_dentry); + dput(gssd_dentry); + return pipe_dentry; +} + static int rpc_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - struct dentry *root; + struct dentry *root, *gssd_dentry; struct net *net = data; struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int err; @@ -1266,6 +1345,13 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL)) return -ENOMEM; + + gssd_dentry = rpc_gssd_dummy_populate(root, sn->gssd_dummy); + if (IS_ERR(gssd_dentry)) { + __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF); + return PTR_ERR(gssd_dentry); + } + dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", net, NET_NAME(net)); mutex_lock(&sn->pipefs_sb_lock); @@ -1280,6 +1366,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return 0; err_depopulate: + dput(gssd_dentry); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 3d6498af9adc..cd30120de9e4 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -44,12 +44,17 @@ static __net_init int sunrpc_init_net(struct net *net) if (err) goto err_unixgid; - rpc_pipefs_init_net(net); + err = rpc_pipefs_init_net(net); + if (err) + goto err_pipefs; + INIT_LIST_HEAD(&sn->all_clients); spin_lock_init(&sn->rpc_client_lock); spin_lock_init(&sn->rpcb_clnt_lock); return 0; +err_pipefs: + unix_gid_cache_destroy(net); err_unixgid: ip_map_cache_destroy(net); err_ipmap: @@ -60,6 +65,7 @@ err_proc: static __net_exit void sunrpc_exit_net(struct net *net) { + rpc_pipefs_exit_net(net); unix_gid_cache_destroy(net); ip_map_cache_destroy(net); rpc_proc_exit(net); -- cgit From 89f842435c630f8426f414e6030bc2ffea0d6f81 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 14 Nov 2013 07:25:18 -0500 Subject: sunrpc: replace sunrpc_net->gssd_running flag with a more reliable check Now that we have a more reliable method to tell if gssd is running, we can replace the sn->gssd_running flag with a function that will query to see if it's up and running. There's also no need to attempt an upcall that we know will fail, so just return -EACCES if gssd isn't running. Finally, fix the warn_gss() message not to claim that that the upcall timed out since we don't necesarily perform one now when gssd isn't running, and remove the extraneous newline from the message. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/sunrpc/rpc_pipe_fs.h | 2 ++ net/sunrpc/auth_gss/auth_gss.c | 17 +++++++---------- net/sunrpc/netns.h | 2 -- net/sunrpc/rpc_pipe.c | 14 ++++++++++---- 4 files changed, 19 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index 85f13424647c..7f490bef9e99 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -131,5 +131,7 @@ extern int rpc_unlink(struct dentry *); extern int register_rpc_pipefs(void); extern void unregister_rpc_pipefs(void); +extern bool gssd_running(struct net *net); + #endif #endif diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 42fdfc634e56..0a2aee060f9f 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -536,8 +536,7 @@ static void warn_gssd(void) unsigned long now = jiffies; if (time_after(now, ratelimit)) { - printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n" - "Please check user daemon is running.\n"); + pr_warn("RPC: AUTH_GSS upcall failed. Please check user daemon is running.\n"); ratelimit = now + 15*HZ; } } @@ -600,7 +599,6 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) struct rpc_pipe *pipe; struct rpc_cred *cred = &gss_cred->gc_base; struct gss_upcall_msg *gss_msg; - unsigned long timeout; DEFINE_WAIT(wait); int err; @@ -608,17 +606,16 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) __func__, from_kuid(&init_user_ns, cred->cr_uid)); retry: err = 0; - /* Default timeout is 15s unless we know that gssd is not running */ - timeout = 15 * HZ; - if (!sn->gssd_running) - timeout = HZ >> 2; + /* if gssd is down, just skip upcalling altogether */ + if (!gssd_running(net)) { + warn_gssd(); + return -EACCES; + } gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { err = wait_event_interruptible_timeout(pipe_version_waitqueue, - sn->pipe_version >= 0, timeout); + sn->pipe_version >= 0, 15 * HZ); if (sn->pipe_version < 0) { - if (err == 0) - sn->gssd_running = 0; warn_gssd(); err = -EACCES; } diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h index 8a8e841d1547..94e506f9d72b 100644 --- a/net/sunrpc/netns.h +++ b/net/sunrpc/netns.h @@ -33,8 +33,6 @@ struct sunrpc_net { int pipe_version; atomic_t pipe_users; struct proc_dir_entry *use_gssp_proc; - - unsigned int gssd_running; }; extern int sunrpc_net_id; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index c23458b464c4..5cd7ad1225a3 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -216,14 +216,11 @@ rpc_destroy_inode(struct inode *inode) static int rpc_pipe_open(struct inode *inode, struct file *filp) { - struct net *net = inode->i_sb->s_fs_info; - struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe *pipe; int first_open; int res = -ENXIO; mutex_lock(&inode->i_mutex); - sn->gssd_running = 1; pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -1222,7 +1219,6 @@ int rpc_pipefs_init_net(struct net *net) return PTR_ERR(sn->gssd_dummy); mutex_init(&sn->pipefs_sb_lock); - sn->gssd_running = 1; sn->pipe_version = -1; return 0; } @@ -1376,6 +1372,16 @@ err_depopulate: return err; } +bool +gssd_running(struct net *net) +{ + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + struct rpc_pipe *pipe = sn->gssd_dummy; + + return pipe->nreaders || pipe->nwriters; +} +EXPORT_SYMBOL_GPL(gssd_running); + static struct dentry * rpc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) -- cgit From 3396f92f8be606ea485b0a82d4e7749a448b013b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 5 Dec 2013 07:33:49 -0500 Subject: rpc_pipe: remove the clntXX dir if creating the pipe fails In the event that we create the gssd/clntXX dir, but the pipe creation subsequently fails, then we should remove the clntXX dir before returning. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 5cd7ad1225a3..0b74c61db7b4 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1313,6 +1313,8 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) } pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data); + if (IS_ERR(pipe_dentry)) + __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); out: dput(clnt_dentry); dput(gssd_dentry); -- cgit From e2f0c83a9de331d9352185ca3642616c13127539 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 5 Dec 2013 07:34:44 -0500 Subject: sunrpc: add an "info" file for the dummy gssd pipe rpc.gssd expects to see an "info" file in each clntXX dir. Since adding the dummy gssd pipe, users that run rpc.gssd see a lot of these messages spamming the logs: rpc.gssd[508]: ERROR: can't open /var/lib/nfs/rpc_pipefs/gssd/clntXX/info: No such file or directory rpc.gssd[508]: ERROR: failed to read service info Add a dummy gssd/clntXX/info file to help silence these messages. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 0b74c61db7b4..5d973b25b5b0 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -1275,6 +1276,44 @@ static const struct rpc_pipe_ops gssd_dummy_pipe_ops = { .downcall = dummy_downcall, }; +/* + * Here we present a bogus "info" file to keep rpc.gssd happy. We don't expect + * that it will ever use this info to handle an upcall, but rpc.gssd expects + * that this file will be there and have a certain format. + */ +static int +rpc_show_dummy_info(struct seq_file *m, void *v) +{ + seq_printf(m, "RPC server: %s\n", utsname()->nodename); + seq_printf(m, "service: foo (1) version 0\n"); + seq_printf(m, "address: 127.0.0.1\n"); + seq_printf(m, "protocol: tcp\n"); + seq_printf(m, "port: 0\n"); + return 0; +} + +static int +rpc_dummy_info_open(struct inode *inode, struct file *file) +{ + return single_open(file, rpc_show_dummy_info, NULL); +} + +static const struct file_operations rpc_dummy_info_operations = { + .owner = THIS_MODULE, + .open = rpc_dummy_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct rpc_filelist gssd_dummy_info_file[] = { + [0] = { + .name = "info", + .i_fop = &rpc_dummy_info_operations, + .mode = S_IFREG | S_IRUSR, + }, +}; + /** * rpc_gssd_dummy_populate - create a dummy gssd pipe * @root: root of the rpc_pipefs filesystem @@ -1312,9 +1351,18 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) goto out; } + ret = rpc_populate(clnt_dentry, gssd_dummy_info_file, 0, 1, NULL); + if (ret) { + __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); + pipe_dentry = ERR_PTR(ret); + goto out; + } + pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data); - if (IS_ERR(pipe_dentry)) + if (IS_ERR(pipe_dentry)) { + __rpc_depopulate(clnt_dentry, gssd_dummy_info_file, 0, 1); __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); + } out: dput(clnt_dentry); dput(gssd_dentry); -- cgit From 23e66ba97127ff3b064d4c6c5138aa34eafc492f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 9 Dec 2013 09:38:00 -0500 Subject: rpc_pipe: fix cleanup of dummy gssd directory when notification fails Currently, it could leak dentry references in some cases. Make sure we clean up properly. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 5d973b25b5b0..b18554898562 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1369,6 +1369,18 @@ out: return pipe_dentry; } +static void +rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry) +{ + struct dentry *clnt_dir = pipe_dentry->d_parent; + struct dentry *gssd_dir = clnt_dir->d_parent; + + __rpc_rmpipe(clnt_dir->d_inode, pipe_dentry); + __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1); + __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1); + dput(pipe_dentry); +} + static int rpc_fill_super(struct super_block *sb, void *data, int silent) { @@ -1412,7 +1424,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return 0; err_depopulate: - dput(gssd_dentry); + rpc_gssd_dummy_depopulate(gssd_dentry); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); -- cgit From 28303ca3090c0aa0dbbb72714c51fceb4b939f6d Mon Sep 17 00:00:00 2001 From: Weng Meiling Date: Sat, 30 Nov 2013 17:56:44 +0800 Subject: sunrpc: fix some typos Signed-off-by: Weng Meiling Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 2 +- net/sunrpc/xprtsock.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 6eecfc2e4f98..b631642318c5 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -368,7 +368,7 @@ struct svc_program { struct svc_program * pg_next; /* other programs (same xprt) */ u32 pg_prog; /* program number */ unsigned int pg_lovers; /* lowest version */ - unsigned int pg_hivers; /* lowest version */ + unsigned int pg_hivers; /* highest version */ unsigned int pg_nvers; /* number of versions */ struct svc_version ** pg_vers; /* version array */ char * pg_name; /* service name */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index dd9d295813cf..0f4f39174a70 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2932,10 +2932,9 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) /* * Once we've associated a backchannel xprt with a connection, - * we want to keep it around as long as long as the connection - * lasts, in case we need to start using it for a backchannel - * again; this reference won't be dropped until bc_xprt is - * destroyed. + * we want to keep it around as long as the connection lasts, + * in case we need to start using it for a backchannel again; + * this reference won't be dropped until bc_xprt is destroyed. */ xprt_get(xprt); args->bc_xprt->xpt_bc_xprt = xprt; -- cgit From 056785ea51a657f55953f6b0195baac2cceb1f16 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 12 Dec 2013 15:49:21 +0200 Subject: net/sunrpc/cache: simplify code by using hex_pack_byte() hex_pack_byte() is a fast way to convert a byte in its ASCII representation. We may use it instead of custom approach. Signed-off-by: Andy Shevchenko Signed-off-by: J. Bruce Fields --- net/sunrpc/cache.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index a72de074172d..0877db0787b1 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1111,9 +1111,7 @@ void qword_addhex(char **bpp, int *lp, char *buf, int blen) *bp++ = 'x'; len -= 2; while (blen && len >= 2) { - unsigned char c = *buf++; - *bp++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); - *bp++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); + bp = hex_byte_pack(bp, *buf++); len -= 2; blen--; } -- cgit From 37c89bde5d402c25211a9e31e3166067f85aa31b Mon Sep 17 00:00:00 2001 From: Li Wang Date: Wed, 27 Nov 2013 22:28:14 +0800 Subject: ceph: Add necessary clean up if invalid reply received in handle_reply() Wake up possible waiters, invoke the call back if any, unregister the request Signed-off-by: Li Wang Signed-off-by: Yunchuan Wen Signed-off-by: Sage Weil --- net/ceph/osd_client.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2b4b32aaa893..a17eaae820f8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1581,6 +1581,13 @@ done: return; bad_put: + req->r_result = -EIO; + __unregister_request(osdc, req); + if (req->r_callback) + req->r_callback(req, msg); + else + complete_all(&req->r_completion); + complete_request(req); ceph_osdc_put_request(req); bad_mutex: mutex_unlock(&osdc->request_mutex); -- cgit From d29adb34a94715174c88ca93e8aba955850c9bde Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Mon, 2 Dec 2013 19:11:48 -0800 Subject: libceph: block I/O when PAUSE or FULL osd map flags are set The PAUSEWR and PAUSERD flags are meant to stop the cluster from processing writes and reads, respectively. The FULL flag is set when the cluster determines that it is out of space, and will no longer process writes. PAUSEWR and PAUSERD are purely client-side settings already implemented in userspace clients. The osd does nothing special with these flags. When the FULL flag is set, however, the osd responds to all writes with -ENOSPC. For cephfs, this makes sense, but for rbd the block layer translates this into EIO. If a cluster goes from full to non-full quickly, a filesystem on top of rbd will not behave well, since some writes succeed while others get EIO. Fix this by blocking any writes when the FULL flag is set in the osd client. This is the same strategy used by userspace, so apply it by default. A follow-on patch makes this configurable. __map_request() is called to re-target osd requests in case the available osds changed. Add a paused field to a ceph_osd_request, and set it whenever an appropriate osd map flag is set. Avoid queueing paused requests in __map_request(), but force them to be resent if they become unpaused. Also subscribe to the next osd map from the monitor if any of these flags are set, so paused requests can be unblocked as soon as possible. Fixes: http://tracker.ceph.com/issues/6079 Reviewed-by: Sage Weil Signed-off-by: Josh Durgin --- include/linux/ceph/osd_client.h | 1 + net/ceph/osd_client.c | 29 +++++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 8f47625a0661..4fb6a8938957 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -138,6 +138,7 @@ struct ceph_osd_request { __le64 *r_request_pool; void *r_request_pgid; __le32 *r_request_attempts; + bool r_paused; struct ceph_eversion *r_request_reassert_version; int r_result; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a17eaae820f8..1ad9866dc707 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1231,6 +1231,22 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_set_request_linger); +/* + * Returns whether a request should be blocked from being sent + * based on the current osdmap and osd_client settings. + * + * Caller should hold map_sem for read. + */ +static bool __req_should_be_paused(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); + bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) || + (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); +} + /* * Pick an osd (the first 'up' osd in the pg), allocate the osd struct * (as needed), and set the request r_osd appropriately. If there is @@ -1248,6 +1264,7 @@ static int __map_request(struct ceph_osd_client *osdc, int acting[CEPH_PG_MAX_SIZE]; int o = -1, num = 0; int err; + bool was_paused; dout("map_request %p tid %lld\n", req, req->r_tid); err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, @@ -1264,12 +1281,18 @@ static int __map_request(struct ceph_osd_client *osdc, num = err; } + was_paused = req->r_paused; + req->r_paused = __req_should_be_paused(osdc, req); + if (was_paused && !req->r_paused) + force_resend = 1; + if ((!force_resend && req->r_osd && req->r_osd->o_osd == o && req->r_sent >= req->r_osd->o_incarnation && req->r_num_pg_osds == num && memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || - (req->r_osd == NULL && o == -1)) + (req->r_osd == NULL && o == -1) || + req->r_paused) return 0; /* no change */ dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", @@ -1811,7 +1834,9 @@ done: * we find out when we are no longer full and stop returning * ENOSPC. */ - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) ceph_monc_request_next_osdmap(&osdc->client->monc); mutex_lock(&osdc->request_mutex); -- cgit From 9a1ea2dbff11547a8e664f143c1ffefc586a577a Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Tue, 10 Dec 2013 09:35:13 -0800 Subject: libceph: resend all writes after the osdmap loses the full flag With the current full handling, there is a race between osds and clients getting the first map marked full. If the osd wins, it will return -ENOSPC to any writes, but the client may already have writes in flight. This results in the client getting the error and propagating it up the stack. For rbd, the block layer turns this into EIO, which can cause corruption in filesystems above it. To avoid this race, osds are being changed to drop writes that came from clients with an osdmap older than the last osdmap marked full. In order for this to work, clients must resend all writes after they encounter a full -> not full transition in the osdmap. osds will wait for an updated map instead of processing a request from a client with a newer map, so resent writes will not be dropped by the osd unless there is another not full -> full transition. This approach requires both osds and clients to be fixed to avoid the race. Old clients talking to osds with this fix may hang instead of returning EIO and potentially corrupting an fs. New clients talking to old osds have the same behavior as before if they encounter this race. Fixes: http://tracker.ceph.com/issues/6938 Reviewed-by: Sage Weil Signed-off-by: Josh Durgin --- net/ceph/osd_client.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1ad9866dc707..9f1993582ff7 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1643,14 +1643,17 @@ static void reset_changed_osds(struct ceph_osd_client *osdc) * * Caller should hold map_sem for read. */ -static void kick_requests(struct ceph_osd_client *osdc, int force_resend) +static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, + bool force_resend_writes) { struct ceph_osd_request *req, *nreq; struct rb_node *p; int needmap = 0; int err; + bool force_resend_req; - dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); + dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "", + force_resend_writes ? " (force resend writes)" : ""); mutex_lock(&osdc->request_mutex); for (p = rb_first(&osdc->requests); p; ) { req = rb_entry(p, struct ceph_osd_request, r_node); @@ -1675,7 +1678,10 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) continue; } - err = __map_request(osdc, req, force_resend); + force_resend_req = force_resend || + (force_resend_writes && + req->r_flags & CEPH_OSD_FLAG_WRITE); + err = __map_request(osdc, req, force_resend_req); if (err < 0) continue; /* error */ if (req->r_osd == NULL) { @@ -1695,7 +1701,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) r_linger_item) { dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); - err = __map_request(osdc, req, force_resend); + err = __map_request(osdc, req, + force_resend || force_resend_writes); dout("__map_request returned %d\n", err); if (err == 0) continue; /* no change and no osd was specified */ @@ -1737,6 +1744,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) struct ceph_osdmap *newmap = NULL, *oldmap; int err; struct ceph_fsid fsid; + bool was_full; dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); p = msg->front.iov_base; @@ -1750,6 +1758,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) down_write(&osdc->map_sem); + was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + /* incremental maps */ ceph_decode_32_safe(&p, end, nr_maps, bad); dout(" %d inc maps\n", nr_maps); @@ -1774,7 +1784,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ceph_osdmap_destroy(osdc->osdmap); osdc->osdmap = newmap; } - kick_requests(osdc, 0); + was_full = was_full || + ceph_osdmap_flag(osdc->osdmap, + CEPH_OSDMAP_FULL); + kick_requests(osdc, 0, was_full); } else { dout("ignoring incremental map %u len %d\n", epoch, maplen); @@ -1817,7 +1830,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) skipped_map = 1; ceph_osdmap_destroy(oldmap); } - kick_requests(osdc, skipped_map); + was_full = was_full || + ceph_osdmap_flag(osdc->osdmap, + CEPH_OSDMAP_FULL); + kick_requests(osdc, skipped_map, was_full); } p += maplen; nr_maps--; -- cgit From 12b4629a9fb80fecaebadc217b13b8776ed8dbef Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:23 +0200 Subject: libceph: all features fields must be u64 In preparation for ceph_features.h update, change all features fields from unsigned int/u32 to u64. (ceph.git has ~40 feature bits at this point.) Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 14 +++++++------- fs/ceph/super.c | 4 ++-- include/linux/ceph/libceph.h | 8 ++++---- include/linux/ceph/messenger.h | 10 +++++----- net/ceph/ceph_common.c | 4 ++-- net/ceph/messenger.c | 4 ++-- 6 files changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d90861f45210..4a13f6e72069 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -63,7 +63,7 @@ static const struct ceph_connection_operations mds_con_ops; */ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, - int features) + u64 features) { int err = -EIO; @@ -98,7 +98,7 @@ bad: */ static int parse_reply_info_trace(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { int err; @@ -145,7 +145,7 @@ out_bad: */ static int parse_reply_info_dir(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { u32 num, i = 0; int err; @@ -217,7 +217,7 @@ out_bad: */ static int parse_reply_info_filelock(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (*p + sizeof(*info->filelock_reply) > end) goto bad; @@ -238,7 +238,7 @@ bad: */ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { if (*p == end) { @@ -262,7 +262,7 @@ bad: */ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); @@ -280,7 +280,7 @@ static int parse_reply_info_extra(void **p, void *end, */ static int parse_reply_info(struct ceph_msg *msg, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { void *p, *end; u32 len; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c6740e43a351..2df963f1cf5a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -490,10 +490,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; - const unsigned supported_features = + const u64 supported_features = CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH; - const unsigned required_features = 0; + const u64 required_features = 0; int page_count; size_t size; int err = -ENOMEM; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 2e3024881a5e..7d704db60cbb 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -122,8 +122,8 @@ struct ceph_client { int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *); - u32 supported_features; - u32 required_features; + u64 supported_features; + u64 required_features; struct ceph_messenger msgr; /* messenger instance */ struct ceph_mon_client monc; @@ -192,8 +192,8 @@ extern int ceph_compare_options(struct ceph_options *new_opt, struct ceph_client *client); extern struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, - unsigned supported_features, - unsigned required_features); + u64 supported_features, + u64 required_features); extern u64 ceph_client_id(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 7c1420bb1dce..c1d3f5a65273 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -60,8 +60,8 @@ struct ceph_messenger { u32 global_seq; spinlock_t global_seq_lock; - u32 supported_features; - u32 required_features; + u64 supported_features; + u64 required_features; }; enum ceph_msg_data_type { @@ -192,7 +192,7 @@ struct ceph_connection { struct ceph_entity_name peer_name; /* peer name */ - unsigned peer_features; + u64 peer_features; u32 connect_seq; /* identify the most recent connection attempt for this connection, client */ u32 peer_global_seq; /* peer's global seq for this connection */ @@ -256,8 +256,8 @@ extern void ceph_msgr_flush(void); extern void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features, + u64 supported_features, + u64 required_features, bool nocrc); extern void ceph_con_init(struct ceph_connection *con, void *private, diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 34b11ee8124e..43d8177a52e1 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -461,8 +461,8 @@ EXPORT_SYMBOL(ceph_client_id); * create a fresh client instance */ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, - unsigned int supported_features, - unsigned int required_features) + u64 supported_features, + u64 required_features) { struct ceph_client *client; struct ceph_entity_addr *myaddr = NULL; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4a5df7b1cc9f..260670558746 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2853,8 +2853,8 @@ static void con_fault(struct ceph_connection *con) */ void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features, + u64 supported_features, + u64 required_features, bool nocrc) { msgr->supported_features = supported_features; -- cgit From 2b3e0c905af43cfe402a2ef3f800be5dc1684005 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:24 +0200 Subject: libceph: update ceph_features.h This updates ceph_features.h so that it has all feature bits defined in ceph.git. In the interim since the last update, ceph.git crossed the "32 feature bits" point, and, the addition of the 33rd bit wasn't handled correctly. The work-around is squashed into this commit and reflects ceph.git commit 053659d05e0349053ef703b414f44965f368b9f0. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 96 +++++++++++++++++++++++++------------- net/ceph/messenger.c | 4 +- 2 files changed, 67 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 4c42080347af..003ea719be65 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -4,42 +4,73 @@ /* * feature bits */ -#define CEPH_FEATURE_UID (1<<0) -#define CEPH_FEATURE_NOSRCADDR (1<<1) -#define CEPH_FEATURE_MONCLOCKCHECK (1<<2) -#define CEPH_FEATURE_FLOCK (1<<3) -#define CEPH_FEATURE_SUBSCRIBE2 (1<<4) -#define CEPH_FEATURE_MONNAMES (1<<5) -#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) -#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) -#define CEPH_FEATURE_OBJECTLOCATOR (1<<8) -#define CEPH_FEATURE_PGID64 (1<<9) -#define CEPH_FEATURE_INCSUBOSDMAP (1<<10) -#define CEPH_FEATURE_PGPOOL3 (1<<11) -#define CEPH_FEATURE_OSDREPLYMUX (1<<12) -#define CEPH_FEATURE_OSDENC (1<<13) -#define CEPH_FEATURE_OMAP (1<<14) -#define CEPH_FEATURE_MONENC (1<<15) -#define CEPH_FEATURE_QUERY_T (1<<16) -#define CEPH_FEATURE_INDEP_PG_MAP (1<<17) -#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) -#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19) -#define CEPH_FEATURE_MON_NULLROUTE (1<<20) -#define CEPH_FEATURE_MON_GV (1<<21) -#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22) -#define CEPH_FEATURE_MSG_AUTH (1<<23) -#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24) -#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25) -#define CEPH_FEATURE_CREATEPOOLID (1<<26) -#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) -#define CEPH_FEATURE_OSD_HBMSGS (1<<28) -#define CEPH_FEATURE_MDSENC (1<<29) -#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30) +#define CEPH_FEATURE_UID (1ULL<<0) +#define CEPH_FEATURE_NOSRCADDR (1ULL<<1) +#define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2) +#define CEPH_FEATURE_FLOCK (1ULL<<3) +#define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4) +#define CEPH_FEATURE_MONNAMES (1ULL<<5) +#define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6) +#define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7) +#define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8) +#define CEPH_FEATURE_PGID64 (1ULL<<9) +#define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10) +#define CEPH_FEATURE_PGPOOL3 (1ULL<<11) +#define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12) +#define CEPH_FEATURE_OSDENC (1ULL<<13) +#define CEPH_FEATURE_OMAP (1ULL<<14) +#define CEPH_FEATURE_MONENC (1ULL<<15) +#define CEPH_FEATURE_QUERY_T (1ULL<<16) +#define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17) +#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18) +#define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19) +#define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20) +#define CEPH_FEATURE_MON_GV (1ULL<<21) +#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22) +#define CEPH_FEATURE_MSG_AUTH (1ULL<<23) +#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24) +#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25) +#define CEPH_FEATURE_CREATEPOOLID (1ULL<<26) +#define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27) +#define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28) +#define CEPH_FEATURE_MDSENC (1ULL<<29) +#define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30) +#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31) +#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32) +#define CEPH_FEATURE_MON_SCRUB (1ULL<<33) +#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34) +#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35) +#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */ +#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37) +#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38) + +/* + * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature + * vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63 + * to mean 33 bit ~0, and introduce a helper below to do the + * translation. + * + * This was introduced by ceph.git commit + * 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8 + * and fixed by ceph.git commit + * 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c + */ +#define CEPH_FEATURE_RESERVED (1ULL<<63) + +static inline u64 ceph_sanitize_features(u64 features) +{ + if (features & CEPH_FEATURE_RESERVED) { + /* everything through OSD_SNAPMAPPER */ + return 0x1ffffffffull; + } else { + return features; + } +} /* * Features supported. */ -#define CEPH_FEATURES_SUPPORTED_DEFAULT \ +#define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ @@ -56,4 +87,5 @@ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC) + #endif diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 260670558746..bd172e1ee0ae 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -1945,7 +1946,8 @@ static int process_connect(struct ceph_connection *con) { u64 sup_feat = con->msgr->supported_features; u64 req_feat = con->msgr->required_features; - u64 server_feat = le64_to_cpu(con->in_reply.features); + u64 server_feat = ceph_sanitize_features( + le64_to_cpu(con->in_reply.features)); int ret; dout("process_connect on %p tag %d\n", con, (int)con->in_tag); -- cgit From b3b33b0e43323af4fb697f4378218d3c268d02cd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:24 +0200 Subject: crush: pass weight vector size to map function Pass the size of the weight vector into crush_do_rule() to ensure that we don't access values past the end. This can happen if the caller misbehaves and passes a weight vector that is smaller than max_devices. Currently the monitor tries to prevent that from happening, but this will gracefully tolerate previous bad osdmaps that got into this state. It's also a bit more defensive. Reflects ceph.git commit 5922e2c2b8335b5e46c9504349c3a55b7434c01a. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/mapper.h | 2 +- net/ceph/crush/mapper.c | 17 ++++++++++++----- net/ceph/osdmap.c | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h index 5772dee3ecbf..69310b031875 100644 --- a/include/linux/crush/mapper.h +++ b/include/linux/crush/mapper.h @@ -14,6 +14,6 @@ extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, i extern int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - const __u32 *weights); + const __u32 *weights, int weight_max); #endif diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index cbd06a91941c..18d2cf66f102 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -264,8 +264,12 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) * true if device is marked "out" (failed, fully offloaded) * of the cluster */ -static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x) +static int is_out(const struct crush_map *map, + const __u32 *weight, int weight_max, + int item, int x) { + if (item >= weight_max) + return 1; if (weight[item] >= 0x10000) return 0; if (weight[item] == 0) @@ -292,7 +296,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in */ static int crush_choose(const struct crush_map *map, struct crush_bucket *bucket, - const __u32 *weight, + const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, int firstn, int recurse_to_leaf, @@ -396,7 +400,7 @@ static int crush_choose(const struct crush_map *map, if (item < 0) { if (crush_choose(map, map->buckets[-1-item], - weight, + weight, weight_max, x, outpos+1, 0, out2, outpos, firstn, 0, @@ -414,6 +418,7 @@ static int crush_choose(const struct crush_map *map, /* out? */ if (itemtype == 0) reject = is_out(map, weight, + weight_max, item, x); else reject = 0; @@ -470,10 +475,12 @@ reject: * @x: hash input * @result: pointer to result vector * @result_max: maximum result size + * @weight: weight vector (for map leaves) + * @weight_max: size of weight vector */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - const __u32 *weight) + const __u32 *weight, int weight_max) { int result_len; int a[CRUSH_MAX_SET]; @@ -545,7 +552,7 @@ int crush_do_rule(const struct crush_map *map, j = 0; osize += crush_choose(map, map->buckets[-1-w[i]], - weight, + weight, weight_max, x, numrep, curstep->arg2, o+osize, j, diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index dbd9a4792427..6477a68ddecb 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1165,7 +1165,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, } r = crush_do_rule(osdmap->crush, ruleno, pps, osds, min_t(int, pool->size, *num), - osdmap->osd_weight); + osdmap->osd_weight, osdmap->max_osd); if (r < 0) { pr_err("error %d from crush rule: pool %lld ruleset %d type %d" " size %d\n", r, pgid.pool, pool->crush_ruleset, -- cgit From bfb16d7d69f0272451ad85a6e50aab3c4262fbc0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:24 +0200 Subject: crush: factor out (trivial) crush_destroy_rule() Reflects ceph.git commit 43a01c9973c4b83f2eaa98be87429941a227ddde. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 1 + net/ceph/crush/crush.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 6a1101f24cfb..09561a04c127 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -174,6 +174,7 @@ extern void crush_destroy_bucket_list(struct crush_bucket_list *b); extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); extern void crush_destroy_bucket(struct crush_bucket *b); +extern void crush_destroy_rule(struct crush_rule *r); extern void crush_destroy(struct crush_map *map); static inline int crush_calc_tree_node(int i) diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 089613234f03..16bc199d9a62 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -116,11 +116,14 @@ void crush_destroy(struct crush_map *map) if (map->rules) { __u32 b; for (b = 0; b < map->max_rules; b++) - kfree(map->rules[b]); + crush_destroy_rule(map->rules[b]); kfree(map->rules); } kfree(map); } - +void crush_destroy_rule(struct crush_rule *rule) +{ + kfree(rule); +} -- cgit From 8f99c85b7ad55faecaab9d29fd7d12f1601c2eea Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:24 +0200 Subject: crush: reduce scope of some local variables Reflects ceph.git commit e7d47827f0333c96ad43d257607fb92ed4176550. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 18d2cf66f102..71192b1f8501 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -189,7 +189,7 @@ static int terminal(int x) static int bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r) { - int n, l; + int n; __u32 w; __u64 t; @@ -197,6 +197,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, n = bucket->num_nodes >> 1; while (!terminal(n)) { + int l; /* pick point in [0, w) */ w = bucket->node_weights[n]; t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, @@ -496,7 +497,6 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; - int firstn; const int descend_once = 0; if ((__u32)ruleno >= map->max_rules) { @@ -510,9 +510,9 @@ int crush_do_rule(const struct crush_map *map, o = b; for (step = 0; step < rule->len; step++) { + int firstn = 0; struct crush_rule_step *curstep = &rule->steps[step]; - firstn = 0; switch (curstep->op) { case CRUSH_RULE_TAKE: w[0] = curstep->arg1; -- cgit From 2a4ba74ef67ad3a1645d78487ed7ccd0f40063c0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:24 +0200 Subject: crush: fix some comments Reflects ceph.git commit 3cef755428761f2481b1dd0e0fbd0464ac483fc5. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 71192b1f8501..82cab7d3e89d 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -565,7 +565,7 @@ int crush_do_rule(const struct crush_map *map, /* copy final _leaf_ values to output set */ memcpy(o, c, osize*sizeof(*o)); - /* swap t and w arrays */ + /* swap o and w arrays */ tmp = o; o = w; w = tmp; -- cgit From e8ef19c4ad161768e1d8309d5ae18481c098eb81 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:24 +0200 Subject: crush: eliminate CRUSH_MAX_SET result size limitation This is only present to size the temporary scratch arrays that we put on the stack. Let the caller allocate them as they wish and remove the limitation. Reflects ceph.git commit 1cfe140bf2dab99517589a82a916f4c75b9492d1. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 1 - include/linux/crush/mapper.h | 3 ++- net/ceph/crush/mapper.c | 10 ++++++---- net/ceph/osdmap.c | 16 +++++++++++++--- 4 files changed, 21 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 09561a04c127..83543c504b5a 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -21,7 +21,6 @@ #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ -#define CRUSH_MAX_SET 10 /* max size of a mapping result */ /* diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h index 69310b031875..eab367446eea 100644 --- a/include/linux/crush/mapper.h +++ b/include/linux/crush/mapper.h @@ -14,6 +14,7 @@ extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, i extern int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - const __u32 *weights, int weight_max); + const __u32 *weights, int weight_max, + int *scratch); #endif diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 82cab7d3e89d..dcf48bc504ea 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -478,15 +478,17 @@ reject: * @result_max: maximum result size * @weight: weight vector (for map leaves) * @weight_max: size of weight vector + * @scratch: scratch vector for private use; must be >= 3 * result_max */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - const __u32 *weight, int weight_max) + const __u32 *weight, int weight_max, + int *scratch) { int result_len; - int a[CRUSH_MAX_SET]; - int b[CRUSH_MAX_SET]; - int c[CRUSH_MAX_SET]; + int *a = scratch; + int *b = scratch + result_max; + int *c = scratch + result_max*2; int recurse_to_leaf; int *w; int wsize = 0; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 6477a68ddecb..8b1a6b48bb5d 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1110,6 +1110,16 @@ int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, } EXPORT_SYMBOL(ceph_calc_ceph_pg); +static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x, + int *result, int result_max, + const __u32 *weight, int weight_max) +{ + int scratch[result_max * 3]; + + return crush_do_rule(map, ruleno, x, result, result_max, + weight, weight_max, scratch); +} + /* * Calculate raw osd vector for the given pgid. Return pointer to osd * array, or NULL on failure. @@ -1163,9 +1173,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, pool->pgp_num_mask) + (unsigned)pgid.pool; } - r = crush_do_rule(osdmap->crush, ruleno, pps, osds, - min_t(int, pool->size, *num), - osdmap->osd_weight, osdmap->max_osd); + r = crush_do_rule_ary(osdmap->crush, ruleno, pps, + osds, min_t(int, pool->size, *num), + osdmap->osd_weight, osdmap->max_osd); if (r < 0) { pr_err("error %d from crush rule: pool %lld ruleset %d type %d" " size %d\n", r, pgid.pool, pool->crush_ruleset, -- cgit From c6d98a603a02594f6ecf16d0a0af989ae9fa7abd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:25 +0200 Subject: crush: return CRUSH_ITEM_UNDEF for failed placements with indep For firstn mode, if we fail to make a valid placement choice, we just continue and return a short result to the caller. For indep mode, however, we need to make the position stable, and return an undefined value on failed placements to avoid shifting later results to the left. Reflects ceph.git commit b1d4dd4eb044875874a1d01c01c7d766db5d0a80. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 3 ++- net/ceph/crush/mapper.c | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 83543c504b5a..3d6a12928560 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -19,10 +19,11 @@ #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ - #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ +#define CRUSH_ITEM_UNDEF 0x7fffffff /* undefined result */ + /* * CRUSH uses user-defined "rules" to describe how inputs should be * mapped to devices. A rule consists of sequence of steps to perform diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index dcf48bc504ea..a8605245d190 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -455,8 +455,12 @@ reject: } while (retry_descent); if (skip_rep) { - dprintk("skip rep\n"); - continue; + if (firstn) { + dprintk("skip rep\n"); + continue; + } + dprintk("undef rep, continuing\n"); + item = CRUSH_ITEM_UNDEF; } dprintk("CHOOSE got %d\n", item); -- cgit From 9a3b490a20e06368c81d7a81506e99388e733379 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:25 +0200 Subject: crush: use breadth-first search for indep mode Reflects ceph.git commit 86e978036a4ecbac4c875e7c00f6c5bbe37282d3. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 3 +- net/ceph/crush/mapper.c | 172 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 165 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 3d6a12928560..4023b1b52296 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -22,7 +22,8 @@ #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ -#define CRUSH_ITEM_UNDEF 0x7fffffff /* undefined result */ +#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */ +#define CRUSH_ITEM_NONE 0x7fffffff /* no result */ /* * CRUSH uses user-defined "rules" to describe how inputs should be diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index a8605245d190..caeb1066bea3 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -473,6 +473,148 @@ reject: } +/** + * choose indep: alternative breadth-first positionally stable mapping + * + */ +static void crush_choose_indep(const struct crush_map *map, + struct crush_bucket *bucket, + const __u32 *weight, int weight_max, + int x, int numrep, int type, + int *out, int outpos, + int recurse_to_leaf, + int *out2) +{ + struct crush_bucket *in = bucket; + int left = numrep - outpos; + int rep; + unsigned int ftotal; + int r; + int i; + int item = 0; + int itemtype; + int collide; + + dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep); + + /* initially my result is undefined */ + for (rep = outpos; rep < numrep; rep++) { + out[rep] = CRUSH_ITEM_UNDEF; + if (out2) + out2[rep] = CRUSH_ITEM_UNDEF; + } + + for (ftotal = 0; left > 0 && ftotal < map->choose_total_tries; ftotal++) { + for (rep = outpos; rep < numrep; rep++) { + if (out[rep] != CRUSH_ITEM_UNDEF) + continue; + + in = bucket; /* initial bucket */ + + /* choose through intervening buckets */ + for (;;) { + r = rep; + + /* be careful */ + if (in->alg == CRUSH_BUCKET_UNIFORM && + in->size % numrep == 0) + /* r'=r+(n+1)*f_total */ + r += (numrep+1) * ftotal; + else + /* r' = r + n*f_total */ + r += numrep * ftotal; + + /* bucket choose */ + if (in->size == 0) { + dprintk(" empty bucket\n"); + break; + } + + item = crush_bucket_choose(in, x, r); + if (item >= map->max_devices) { + dprintk(" bad item %d\n", item); + out[rep] = CRUSH_ITEM_NONE; + if (out2) + out2[rep] = CRUSH_ITEM_NONE; + left--; + break; + } + + /* desired type? */ + if (item < 0) + itemtype = map->buckets[-1-item]->type; + else + itemtype = 0; + dprintk(" item %d type %d\n", item, itemtype); + + /* keep going? */ + if (itemtype != type) { + if (item >= 0 || + (-1-item) >= map->max_buckets) { + dprintk(" bad item type %d\n", type); + out[rep] = CRUSH_ITEM_NONE; + if (out2) + out2[rep] = + CRUSH_ITEM_NONE; + left--; + break; + } + in = map->buckets[-1-item]; + continue; + } + + /* collision? */ + collide = 0; + for (i = outpos; i < numrep; i++) { + if (out[i] == item) { + collide = 1; + break; + } + } + if (collide) + break; + + if (recurse_to_leaf) { + if (item < 0) { + crush_choose_indep(map, + map->buckets[-1-item], + weight, weight_max, + x, rep+1, 0, + out2, rep, + 0, NULL); + if (out2[rep] == CRUSH_ITEM_NONE) { + /* placed nothing; no leaf */ + break; + } + } else { + /* we already have a leaf! */ + out2[rep] = item; + } + } + + /* out? */ + if (itemtype == 0 && + is_out(map, weight, weight_max, item, x)) + break; + + /* yay! */ + out[rep] = item; + left--; + break; + } + } + } + for (rep = outpos; rep < numrep; rep++) { + if (out[rep] == CRUSH_ITEM_UNDEF) { + out[rep] = CRUSH_ITEM_NONE; + } + if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) { + out2[rep] = CRUSH_ITEM_NONE; + } + } +} + /** * crush_do_rule - calculate a mapping with the given input and rule * @map: the crush_map @@ -556,15 +698,27 @@ int crush_do_rule(const struct crush_map *map, continue; } j = 0; - osize += crush_choose(map, - map->buckets[-1-w[i]], - weight, weight_max, - x, numrep, - curstep->arg2, - o+osize, j, - firstn, - recurse_to_leaf, - descend_once, c+osize); + if (firstn) { + osize += crush_choose(map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, + curstep->arg2, + o+osize, j, + firstn, + recurse_to_leaf, + descend_once, c+osize); + } else { + crush_choose_indep(map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, + curstep->arg2, + o+osize, j, + recurse_to_leaf, + c+osize); + osize += numrep; + } } if (recurse_to_leaf) -- cgit From 3102b0a5b4cd63acccac78ca541b73fe28b4faa6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:25 +0200 Subject: crush: add note about r in recursive choose Reflects ceph.git commit 4551fee9ad89d0427ed865d766d0d44004d3e3e1. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index caeb1066bea3..77b7a73e65cf 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -514,6 +514,14 @@ static void crush_choose_indep(const struct crush_map *map, /* choose through intervening buckets */ for (;;) { + /* note: we base the choice on the position + * even in the nested call. that means that + * if the first layer chooses the same bucket + * in a different position, we will tend to + * choose a different item in that bucket. + * this will involve more devices in data + * movement and tend to distribute the load. + */ r = rep; /* be careful */ -- cgit From 9fe07182827d9913daf85f3e6a950578a3fd4c5d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:25 +0200 Subject: crush: strip firstn conditionals out of crush_choose, rename Now that indep is handled by crush_choose_indep, rename crush_choose to crush_choose_firstn and remove all the conditionals. This ends up stripping out *lots* of code. Note that it *also* makes it obvious that the shenanigans we were playing with r' for uniform buckets were broken for firstn mode. This appears to have happened waaaay back in commit dae8bec9 (or earlier)... 2007. Reflects ceph.git commit 94350996cb2035850bcbece6a77a9b0394177ec9. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 88 +++++++++++++++++++------------------------------ 1 file changed, 33 insertions(+), 55 deletions(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 77b7a73e65cf..a71f8c4c78ec 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -282,7 +282,7 @@ static int is_out(const struct crush_map *map, } /** - * crush_choose - choose numrep distinct items of given type + * crush_choose_firstn - choose numrep distinct items of given type * @map: the crush_map * @bucket: the bucket we are choose an item from * @x: crush input value @@ -290,18 +290,17 @@ static int is_out(const struct crush_map *map, * @type: the type of item to choose * @out: pointer to output vector * @outpos: our position in that vector - * @firstn: true if choosing "first n" items, false if choosing "indep" * @recurse_to_leaf: true if we want one device under each item of given type * @descend_once: true if we should only try one descent before giving up * @out2: second output vector for leaf items (if @recurse_to_leaf) */ -static int crush_choose(const struct crush_map *map, - struct crush_bucket *bucket, - const __u32 *weight, int weight_max, - int x, int numrep, int type, - int *out, int outpos, - int firstn, int recurse_to_leaf, - int descend_once, int *out2) +static int crush_choose_firstn(const struct crush_map *map, + struct crush_bucket *bucket, + const __u32 *weight, int weight_max, + int x, int numrep, int type, + int *out, int outpos, + int recurse_to_leaf, + int descend_once, int *out2) { int rep; unsigned int ftotal, flocal; @@ -330,26 +329,8 @@ static int crush_choose(const struct crush_map *map, collide = 0; retry_bucket = 0; r = rep; - if (in->alg == CRUSH_BUCKET_UNIFORM) { - /* be careful */ - if (firstn || (__u32)numrep >= in->size) - /* r' = r + f_total */ - r += ftotal; - else if (in->size % numrep == 0) - /* r'=r+(n+1)*f_local */ - r += (numrep+1) * - (flocal+ftotal); - else - /* r' = r + n*f_local */ - r += numrep * (flocal+ftotal); - } else { - if (firstn) - /* r' = r + f_total */ - r += ftotal; - else - /* r' = r + n*f_local */ - r += numrep * (flocal+ftotal); - } + /* r' = r + f_total */ + r += ftotal; /* bucket choose */ if (in->size == 0) { @@ -399,12 +380,12 @@ static int crush_choose(const struct crush_map *map, reject = 0; if (!collide && recurse_to_leaf) { if (item < 0) { - if (crush_choose(map, + if (crush_choose_firstn(map, map->buckets[-1-item], weight, weight_max, x, outpos+1, 0, out2, outpos, - firstn, 0, + 0, map->chooseleaf_descend_once, NULL) <= outpos) /* didn't get leaf */ @@ -455,12 +436,8 @@ reject: } while (retry_descent); if (skip_rep) { - if (firstn) { - dprintk("skip rep\n"); - continue; - } - dprintk("undef rep, continuing\n"); - item = CRUSH_ITEM_UNDEF; + dprintk("skip rep\n"); + continue; } dprintk("CHOOSE got %d\n", item); @@ -474,7 +451,7 @@ reject: /** - * choose indep: alternative breadth-first positionally stable mapping + * crush_choose_indep: alternative breadth-first positionally stable mapping * */ static void crush_choose_indep(const struct crush_map *map, @@ -707,24 +684,25 @@ int crush_do_rule(const struct crush_map *map, } j = 0; if (firstn) { - osize += crush_choose(map, - map->buckets[-1-w[i]], - weight, weight_max, - x, numrep, - curstep->arg2, - o+osize, j, - firstn, - recurse_to_leaf, - descend_once, c+osize); + osize += crush_choose_firstn( + map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, + curstep->arg2, + o+osize, j, + recurse_to_leaf, + descend_once, c+osize); } else { - crush_choose_indep(map, - map->buckets[-1-w[i]], - weight, weight_max, - x, numrep, - curstep->arg2, - o+osize, j, - recurse_to_leaf, - c+osize); + crush_choose_indep( + map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, + curstep->arg2, + o+osize, j, + recurse_to_leaf, + c+osize); osize += numrep; } } -- cgit From ab4ce2b5bdb5ca416756df3df6f5c63667a05065 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:25 +0200 Subject: crush: clarify numrep vs endpos Pass numrep (the width of the result) separately from the number of results we want *this* iteration. This makes things less awkward when we do a recursive call (for chooseleaf) and want only one item. Reflects ceph.git commit 1b567ee08972f268c11b43fc881e57b5984dd08b. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index a71f8c4c78ec..125dbd04f2b6 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -457,13 +457,13 @@ reject: static void crush_choose_indep(const struct crush_map *map, struct crush_bucket *bucket, const __u32 *weight, int weight_max, - int x, int numrep, int type, + int x, int left, int numrep, int type, int *out, int outpos, int recurse_to_leaf, int *out2) { struct crush_bucket *in = bucket; - int left = numrep - outpos; + int endpos = outpos + left; int rep; unsigned int ftotal; int r; @@ -476,14 +476,14 @@ static void crush_choose_indep(const struct crush_map *map, bucket->id, x, outpos, numrep); /* initially my result is undefined */ - for (rep = outpos; rep < numrep; rep++) { + for (rep = outpos; rep < endpos; rep++) { out[rep] = CRUSH_ITEM_UNDEF; if (out2) out2[rep] = CRUSH_ITEM_UNDEF; } for (ftotal = 0; left > 0 && ftotal < map->choose_total_tries; ftotal++) { - for (rep = outpos; rep < numrep; rep++) { + for (rep = outpos; rep < endpos; rep++) { if (out[rep] != CRUSH_ITEM_UNDEF) continue; @@ -551,7 +551,7 @@ static void crush_choose_indep(const struct crush_map *map, /* collision? */ collide = 0; - for (i = outpos; i < numrep; i++) { + for (i = outpos; i < endpos; i++) { if (out[i] == item) { collide = 1; break; @@ -565,7 +565,7 @@ static void crush_choose_indep(const struct crush_map *map, crush_choose_indep(map, map->buckets[-1-item], weight, weight_max, - x, rep+1, 0, + x, 1, numrep, 0, out2, rep, 0, NULL); if (out2[rep] == CRUSH_ITEM_NONE) { @@ -590,7 +590,7 @@ static void crush_choose_indep(const struct crush_map *map, } } } - for (rep = outpos; rep < numrep; rep++) { + for (rep = outpos; rep < endpos; rep++) { if (out[rep] == CRUSH_ITEM_UNDEF) { out[rep] = CRUSH_ITEM_NONE; } @@ -698,7 +698,7 @@ int crush_do_rule(const struct crush_map *map, map, map->buckets[-1-w[i]], weight, weight_max, - x, numrep, + x, numrep, numrep, curstep->arg2, o+osize, j, recurse_to_leaf, -- cgit From 4158608139de02f5265ec4f41774af7418911016 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:25 +0200 Subject: crush: pass parent r value for indep call Pass down the parent's 'r' value so that we will sample different values in the recursive call when the parent tries multiple times. This avoids doing useless work (calling multiple times and trying the same values). Reflects ceph.git commit 2731d3030d7a3e80922b7f1b7756f9a4a124bac5. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 125dbd04f2b6..c727836b5860 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -460,7 +460,8 @@ static void crush_choose_indep(const struct crush_map *map, int x, int left, int numrep, int type, int *out, int outpos, int recurse_to_leaf, - int *out2) + int *out2, + int parent_r) { struct crush_bucket *in = bucket; int endpos = outpos + left; @@ -499,7 +500,7 @@ static void crush_choose_indep(const struct crush_map *map, * this will involve more devices in data * movement and tend to distribute the load. */ - r = rep; + r = rep + parent_r; /* be careful */ if (in->alg == CRUSH_BUCKET_UNIFORM && @@ -567,7 +568,7 @@ static void crush_choose_indep(const struct crush_map *map, weight, weight_max, x, 1, numrep, 0, out2, rep, - 0, NULL); + 0, NULL, r); if (out2[rep] == CRUSH_ITEM_NONE) { /* placed nothing; no leaf */ break; @@ -702,7 +703,8 @@ int crush_do_rule(const struct crush_map *map, curstep->arg2, o+osize, j, recurse_to_leaf, - c+osize); + c+osize, + 0); osize += numrep; } } -- cgit From be3226acc5544bcc91e756eb3ee6ca7b74f6f0a8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:26 +0200 Subject: crush: new SET_CHOOSE_LEAF_TRIES command Explicitly control the number of sample attempts, and allow the number of tries in the recursive call to be explicitly controlled via the rule. This is important because the amount of time we want to spend looking for a solution may be rule dependent (e.g., higher for the wide indep pool than the rep pools). (We should do the same for the other tunables, by the way!) Reflects ceph.git commit c43c893be872f709c787bc57f46c0e97876ff681. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 2 ++ net/ceph/crush/mapper.c | 31 +++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 4023b1b52296..2e50bab91655 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -46,6 +46,8 @@ enum { CRUSH_RULE_EMIT = 4, /* no args */ CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, + + CRUSH_RULE_SET_CHOOSE_LEAF_TRIES = 9, }; /* diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index c727836b5860..e3ade074541c 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -455,11 +455,13 @@ reject: * */ static void crush_choose_indep(const struct crush_map *map, - struct crush_bucket *bucket, - const __u32 *weight, int weight_max, + struct crush_bucket *bucket, + const __u32 *weight, int weight_max, int x, int left, int numrep, int type, - int *out, int outpos, - int recurse_to_leaf, + int *out, int outpos, + unsigned int attempts, + unsigned int recurse_attempts, + int recurse_to_leaf, int *out2, int parent_r) { @@ -483,7 +485,7 @@ static void crush_choose_indep(const struct crush_map *map, out2[rep] = CRUSH_ITEM_UNDEF; } - for (ftotal = 0; left > 0 && ftotal < map->choose_total_tries; ftotal++) { + for (ftotal = 0; left > 0 && ftotal < attempts; ftotal++) { for (rep = outpos; rep < endpos; rep++) { if (out[rep] != CRUSH_ITEM_UNDEF) continue; @@ -564,11 +566,12 @@ static void crush_choose_indep(const struct crush_map *map, if (recurse_to_leaf) { if (item < 0) { crush_choose_indep(map, - map->buckets[-1-item], - weight, weight_max, - x, 1, numrep, 0, - out2, rep, - 0, NULL, r); + map->buckets[-1-item], + weight, weight_max, + x, 1, numrep, 0, + out2, rep, + recurse_attempts, 0, + 0, NULL, r); if (out2[rep] == CRUSH_ITEM_NONE) { /* placed nothing; no leaf */ break; @@ -631,6 +634,7 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; + int choose_leaf_tries = 1; const int descend_once = 0; if ((__u32)ruleno >= map->max_rules) { @@ -653,6 +657,11 @@ int crush_do_rule(const struct crush_map *map, wsize = 1; break; + case CRUSH_RULE_SET_CHOOSE_LEAF_TRIES: + if (curstep->arg1 > 0) + choose_leaf_tries = curstep->arg1; + break; + case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; @@ -702,6 +711,8 @@ int crush_do_rule(const struct crush_map *map, x, numrep, numrep, curstep->arg2, o+osize, j, + map->choose_total_tries, + choose_leaf_tries, recurse_to_leaf, c+osize, 0); -- cgit From f18650ace38ef200dd1578257c75e9407297953c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:26 +0200 Subject: crush: apply chooseleaf_tries to firstn mode too Parameterize the attempts for the _firstn choose method, and apply the rule-specified tries count to firstn mode as well. Note that we have slightly different behavior here than with indep: If the firstn value is not specified for firstn, we pass through the normal attempt count. This maintains compatibility with legacy behavior. Note that this is usually *not* actually N^2 work, though, because of the descend_once tunable. However, descend_once is unfortunately *not* the same thing as 1 chooseleaf try because it is only checked on a reject but not on a collision. Sigh. In contrast, for indep, if tries is not specified we default to 1 recursive attempt, because that is simply more sane, and we have the option to do so. The descend_once tunable has no effect for indep. Reflects ceph.git commit 64aeded50d80942d66a5ec7b604ff2fcbf5d7b63. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 5 ++++- net/ceph/crush/mapper.c | 14 ++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 2e50bab91655..07b8fd4f81fc 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -165,7 +165,10 @@ struct crush_map { __u32 choose_local_fallback_tries; /* choose attempts before giving up */ __u32 choose_total_tries; - /* attempt chooseleaf inner descent once; on failure retry outer descent */ + /* attempt chooseleaf inner descent once for firstn mode; on + * reject retry outer descent. Note that this does *not* + * apply to a collision: in that case we will retry as we used + * to. */ __u32 chooseleaf_descend_once; }; diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index e3ade074541c..c34320518c8b 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -299,6 +299,8 @@ static int crush_choose_firstn(const struct crush_map *map, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, + unsigned int attempts, + unsigned int recurse_attempts, int recurse_to_leaf, int descend_once, int *out2) { @@ -385,6 +387,7 @@ static int crush_choose_firstn(const struct crush_map *map, weight, weight_max, x, outpos+1, 0, out2, outpos, + recurse_attempts, 0, 0, map->chooseleaf_descend_once, NULL) <= outpos) @@ -421,7 +424,7 @@ reject: flocal <= in->size + map->choose_local_fallback_tries) /* exhaustive bucket search */ retry_bucket = 1; - else if (ftotal <= map->choose_total_tries) + else if (ftotal <= attempts) /* then retry descent */ retry_descent = 1; else @@ -634,7 +637,8 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; - int choose_leaf_tries = 1; + int choose_tries = map->choose_total_tries; + int choose_leaf_tries = 0; const int descend_once = 0; if ((__u32)ruleno >= map->max_rules) { @@ -701,6 +705,8 @@ int crush_do_rule(const struct crush_map *map, x, numrep, curstep->arg2, o+osize, j, + choose_tries, + choose_leaf_tries ? choose_leaf_tries : choose_tries, recurse_to_leaf, descend_once, c+osize); } else { @@ -711,8 +717,8 @@ int crush_do_rule(const struct crush_map *map, x, numrep, numrep, curstep->arg2, o+osize, j, - map->choose_total_tries, - choose_leaf_tries, + choose_tries, + choose_leaf_tries ? choose_leaf_tries : 1, recurse_to_leaf, c+osize, 0); -- cgit From cc10df4a3a5c34cb1d5b635ac70dd1fc406153ce Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:26 +0200 Subject: crush: add SET_CHOOSE_TRIES rule step Since we can specify the recursive retries in a rule, we may as well also specify the non-recursive tries too for completeness. Reflects ceph.git commit d1b97462cffccc871914859eaee562f2786abfd1. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 3 ++- net/ceph/crush/mapper.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 07b8fd4f81fc..5f95969347ec 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -47,7 +47,8 @@ enum { CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, - CRUSH_RULE_SET_CHOOSE_LEAF_TRIES = 9, + CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */ + CRUSH_RULE_SET_CHOOSE_LEAF_TRIES = 9, /* override chooseleaf_descend_once */ }; /* diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index c34320518c8b..a1acdea935bf 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -661,6 +661,11 @@ int crush_do_rule(const struct crush_map *map, wsize = 1; break; + case CRUSH_RULE_SET_CHOOSE_TRIES: + if (curstep->arg1 > 0) + choose_tries = curstep->arg1; + break; + case CRUSH_RULE_SET_CHOOSE_LEAF_TRIES: if (curstep->arg1 > 0) choose_leaf_tries = curstep->arg1; -- cgit From 917edad5d1d62070436b74ecbf5ea019b651ff69 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:26 +0200 Subject: crush: CHOOSE_LEAF -> CHOOSELEAF throughout This aligns the internal identifier names with the user-visible names in the decompiled crush map language. Reflects ceph.git commit caa0e22e15e4226c3671318ba1f61314bf6da2a6. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 6 +++--- net/ceph/crush/mapper.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 5f95969347ec..7b0fc4aba75b 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -44,11 +44,11 @@ enum { /* arg2 = type */ CRUSH_RULE_CHOOSE_INDEP = 3, /* same */ CRUSH_RULE_EMIT = 4, /* no args */ - CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, - CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, + CRUSH_RULE_CHOOSELEAF_FIRSTN = 6, + CRUSH_RULE_CHOOSELEAF_INDEP = 7, CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */ - CRUSH_RULE_SET_CHOOSE_LEAF_TRIES = 9, /* override chooseleaf_descend_once */ + CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ }; /* diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index a1acdea935bf..e9256a30e60d 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -666,25 +666,25 @@ int crush_do_rule(const struct crush_map *map, choose_tries = curstep->arg1; break; - case CRUSH_RULE_SET_CHOOSE_LEAF_TRIES: + case CRUSH_RULE_SET_CHOOSELEAF_TRIES: if (curstep->arg1 > 0) choose_leaf_tries = curstep->arg1; break; - case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; /* fall through */ - case CRUSH_RULE_CHOOSE_LEAF_INDEP: + case CRUSH_RULE_CHOOSELEAF_INDEP: case CRUSH_RULE_CHOOSE_INDEP: if (wsize == 0) break; recurse_to_leaf = curstep->op == - CRUSH_RULE_CHOOSE_LEAF_FIRSTN || + CRUSH_RULE_CHOOSELEAF_FIRSTN || curstep->op == - CRUSH_RULE_CHOOSE_LEAF_INDEP; + CRUSH_RULE_CHOOSELEAF_INDEP; /* reset output */ osize = 0; -- cgit From d390bb2a83086f2b79c152e2c1734813bd257d9b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:26 +0200 Subject: crush: generalize descend_once The legacy behavior is to make the normal number of tries for the recursive chooseleaf call. The descend_once tunable changed this to making a single try and bail if we get a reject (note that it is impossible to collide in the recursive case). The new set_chooseleaf_tries lets you select the number of recursive chooseleaf attempts for indep mode, or default to 1. Use the same behavior for firstn, except default to total_tries when the legacy tunables are set (for compatibility). This makes the rule step override the (new) default of 1 recursive attempt, keeping behavior consistent with indep mode. Reflects ceph.git commit 685c6950ef3df325ef04ce7c986e36ca2514c5f1. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index e9256a30e60d..0613dd2d5fa3 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -291,7 +291,6 @@ static int is_out(const struct crush_map *map, * @out: pointer to output vector * @outpos: our position in that vector * @recurse_to_leaf: true if we want one device under each item of given type - * @descend_once: true if we should only try one descent before giving up * @out2: second output vector for leaf items (if @recurse_to_leaf) */ static int crush_choose_firstn(const struct crush_map *map, @@ -302,7 +301,7 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int attempts, unsigned int recurse_attempts, int recurse_to_leaf, - int descend_once, int *out2) + int *out2) { int rep; unsigned int ftotal, flocal; @@ -389,7 +388,6 @@ static int crush_choose_firstn(const struct crush_map *map, out2, outpos, recurse_attempts, 0, 0, - map->chooseleaf_descend_once, NULL) <= outpos) /* didn't get leaf */ reject = 1; @@ -414,10 +412,7 @@ reject: ftotal++; flocal++; - if (reject && descend_once) - /* let outer call try again */ - skip_rep = 1; - else if (collide && flocal <= map->choose_local_tries) + if (collide && flocal <= map->choose_local_tries) /* retry locally a few times */ retry_bucket = 1; else if (map->choose_local_fallback_tries > 0 && @@ -639,7 +634,6 @@ int crush_do_rule(const struct crush_map *map, int numrep; int choose_tries = map->choose_total_tries; int choose_leaf_tries = 0; - const int descend_once = 0; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -703,6 +697,14 @@ int crush_do_rule(const struct crush_map *map, } j = 0; if (firstn) { + int recurse_tries; + if (choose_leaf_tries) + recurse_tries = + choose_leaf_tries; + else if (map->chooseleaf_descend_once) + recurse_tries = 1; + else + recurse_tries = choose_tries; osize += crush_choose_firstn( map, map->buckets[-1-w[i]], @@ -711,9 +713,9 @@ int crush_do_rule(const struct crush_map *map, curstep->arg2, o+osize, j, choose_tries, - choose_leaf_tries ? choose_leaf_tries : choose_tries, + recurse_tries, recurse_to_leaf, - descend_once, c+osize); + c+osize); } else { crush_choose_indep( map, @@ -723,7 +725,8 @@ int crush_do_rule(const struct crush_map *map, curstep->arg2, o+osize, j, choose_tries, - choose_leaf_tries ? choose_leaf_tries : 1, + choose_leaf_tries ? + choose_leaf_tries : 1, recurse_to_leaf, c+osize, 0); -- cgit From f046bf92080cbdc4a94c6e86698c5a3f10716445 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:27 +0200 Subject: crush: add set_choose_local_[fallback_]tries steps This allows all of the tunables to be overridden by a specific rule. Reflects ceph.git commits d129e09e57fbc61cfd4f492e3ee77d0750c9d292, 0497db49e5973b50df26251ed0e3f4ac7578e66e. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 2 ++ net/ceph/crush/mapper.c | 28 +++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 7b0fc4aba75b..acaa5615d634 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -49,6 +49,8 @@ enum { CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */ CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ + CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, + CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, }; /* diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 0613dd2d5fa3..8cde4818e18b 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -300,6 +300,8 @@ static int crush_choose_firstn(const struct crush_map *map, int *out, int outpos, unsigned int attempts, unsigned int recurse_attempts, + unsigned int local_tries, + unsigned int local_fallback_tries, int recurse_to_leaf, int *out2) { @@ -338,9 +340,9 @@ static int crush_choose_firstn(const struct crush_map *map, reject = 1; goto reject; } - if (map->choose_local_fallback_tries > 0 && + if (local_fallback_tries > 0 && flocal >= (in->size>>1) && - flocal > map->choose_local_fallback_tries) + flocal > local_fallback_tries) item = bucket_perm_choose(in, x, r); else item = crush_bucket_choose(in, x, r); @@ -387,6 +389,8 @@ static int crush_choose_firstn(const struct crush_map *map, x, outpos+1, 0, out2, outpos, recurse_attempts, 0, + local_tries, + local_fallback_tries, 0, NULL) <= outpos) /* didn't get leaf */ @@ -412,11 +416,11 @@ reject: ftotal++; flocal++; - if (collide && flocal <= map->choose_local_tries) + if (collide && flocal <= local_tries) /* retry locally a few times */ retry_bucket = 1; - else if (map->choose_local_fallback_tries > 0 && - flocal <= in->size + map->choose_local_fallback_tries) + else if (local_fallback_tries > 0 && + flocal <= in->size + local_fallback_tries) /* exhaustive bucket search */ retry_bucket = 1; else if (ftotal <= attempts) @@ -633,6 +637,8 @@ int crush_do_rule(const struct crush_map *map, int i, j; int numrep; int choose_tries = map->choose_total_tries; + int choose_local_tries = map->choose_local_tries; + int choose_local_fallback_tries = map->choose_local_fallback_tries; int choose_leaf_tries = 0; if ((__u32)ruleno >= map->max_rules) { @@ -665,6 +671,16 @@ int crush_do_rule(const struct crush_map *map, choose_leaf_tries = curstep->arg1; break; + case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: + if (curstep->arg1 > 0) + choose_local_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: + if (curstep->arg1 > 0) + choose_local_fallback_tries = curstep->arg1; + break; + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; @@ -714,6 +730,8 @@ int crush_do_rule(const struct crush_map *map, o+osize, j, choose_tries, recurse_tries, + choose_local_tries, + choose_local_fallback_tries, recurse_to_leaf, c+osize); } else { -- cgit From 2d8be0bc8bc2dc7d3fc3d322c4db3fdf6c924660 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:27 +0200 Subject: crush: attempts -> tries Reflects ceph.git commit ea3a0bb8b773360d73b8b77fa32115ef091c9857. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 8cde4818e18b..71ce4f12a7c9 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -298,8 +298,8 @@ static int crush_choose_firstn(const struct crush_map *map, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, - unsigned int attempts, - unsigned int recurse_attempts, + unsigned int tries, + unsigned int recurse_tries, unsigned int local_tries, unsigned int local_fallback_tries, int recurse_to_leaf, @@ -388,7 +388,7 @@ static int crush_choose_firstn(const struct crush_map *map, weight, weight_max, x, outpos+1, 0, out2, outpos, - recurse_attempts, 0, + recurse_tries, 0, local_tries, local_fallback_tries, 0, @@ -423,7 +423,7 @@ reject: flocal <= in->size + local_fallback_tries) /* exhaustive bucket search */ retry_bucket = 1; - else if (ftotal <= attempts) + else if (ftotal <= tries) /* then retry descent */ retry_descent = 1; else @@ -461,8 +461,8 @@ static void crush_choose_indep(const struct crush_map *map, const __u32 *weight, int weight_max, int x, int left, int numrep, int type, int *out, int outpos, - unsigned int attempts, - unsigned int recurse_attempts, + unsigned int tries, + unsigned int recurse_tries, int recurse_to_leaf, int *out2, int parent_r) @@ -487,7 +487,7 @@ static void crush_choose_indep(const struct crush_map *map, out2[rep] = CRUSH_ITEM_UNDEF; } - for (ftotal = 0; left > 0 && ftotal < attempts; ftotal++) { + for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { for (rep = outpos; rep < endpos; rep++) { if (out[rep] != CRUSH_ITEM_UNDEF) continue; @@ -572,7 +572,7 @@ static void crush_choose_indep(const struct crush_map *map, weight, weight_max, x, 1, numrep, 0, out2, rep, - recurse_attempts, 0, + recurse_tries, 0, 0, NULL, r); if (out2[rep] == CRUSH_ITEM_NONE) { /* placed nothing; no leaf */ -- cgit From 0e32d7126cdf30b610bc131c81f131c717bd0d77 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:27 +0200 Subject: crush: fix crush_choose_firstn comment Reflects ceph.git commit 8b38f10bc2ee3643a33ea5f9545ad5c00e4ac5b4. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/crush/mapper.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 71ce4f12a7c9..b703790b4e44 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -290,7 +290,11 @@ static int is_out(const struct crush_map *map, * @type: the type of item to choose * @out: pointer to output vector * @outpos: our position in that vector - * @recurse_to_leaf: true if we want one device under each item of given type + * @tries: number of attempts to make + * @recurse_tries: number of attempts to have recursive chooseleaf make + * @local_tries: localized retries + * @local_fallback_tries: localized fallback retries + * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) * @out2: second output vector for leaf items (if @recurse_to_leaf) */ static int crush_choose_firstn(const struct crush_map *map, -- cgit From f48db1e9ac6f1578ab7efef9f66c70279e2f0cb5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 30 Dec 2013 19:21:29 +0200 Subject: libceph: use CEPH_MON_PORT when the specified port is 0 Similar to userspace, don't bail with "parse_ips bad ip ..." if the specified port is port 0, instead use port CEPH_MON_PORT (6789, the default monitor port). Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/messenger.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index bd172e1ee0ae..d2cadb5b2b63 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1866,7 +1866,9 @@ int ceph_parse_ips(const char *c, const char *end, port = (port * 10) + (*p - '0'); p++; } - if (port > 65535 || port == 0) + if (port == 0) + port = CEPH_MON_PORT; + else if (port > 65535) goto bad; } else { port = CEPH_MON_PORT; -- cgit From 0fe8d04e8c3a1eb49089793e38b60a17cee564e3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 31 Dec 2013 13:13:30 -0500 Subject: SUNRPC: Ensure xprt_connect_status handles all potential connection errors Currently, xprt_connect_status will convert connection error values such as ECONNREFUSED, ECONNRESET, ... into EIO, which means that they never get handled. Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 04199bc8416f..ddd198e90292 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -749,6 +749,11 @@ static void xprt_connect_status(struct rpc_task *task) } switch (task->tk_status) { + case -ECONNREFUSED: + case -ECONNRESET: + case -ECONNABORTED: + case -ENETUNREACH: + case -EHOSTUNREACH: case -EAGAIN: dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid); break; -- cgit From df2772700c6ee706be7b2fd16c6bf2c1bf63cda0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 31 Dec 2013 13:11:43 -0500 Subject: SUNRPC: Handle connect errors ECONNABORTED and EHOSTUNREACH Ensure that call_bind_status, call_connect_status, call_transmit_status and call_status all are capable of handling ECONNABORTED and EHOSTUNREACH. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index f09b7db2c492..b9276a63eaf1 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1729,6 +1729,7 @@ call_bind_status(struct rpc_task *task) return; case -ECONNREFUSED: /* connection problems */ case -ECONNRESET: + case -ECONNABORTED: case -ENOTCONN: case -EHOSTDOWN: case -EHOSTUNREACH: @@ -1799,7 +1800,9 @@ call_connect_status(struct rpc_task *task) return; case -ECONNREFUSED: case -ECONNRESET: + case -ECONNABORTED: case -ENETUNREACH: + case -EHOSTUNREACH: /* retry with existing socket, after a delay */ rpc_delay(task, 3*HZ); if (RPC_IS_SOFTCONN(task)) @@ -1902,6 +1905,7 @@ call_transmit_status(struct rpc_task *task) break; } case -ECONNRESET: + case -ECONNABORTED: case -ENOTCONN: case -EPIPE: rpc_task_force_reencode(task); @@ -2011,8 +2015,9 @@ call_status(struct rpc_task *task) xprt_conditional_disconnect(req->rq_xprt, req->rq_connect_cookie); break; - case -ECONNRESET: case -ECONNREFUSED: + case -ECONNRESET: + case -ECONNABORTED: rpc_force_rebind(clnt); rpc_delay(task, 3*HZ); case -EPIPE: -- cgit From 2118071d3b0d57a03fad77885f4fdc364798aa87 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 31 Dec 2013 13:22:59 -0500 Subject: SUNRPC: Report connection error values to rpc_tasks on the pending queue Currently we only report EAGAIN, which is not descriptive enough for softconn tasks. Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index dd9d295813cf..ab006b7b7ab8 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -257,6 +257,7 @@ struct sock_xprt { void (*old_data_ready)(struct sock *, int); void (*old_state_change)(struct sock *); void (*old_write_space)(struct sock *); + void (*old_error_report)(struct sock *); }; /* @@ -274,6 +275,11 @@ struct sock_xprt { */ #define TCP_RPC_REPLY (1UL << 6) +static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) +{ + return (struct rpc_xprt *) sk->sk_user_data; +} + static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt) { return (struct sockaddr *) &xprt->addr; @@ -799,6 +805,7 @@ static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) transport->old_data_ready = sk->sk_data_ready; transport->old_state_change = sk->sk_state_change; transport->old_write_space = sk->sk_write_space; + transport->old_error_report = sk->sk_error_report; } static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk) @@ -806,6 +813,33 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s sk->sk_data_ready = transport->old_data_ready; sk->sk_state_change = transport->old_state_change; sk->sk_write_space = transport->old_write_space; + sk->sk_error_report = transport->old_error_report; +} + +/** + * xs_error_report - callback to handle TCP socket state errors + * @sk: socket + * + * Note: we don't call sock_error() since there may be a rpc_task + * using the socket, and so we don't want to clear sk->sk_err. + */ +static void xs_error_report(struct sock *sk) +{ + struct rpc_xprt *xprt; + int err; + + read_lock_bh(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk))) + goto out; + + err = -sk->sk_err; + if (err == 0) + goto out; + dprintk("RPC: xs_error_report client %p, error=%d...\n", + xprt, -err); + xprt_wake_pending_tasks(xprt, err); + out: + read_unlock_bh(&sk->sk_callback_lock); } static void xs_reset_transport(struct sock_xprt *transport) @@ -885,11 +919,6 @@ static void xs_destroy(struct rpc_xprt *xprt) module_put(THIS_MODULE); } -static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) -{ - return (struct rpc_xprt *) sk->sk_user_data; -} - static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) { struct xdr_skb_reader desc = { @@ -1869,6 +1898,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_user_data = xprt; sk->sk_data_ready = xs_local_data_ready; sk->sk_write_space = xs_udp_write_space; + sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_ATOMIC; xprt_clear_connected(xprt); @@ -2146,6 +2176,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; + sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_ATOMIC; /* socket options */ -- cgit From e8353c7682875329b8e70999e1652fd1bba8973d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 31 Dec 2013 13:39:22 -0500 Subject: SUNRPC: Add tracepoint for socket errors Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 1 + net/sunrpc/xprtsock.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index d51d16c7afd8..ddc179b7a105 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -301,6 +301,7 @@ DECLARE_EVENT_CLASS(xs_socket_event_done, DEFINE_RPC_SOCKET_EVENT(rpc_socket_state_change); DEFINE_RPC_SOCKET_EVENT_DONE(rpc_socket_connect); +DEFINE_RPC_SOCKET_EVENT_DONE(rpc_socket_error); DEFINE_RPC_SOCKET_EVENT_DONE(rpc_socket_reset_connection); DEFINE_RPC_SOCKET_EVENT(rpc_socket_close); DEFINE_RPC_SOCKET_EVENT(rpc_socket_shutdown); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ab006b7b7ab8..25dbfa971948 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -837,6 +837,7 @@ static void xs_error_report(struct sock *sk) goto out; dprintk("RPC: xs_error_report client %p, error=%d...\n", xprt, -err); + trace_rpc_socket_error(xprt, sk->sk_socket, err); xprt_wake_pending_tasks(xprt, err); out: read_unlock_bh(&sk->sk_callback_lock); -- cgit From 7e55b59b2f32afc83452ae250dfd6173c9a7b515 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Tue, 31 Dec 2013 13:17:20 +0800 Subject: SUNRPC/NFSD: Support a new option for ignoring the result of svc_register NFSv4 clients can contact port 2049 directly instead of needing the portmapper. Therefore a failure to register to the portmapper when starting an NFSv4-only server isn't really a problem. But Gareth Williams reports that an attempt to start an NFSv4-only server without starting portmap fails: #rpc.nfsd -N 2 -N 3 rpc.nfsd: writing fd to kernel failed: errno 111 (Connection refused) rpc.nfsd: unable to set any sockets for nfsd Add a flag to svc_version to tell the rpc layer it can safely ignore an rpcbind failure in the NFSv4-only case. Reported-by: Gareth Williams Reviewed-by: Chuck Lever Signed-off-by: Kinglong Mee Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 1 + include/linux/sunrpc/svc.h | 4 +++- net/sunrpc/svc.c | 25 +++++++++++++++++-------- 3 files changed, 21 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 7bac4bdbddee..41e34dfd4e5f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1882,6 +1882,7 @@ struct svc_version nfsd_version4 = { .vs_proc = nfsd_procedures4, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS4_SVC_XDRSIZE, + .vs_rpcb_optnl = 1, }; /* diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index b631642318c5..04e763221246 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -386,8 +386,10 @@ struct svc_version { struct svc_procedure * vs_proc; /* per-procedure info */ u32 vs_xdrsize; /* xdrsize needed for this version */ - unsigned int vs_hidden : 1; /* Don't register with portmapper. + unsigned int vs_hidden : 1, /* Don't register with portmapper. * Only used for nfsacl so far. */ + vs_rpcb_optnl:1;/* Don't care the result of register. + * Only used for nfsv4. */ /* Override dispatch function (e.g. when caching replies). * A return value of 0 means drop the request. diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index e7fbe368b4a3..5de6801cd924 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -916,9 +916,6 @@ static int __svc_register(struct net *net, const char *progname, #endif } - if (error < 0) - printk(KERN_WARNING "svc: failed to register %sv%u RPC " - "service (errno %d).\n", progname, version, -error); return error; } @@ -937,6 +934,7 @@ int svc_register(const struct svc_serv *serv, struct net *net, const unsigned short port) { struct svc_program *progp; + struct svc_version *vers; unsigned int i; int error = 0; @@ -946,7 +944,8 @@ int svc_register(const struct svc_serv *serv, struct net *net, for (progp = serv->sv_program; progp; progp = progp->pg_next) { for (i = 0; i < progp->pg_nvers; i++) { - if (progp->pg_vers[i] == NULL) + vers = progp->pg_vers[i]; + if (vers == NULL) continue; dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n", @@ -955,16 +954,26 @@ int svc_register(const struct svc_serv *serv, struct net *net, proto == IPPROTO_UDP? "udp" : "tcp", port, family, - progp->pg_vers[i]->vs_hidden? - " (but not telling portmap)" : ""); + vers->vs_hidden ? + " (but not telling portmap)" : ""); - if (progp->pg_vers[i]->vs_hidden) + if (vers->vs_hidden) continue; error = __svc_register(net, progp->pg_name, progp->pg_prog, i, family, proto, port); - if (error < 0) + + if (vers->vs_rpcb_optnl) { + error = 0; + continue; + } + + if (error < 0) { + printk(KERN_WARNING "svc: failed to register " + "%sv%u RPC service (errno %d).\n", + progp->pg_name, i, -error); break; + } } } -- cgit From 6ff33b7dd0228b7d7ed44791bbbc98b03fd15d9d Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 17 Dec 2013 12:16:11 -0500 Subject: sunrpc: Fix infinite loop in RPC state machine When a task enters call_refreshresult with status 0 from call_refresh and !rpcauth_uptodatecred(task) it enters call_refresh again with no rate-limiting or max number of retries. Instead of trying forever, make use of the retry path that other errors use. This only seems to be possible when the crrefresh callback is gss_refresh_null, which only happens when destroying the context. To reproduce: 1) mount with sec=krb5 (or sec=sys with krb5 negotiated for non FSID specific operations). 2) reboot - the client will be stuck and will need to be hard rebooted BUG: soft lockup - CPU#0 stuck for 22s! [kworker/0:2:46] Modules linked in: rpcsec_gss_krb5 nfsv4 nfs fscache ppdev crc32c_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd serio_raw i2c_piix4 i2c_core e1000 parport_pc parport shpchp nfsd auth_rpcgss oid_registry exportfs nfs_acl lockd sunrpc autofs4 mptspi scsi_transport_spi mptscsih mptbase ata_generic floppy irq event stamp: 195724 hardirqs last enabled at (195723): [] restore_args+0x0/0x30 hardirqs last disabled at (195724): [] apic_timer_interrupt+0x6a/0x80 softirqs last enabled at (195722): [] __do_softirq+0x1df/0x276 softirqs last disabled at (195717): [] irq_exit+0x53/0x9a CPU: 0 PID: 46 Comm: kworker/0:2 Not tainted 3.13.0-rc3-branch-dros_testing+ #4 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013 Workqueue: rpciod rpc_async_schedule [sunrpc] task: ffff8800799c4260 ti: ffff880079002000 task.ti: ffff880079002000 RIP: 0010:[] [] __rpc_execute+0x8a/0x362 [sunrpc] RSP: 0018:ffff880079003d18 EFLAGS: 00000246 RAX: 0000000000000005 RBX: 0000000000000007 RCX: 0000000000000007 RDX: 0000000000000007 RSI: ffff88007aecbae8 RDI: ffff8800783d8900 RBP: ffff880079003d78 R08: ffff88006e30e9f8 R09: ffffffffa005a3d7 R10: ffff88006e30e7b0 R11: ffff8800783d8900 R12: ffffffffa006675e R13: ffff880079003ce8 R14: ffff88006e30e7b0 R15: ffff8800783d8900 FS: 0000000000000000(0000) GS:ffff88007f200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3072333000 CR3: 0000000001a0b000 CR4: 00000000001407f0 Stack: ffff880079003d98 0000000000000246 0000000000000000 ffff88007a9a4830 ffff880000000000 ffffffff81073f47 ffff88007f212b00 ffff8800799c4260 ffff8800783d8988 ffff88007f212b00 ffffe8ffff604800 0000000000000000 Call Trace: [] ? trace_hardirqs_on_caller+0x145/0x1a1 [] rpc_async_schedule+0x27/0x32 [sunrpc] [] process_one_work+0x211/0x3a5 [] ? process_one_work+0x172/0x3a5 [] worker_thread+0x134/0x202 [] ? rescuer_thread+0x280/0x280 [] ? rescuer_thread+0x280/0x280 [] kthread+0xc9/0xd1 [] ? __kthread_parkme+0x61/0x61 [] ret_from_fork+0x7c/0xb0 [] ? __kthread_parkme+0x61/0x61 Code: e8 87 63 fd e0 c6 05 10 dd 01 00 01 48 8b 43 70 4c 8d 6b 70 45 31 e4 a8 02 0f 85 d5 02 00 00 4c 8b 7b 48 48 c7 43 48 00 00 00 00 <4c> 8b 4b 50 4d 85 ff 75 0c 4d 85 c9 4d 89 cf 0f 84 32 01 00 00 And the output of "rpcdebug -m rpc -s all": RPC: 61 call_refresh (status 0) RPC: 61 call_refresh (status 0) RPC: 61 refreshing RPCSEC_GSS cred ffff88007a413cf0 RPC: 61 refreshing RPCSEC_GSS cred ffff88007a413cf0 RPC: 61 call_refreshresult (status 0) RPC: 61 refreshing RPCSEC_GSS cred ffff88007a413cf0 RPC: 61 call_refreshresult (status 0) RPC: 61 refreshing RPCSEC_GSS cred ffff88007a413cf0 RPC: 61 call_refresh (status 0) RPC: 61 call_refreshresult (status 0) RPC: 61 call_refresh (status 0) RPC: 61 call_refresh (status 0) RPC: 61 refreshing RPCSEC_GSS cred ffff88007a413cf0 RPC: 61 call_refreshresult (status 0) RPC: 61 call_refresh (status 0) RPC: 61 refreshing RPCSEC_GSS cred ffff88007a413cf0 RPC: 61 call_refresh (status 0) RPC: 61 refreshing RPCSEC_GSS cred ffff88007a413cf0 RPC: 61 refreshing RPCSEC_GSS cred ffff88007a413cf0 RPC: 61 call_refreshresult (status 0) RPC: 61 call_refresh (status 0) RPC: 61 call_refresh (status 0) RPC: 61 call_refresh (status 0) RPC: 61 call_refresh (status 0) RPC: 61 call_refreshresult (status 0) RPC: 61 refreshing RPCSEC_GSS cred ffff88007a413cf0 Signed-off-by: Weston Andros Adamson Cc: stable@vger.kernel.org # 2.6.37+ Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b9276a63eaf1..0edada973434 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1529,9 +1529,13 @@ call_refreshresult(struct rpc_task *task) task->tk_action = call_refresh; switch (status) { case 0: - if (rpcauth_uptodatecred(task)) + if (rpcauth_uptodatecred(task)) { task->tk_action = call_allocate; - return; + return; + } + /* Use rate-limiting and a max number of retries if refresh + * had status 0 but failed to update the cred. + */ case -ETIMEDOUT: rpc_delay(task, 3*HZ); case -EAGAIN: -- cgit From 1654a04cd702fd19c297c36300a6ab834cf8c072 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 4 Jan 2014 07:18:03 -0500 Subject: sunrpc: don't wait for write before allowing reads from use-gss-proxy file It doesn't make much sense to make reads from this procfile hang. As far as I can tell, only gssproxy itself will open this file and it never reads from it. Change it to just give the present setting of sn->use_gss_proxy without waiting for anything. Note that we do not want to call use_gss_proxy() in this codepath since an inopportune read of this file could cause it to be disabled prematurely. Cc: stable@vger.kernel.org Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_rpc_upcall.c | 2 -- net/sunrpc/auth_gss/svcauth_gss.c | 33 ++------------------------------- net/sunrpc/netns.h | 1 - 3 files changed, 2 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 458f85e9b0ba..abbb7dcd1689 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -137,7 +137,6 @@ void init_gssp_clnt(struct sunrpc_net *sn) { mutex_init(&sn->gssp_lock); sn->gssp_clnt = NULL; - init_waitqueue_head(&sn->gssp_wq); } int set_gssp_clnt(struct net *net) @@ -154,7 +153,6 @@ int set_gssp_clnt(struct net *net) sn->gssp_clnt = clnt; } mutex_unlock(&sn->gssp_lock); - wake_up(&sn->gssp_wq); return ret; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 008cdade5aae..1b94a9c8a242 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1295,34 +1295,9 @@ static int set_gss_proxy(struct net *net, int type) else ret = -EBUSY; spin_unlock(&use_gssp_lock); - wake_up(&sn->gssp_wq); return ret; } -static inline bool gssp_ready(struct sunrpc_net *sn) -{ - switch (sn->use_gss_proxy) { - case -1: - return false; - case 0: - return true; - case 1: - return sn->gssp_clnt; - } - WARN_ON_ONCE(1); - return false; -} - -static int wait_for_gss_proxy(struct net *net, struct file *file) -{ - struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); - - if (file->f_flags & O_NONBLOCK && !gssp_ready(sn)) - return -EAGAIN; - return wait_event_interruptible(sn->gssp_wq, gssp_ready(sn)); -} - - static ssize_t write_gssp(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -1355,16 +1330,12 @@ static ssize_t read_gssp(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct net *net = PDE_DATA(file_inode(file)); + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); unsigned long p = *ppos; char tbuf[10]; size_t len; - int ret; - - ret = wait_for_gss_proxy(net, file); - if (ret) - return ret; - snprintf(tbuf, sizeof(tbuf), "%d\n", use_gss_proxy(net)); + snprintf(tbuf, sizeof(tbuf), "%d\n", sn->use_gss_proxy); len = strlen(tbuf); if (p >= len) return 0; diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h index 779742cfc1ff..3a260e47fad2 100644 --- a/net/sunrpc/netns.h +++ b/net/sunrpc/netns.h @@ -26,7 +26,6 @@ struct sunrpc_net { unsigned int rpcb_is_af_local : 1; struct mutex gssp_lock; - wait_queue_head_t gssp_wq; struct rpc_clnt *gssp_clnt; int use_gss_proxy; int pipe_version; -- cgit From a92e5eb1103341e985a575e48e26f87fbb9b1679 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 4 Jan 2014 07:18:04 -0500 Subject: sunrpc: fix potential race between setting use_gss_proxy and the upcall rpc_clnt An nfsd thread can call use_gss_proxy and find it set to '1' but find gssp_clnt still NULL, so that when it attempts the upcall the result will be an unnecessary -EIO. So, ensure that gssp_clnt is created first, and set the use_gss_proxy variable only if that succeeds. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 1b94a9c8a242..60dc3700b2cb 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1317,10 +1317,10 @@ static ssize_t write_gssp(struct file *file, const char __user *buf, return res; if (i != 1) return -EINVAL; - res = set_gss_proxy(net, 1); + res = set_gssp_clnt(net); if (res) return res; - res = set_gssp_clnt(net); + res = set_gss_proxy(net, 1); if (res) return res; return count; -- cgit From 0fdc26785d0a5bb33d9adb572307fd2d7a406734 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 4 Jan 2014 07:18:05 -0500 Subject: sunrpc: get rid of use_gssp_lock We can achieve the same result with a cmpxchg(). This also fixes a potential race in use_gss_proxy(). The value of sn->use_gss_proxy could go from -1 to 1 just after we check it in use_gss_proxy() but before we acquire the spinlock. The procfile write would end up returning success but the value would flip to 0 soon afterward. With this method we not only avoid locking but the first "setter" always wins. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 42 +++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 60dc3700b2cb..2a935404047f 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1263,41 +1263,35 @@ out: return ret; } -DEFINE_SPINLOCK(use_gssp_lock); - -static bool use_gss_proxy(struct net *net) +/* + * Try to set the sn->use_gss_proxy variable to a new value. We only allow + * it to be changed if it's currently undefined (-1). If it's any other value + * then return -EBUSY unless the type wouldn't have changed anyway. + */ +static int set_gss_proxy(struct net *net, int type) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + int ret; - if (sn->use_gss_proxy != -1) - return sn->use_gss_proxy; - spin_lock(&use_gssp_lock); - /* - * If you wanted gss-proxy, you should have said so before - * starting to accept requests: - */ - sn->use_gss_proxy = 0; - spin_unlock(&use_gssp_lock); + WARN_ON_ONCE(type != 0 && type != 1); + ret = cmpxchg(&sn->use_gss_proxy, -1, type); + if (ret != -1 && ret != type) + return -EBUSY; return 0; } -#ifdef CONFIG_PROC_FS - -static int set_gss_proxy(struct net *net, int type) +static bool use_gss_proxy(struct net *net) { struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); - int ret = 0; - WARN_ON_ONCE(type != 0 && type != 1); - spin_lock(&use_gssp_lock); - if (sn->use_gss_proxy == -1 || sn->use_gss_proxy == type) - sn->use_gss_proxy = type; - else - ret = -EBUSY; - spin_unlock(&use_gssp_lock); - return ret; + /* If use_gss_proxy is still undefined, then try to disable it */ + if (sn->use_gss_proxy == -1) + set_gss_proxy(net, 0); + return sn->use_gss_proxy; } +#ifdef CONFIG_PROC_FS + static ssize_t write_gssp(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { -- cgit From bba0f88bf7c4ad467eeb3a17443de1f9cd0437e0 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 18 Jan 2013 11:33:08 -0500 Subject: minor svcauth_gss.c cleanup --- net/sunrpc/auth_gss/svcauth_gss.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 2a935404047f..0f73f4507746 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1591,8 +1591,7 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) BUG_ON(integ_len % 4); *p++ = htonl(integ_len); *p++ = htonl(gc->gc_seq); - if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, - integ_len)) + if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) BUG(); if (resbuf->tail[0].iov_base == NULL) { if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE) @@ -1600,10 +1599,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) resbuf->tail[0].iov_base = resbuf->head[0].iov_base + resbuf->head[0].iov_len; resbuf->tail[0].iov_len = 0; - resv = &resbuf->tail[0]; - } else { - resv = &resbuf->tail[0]; } + resv = &resbuf->tail[0]; mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic)) goto out_err; -- cgit From 3cea4c3071d4e55e9d7356efe9d0ebf92f0c2204 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 9 Jan 2014 20:08:21 +0200 Subject: libceph: rename ceph_msg::front_max to front_alloc_len Rename front_max field of struct ceph_msg to front_alloc_len to make its purpose more clear. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger.c | 6 +++--- net/ceph/mon_client.c | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index c1d3f5a65273..861138f7c161 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -157,7 +157,7 @@ struct ceph_msg { bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; - int front_max; + int front_alloc_len; unsigned long ack_stamp; /* tx: when we were acked */ struct ceph_msgpool *pool; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d2cadb5b2b63..f4d411c017e7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3130,7 +3130,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, INIT_LIST_HEAD(&m->data); /* front */ - m->front_max = front_len; + m->front_alloc_len = front_len; if (front_len) { if (front_len > PAGE_CACHE_SIZE) { m->front.iov_base = __vmalloc(front_len, flags, @@ -3305,8 +3305,8 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { - pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, - msg->front_max, msg->data_length); + pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, + msg->front_alloc_len, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 1fe25cd29d0e..2ac9ef35110b 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -152,7 +152,7 @@ static int __open_session(struct ceph_mon_client *monc) /* initiatiate authentication handshake */ ret = ceph_auth_build_hello(monc->auth, monc->m_auth->front.iov_base, - monc->m_auth->front_max); + monc->m_auth->front_alloc_len); __send_prepared_auth_request(monc, ret); } else { dout("open_session mon%d already open\n", monc->cur_mon); @@ -196,7 +196,7 @@ static void __send_subscribe(struct ceph_mon_client *monc) int num; p = msg->front.iov_base; - end = p + msg->front_max; + end = p + msg->front_alloc_len; num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; ceph_encode_32(&p, num); @@ -897,7 +897,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, msg->front.iov_len, monc->m_auth->front.iov_base, - monc->m_auth->front_max); + monc->m_auth->front_alloc_len); if (ret < 0) { monc->client->auth_err = ret; wake_up_all(&monc->client->auth_wq); @@ -939,7 +939,7 @@ static int __validate_auth(struct ceph_mon_client *monc) return 0; ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, - monc->m_auth->front_max); + monc->m_auth->front_alloc_len); if (ret <= 0) return ret; /* either an error, or no need to authenticate */ __send_prepared_auth_request(monc, ret); -- cgit From 3f0a4ac55fe036902e3666be740da63528ad8639 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 9 Jan 2014 20:08:21 +0200 Subject: libceph: rename front to front_len in get_reply() Rename front local variable to front_len in get_reply() to make its purpose more clear. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 9f1993582ff7..7619c37bfed4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2502,7 +2502,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, struct ceph_osd_client *osdc = osd->o_osdc; struct ceph_msg *m; struct ceph_osd_request *req; - int front = le32_to_cpu(hdr->front_len); + int front_len = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid; @@ -2522,12 +2522,13 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply, req->r_reply->con); ceph_msg_revoke_incoming(req->r_reply); - if (front > req->r_reply->front.iov_len) { + if (front_len > req->r_reply->front.iov_len) { pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n", - front, (int)req->r_reply->front.iov_len, + front_len, (int)req->r_reply->front.iov_len, (unsigned int)con->peer_name.type, le64_to_cpu(con->peer_name.num)); - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, + false); if (!m) goto out; ceph_msg_put(req->r_reply); -- cgit From f2be82b0058e90b5d9ac2cb896b4914276fb50ef Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 9 Jan 2014 20:08:21 +0200 Subject: libceph: fix preallocation check in get_reply() The check that makes sure that we have enough memory allocated to read in the entire header of the message in question is currently busted. It compares front_len of the incoming message with iov_len field of ceph_msg::front structure, which is used primarily to indicate the amount of data already read in, and not the size of the allocated buffer. Under certain conditions (e.g. a short read from a socket followed by that socket's shutdown and owning ceph_connection reset) this results in a warning similar to [85688.975866] libceph: get_reply front 198 > preallocated 122 (4#0) and, through another bug, leads to forever hung tasks and forced reboots. Fix this by comparing front_len with front_alloc_len field of struct ceph_msg, which stores the actual size of the buffer. Fixes: http://tracker.ceph.com/issues/5425 Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/messenger.c | 3 +-- net/ceph/osd_client.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f4d411c017e7..252ad4e01cf8 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3130,7 +3130,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, INIT_LIST_HEAD(&m->data); /* front */ - m->front_alloc_len = front_len; if (front_len) { if (front_len > PAGE_CACHE_SIZE) { m->front.iov_base = __vmalloc(front_len, flags, @@ -3147,7 +3146,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, } else { m->front.iov_base = NULL; } - m->front.iov_len = front_len; + m->front_alloc_len = m->front.iov_len = front_len; dout("ceph_msg_new %p front %d\n", m, front_len); return m; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7619c37bfed4..733195170490 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2522,9 +2522,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply, req->r_reply->con); ceph_msg_revoke_incoming(req->r_reply); - if (front_len > req->r_reply->front.iov_len) { + if (front_len > req->r_reply->front_alloc_len) { pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n", - front_len, (int)req->r_reply->front.iov_len, + front_len, req->r_reply->front_alloc_len, (unsigned int)con->peer_name.type, le64_to_cpu(con->peer_name.num)); m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, -- cgit From c692554bf4fe50a0187744663acc935d10e210a2 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Sun, 19 Jan 2014 21:50:51 +0000 Subject: gss_krb5: use lcm from kernel lib Replace hardcoded lowest common multiple algorithm by the lcm() function in kernel lib. Signed-off-by: Luis Henriques Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_krb5_keys.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index 76e42e6be755..24589bd2a4b6 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -59,6 +59,7 @@ #include #include #include +#include #ifdef RPC_DEBUG # define RPCDBG_FACILITY RPCDBG_AUTH @@ -72,7 +73,7 @@ static void krb5_nfold(u32 inbits, const u8 *in, u32 outbits, u8 *out) { - int a, b, c, lcm; + unsigned long ulcm; int byte, i, msbit; /* the code below is more readable if I make these bytes @@ -82,17 +83,7 @@ static void krb5_nfold(u32 inbits, const u8 *in, outbits >>= 3; /* first compute lcm(n,k) */ - - a = outbits; - b = inbits; - - while (b != 0) { - c = b; - b = a%b; - a = c; - } - - lcm = outbits*inbits/a; + ulcm = lcm(inbits, outbits); /* now do the real work */ @@ -101,7 +92,7 @@ static void krb5_nfold(u32 inbits, const u8 *in, /* this will end up cycling through k lcm(k,n)/k times, which is correct */ - for (i = lcm-1; i >= 0; i--) { + for (i = ulcm-1; i >= 0; i--) { /* compute the msbit in k which gets added into this byte */ msbit = ( /* first, start with the msbit in the first, -- cgit From eeb0bed5572b1282009dfc2635604df5a35d1a02 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 9 Jan 2014 20:08:21 +0200 Subject: libceph: add ceph_kv{malloc,free}() and switch to them Encapsulate kmalloc vs vmalloc memory allocation and freeing logic into two helpers, ceph_kvmalloc() and ceph_kvfree(), and switch to them. ceph_kvmalloc() kmalloc()'s a maximum of 8 pages, anything bigger is vmalloc()'ed with __GFP_HIGHMEM set. This changes the existing behaviour: - for buffers (ceph_buffer_new()), from trying to kmalloc() everything and using vmalloc() just as a fallback - for messages (ceph_msg_new()), from going to vmalloc() for anything bigger than a page - for messages (ceph_msg_new()), from disallowing vmalloc() to use high memory Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/buffer.h | 1 - include/linux/ceph/libceph.h | 11 +++++++---- include/linux/ceph/messenger.h | 1 - net/ceph/buffer.c | 22 ++++++---------------- net/ceph/ceph_common.c | 20 ++++++++++++++++++++ net/ceph/messenger.c | 13 ++----------- 6 files changed, 35 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h index 58d19014068f..07ad423cc37f 100644 --- a/include/linux/ceph/buffer.h +++ b/include/linux/ceph/buffer.h @@ -17,7 +17,6 @@ struct ceph_buffer { struct kref kref; struct kvec vec; size_t alloc_len; - bool is_vmalloc; }; extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp); diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 7d704db60cbb..2f49aa4c4f7f 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -173,15 +173,18 @@ static inline int calc_pages_for(u64 off, u64 len) (off >> PAGE_CACHE_SHIFT); } +extern struct kmem_cache *ceph_inode_cachep; +extern struct kmem_cache *ceph_cap_cachep; +extern struct kmem_cache *ceph_dentry_cachep; +extern struct kmem_cache *ceph_file_cachep; + /* ceph_common.c */ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); -extern struct kmem_cache *ceph_inode_cachep; -extern struct kmem_cache *ceph_cap_cachep; -extern struct kmem_cache *ceph_dentry_cachep; -extern struct kmem_cache *ceph_file_cachep; +extern void *ceph_kvmalloc(size_t size, gfp_t flags); +extern void ceph_kvfree(const void *ptr); extern struct ceph_options *ceph_parse_options(char *options, const char *dev_name, const char *dev_name_end, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 861138f7c161..20ee8b63a968 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -154,7 +154,6 @@ struct ceph_msg { struct list_head list_head; /* links for connection lists */ struct kref kref; - bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; int front_alloc_len; diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index bf3e6a13c215..621b5f65407f 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -6,6 +6,7 @@ #include #include +#include /* for ceph_kv{malloc,free} */ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { @@ -15,16 +16,10 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) if (!b) return NULL; - b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); - if (b->vec.iov_base) { - b->is_vmalloc = false; - } else { - b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL); - if (!b->vec.iov_base) { - kfree(b); - return NULL; - } - b->is_vmalloc = true; + b->vec.iov_base = ceph_kvmalloc(len, gfp); + if (!b->vec.iov_base) { + kfree(b); + return NULL; } kref_init(&b->kref); @@ -40,12 +35,7 @@ void ceph_buffer_release(struct kref *kref) struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); dout("buffer_release %p\n", b); - if (b->vec.iov_base) { - if (b->is_vmalloc) - vfree(b->vec.iov_base); - else - kfree(b->vec.iov_base); - } + ceph_kvfree(b->vec.iov_base); kfree(b); } EXPORT_SYMBOL(ceph_buffer_release); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 43d8177a52e1..67d7721d237e 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -170,6 +171,25 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); +void *ceph_kvmalloc(size_t size, gfp_t flags) +{ + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + void *ptr = kmalloc(size, flags | __GFP_NOWARN); + if (ptr) + return ptr; + } + + return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); +} + +void ceph_kvfree(const void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + static int parse_fsid(const char *str, struct ceph_fsid *fsid) { diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 252ad4e01cf8..2ed1304d22a7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3131,13 +3131,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, /* front */ if (front_len) { - if (front_len > PAGE_CACHE_SIZE) { - m->front.iov_base = __vmalloc(front_len, flags, - PAGE_KERNEL); - m->front_is_vmalloc = true; - } else { - m->front.iov_base = kmalloc(front_len, flags); - } + m->front.iov_base = ceph_kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); @@ -3259,10 +3253,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) void ceph_msg_kfree(struct ceph_msg *m) { dout("msg_kfree %p\n", m); - if (m->front_is_vmalloc) - vfree(m->front.iov_base); - else - kfree(m->front.iov_base); + ceph_kvfree(m->front.iov_base); kmem_cache_free(ceph_msg_cache, m); } -- cgit From 0b4af2e8c9f3fc9c31d2f9374b79af2c890ef897 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 16 Jan 2014 19:18:27 +0200 Subject: libceph: dout() is missing a newline Add a missing newline to a dout() in __reset_osd(). Signed-off-by: Ilya Dryomov --- net/ceph/osd_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 733195170490..959d332ed534 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1044,8 +1044,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) !ceph_con_opened(&osd->o_con)) { struct ceph_osd_request *req; - dout(" osd addr hasn't changed and connection never opened," - " letting msgr retry"); + dout("osd addr hasn't changed and connection never opened, " + "letting msgr retry\n"); /* touch each r_stamp for handle_timeout()'s benfit */ list_for_each_entry(req, &osd->o_requests, r_osd_item) req->r_stamp = jiffies; -- cgit From 8f22ba61b5d730a870cd6b10d299d23280d060fa Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sun, 26 Jan 2014 11:39:26 +0000 Subject: RxRPC: do not unlock unheld spinlock in rxrpc_connect_exclusive() If rx->conn is not NULL, rxrpc_connect_exclusive() does not acquire the transport's client lock, but it still releases it. The patch adds locking of the spinlock to this path. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: David Howells --- net/rxrpc/ar-connection.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 4106ca95ec86..7bf5b5b9e8b9 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -381,6 +381,8 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx, rxrpc_assign_connection_id(conn); rx->conn = conn; + } else { + spin_lock(&trans->client_lock); } /* we've got a connection with a free channel and we can now attach the -- cgit From 24a9981ee9c7266fa171bbece62062bf78c4d246 Mon Sep 17 00:00:00 2001 From: Tim Smith Date: Sun, 26 Jan 2014 11:39:28 +0000 Subject: af_rxrpc: Avoid setting up double-free on checksum error skb_kill_datagram() does not dequeue the skb when MSG_PEEK is unset. This leaves a free'd skb on the queue, resulting a double-free later. Without this, the following oops can occur: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] skb_dequeue+0x47/0x70 PGD 0 Oops: 0002 [#1] SMP Modules linked in: af_rxrpc ... CPU: 0 PID: 1191 Comm: listen Not tainted 3.12.0+ #4 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff8801183536b0 ti: ffff880035c92000 task.ti: ffff880035c92000 RIP: 0010:[] skb_dequeue+0x47/0x70 RSP: 0018:ffff880035c93db8 EFLAGS: 00010097 RAX: 0000000000000246 RBX: ffff8800d2754b00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000202 RDI: ffff8800d254c084 RBP: ffff880035c93dd0 R08: ffff880035c93cf0 R09: ffff8800d968f270 R10: 0000000000000000 R11: 0000000000000293 R12: ffff8800d254c070 R13: ffff8800d254c084 R14: ffff8800cd861240 R15: ffff880119b39720 FS: 00007f37a969d740(0000) GS:ffff88011fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000008 CR3: 00000000d4413000 CR4: 00000000000006f0 Stack: ffff8800d254c000 ffff8800d254c070 ffff8800d254c2c0 ffff880035c93df8 ffffffffa041a5b8 ffff8800cd844c80 ffffffffa04385a0 ffff8800cd844cb0 ffff880035c93e18 ffffffff81546cef ffff8800d45fea00 0000000000000008 Call Trace: [] rxrpc_release+0x128/0x2e0 [af_rxrpc] [] sock_release+0x1f/0x80 [] sock_close+0x12/0x20 [] __fput+0xe1/0x230 [] ____fput+0xe/0x10 [] task_work_run+0xbc/0xe0 [] do_exit+0x2be/0xa10 [] ? do_munmap+0x297/0x3b0 [] do_group_exit+0x3f/0xa0 [] SyS_exit_group+0x14/0x20 [] system_call_fastpath+0x16/0x1b Signed-off-by: Tim Smith Signed-off-by: David Howells --- net/rxrpc/ar-recvmsg.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index 898492a8d61b..64cba2e35156 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -353,6 +353,10 @@ csum_copy_error: if (continue_call) rxrpc_put_call(continue_call); rxrpc_kill_skb(skb); + if (!(flags & MSG_PEEK)) { + if (skb_dequeue(&rx->sk.sk_receive_queue) != skb) + BUG(); + } skb_kill_datagram(&rx->sk, skb, flags); rxrpc_put_call(call); return -EAGAIN; -- cgit From 1ea427359dde1573815e19c411ce08fdf0c42cfe Mon Sep 17 00:00:00 2001 From: Tim Smith Date: Sun, 26 Jan 2014 11:39:31 +0000 Subject: af_rxrpc: Handle frames delivered from another VM On input, CHECKSUM_PARTIAL should be treated the same way as CHECKSUM_UNNECESSARY. See include/linux/skbuff.h Signed-off-by: Tim Smith Signed-off-by: David Howells --- net/rxrpc/ar-recvmsg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index 64cba2e35156..34b5490dde65 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -180,7 +180,8 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, if (copy > len - copied) copy = len - copied; - if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->ip_summed == CHECKSUM_UNNECESSARY || + skb->ip_summed == CHECKSUM_PARTIAL) { ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy); } else { -- cgit From de960aa9ab4decc3304959f69533eef64d05d8e8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 26 Jan 2014 10:58:16 +0100 Subject: net: add and use skb_gso_transport_seglen() This moves part of Eric Dumazets skb_gso_seglen helper from tbf sched to skbuff core so it may be reused by upcoming ip forwarding path patch. Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 25 +++++++++++++++++++++++++ net/sched/sch_tbf.c | 13 +++---------- 3 files changed, 29 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1f689e62e4cb..f589c9af8cbf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2456,6 +2456,7 @@ void skb_zerocopy(struct sk_buff *to, const struct sk_buff *from, void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); +unsigned int skb_gso_transport_seglen(const struct sk_buff *skb); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct skb_checksum_ops { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8f519dbb358b..9ae6d11374d1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -47,6 +47,8 @@ #include #include #include +#include +#include #include #ifdef CONFIG_NET_CLS_ACT #include @@ -3916,3 +3918,26 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) nf_reset_trace(skb); } EXPORT_SYMBOL_GPL(skb_scrub_packet); + +/** + * skb_gso_transport_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_transport_seglen is used to determine the real size of the + * individual segments, including Layer4 headers (TCP/UDP). + * + * The MAC/L2 or network (IP, IPv6) headers are not accounted for. + */ +unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int hdr_len; + + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + hdr_len = tcp_hdrlen(skb); + else + hdr_len = sizeof(struct udphdr); + return hdr_len + shinfo->gso_size; +} +EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index fbba5b0ec121..1cb413fead89 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -21,7 +21,6 @@ #include #include #include -#include /* Simple Token Bucket Filter. @@ -148,16 +147,10 @@ static u64 psched_ns_t2l(const struct psched_ratecfg *r, * Return length of individual segments of a gso packet, * including all headers (MAC, IP, TCP/UDP) */ -static unsigned int skb_gso_seglen(const struct sk_buff *skb) +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) { unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - const struct skb_shared_info *shinfo = skb_shinfo(skb); - - if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) - hdr_len += tcp_hdrlen(skb); - else - hdr_len += sizeof(struct udphdr); - return hdr_len + shinfo->gso_size; + return hdr_len + skb_gso_transport_seglen(skb); } /* GSO packet is too big, segment it so that tbf can transmit @@ -202,7 +195,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) int ret; if (qdisc_pkt_len(skb) > q->max_size) { - if (skb_is_gso(skb) && skb_gso_seglen(skb) <= q->max_size) + if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size) return tbf_segment(skb, sch); return qdisc_reshape_fail(skb, sch); } -- cgit From 0ea9de0ea6a4e4a1d343130b2a159b4f986e288e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 27 Jan 2014 14:53:27 -0500 Subject: sunrpc: turn warn_gssd() log message into a dprintk() The original printk() made sense when the GSSAPI codepaths were called only when sec=krb5* was explicitly requested. Now however, in many cases the nfs client will try to acquire GSSAPI credentials by default, even when it's not requested. Since we don't have a great mechanism to distinguish between the two cases, just turn the pr_warn into a dprintk instead. With this change we can also get rid of the ratelimiting. We do need to keep the EXPORT_SYMBOL(gssd_running) in place since auth_gss.ko needs it and sunrpc.ko provides it. We can however, eliminate the gssd_running call in the nfs code since that's a bit of a layering violation. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 5 +---- net/sunrpc/auth_gss/auth_gss.c | 8 +------- 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 73d4ecda1e36..dbb3e1f30c68 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -372,10 +372,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - error = -EINVAL; - if (gssd_running(clp->cl_net)) - error = nfs_create_rpc_client(clp, timeparms, - RPC_AUTH_GSS_KRB5I); + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); if (error < 0) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 0a2aee060f9f..6c0513a7f992 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -532,13 +532,7 @@ gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred) static void warn_gssd(void) { - static unsigned long ratelimit; - unsigned long now = jiffies; - - if (time_after(now, ratelimit)) { - pr_warn("RPC: AUTH_GSS upcall failed. Please check user daemon is running.\n"); - ratelimit = now + 15*HZ; - } + dprintk("AUTH_GSS upcall failed. Please check user daemon is running.\n"); } static inline int -- cgit From 27d79f3b1071b2a2d58443a130e92c381c838e5d Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Mon, 27 Jan 2014 12:13:57 +0530 Subject: net: ipv4: Use PTR_ERR_OR_ZERO PTR_RET is deprecated. Use PTR_ERR_OR_ZERO instead. While at it also include missing err.h header. Signed-off-by: Sachin Kamat Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c0e3cb72ad70..bd28f386bd02 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -930,7 +931,7 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, } rtnl_unlock(); - return PTR_RET(itn->fb_tunnel_dev); + return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); -- cgit From 22116525baec1d63f4878eaa92f0b57946a78819 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:18 +0200 Subject: libceph: start using oloc abstraction Instead of relying on pool fields in ceph_file_layout (for mapping) and ceph_pg (for enconding), start using ceph_object_locator (oloc) abstraction. Note that userspace oloc currently consists of pool, key, nspace and hash fields, while this one contains only a pool. This is OK, because at this point we only send (i.e. encode) olocs and never have to receive (i.e. decode) them. This makes keeping a copy of ceph_file_layout in every osd request unnecessary, so ceph_osd_request::r_file_layout field is nuked. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- drivers/block/rbd.c | 8 ++++---- include/linux/ceph/osd_client.h | 3 ++- include/linux/ceph/osdmap.h | 3 +-- net/ceph/osd_client.c | 8 +++++--- 4 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 72a7eec456a9..6614e8d95525 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1808,12 +1808,12 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; + osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + osd_req->r_oid_len = strlen(obj_request->object_name); rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); - osd_req->r_file_layout = rbd_dev->layout; /* struct */ - return osd_req; } @@ -1849,12 +1849,12 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; + osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + osd_req->r_oid_len = strlen(obj_request->object_name); rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); - osd_req->r_file_layout = rbd_dev->layout; /* struct */ - return osd_req; } diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4fb6a8938957..5b8555109789 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -159,12 +159,13 @@ struct ceph_osd_request { struct inode *r_inode; /* for use by callbacks */ void *r_priv; /* ditto */ + struct ceph_object_locator r_oloc; + char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ int r_oid_len; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ - struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ }; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index d05cc4451af6..256134af4ad4 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -40,8 +40,7 @@ struct ceph_pg_pool_info { }; struct ceph_object_locator { - uint64_t pool; - char *key; + s64 pool; }; struct ceph_pg_mapping { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 959d332ed534..7130c5c83d79 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -368,6 +368,8 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); + req->r_oloc.pool = -1; + /* create reply message */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); @@ -761,7 +763,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (num_ops > 1) osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); - req->r_file_layout = *layout; /* keep a copy */ + req->r_oloc.pool = ceph_file_layout_pg_pool(*layout); snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, objnum); @@ -1268,7 +1270,7 @@ static int __map_request(struct ceph_osd_client *osdc, dout("map_request %p tid %lld\n", req, req->r_tid); err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, - ceph_file_layout_pg_pool(req->r_file_layout)); + req->r_oloc.pool); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; @@ -1354,7 +1356,7 @@ static void __send_request(struct ceph_osd_client *osdc, /* fill in message content that changes each time we send it */ put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); put_unaligned_le32(req->r_flags, req->r_request_flags); - put_unaligned_le64(req->r_pgid.pool, req->r_request_pool); + put_unaligned_le64(req->r_oloc.pool, req->r_request_pool); p = req->r_request_pgid; ceph_encode_64(&p, req->r_pgid.pool); ceph_encode_32(&p, req->r_pgid.seed); -- cgit From 2d0ebc5d591f49131bf8f93b54c5424162c3fb7f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:18 +0200 Subject: libceph: rename MAX_OBJ_NAME_SIZE to CEPH_MAX_OID_NAME_LEN In preparation for adding oid abstraction, rename MAX_OBJ_NAME_SIZE to CEPH_MAX_OID_NAME_LEN. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- drivers/block/rbd.c | 6 +++--- include/linux/ceph/osd_client.h | 4 ++-- net/ceph/osd_client.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6614e8d95525..98792b2a7f65 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1088,9 +1088,9 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) name_format = "%s.%012llx"; if (rbd_dev->image_format == 2) name_format = "%s.%016llx"; - ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format, + ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, rbd_dev->header.object_prefix, segment); - if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { + if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { pr_err("error formatting segment name for #%llu (%d)\n", segment, ret); kfree(name); @@ -5350,7 +5350,7 @@ static int rbd_slab_init(void) rbd_assert(!rbd_segment_name_cache); rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", - MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL); + CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); if (rbd_segment_name_cache) return 0; out_err: diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 5b8555109789..b42f15848cae 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -16,7 +16,7 @@ * Maximum object name size * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100) */ -#define MAX_OBJ_NAME_SIZE 100 +#define CEPH_MAX_OID_NAME_LEN 100 struct ceph_msg; struct ceph_snap_context; @@ -161,7 +161,7 @@ struct ceph_osd_request { struct ceph_object_locator r_oloc; - char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ + char r_oid[CEPH_MAX_OID_NAME_LEN]; /* object name */ int r_oid_len; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7130c5c83d79..a053e7e4a780 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -338,7 +338,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, msg_size = 4 + 4 + 8 + 8 + 4+8; msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pg_t */ - msg_size += 4 + MAX_OBJ_NAME_SIZE; + msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); msg_size += 8; /* snapid */ msg_size += 8; /* snap_seq */ -- cgit From 4295f2217a5aa8ef2738e3a368db3c1ceab41212 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:18 +0200 Subject: libceph: introduce and start using oid abstraction In preparation for tiering support, which would require having two (base and target) object names for each osd request and also copying those names around, introduce struct ceph_object_id (oid) and a couple helpers to facilitate those copies and encapsulate the fact that object name is not necessarily a NUL-terminated string. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- drivers/block/rbd.c | 10 ++-------- include/linux/ceph/osd_client.h | 9 +-------- include/linux/ceph/osdmap.h | 36 ++++++++++++++++++++++++++++++++++++ net/ceph/debugfs.c | 3 ++- net/ceph/osd_client.c | 17 +++++++++-------- 5 files changed, 50 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 98792b2a7f65..6fdc5fc00447 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1809,10 +1809,7 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_priv = obj_request; osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - - osd_req->r_oid_len = strlen(obj_request->object_name); - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); + ceph_oid_set_name(&osd_req->r_oid, obj_request->object_name); return osd_req; } @@ -1850,10 +1847,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req->r_priv = obj_request; osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - - osd_req->r_oid_len = strlen(obj_request->object_name); - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); + ceph_oid_set_name(&osd_req->r_oid, obj_request->object_name); return osd_req; } diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index b42f15848cae..8d8bb53994b1 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -12,12 +12,6 @@ #include #include -/* - * Maximum object name size - * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100) - */ -#define CEPH_MAX_OID_NAME_LEN 100 - struct ceph_msg; struct ceph_snap_context; struct ceph_osd_request; @@ -160,9 +154,8 @@ struct ceph_osd_request { void *r_priv; /* ditto */ struct ceph_object_locator r_oloc; + struct ceph_object_id r_oid; - char r_oid[CEPH_MAX_OID_NAME_LEN]; /* object name */ - int r_oid_len; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index f2679c384625..c85f7d43b861 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -43,6 +43,18 @@ struct ceph_object_locator { s64 pool; }; +/* + * Maximum supported by kernel client object name length + * + * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100) + */ +#define CEPH_MAX_OID_NAME_LEN 100 + +struct ceph_object_id { + char name[CEPH_MAX_OID_NAME_LEN]; + int name_len; +}; + struct ceph_pg_mapping { struct rb_node node; struct ceph_pg pgid; @@ -72,6 +84,30 @@ struct ceph_osdmap { struct crush_map *crush; }; +static inline void ceph_oid_set_name(struct ceph_object_id *oid, + const char *name) +{ + int len; + + len = strlen(name); + if (len > sizeof(oid->name)) { + WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n", + name, len, sizeof(oid->name)); + len = sizeof(oid->name); + } + + memcpy(oid->name, name, len); + oid->name_len = len; +} + +static inline void ceph_oid_copy(struct ceph_object_id *dest, + struct ceph_object_id *src) +{ + BUG_ON(src->name_len > sizeof(dest->name)); + memcpy(dest->name, src->name, src->name_len); + dest->name_len = src->name_len; +} + static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) { return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 83661cdc0766..1f8562706393 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -132,7 +132,8 @@ static int osdc_show(struct seq_file *s, void *pp) req->r_osd ? req->r_osd->o_osd : -1, req->r_pgid.pool, req->r_pgid.seed); - seq_printf(s, "%.*s", req->r_oid_len, req->r_oid); + seq_printf(s, "%.*s", req->r_oid.name_len, + req->r_oid.name); if (req->r_reassert_version.epoch) seq_printf(s, "\t%u'%llu", diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a053e7e4a780..2988d68b24c6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -765,9 +765,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_oloc.pool = ceph_file_layout_pg_pool(*layout); - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", - vino.ino, objnum); - req->r_oid_len = strlen(req->r_oid); + snprintf(req->r_oid.name, sizeof(req->r_oid.name), + "%llx.%08llx", vino.ino, objnum); + req->r_oid.name_len = strlen(req->r_oid.name); return req; } @@ -1269,7 +1269,7 @@ static int __map_request(struct ceph_osd_client *osdc, bool was_paused; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, + err = ceph_calc_ceph_pg(&pgid, req->r_oid.name, osdc->osdmap, req->r_oloc.pool); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); @@ -2118,10 +2118,11 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, ceph_encode_32(&p, -1); /* preferred */ /* oid */ - ceph_encode_32(&p, req->r_oid_len); - memcpy(p, req->r_oid, req->r_oid_len); - dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); - p += req->r_oid_len; + ceph_encode_32(&p, req->r_oid.name_len); + memcpy(p, req->r_oid.name, req->r_oid.name_len); + dout("oid '%.*s' len %d\n", req->r_oid.name_len, + req->r_oid.name, req->r_oid.name_len); + p += req->r_oid.name_len; /* ops--can imply data */ ceph_encode_16(&p, (u16)req->r_num_ops); -- cgit From 7c13cb64352230deac24d3cb058387a6c0676f83 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:19 +0200 Subject: libceph: replace ceph_calc_ceph_pg() with ceph_oloc_oid_to_pg() Switch ceph_calc_ceph_pg() to new oloc and oid abstractions and rename it to ceph_oloc_oid_to_pg() to make its purpose more clear. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- fs/ceph/ioctl.c | 8 ++++++-- include/linux/ceph/osdmap.h | 7 +++++-- net/ceph/osd_client.c | 4 ++-- net/ceph/osdmap.c | 29 +++++++++++++++++------------ 4 files changed, 30 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 669622fd1ae3..dc66c9e023e4 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -183,6 +183,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_object_locator oloc; + struct ceph_object_id oid; u64 len = 1, olen; u64 tmp; struct ceph_pg pgid; @@ -211,8 +213,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, - ceph_file_layout_pg_pool(ci->i_layout)); + oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); + ceph_oid_set_name(&oid, dl.object_name); + + r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); if (r < 0) { up_read(&osdc->map_sem); return r; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index c85f7d43b861..ebb8ec285de6 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -163,8 +163,11 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 *bno, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ -extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, - struct ceph_osdmap *osdmap, uint64_t pool); +extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_locator *oloc, + struct ceph_object_id *oid, + struct ceph_pg *pg_out); + extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, int *acting); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2988d68b24c6..10360dedcdad 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1269,8 +1269,8 @@ static int __map_request(struct ceph_osd_client *osdc, bool was_paused; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_calc_ceph_pg(&pgid, req->r_oid.name, osdc->osdmap, - req->r_oloc.pool); + err = ceph_oloc_oid_to_pg(osdc->osdmap, &req->r_oloc, &req->r_oid, + &pgid); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 8b1a6b48bb5d..768dd04eb9b1 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1090,25 +1090,30 @@ invalid: EXPORT_SYMBOL(ceph_calc_file_object_mapping); /* - * calculate an object layout (i.e. pgid) from an oid, - * file_layout, and osdmap + * Calculate mapping of a (oloc, oid) pair to a PG. Should only be + * called with target's (oloc, oid), since tiering isn't taken into + * account. */ -int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, - struct ceph_osdmap *osdmap, uint64_t pool) +int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_locator *oloc, + struct ceph_object_id *oid, + struct ceph_pg *pg_out) { - struct ceph_pg_pool_info *pool_info; + struct ceph_pg_pool_info *pi; - BUG_ON(!osdmap); - pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool); - if (!pool_info) + pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); + if (!pi) return -EIO; - pg->pool = pool; - pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid)); - dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed); + pg_out->pool = oloc->pool; + pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, + oid->name_len); + + dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, + pg_out->pool, pg_out->seed); return 0; } -EXPORT_SYMBOL(ceph_calc_ceph_pg); +EXPORT_SYMBOL(ceph_oloc_oid_to_pg); static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x, int *result, int result_max, -- cgit From ce7f6a2790464047199f54b66420243d433142bd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:19 +0200 Subject: libceph: add ceph_pg_pool_by_id() "Lookup pool info by ID" function is hidden in osdmap.c. Expose it to the rest of libceph. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osdmap.h | 3 +++ net/ceph/osdmap.c | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index ebb8ec285de6..7f894a64c6c7 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -174,6 +174,9 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); +extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, + u64 id); + extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 768dd04eb9b1..d69d23556f0c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -464,6 +464,11 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) return NULL; } +struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) +{ + return __lookup_pg_pool(&map->pg_pools, id); +} + const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) { struct ceph_pg_pool_info *pi; -- cgit From 17a13e4028e6ad7ded079cf32370c47bd0e0fc07 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:19 +0200 Subject: libceph: follow {read,write}_tier fields on osd request submission Overwrite ceph_osd_request::r_oloc.pool with read_tier for read ops and write_tier for write and read+write ops (aka basic tiering support). {read,write}_tier are part of pg_pool_t since v9. This commit bumps our pg_pool_t decode compat version from v7 to v9, all new fields except for {read,write}_tier are ignored. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osdmap.h | 2 ++ net/ceph/osd_client.c | 30 ++++++++++++++++++++++++++++-- net/ceph/osdmap.c | 28 +++++++++++++++++++++++++--- 3 files changed, 55 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 7f894a64c6c7..49ff69f0746b 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -35,6 +35,8 @@ struct ceph_pg_pool_info { u8 object_hash; u32 pg_num, pgp_num; int pg_num_mask, pgp_num_mask; + s64 read_tier; + s64 write_tier; /* wins for read+write ops */ u64 flags; char *name; }; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 10360dedcdad..0eb009eaf4ff 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1249,6 +1249,32 @@ static bool __req_should_be_paused(struct ceph_osd_client *osdc, (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); } +/* + * Calculate mapping of a request to a PG. Takes tiering into account. + */ +static int __calc_request_pg(struct ceph_osdmap *osdmap, + struct ceph_osd_request *req, + struct ceph_pg *pg_out) +{ + if ((req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + struct ceph_pg_pool_info *pi; + + pi = ceph_pg_pool_by_id(osdmap, req->r_oloc.pool); + if (pi) { + if ((req->r_flags & CEPH_OSD_FLAG_READ) && + pi->read_tier >= 0) + req->r_oloc.pool = pi->read_tier; + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + pi->write_tier >= 0) + req->r_oloc.pool = pi->write_tier; + } + /* !pi is caught in ceph_oloc_oid_to_pg() */ + } + + return ceph_oloc_oid_to_pg(osdmap, &req->r_oloc, + &req->r_oid, pg_out); +} + /* * Pick an osd (the first 'up' osd in the pg), allocate the osd struct * (as needed), and set the request r_osd appropriately. If there is @@ -1269,8 +1295,8 @@ static int __map_request(struct ceph_osd_client *osdc, bool was_paused; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_oloc_oid_to_pg(osdc->osdmap, &req->r_oloc, &req->r_oid, - &pgid); + + err = __calc_request_pg(osdc->osdmap, req, &pgid); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index d69d23556f0c..aade4a5c1c07 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -519,8 +519,8 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); return -EINVAL; } - if (cv > 7) { - pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv); + if (cv > 9) { + pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); return -EINVAL; } len = ceph_decode_32(p); @@ -548,12 +548,34 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) *p += len; } - /* skip removed snaps */ + /* skip removed_snaps */ num = ceph_decode_32(p); *p += num * (8 + 8); *p += 8; /* skip auid */ pi->flags = ceph_decode_64(p); + *p += 4; /* skip crash_replay_interval */ + + if (ev >= 7) + *p += 1; /* skip min_size */ + + if (ev >= 8) + *p += 8 + 8; /* skip quota_max_* */ + + if (ev >= 9) { + /* skip tiers */ + num = ceph_decode_32(p); + *p += num * 8; + + *p += 8; /* skip tier_of */ + *p += 1; /* skip cache_mode */ + + pi->read_tier = ceph_decode_64(p); + pi->write_tier = ceph_decode_64(p); + } else { + pi->read_tier = -1; + pi->write_tier = -1; + } /* ignore the rest */ -- cgit From 3c972c95c68f455d80ff185aa440857be046bbe0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:20 +0200 Subject: libceph: rename ceph_osd_request::r_{oloc,oid} to r_base_{oloc,oid} Rename ceph_osd_request::r_{oloc,oid} to r_base_{oloc,oid} before introducing r_target_{oloc,oid} needed for redirects. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- drivers/block/rbd.c | 8 ++++---- include/linux/ceph/osd_client.h | 4 ++-- net/ceph/debugfs.c | 4 ++-- net/ceph/osd_client.c | 30 +++++++++++++++--------------- 4 files changed, 23 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6fdc5fc00447..16cab6635163 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1808,8 +1808,8 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - ceph_oid_set_name(&osd_req->r_oid, obj_request->object_name); + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); return osd_req; } @@ -1846,8 +1846,8 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - ceph_oid_set_name(&osd_req->r_oid, obj_request->object_name); + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); return osd_req; } diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 8d8bb53994b1..3170ca6d98b2 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -153,8 +153,8 @@ struct ceph_osd_request { struct inode *r_inode; /* for use by callbacks */ void *r_priv; /* ditto */ - struct ceph_object_locator r_oloc; - struct ceph_object_id r_oid; + struct ceph_object_locator r_base_oloc; + struct ceph_object_id r_base_oid; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 1f8562706393..258a382e75ed 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -132,8 +132,8 @@ static int osdc_show(struct seq_file *s, void *pp) req->r_osd ? req->r_osd->o_osd : -1, req->r_pgid.pool, req->r_pgid.seed); - seq_printf(s, "%.*s", req->r_oid.name_len, - req->r_oid.name); + seq_printf(s, "%.*s", req->r_base_oid.name_len, + req->r_base_oid.name); if (req->r_reassert_version.epoch) seq_printf(s, "\t%u'%llu", diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0eb009eaf4ff..3997a87c4f51 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -368,7 +368,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); - req->r_oloc.pool = -1; + req->r_base_oloc.pool = -1; /* create reply message */ if (use_mempool) @@ -763,11 +763,11 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (num_ops > 1) osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); - req->r_oloc.pool = ceph_file_layout_pg_pool(*layout); + req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); - snprintf(req->r_oid.name, sizeof(req->r_oid.name), + snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), "%llx.%08llx", vino.ino, objnum); - req->r_oid.name_len = strlen(req->r_oid.name); + req->r_base_oid.name_len = strlen(req->r_base_oid.name); return req; } @@ -1259,20 +1259,20 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap, if ((req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { struct ceph_pg_pool_info *pi; - pi = ceph_pg_pool_by_id(osdmap, req->r_oloc.pool); + pi = ceph_pg_pool_by_id(osdmap, req->r_base_oloc.pool); if (pi) { if ((req->r_flags & CEPH_OSD_FLAG_READ) && pi->read_tier >= 0) - req->r_oloc.pool = pi->read_tier; + req->r_base_oloc.pool = pi->read_tier; if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && pi->write_tier >= 0) - req->r_oloc.pool = pi->write_tier; + req->r_base_oloc.pool = pi->write_tier; } /* !pi is caught in ceph_oloc_oid_to_pg() */ } - return ceph_oloc_oid_to_pg(osdmap, &req->r_oloc, - &req->r_oid, pg_out); + return ceph_oloc_oid_to_pg(osdmap, &req->r_base_oloc, + &req->r_base_oid, pg_out); } /* @@ -1382,7 +1382,7 @@ static void __send_request(struct ceph_osd_client *osdc, /* fill in message content that changes each time we send it */ put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); put_unaligned_le32(req->r_flags, req->r_request_flags); - put_unaligned_le64(req->r_oloc.pool, req->r_request_pool); + put_unaligned_le64(req->r_base_oloc.pool, req->r_request_pool); p = req->r_request_pgid; ceph_encode_64(&p, req->r_pgid.pool); ceph_encode_32(&p, req->r_pgid.seed); @@ -2144,11 +2144,11 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, ceph_encode_32(&p, -1); /* preferred */ /* oid */ - ceph_encode_32(&p, req->r_oid.name_len); - memcpy(p, req->r_oid.name, req->r_oid.name_len); - dout("oid '%.*s' len %d\n", req->r_oid.name_len, - req->r_oid.name, req->r_oid.name_len); - p += req->r_oid.name_len; + ceph_encode_32(&p, req->r_base_oid.name_len); + memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); + dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, + req->r_base_oid.name, req->r_base_oid.name_len); + p += req->r_base_oid.name_len; /* ops--can imply data */ ceph_encode_16(&p, (u16)req->r_num_ops); -- cgit From 205ee1187a671c3b067d7f1e974903b44036f270 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:20 +0200 Subject: libceph: follow redirect replies from osds Follow redirect replies from osds, for details see ceph.git commit fbbe3ad1220799b7bb00ea30fce581c5eadaf034. v1 (current) version of redirect reply consists of oloc and oid, which expands to pool, key, nspace, hash and oid. However, server-side code that would populate anything other than pool doesn't exist yet, and hence this commit adds support for pool redirects only. To make sure that future server-side updates don't break us, we decode all fields and, if any of key, nspace, hash or oid have a non-default value, error out with "corrupt osd_op_reply ..." message. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osd_client.h | 6 ++ net/ceph/osd_client.c | 167 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 164 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 3170ca6d98b2..fd47e872ebcc 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -155,6 +155,8 @@ struct ceph_osd_request { struct ceph_object_locator r_base_oloc; struct ceph_object_id r_base_oid; + struct ceph_object_locator r_target_oloc; + struct ceph_object_id r_target_oid; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ @@ -162,6 +164,10 @@ struct ceph_osd_request { struct ceph_snap_context *r_snapc; /* snap context for writes */ }; +struct ceph_request_redirect { + struct ceph_object_locator oloc; +}; + struct ceph_osd_event { u64 cookie; int one_shot; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3997a87c4f51..010ff3bd58ad 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -369,6 +369,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_osd_item); req->r_base_oloc.pool = -1; + req->r_target_oloc.pool = -1; /* create reply message */ if (use_mempool) @@ -1256,23 +1257,36 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap, struct ceph_osd_request *req, struct ceph_pg *pg_out) { - if ((req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + bool need_check_tiering; + + need_check_tiering = false; + if (req->r_target_oloc.pool == -1) { + req->r_target_oloc = req->r_base_oloc; /* struct */ + need_check_tiering = true; + } + if (req->r_target_oid.name_len == 0) { + ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); + need_check_tiering = true; + } + + if (need_check_tiering && + (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { struct ceph_pg_pool_info *pi; - pi = ceph_pg_pool_by_id(osdmap, req->r_base_oloc.pool); + pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); if (pi) { if ((req->r_flags & CEPH_OSD_FLAG_READ) && pi->read_tier >= 0) - req->r_base_oloc.pool = pi->read_tier; + req->r_target_oloc.pool = pi->read_tier; if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && pi->write_tier >= 0) - req->r_base_oloc.pool = pi->write_tier; + req->r_target_oloc.pool = pi->write_tier; } /* !pi is caught in ceph_oloc_oid_to_pg() */ } - return ceph_oloc_oid_to_pg(osdmap, &req->r_base_oloc, - &req->r_base_oid, pg_out); + return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, + &req->r_target_oid, pg_out); } /* @@ -1382,7 +1396,7 @@ static void __send_request(struct ceph_osd_client *osdc, /* fill in message content that changes each time we send it */ put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); put_unaligned_le32(req->r_flags, req->r_request_flags); - put_unaligned_le64(req->r_base_oloc.pool, req->r_request_pool); + put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool); p = req->r_request_pgid; ceph_encode_64(&p, req->r_pgid.pool); ceph_encode_32(&p, req->r_pgid.seed); @@ -1483,6 +1497,109 @@ static void handle_osds_timeout(struct work_struct *work) round_jiffies_relative(delay)); } +static int ceph_oloc_decode(void **p, void *end, + struct ceph_object_locator *oloc) +{ + u8 struct_v, struct_cv; + u32 len; + void *struct_end; + int ret = 0; + + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); + struct_v = ceph_decode_8(p); + struct_cv = ceph_decode_8(p); + if (struct_v < 3) { + pr_warn("got v %d < 3 cv %d of ceph_object_locator\n", + struct_v, struct_cv); + goto e_inval; + } + if (struct_cv > 6) { + pr_warn("got v %d cv %d > 6 of ceph_object_locator\n", + struct_v, struct_cv); + goto e_inval; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, e_inval); + struct_end = *p + len; + + oloc->pool = ceph_decode_64(p); + *p += 4; /* skip preferred */ + + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_object_locator::key is set\n"); + goto e_inval; + } + + if (struct_v >= 5) { + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_object_locator::nspace is set\n"); + goto e_inval; + } + } + + if (struct_v >= 6) { + s64 hash = ceph_decode_64(p); + if (hash != -1) { + pr_warn("ceph_object_locator::hash is set\n"); + goto e_inval; + } + } + + /* skip the rest */ + *p = struct_end; +out: + return ret; + +e_inval: + ret = -EINVAL; + goto out; +} + +static int ceph_redirect_decode(void **p, void *end, + struct ceph_request_redirect *redir) +{ + u8 struct_v, struct_cv; + u32 len; + void *struct_end; + int ret; + + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); + struct_v = ceph_decode_8(p); + struct_cv = ceph_decode_8(p); + if (struct_cv > 1) { + pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n", + struct_v, struct_cv); + goto e_inval; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, e_inval); + struct_end = *p + len; + + ret = ceph_oloc_decode(p, end, &redir->oloc); + if (ret) + goto out; + + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_request_redirect::object_name is set\n"); + goto e_inval; + } + + len = ceph_decode_32(p); + *p += len; /* skip osd_instructions */ + + /* skip the rest */ + *p = struct_end; +out: + return ret; + +e_inval: + ret = -EINVAL; + goto out; +} + static void complete_request(struct ceph_osd_request *req) { complete_all(&req->r_safe_completion); /* fsync waiter */ @@ -1497,6 +1614,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, { void *p, *end; struct ceph_osd_request *req; + struct ceph_request_redirect redir; u64 tid; int object_len; unsigned int numops; @@ -1576,10 +1694,41 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, for (i = 0; i < numops; i++) req->r_reply_op_result[i] = ceph_decode_32(&p); - already_completed = req->r_got_reply; + if (le16_to_cpu(msg->hdr.version) >= 6) { + p += 8 + 4; /* skip replay_version */ + p += 8; /* skip user_version */ - if (!req->r_got_reply) { + err = ceph_redirect_decode(&p, end, &redir); + if (err) + goto bad_put; + } else { + redir.oloc.pool = -1; + } + if (redir.oloc.pool != -1) { + dout("redirect pool %lld\n", redir.oloc.pool); + + __unregister_request(osdc, req); + mutex_unlock(&osdc->request_mutex); + + req->r_target_oloc = redir.oloc; /* struct */ + + /* + * Start redirect requests with nofail=true. If + * mapping fails, request will end up on the notarget + * list, waiting for the new osdmap (which can take + * a while), even though the original request mapped + * successfully. In the future we might want to follow + * original request's nofail setting here. + */ + err = ceph_osdc_start_request(osdc, req, true); + BUG_ON(err); + + goto done; + } + + already_completed = req->r_got_reply; + if (!req->r_got_reply) { req->r_result = result; dout("handle_reply result %d bytes %d\n", req->r_result, bytes); -- cgit From a452ce345d63ddf92cd101e4196569f8718ad319 Mon Sep 17 00:00:00 2001 From: Holger Eitzenberger Date: Mon, 27 Jan 2014 10:33:18 +0100 Subject: net: Fix memory leak if TPROXY used with TCP early demux I see a memory leak when using a transparent HTTP proxy using TPROXY together with TCP early demux and Kernel v3.8.13.15 (Ubuntu stable): unreferenced object 0xffff88008cba4a40 (size 1696): comm "softirq", pid 0, jiffies 4294944115 (age 8907.520s) hex dump (first 32 bytes): 0a e0 20 6a 40 04 1b 37 92 be 32 e2 e8 b4 00 00 .. j@..7..2..... 02 00 07 01 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmem_cache_alloc+0xad/0xb9 [] sk_prot_alloc+0x29/0xc5 [] sk_clone_lock+0x14/0x283 [] inet_csk_clone_lock+0xf/0x7b [] netlink_broadcast+0x14/0x16 [] tcp_create_openreq_child+0x1b/0x4c3 [] tcp_v4_syn_recv_sock+0x38/0x25d [] tcp_check_req+0x25c/0x3d0 [] tcp_v4_do_rcv+0x287/0x40e [] ip_route_input_noref+0x843/0xa55 [] tcp_v4_rcv+0x4c9/0x725 [] ip_local_deliver_finish+0xe9/0x154 [] __netif_receive_skb+0x4b2/0x514 [] process_backlog+0xee/0x1c5 [] net_rx_action+0xa7/0x200 [] add_interrupt_randomness+0x39/0x157 But there are many more, resulting in the machine going OOM after some days. From looking at the TPROXY code, and with help from Florian, I see that the memory leak is introduced in tcp_v4_early_demux(): void tcp_v4_early_demux(struct sk_buff *skb) { /* ... */ iph = ip_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) return; sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), skb->skb_iif); if (sk) { skb->sk = sk; where the socket is assigned unconditionally to skb->sk, also bumping the refcnt on it. This is problematic, because in our case the skb has already a socket assigned in the TPROXY target. This then results in the leak I see. The very same issue seems to be with IPv6, but haven't tested. Reviewed-by: Florian Westphal Signed-off-by: Holger Eitzenberger Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 2 +- net/ipv6/ip6_input.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 054a3e97d822..3d4da2c16b6a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -314,7 +314,7 @@ static int ip_rcv_finish(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - if (sysctl_ip_early_demux && !skb_dst(skb)) { + if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct net_protocol *ipprot; int protocol = iph->protocol; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 302d6fb1ff2b..51d54dc376f3 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -49,7 +49,7 @@ int ip6_rcv_finish(struct sk_buff *skb) { - if (sysctl_ip_early_demux && !skb_dst(skb)) { + if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); -- cgit From ce60e0c4df5f95086d5c2662c5cfa0beb8181c6d Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 7 Jan 2014 12:52:43 +1100 Subject: net: 6lowpan: fixup for code movement Signed-off-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/ieee802154/6lowpan_iphc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan_iphc.c b/net/ieee802154/6lowpan_iphc.c index 083f905bf109..860aa2d445ba 100644 --- a/net/ieee802154/6lowpan_iphc.c +++ b/net/ieee802154/6lowpan_iphc.c @@ -678,7 +678,7 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, hc06_ptr += 3; } else { /* compress nothing */ - memcpy(hc06_ptr, &hdr, 4); + memcpy(hc06_ptr, hdr, 4); /* replace the top byte with new ECN | DSCP format */ *hc06_ptr = tmp; hc06_ptr += 4; -- cgit From c0c0c50ff7c3e331c90bab316d21f724fb9e1994 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Tue, 28 Jan 2014 11:49:43 +0800 Subject: net: gre: use icmp_hdr() to get inner ip header When dealing with icmp messages, the skb->data points the ip header that triggered the sending of the icmp message. In gre_cisco_err(), the parse_gre_header() is called, and the iptunnel_pull_header() is called to pull the skb at the end of the parse_gre_header(), so the skb->data doesn't point the inner ip header. Unfortunately, the ipgre_err still needs those ip addresses in inner ip header to look up tunnel by ip_tunnel_lookup(). So just use icmp_hdr() to get inner ip header instead of skb->data. Signed-off-by: Duan Jiong Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e7a92fdb36f6..ec4f762efda5 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -178,7 +178,7 @@ static int ipgre_err(struct sk_buff *skb, u32 info, else itn = net_generic(net, ipgre_net_id); - iph = (const struct iphdr *)skb->data; + iph = (const struct iphdr *)(icmp_hdr(skb) + 1); t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, iph->daddr, iph->saddr, tpi->key); -- cgit From 0f1a24c9a9f4682dd61f5c39b9952f915c5e952c Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 28 Jan 2014 16:30:52 -0500 Subject: llc: remove noisy WARN from llc_mac_hdr_init Sending malformed llc packets triggers this spew, which seems excessive. WARNING: CPU: 1 PID: 6917 at net/llc/llc_output.c:46 llc_mac_hdr_init+0x85/0x90 [llc]() device type not supported: 0 CPU: 1 PID: 6917 Comm: trinity-c1 Not tainted 3.13.0+ #95 0000000000000009 00000000007e257d ffff88009232fbe8 ffffffffac737325 ffff88009232fc30 ffff88009232fc20 ffffffffac06d28d ffff88020e07f180 ffff88009232fec0 00000000000000c8 0000000000000000 ffff88009232fe70 Call Trace: [] dump_stack+0x4e/0x7a [] warn_slowpath_common+0x7d/0xa0 [] warn_slowpath_fmt+0x5c/0x80 [] llc_mac_hdr_init+0x85/0x90 [llc] [] llc_build_and_send_ui_pkt+0x79/0x90 [llc] [] llc_ui_sendmsg+0x23a/0x400 [llc2] [] sock_sendmsg+0x9c/0xe0 [] ? might_fault+0x47/0x50 [] SYSC_sendto+0x121/0x1c0 [] ? syscall_trace_enter+0x207/0x270 [] SyS_sendto+0xe/0x10 [] tracesys+0xdd/0xe2 Until 2009, this was a printk, when it was changed in bf9ae5386bc: "llc: use dev_hard_header". Let userland figure out what -EINVAL means by itself. Signed-off-by: Dave Jones Signed-off-by: David S. Miller --- net/llc/llc_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c index 2dae8a5df23f..94425e421213 100644 --- a/net/llc/llc_output.c +++ b/net/llc/llc_output.c @@ -43,7 +43,7 @@ int llc_mac_hdr_init(struct sk_buff *skb, rc = 0; break; default: - WARN(1, "device type not supported: %d\n", skb->dev->type); + break; } return rc; } -- cgit From 7fceb4de75f993a598d27af835e87b19b8be514e Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Wed, 29 Jan 2014 01:05:28 +0900 Subject: net: Fix warning on make htmldocs caused by skbuff.c This patch fixed following Warning while executing "make htmldocs". Warning(/net/core/skbuff.c:2164): No description found for parameter 'from' Warning(/net/core/skbuff.c:2164): Excess function parameter 'source' description in 'skb_zerocopy' Replace "@source" with "@from" fixed the warning. Signed-off-by: Masanari Iida Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9ae6d11374d1..5976ef0846bd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2121,7 +2121,7 @@ EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); /** * skb_zerocopy - Zero copy skb to skb * @to: destination buffer - * @source: source buffer + * @from: source buffer * @len: number of bytes to copy from source buffer * @hlen: size of linear headroom in destination buffer * -- cgit From 2def2ef2ae5f3990aabdbe8a755911902707d268 Mon Sep 17 00:00:00 2001 From: PaX Team Date: Thu, 30 Jan 2014 16:59:25 -0800 Subject: x86, x32: Correct invalid use of user timespec in the kernel The x32 case for the recvmsg() timout handling is broken: asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct compat_timespec __user *timeout) { int datagrams; struct timespec ktspec; if (flags & MSG_CMSG_COMPAT) return -EINVAL; if (COMPAT_USE_64BIT_TIME) return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, (struct timespec *) timeout); ... The timeout pointer parameter is provided by userland (hence the __user annotation) but for x32 syscalls it's simply cast to a kernel pointer and is passed to __sys_recvmmsg which will eventually directly dereference it for both reading and writing. Other callers to __sys_recvmmsg properly copy from userland to the kernel first. The bug was introduced by commit ee4fa23c4bfc ("compat: Use COMPAT_USE_64BIT_TIME in net/compat.c") and should affect all kernels since 3.4 (and perhaps vendor kernels if they backported x32 support along with this code). Note that CONFIG_X86_X32_ABI gets enabled at build time and only if CONFIG_X86_X32 is enabled and ld can build x32 executables. Other uses of COMPAT_USE_64BIT_TIME seem fine. This addresses CVE-2014-0038. Signed-off-by: PaX Team Signed-off-by: H. Peter Anvin Cc: # v3.4+ Signed-off-by: Linus Torvalds --- net/compat.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/compat.c b/net/compat.c index dd32e34c1e2c..f50161fb812e 100644 --- a/net/compat.c +++ b/net/compat.c @@ -780,21 +780,16 @@ asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, if (flags & MSG_CMSG_COMPAT) return -EINVAL; - if (COMPAT_USE_64BIT_TIME) - return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, - flags | MSG_CMSG_COMPAT, - (struct timespec *) timeout); - if (timeout == NULL) return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, NULL); - if (get_compat_timespec(&ktspec, timeout)) + if (compat_get_timespec(&ktspec, timeout)) return -EFAULT; datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, &ktspec); - if (datagrams > 0 && put_compat_timespec(&ktspec, timeout)) + if (datagrams > 0 && compat_put_timespec(&ktspec, timeout)) datagrams = -EFAULT; return datagrams; -- cgit From c172ec5c8dc8c09dd5958f4ae542fa321bb15a92 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 31 Jan 2014 17:49:22 +0200 Subject: libceph: fix error handling in ceph_osdc_init() msgpool_op_reply message pool isn't destroyed if workqueue construction fails. Fix it. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 010ff3bd58ad..166d4c7ba065 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -2504,9 +2504,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) err = -ENOMEM; osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); if (!osdc->notify_wq) - goto out_msgpool; + goto out_msgpool_reply; + return 0; +out_msgpool_reply: + ceph_msgpool_destroy(&osdc->msgpool_op_reply); out_msgpool: ceph_msgpool_destroy(&osdc->msgpool_op); out_mempool: -- cgit From 2f617435c3a6fe3f39efb9ae2baa77de2d6c97b8 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Sun, 12 Jan 2014 11:06:37 +0200 Subject: mac80211: move roc cookie assignment earlier ieee80211_start_roc_work() might add a new roc to existing roc, and tell cfg80211 it has already started. However, this might happen before the roc cookie was set, resulting in REMAIN_ON_CHANNEL (started) event with null cookie. Consequently, it can make wpa_supplicant go out of sync. Fix it by setting the roc cookie earlier. Cc: stable@vger.kernel.org Signed-off-by: Eliad Peller Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f9ae9b85d4c1..94b4acb5aabb 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2638,6 +2638,24 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, INIT_DELAYED_WORK(&roc->work, ieee80211_sw_roc_work); INIT_LIST_HEAD(&roc->dependents); + /* + * cookie is either the roc cookie (for normal roc) + * or the SKB (for mgmt TX) + */ + if (!txskb) { + /* local->mtx protects this */ + local->roc_cookie_counter++; + roc->cookie = local->roc_cookie_counter; + /* wow, you wrapped 64 bits ... more likely a bug */ + if (WARN_ON(roc->cookie == 0)) { + roc->cookie = 1; + local->roc_cookie_counter++; + } + *cookie = roc->cookie; + } else { + *cookie = (unsigned long)txskb; + } + /* if there's one pending or we're scanning, queue this one */ if (!list_empty(&local->roc_list) || local->scanning || local->radar_detect_enabled) @@ -2772,24 +2790,6 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, if (!queued) list_add_tail(&roc->list, &local->roc_list); - /* - * cookie is either the roc cookie (for normal roc) - * or the SKB (for mgmt TX) - */ - if (!txskb) { - /* local->mtx protects this */ - local->roc_cookie_counter++; - roc->cookie = local->roc_cookie_counter; - /* wow, you wrapped 64 bits ... more likely a bug */ - if (WARN_ON(roc->cookie == 0)) { - roc->cookie = 1; - local->roc_cookie_counter++; - } - *cookie = roc->cookie; - } else { - *cookie = (unsigned long)txskb; - } - return 0; } -- cgit From f12cb2893069495726c21a4b0178705dacfecfe0 Mon Sep 17 00:00:00 2001 From: Pontus Fuchs Date: Thu, 16 Jan 2014 15:00:40 +0100 Subject: nl80211: Reset split_start when netlink skb is exhausted When the netlink skb is exhausted split_start is left set. In the subsequent retry, with a larger buffer, the dump is continued from the failing point instead of from the beginning. This was causing my rt28xx based USB dongle to now show up when running "iw list" with an old iw version without split dump support. Cc: stable@vger.kernel.org Fixes: 3713b4e364ef ("nl80211: allow splitting wiphy information in dumps") Signed-off-by: Pontus Fuchs [avoid the entire workaround when state->split is set] Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 7a742594916e..6ea960b1a8eb 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1719,9 +1719,10 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) * We can then retry with the larger buffer. */ if ((ret == -ENOBUFS || ret == -EMSGSIZE) && - !skb->len && + !skb->len && !state->split && cb->min_dump_alloc < 4096) { cb->min_dump_alloc = 4096; + state->split_start = 0; rtnl_unlock(); return 1; } -- cgit From 5a6aa705ffdd97552ff756bbfa7d5a3b62a6c690 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 23 Jan 2014 16:32:29 +0100 Subject: cfg80211: re-enable 5/10 MHz support Unfortunately I forgot this during the merge window, but the patch seems small enough to go in as a fix. The userspace API bug that was the reason for disabling it has long been fixed. Signed-off-by: Johannes Berg --- net/wireless/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index d89dee2259b5..77fe4c791269 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -440,9 +440,6 @@ int wiphy_register(struct wiphy *wiphy) int i; u16 ifmodes = wiphy->interface_modes; - /* support for 5/10 MHz is broken due to nl80211 API mess - disable */ - wiphy->flags &= ~WIPHY_FLAG_SUPPORTS_5_10_MHZ; - /* * There are major locking problems in nl80211/mac80211 for CSA, * disable for all drivers until this has been reworked. -- cgit From 8ffcc704c963b4157391bd87a4544cdfd18b574d Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Thu, 23 Jan 2014 14:28:16 +0200 Subject: mac80211: avoid deadlock revealed by lockdep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sdata->u.ap.request_smps_work can’t be flushed synchronously under wdev_lock(wdev) since ieee80211_request_smps_ap_work itself locks the same lock. While at it, reset the driver_smps_mode when the ap is stopped to its default: OFF. This solves: ====================================================== [ INFO: possible circular locking dependency detected ] 3.12.0-ipeer+ #2 Tainted: G O ------------------------------------------------------- rmmod/2867 is trying to acquire lock: ((&sdata->u.ap.request_smps_work)){+.+...}, at: [] flush_work+0x0/0x90 but task is already holding lock: (&wdev->mtx){+.+.+.}, at: [] cfg80211_stop_ap+0x26/0x230 [cfg80211] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&wdev->mtx){+.+.+.}: [] lock_acquire+0x79/0xe0 [] mutex_lock_nested+0x4a/0x360 [] ieee80211_request_smps_ap_work+0x2b/0x50 [mac80211] [] process_one_work+0x198/0x450 [] worker_thread+0xf9/0x320 [] kthread+0x9f/0xb0 [] ret_from_kernel_thread+0x1b/0x28 -> #0 ((&sdata->u.ap.request_smps_work)){+.+...}: [] __lock_acquire+0x183f/0x1910 [] lock_acquire+0x79/0xe0 [] flush_work+0x47/0x90 [] __cancel_work_timer+0x67/0xe0 [] cancel_work_sync+0xf/0x20 [] ieee80211_stop_ap+0x8c/0x340 [mac80211] [] cfg80211_stop_ap+0x8c/0x230 [cfg80211] [] cfg80211_leave+0x79/0x100 [cfg80211] [] cfg80211_netdev_notifier_call+0xf2/0x4f0 [cfg80211] [] notifier_call_chain+0x59/0x130 [] __raw_notifier_call_chain+0x1e/0x30 [] raw_notifier_call_chain+0x1f/0x30 [] call_netdevice_notifiers_info+0x33/0x70 [] call_netdevice_notifiers+0x13/0x20 [] __dev_close_many+0x34/0xb0 [] dev_close_many+0x6e/0xc0 [] rollback_registered_many+0xa7/0x1f0 [] unregister_netdevice_many+0x14/0x60 [] ieee80211_remove_interfaces+0xe9/0x170 [mac80211] [] ieee80211_unregister_hw+0x56/0x110 [mac80211] [] iwl_op_mode_mvm_stop+0x26/0xe0 [iwlmvm] [] _iwl_op_mode_stop+0x3a/0x70 [iwlwifi] [] iwl_opmode_deregister+0x6f/0x90 [iwlwifi] [] __exit_compat+0xd/0x19 [iwlmvm] [] SyS_delete_module+0x179/0x2b0 [] sysenter_do_call+0x12/0x32 Fixes: 687da132234f ("mac80211: implement SMPS for AP") Cc: [3.13] Reported-by: Ilan Peer Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 3 +-- net/mac80211/ht.c | 4 +++- net/mac80211/iface.c | 15 +++++++++++---- 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 94b4acb5aabb..33acdca4a1df 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1090,8 +1090,6 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) kfree(sdata->u.ap.next_beacon); sdata->u.ap.next_beacon = NULL; - cancel_work_sync(&sdata->u.ap.request_smps_work); - /* turn off carrier for this interface and dependent VLANs */ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) netif_carrier_off(vlan->dev); @@ -1103,6 +1101,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) kfree_rcu(old_beacon, rcu_head); if (old_probe_resp) kfree_rcu(old_probe_resp, rcu_head); + sdata->u.ap.driver_smps_mode = IEEE80211_SMPS_OFF; __sta_info_flush(sdata, true); ieee80211_free_keys(sdata, true); diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index fab7b91923e0..70dd013de836 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -466,7 +466,9 @@ void ieee80211_request_smps_ap_work(struct work_struct *work) u.ap.request_smps_work); sdata_lock(sdata); - __ieee80211_request_smps_ap(sdata, sdata->u.ap.driver_smps_mode); + if (sdata_dereference(sdata->u.ap.beacon, sdata)) + __ieee80211_request_smps_ap(sdata, + sdata->u.ap.driver_smps_mode); sdata_unlock(sdata); } diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 3dfd20a453ab..ae2eb148a028 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -770,12 +770,19 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, ieee80211_roc_purge(local, sdata); - if (sdata->vif.type == NL80211_IFTYPE_STATION) + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: ieee80211_mgd_stop(sdata); - - if (sdata->vif.type == NL80211_IFTYPE_ADHOC) + break; + case NL80211_IFTYPE_ADHOC: ieee80211_ibss_stop(sdata); - + break; + case NL80211_IFTYPE_AP: + cancel_work_sync(&sdata->u.ap.request_smps_work); + break; + default: + break; + } /* * Remove all stations associated with this interface. -- cgit From a617302c531eaf497ccd02a61d380efc119ba999 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Jan 2014 11:14:18 +0200 Subject: cfg80211: fix scan done race When an interface/wdev is removed, any ongoing scan should be cancelled by the driver. This will make it call cfg80211, which only queues a work struct. If interface/wdev removal is quick enough, this can leave the scan request pending and processed only after the interface is gone, causing a use-after-free. Fix this by making sure the scan request is not pending after the interface is destroyed. We can't flush or cancel the work item due to locking concerns, but when it'll run it shouldn't find anything to do. This leaves a potential issue, if a new scan gets requested before the work runs, it prematurely stops the running scan, potentially causing another crash. I'll fix that in the next patch. This was particularly observed with P2P_DEVICE wdevs, likely because freeing them is quicker than freeing netdevs. Reported-by: Andrei Otcheretianski Fixes: 4a58e7c38443 ("cfg80211: don't "leak" uncompleted scans") Signed-off-by: Johannes Berg --- net/wireless/core.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 77fe4c791269..02ed00dbf2df 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -203,8 +203,11 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, rdev->opencount--; - WARN_ON(rdev->scan_req && rdev->scan_req->wdev == wdev && - !rdev->scan_req->notified); + if (rdev->scan_req && rdev->scan_req->wdev == wdev) { + if (WARN_ON(!rdev->scan_req->notified)) + rdev->scan_req->aborted = true; + ___cfg80211_scan_done(rdev); + } } static int cfg80211_rfkill_set_block(void *data, bool blocked) @@ -856,8 +859,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, break; case NETDEV_DOWN: cfg80211_update_iface_num(rdev, wdev->iftype, -1); - WARN_ON(rdev->scan_req && rdev->scan_req->wdev == wdev && - !rdev->scan_req->notified); + if (rdev->scan_req && rdev->scan_req->wdev == wdev) { + if (WARN_ON(!rdev->scan_req->notified)) + rdev->scan_req->aborted = true; + ___cfg80211_scan_done(rdev); + } if (WARN_ON(rdev->sched_scan_req && rdev->sched_scan_req->dev == wdev->netdev)) { -- cgit From f9d15d162b3acf28f85b3ac05c4883e5ed588d28 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Jan 2014 11:14:19 +0200 Subject: cfg80211: send scan results from work queue Due to the previous commit, when a scan finishes, it is in theory possible to hit the following sequence: 1. interface starts being removed 2. scan is cancelled by driver and cfg80211 is notified 3. scan done work is scheduled 4. interface is removed completely, rdev->scan_req is freed, event sent to userspace but scan done work remains pending 5. new scan is requested on another virtual interface 6. scan done work runs, freeing the still-running scan To fix this situation, hang on to the scan done message and block new scans while that is the case, and only send the message from the work function, regardless of whether the scan_req is already freed from interface removal. This makes step 5 above impossible and changes step 6 to be 5. scan done work runs, sending the scan done message As this can't work for wext, so we send the message immediately, but this shouldn't be an issue since we still return -EBUSY. Signed-off-by: Johannes Berg --- net/wireless/core.c | 4 ++-- net/wireless/core.h | 4 +++- net/wireless/nl80211.c | 29 ++++++++++------------------- net/wireless/nl80211.h | 8 ++++---- net/wireless/scan.c | 40 +++++++++++++++++++++++++--------------- net/wireless/sme.c | 2 +- 6 files changed, 45 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 02ed00dbf2df..010892b81a06 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -206,7 +206,7 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified)) rdev->scan_req->aborted = true; - ___cfg80211_scan_done(rdev); + ___cfg80211_scan_done(rdev, false); } } @@ -862,7 +862,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, if (rdev->scan_req && rdev->scan_req->wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified)) rdev->scan_req->aborted = true; - ___cfg80211_scan_done(rdev); + ___cfg80211_scan_done(rdev, false); } if (WARN_ON(rdev->sched_scan_req && diff --git a/net/wireless/core.h b/net/wireless/core.h index 37ec16d7bb1a..f1d193b557b6 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -62,6 +62,7 @@ struct cfg80211_registered_device { struct rb_root bss_tree; u32 bss_generation; struct cfg80211_scan_request *scan_req; /* protected by RTNL */ + struct sk_buff *scan_msg; struct cfg80211_sched_scan_request *sched_scan_req; unsigned long suspend_at; struct work_struct scan_done_wk; @@ -361,7 +362,8 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, bool pairwise, const u8 *mac_addr); void __cfg80211_scan_done(struct work_struct *wk); -void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev); +void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, + bool send_message); void __cfg80211_sched_scan_results(struct work_struct *wk); int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, bool driver_initiated); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6ea960b1a8eb..4fe2e6e2bc76 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5245,7 +5245,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->scan) return -EOPNOTSUPP; - if (rdev->scan_req) { + if (rdev->scan_req || rdev->scan_msg) { err = -EBUSY; goto unlock; } @@ -10012,40 +10012,31 @@ void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, NL80211_MCGRP_SCAN, GFP_KERNEL); } -void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev) +struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, bool aborted) { struct sk_buff *msg; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) - return; + return NULL; if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, - NL80211_CMD_NEW_SCAN_RESULTS) < 0) { + aborted ? NL80211_CMD_SCAN_ABORTED : + NL80211_CMD_NEW_SCAN_RESULTS) < 0) { nlmsg_free(msg); - return; + return NULL; } - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, - NL80211_MCGRP_SCAN, GFP_KERNEL); + return msg; } -void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev) +void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, + struct sk_buff *msg) { - struct sk_buff *msg; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - if (nl80211_send_scan_msg(msg, rdev, wdev, 0, 0, 0, - NL80211_CMD_SCAN_ABORTED) < 0) { - nlmsg_free(msg); - return; - } - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, NL80211_MCGRP_SCAN, GFP_KERNEL); } diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index b1b231324e10..75799746d845 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -8,10 +8,10 @@ void nl80211_exit(void); void nl80211_notify_dev_rename(struct cfg80211_registered_device *rdev); void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); -void nl80211_send_scan_done(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev); -void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev); +struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, bool aborted); +void nl80211_send_scan_result(struct cfg80211_registered_device *rdev, + struct sk_buff *msg); void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, struct net_device *netdev, u32 cmd); void nl80211_send_sched_scan_results(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/scan.c b/net/wireless/scan.c index b528e31da2cf..d1ed4aebbbb7 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -161,18 +161,25 @@ static void __cfg80211_bss_expire(struct cfg80211_registered_device *dev, dev->bss_generation++; } -void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev) +void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, + bool send_message) { struct cfg80211_scan_request *request; struct wireless_dev *wdev; + struct sk_buff *msg; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif ASSERT_RTNL(); - request = rdev->scan_req; + if (rdev->scan_msg) { + nl80211_send_scan_result(rdev, rdev->scan_msg); + rdev->scan_msg = NULL; + return; + } + request = rdev->scan_req; if (!request) return; @@ -186,18 +193,16 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev) if (wdev->netdev) cfg80211_sme_scan_done(wdev->netdev); - if (request->aborted) { - nl80211_send_scan_aborted(rdev, wdev); - } else { - if (request->flags & NL80211_SCAN_FLAG_FLUSH) { - /* flush entries from previous scans */ - spin_lock_bh(&rdev->bss_lock); - __cfg80211_bss_expire(rdev, request->scan_start); - spin_unlock_bh(&rdev->bss_lock); - } - nl80211_send_scan_done(rdev, wdev); + if (!request->aborted && + request->flags & NL80211_SCAN_FLAG_FLUSH) { + /* flush entries from previous scans */ + spin_lock_bh(&rdev->bss_lock); + __cfg80211_bss_expire(rdev, request->scan_start); + spin_unlock_bh(&rdev->bss_lock); } + msg = nl80211_build_scan_msg(rdev, wdev, request->aborted); + #ifdef CONFIG_CFG80211_WEXT if (wdev->netdev && !request->aborted) { memset(&wrqu, 0, sizeof(wrqu)); @@ -211,6 +216,11 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev) rdev->scan_req = NULL; kfree(request); + + if (!send_message) + rdev->scan_msg = msg; + else + nl80211_send_scan_result(rdev, msg); } void __cfg80211_scan_done(struct work_struct *wk) @@ -221,7 +231,7 @@ void __cfg80211_scan_done(struct work_struct *wk) scan_done_wk); rtnl_lock(); - ___cfg80211_scan_done(rdev); + ___cfg80211_scan_done(rdev, true); rtnl_unlock(); } @@ -1079,7 +1089,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, if (IS_ERR(rdev)) return PTR_ERR(rdev); - if (rdev->scan_req) { + if (rdev->scan_req || rdev->scan_msg) { err = -EBUSY; goto out; } @@ -1481,7 +1491,7 @@ int cfg80211_wext_giwscan(struct net_device *dev, if (IS_ERR(rdev)) return PTR_ERR(rdev); - if (rdev->scan_req) + if (rdev->scan_req || rdev->scan_msg) return -EAGAIN; res = ieee80211_scan_results(rdev, info, extra, data->length); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index a63509118508..f04d4c32e96e 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -67,7 +67,7 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) ASSERT_RDEV_LOCK(rdev); ASSERT_WDEV_LOCK(wdev); - if (rdev->scan_req) + if (rdev->scan_req || rdev->scan_msg) return -EBUSY; if (wdev->conn->params.channel) -- cgit From 0297ea17bf7879fb5846fafd1be4c0471e72848d Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 27 Jan 2014 11:07:42 +0200 Subject: mac80211: release the channel in error path in start_ap When the driver cannot start the AP or when the assignement of the beacon goes wrong, we need to unassign the vif. Cc: stable@vger.kernel.org Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 33acdca4a1df..453e974287d1 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1021,8 +1021,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, IEEE80211_P2P_OPPPS_ENABLE_BIT; err = ieee80211_assign_beacon(sdata, ¶ms->beacon); - if (err < 0) + if (err < 0) { + ieee80211_vif_release_channel(sdata); return err; + } changed |= err; err = drv_start_ap(sdata->local, sdata); @@ -1032,6 +1034,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (old) kfree_rcu(old, rcu_head); RCU_INIT_POINTER(sdata->u.ap.beacon, NULL); + ieee80211_vif_release_channel(sdata); return err; } -- cgit From d4c80d9df6d1e4473b1409e4d220ca3d1612125c Mon Sep 17 00:00:00 2001 From: Sujith Manoharan Date: Thu, 30 Jan 2014 14:17:28 +0530 Subject: mac80211: Fix IBSS disconnect Currently, when a station leaves an IBSS network, the corresponding BSS is not dropped from cfg80211 if there are other active stations in the network. But, the small window that is present when trying to determine a station's status based on IEEE80211_IBSS_MERGE_INTERVAL introduces a race. Instead of trying to keep the BSS, always remove it when leaving an IBSS network. There is not much benefit to retain the BSS entry since it will be added with a subsequent join operation. This fixes an issue where a dangling BSS entry causes ath9k to wait for a beacon indefinitely. Cc: Reported-by: Simon Wunderlich Signed-off-by: Sujith Manoharan Signed-off-by: Johannes Berg --- net/mac80211/ibss.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 771080ec7212..2796a198728f 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -695,12 +695,9 @@ static void ieee80211_ibss_disconnect(struct ieee80211_sub_if_data *sdata) struct cfg80211_bss *cbss; struct beacon_data *presp; struct sta_info *sta; - int active_ibss; u16 capability; - active_ibss = ieee80211_sta_active_ibss(sdata); - - if (!active_ibss && !is_zero_ether_addr(ifibss->bssid)) { + if (!is_zero_ether_addr(ifibss->bssid)) { capability = WLAN_CAPABILITY_IBSS; if (ifibss->privacy) -- cgit From 338f977f4eb441e69bb9a46eaa0ac715c931a67f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 1 Feb 2014 00:16:23 +0100 Subject: mac80211: fix fragmentation code, particularly for encryption The "new" fragmentation code (since my rewrite almost 5 years ago) erroneously sets skb->len rather than using skb_trim() to adjust the length of the first fragment after copying out all the others. This leaves the skb tail pointer pointing to after where the data originally ended, and thus causes the encryption MIC to be written at that point, rather than where it belongs: immediately after the data. The impact of this is that if software encryption is done, then a) encryption doesn't work for the first fragment, the connection becomes unusable as the first fragment will never be properly verified at the receiver, the MIC is practically guaranteed to be wrong b) we leak up to 8 bytes of plaintext (!) of the packet out into the air This is only mitigated by the fact that many devices are capable of doing encryption in hardware, in which case this can't happen as the tail pointer is irrelevant in that case. Additionally, fragmentation is not used very frequently and would normally have to be configured manually. Fix this by using skb_trim() properly. Cc: stable@vger.kernel.org Fixes: 2de8e0d999b8 ("mac80211: rewrite fragmentation") Reported-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 27c990bf2320..97a02d3f7d87 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -878,7 +878,7 @@ static int ieee80211_fragment(struct ieee80211_tx_data *tx, } /* adjust first fragment's length */ - skb->len = hdrlen + per_fragm; + skb_trim(skb, hdrlen + per_fragm); return 0; } -- cgit From fab57a6cc227468ca9e6a4c7ff8d3b10727785ee Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 29 Jan 2014 13:28:02 +0100 Subject: mac80211: fix virtual monitor interface iteration During channel context assignment, the interface should be found by interface iteration, so we need to assign the pointer before the channel context. Reported-by: Emmanuel Grumbach Tested-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index ae2eb148a028..d6d1f1df9119 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -418,20 +418,24 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) return ret; } + mutex_lock(&local->iflist_mtx); + rcu_assign_pointer(local->monitor_sdata, sdata); + mutex_unlock(&local->iflist_mtx); + mutex_lock(&local->mtx); ret = ieee80211_vif_use_channel(sdata, &local->monitor_chandef, IEEE80211_CHANCTX_EXCLUSIVE); mutex_unlock(&local->mtx); if (ret) { + mutex_lock(&local->iflist_mtx); + rcu_assign_pointer(local->monitor_sdata, NULL); + mutex_unlock(&local->iflist_mtx); + synchronize_net(); drv_remove_interface(local, sdata); kfree(sdata); return ret; } - mutex_lock(&local->iflist_mtx); - rcu_assign_pointer(local->monitor_sdata, sdata); - mutex_unlock(&local->iflist_mtx); - return 0; } -- cgit From 0bbfdfe8d25fcc1d5c2edb6b060fb0c5cf66aff9 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 31 Jan 2014 19:33:39 +0200 Subject: libceph: factor out logic from ceph_osdc_start_request() Factor out logic from ceph_osdc_start_request() into a new helper, __ceph_osdc_start_request(). ceph_osdc_start_request() now amounts to taking locks and calling __ceph_osdc_start_request(). Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 62 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 166d4c7ba065..2aa82b6bb305 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1426,6 +1426,40 @@ static void __send_queued(struct ceph_osd_client *osdc) __send_request(osdc, req); } +/* + * Caller should hold map_sem for read and request_mutex. + */ +static int __ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, + bool nofail) +{ + int rc; + + __register_request(osdc, req); + req->r_sent = 0; + req->r_got_reply = 0; + rc = __map_request(osdc, req, 0); + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; + } else { + __unregister_request(osdc, req); + } + return rc; + } + + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + __send_queued(osdc); + } + + return 0; +} + /* * Timeout callback, called every N seconds when 1 or more osd * requests has been active for more than N seconds. When this @@ -2351,34 +2385,16 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, bool nofail) { - int rc = 0; + int rc; down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); - __register_request(osdc, req); - req->r_sent = 0; - req->r_got_reply = 0; - rc = __map_request(osdc, req, 0); - if (rc < 0) { - if (nofail) { - dout("osdc_start_request failed map, " - " will retry %lld\n", req->r_tid); - rc = 0; - } else { - __unregister_request(osdc, req); - } - goto out_unlock; - } - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); - } else { - __send_queued(osdc); - } - rc = 0; -out_unlock: + + rc = __ceph_osdc_start_request(osdc, req, nofail); + mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); + return rc; } EXPORT_SYMBOL(ceph_osdc_start_request); -- cgit From ff513ace9b772e75e337f8e058cc7f12816843fe Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 3 Feb 2014 13:56:33 +0200 Subject: libceph: take map_sem for read in handle_reply() Handling redirect replies requires both map_sem and request_mutex. Taking map_sem unconditionally near the top of handle_reply() avoids possible race conditions that arise from releasing request_mutex to be able to acquire map_sem in redirect reply case. (Lock ordering is: map_sem, request_mutex, crush_mutex.) Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2aa82b6bb305..0676f2b199d6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1687,6 +1687,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, osdmap_epoch = ceph_decode_32(&p); /* lookup */ + down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); if (req == NULL) { @@ -1743,7 +1744,6 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, dout("redirect pool %lld\n", redir.oloc.pool); __unregister_request(osdc, req); - mutex_unlock(&osdc->request_mutex); req->r_target_oloc = redir.oloc; /* struct */ @@ -1755,10 +1755,10 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, * successfully. In the future we might want to follow * original request's nofail setting here. */ - err = ceph_osdc_start_request(osdc, req, true); + err = __ceph_osdc_start_request(osdc, req, true); BUG_ON(err); - goto done; + goto out_unlock; } already_completed = req->r_got_reply; @@ -1776,8 +1776,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, req->r_got_reply = 1; } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { dout("handle_reply tid %llu dup ack\n", tid); - mutex_unlock(&osdc->request_mutex); - goto done; + goto out_unlock; } dout("handle_reply tid %llu flags %d\n", tid, flags); @@ -1792,6 +1791,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, __unregister_request(osdc, req); mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); if (!already_completed) { if (req->r_unsafe_callback && @@ -1809,10 +1809,14 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, complete_request(req); } -done: +out: dout("req=%p req->r_linger=%d\n", req, req->r_linger); ceph_osdc_put_request(req); return; +out_unlock: + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); + goto out; bad_put: req->r_result = -EIO; @@ -1825,6 +1829,7 @@ bad_put: ceph_osdc_put_request(req); bad_mutex: mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); bad: pr_err("corrupt osd_op_reply got %d %d\n", (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); -- cgit From 0ec1d15ec6ed513ab2cc86c67d94761d71228a32 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 5 Feb 2014 15:19:55 +0200 Subject: libceph: do not dereference a NULL bio pointer Commit f38a5181d9f3 ("ceph: Convert to immutable biovecs") introduced a NULL pointer dereference, which broke rbd in -rc1. Fix it. Cc: Kent Overstreet Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/messenger.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0e478a0f4204..30efc5c18622 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -840,9 +840,13 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, if (!cursor->bvec_iter.bi_size) { bio = bio->bi_next; - cursor->bvec_iter = bio->bi_iter; + cursor->bio = bio; + if (bio) + cursor->bvec_iter = bio->bi_iter; + else + memset(&cursor->bvec_iter, 0, + sizeof(cursor->bvec_iter)); } - cursor->bio = bio; if (!cursor->last_piece) { BUG_ON(!cursor->resid); -- cgit