From a70e65df8812c52252fa07a2eb92a46451a4427f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2008 14:37:28 -0800 Subject: [PATCH] merge open_namei() and do_filp_open() open_namei() will, in the future, need to take mount write counts over its creation and truncation (via may_open()) operations. It needs to keep these write counts until any potential filp that is created gets __fput()'d. This gets complicated in the error handling and becomes very murky as to how far open_namei() actually got, and whether or not that mount write count was taken. That makes it a bad interface. All that the current do_filp_open() really does is allocate the nameidata on the stack, then call open_namei(). So, this merges those two functions and moves filp_open() over to namei.c so it can be close to its buddy: do_filp_open(). It also gets a kerneldoc comment in the process. Acked-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Dave Hansen Signed-off-by: Al Viro --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b84b848431f2..013b9c2b88e6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1735,7 +1735,8 @@ extern struct file *create_read_pipe(struct file *f); extern struct file *create_write_pipe(void); extern void free_write_pipe(struct file *); -extern int open_namei(int dfd, const char *, int, int, struct nameidata *); +extern struct file *do_filp_open(int dfd, const char *pathname, + int open_flag, int mode); extern int may_open(struct nameidata *, int, int); extern int kernel_read(struct file *, unsigned long, char *, unsigned long); -- cgit From 8366025eb80dfa0d8d94b286d53027081c280ef1 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 15 Feb 2008 14:37:30 -0800 Subject: [PATCH] r/o bind mounts: stub functions This patch adds two function mnt_want_write() and mnt_drop_write(). These are used like a lock pair around and fs operations that might cause a write to the filesystem. Before these can become useful, we must first cover each place in the VFS where writes are performed with a want/drop pair. When that is complete, we can actually introduce code that will safely check the counts before allowing r/w<->r/o transitions to occur. Acked-by: Serge Hallyn Acked-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Dave Hansen Signed-off-by: Al Viro --- fs/namespace.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mount.h | 3 +++ 2 files changed, 57 insertions(+) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 94f026ec990a..066b393578c1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -80,6 +80,60 @@ struct vfsmount *alloc_vfsmnt(const char *name) return mnt; } +/* + * Most r/o checks on a fs are for operations that take + * discrete amounts of time, like a write() or unlink(). + * We must keep track of when those operations start + * (for permission checks) and when they end, so that + * we can determine when writes are able to occur to + * a filesystem. + */ +/** + * mnt_want_write - get write access to a mount + * @mnt: the mount on which to take a write + * + * This tells the low-level filesystem that a write is + * about to be performed to it, and makes sure that + * writes are allowed before returning success. When + * the write operation is finished, mnt_drop_write() + * must be called. This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *mnt) +{ + if (__mnt_is_readonly(mnt)) + return -EROFS; + return 0; +} +EXPORT_SYMBOL_GPL(mnt_want_write); + +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done + * performing writes to it. Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ +} +EXPORT_SYMBOL_GPL(mnt_drop_write); + +/* + * __mnt_is_readonly: check whether a mount is read-only + * @mnt: the mount to check for its write status + * + * This shouldn't be used directly ouside of the VFS. + * It does not guarantee that the filesystem will stay + * r/w, just that it is right *now*. This can not and + * should not be used in place of IS_RDONLY(inode). + */ +int __mnt_is_readonly(struct vfsmount *mnt) +{ + return (mnt->mnt_sb->s_flags & MS_RDONLY); +} +EXPORT_SYMBOL_GPL(__mnt_is_readonly); + int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) { mnt->mnt_sb = sb; diff --git a/include/linux/mount.h b/include/linux/mount.h index 5ee2df217cdf..2eecd2c8c760 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -71,9 +71,12 @@ static inline struct vfsmount *mntget(struct vfsmount *mnt) return mnt; } +extern int mnt_want_write(struct vfsmount *mnt); +extern void mnt_drop_write(struct vfsmount *mnt); extern void mntput_no_expire(struct vfsmount *mnt); extern void mnt_pin(struct vfsmount *mnt); extern void mnt_unpin(struct vfsmount *mnt); +extern int __mnt_is_readonly(struct vfsmount *mnt); static inline void mntput(struct vfsmount *mnt) { -- cgit From aceaf78da92a53f5e1b105649a1b8c0afdb2135c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 15 Feb 2008 14:37:31 -0800 Subject: [PATCH] r/o bind mounts: create helper to drop file write access If someone decides to demote a file from r/w to just r/o, they can use this same code as __fput(). NFS does just that, and will use this in the next patch. AV: drop write access in __fput() only after we evict from file list. Signed-off-by: Dave Hansen Cc: Erez Zadok Cc: Trond Myklebust Cc: "J Bruce Fields" Acked-by: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/file_table.c | 21 +++++++++++++++++++-- fs/nfsd/nfs4state.c | 3 ++- include/linux/file.h | 1 + 3 files changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/file_table.c b/fs/file_table.c index 986ff4ed0a7c..3f73eb1f195a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -211,6 +211,23 @@ void fput(struct file *file) EXPORT_SYMBOL(fput); +/** + * drop_file_write_access - give up ability to write to a file + * @file: the file to which we will stop writing + * + * This is a central place which will give up the ability + * to write to @file, along with access to write through + * its vfsmount. + */ +void drop_file_write_access(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + + put_write_access(inode); +} +EXPORT_SYMBOL_GPL(drop_file_write_access); + /* __fput is called from task context when aio completion releases the last * last use of a struct file *. Do not use otherwise. */ @@ -236,10 +253,10 @@ void __fput(struct file *file) if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) cdev_put(inode->i_cdev); fops_put(file->f_op); - if (file->f_mode & FMODE_WRITE) - put_write_access(inode); put_pid(file->f_owner.pid); file_kill(file); + if (file->f_mode & FMODE_WRITE) + drop_file_write_access(file); file->f_path.dentry = NULL; file->f_path.mnt = NULL; file_free(file); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index bcb97d8e8b8b..81a75f3081f4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -1239,7 +1240,7 @@ static inline void nfs4_file_downgrade(struct file *filp, unsigned int share_access) { if (share_access & NFS4_SHARE_ACCESS_WRITE) { - put_write_access(filp->f_path.dentry->d_inode); + drop_file_write_access(filp); filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; } } diff --git a/include/linux/file.h b/include/linux/file.h index 7239baac81a9..653477021e4c 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -61,6 +61,7 @@ extern struct kmem_cache *filp_cachep; extern void __fput(struct file *); extern void fput(struct file *); +extern void drop_file_write_access(struct file *file); struct file_operations; struct vfsmount; -- cgit From 3d733633a633065729c9e4e254b2e5442c00ef7e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 15 Feb 2008 14:37:59 -0800 Subject: [PATCH] r/o bind mounts: track numbers of writers to mounts This is the real meat of the entire series. It actually implements the tracking of the number of writers to a mount. However, it causes scalability problems because there can be hundreds of cpus doing open()/close() on files on the same mnt at the same time. Even an atomic_t in the mnt has massive scalaing problems because the cacheline gets so terribly contended. This uses a statically-allocated percpu variable. All want/drop operations are local to a cpu as long that cpu operates on the same mount, and there are no writer count imbalances. Writer count imbalances happen when a write is taken on one cpu, and released on another, like when an open/close pair is performed on two Upon a remount,ro request, all of the data from the percpu variables is collected (expensive, but very rare) and we determine if there are any outstanding writers to the mount. I've written a little benchmark to sit in a loop for a couple of seconds in several cpus in parallel doing open/write/close loops. http://sr71.net/~dave/linux/openbench.c The code in here is a a worst-possible case for this patch. It does opens on a _pair_ of files in two different mounts in parallel. This should cause my code to lose its "operate on the same mount" optimization completely. This worst-case scenario causes a 3% degredation in the benchmark. I could probably get rid of even this 3%, but it would be more complex than what I have here, and I think this is getting into acceptable territory. In practice, I expect writing more than 3 bytes to a file, as well as disk I/O to mask any effects that this has. (To get rid of that 3%, we could have an #defined number of mounts in the percpu variable. So, instead of a CPU getting operate only on percpu data when it accesses only one mount, it could stay on percpu data when it only accesses N or fewer mounts.) [AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount Acked-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/namespace.c | 252 +++++++++++++++++++++++++++++++++++++++++++++++--- include/linux/mount.h | 7 ++ 2 files changed, 244 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 066b393578c1..e3ce18d91aad 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) return tmp & (HASH_SIZE - 1); } +#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) + struct vfsmount *alloc_vfsmnt(const char *name) { struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); + atomic_set(&mnt->__mnt_writers, 0); if (name) { int size = strlen(name) + 1; char *newname = kmalloc(size, GFP_KERNEL); @@ -80,6 +84,92 @@ struct vfsmount *alloc_vfsmnt(const char *name) return mnt; } +/* + * Most r/o checks on a fs are for operations that take + * discrete amounts of time, like a write() or unlink(). + * We must keep track of when those operations start + * (for permission checks) and when they end, so that + * we can determine when writes are able to occur to + * a filesystem. + */ +/* + * __mnt_is_readonly: check whether a mount is read-only + * @mnt: the mount to check for its write status + * + * This shouldn't be used directly ouside of the VFS. + * It does not guarantee that the filesystem will stay + * r/w, just that it is right *now*. This can not and + * should not be used in place of IS_RDONLY(inode). + * mnt_want/drop_write() will _keep_ the filesystem + * r/w. + */ +int __mnt_is_readonly(struct vfsmount *mnt) +{ + return (mnt->mnt_sb->s_flags & MS_RDONLY); +} +EXPORT_SYMBOL_GPL(__mnt_is_readonly); + +struct mnt_writer { + /* + * If holding multiple instances of this lock, they + * must be ordered by cpu number. + */ + spinlock_t lock; + struct lock_class_key lock_class; /* compiles out with !lockdep */ + unsigned long count; + struct vfsmount *mnt; +} ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); + +static int __init init_mnt_writers(void) +{ + int cpu; + for_each_possible_cpu(cpu) { + struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); + spin_lock_init(&writer->lock); + lockdep_set_class(&writer->lock, &writer->lock_class); + writer->count = 0; + } + return 0; +} +fs_initcall(init_mnt_writers); + +static void unlock_mnt_writers(void) +{ + int cpu; + struct mnt_writer *cpu_writer; + + for_each_possible_cpu(cpu) { + cpu_writer = &per_cpu(mnt_writers, cpu); + spin_unlock(&cpu_writer->lock); + } +} + +static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) +{ + if (!cpu_writer->mnt) + return; + /* + * This is in case anyone ever leaves an invalid, + * old ->mnt and a count of 0. + */ + if (!cpu_writer->count) + return; + atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); + cpu_writer->count = 0; +} + /* + * must hold cpu_writer->lock + */ +static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, + struct vfsmount *mnt) +{ + if (cpu_writer->mnt == mnt) + return; + __clear_mnt_count(cpu_writer); + cpu_writer->mnt = mnt; +} + /* * Most r/o checks on a fs are for operations that take * discrete amounts of time, like a write() or unlink(). @@ -100,12 +190,77 @@ struct vfsmount *alloc_vfsmnt(const char *name) */ int mnt_want_write(struct vfsmount *mnt) { - if (__mnt_is_readonly(mnt)) - return -EROFS; - return 0; + int ret = 0; + struct mnt_writer *cpu_writer; + + cpu_writer = &get_cpu_var(mnt_writers); + spin_lock(&cpu_writer->lock); + if (__mnt_is_readonly(mnt)) { + ret = -EROFS; + goto out; + } + use_cpu_writer_for_mount(cpu_writer, mnt); + cpu_writer->count++; +out: + spin_unlock(&cpu_writer->lock); + put_cpu_var(mnt_writers); + return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); +static void lock_mnt_writers(void) +{ + int cpu; + struct mnt_writer *cpu_writer; + + for_each_possible_cpu(cpu) { + cpu_writer = &per_cpu(mnt_writers, cpu); + spin_lock(&cpu_writer->lock); + __clear_mnt_count(cpu_writer); + cpu_writer->mnt = NULL; + } +} + +/* + * These per-cpu write counts are not guaranteed to have + * matched increments and decrements on any given cpu. + * A file open()ed for write on one cpu and close()d on + * another cpu will imbalance this count. Make sure it + * does not get too far out of whack. + */ +static void handle_write_count_underflow(struct vfsmount *mnt) +{ + if (atomic_read(&mnt->__mnt_writers) >= + MNT_WRITER_UNDERFLOW_LIMIT) + return; + /* + * It isn't necessary to hold all of the locks + * at the same time, but doing it this way makes + * us share a lot more code. + */ + lock_mnt_writers(); + /* + * vfsmount_lock is for mnt_flags. + */ + spin_lock(&vfsmount_lock); + /* + * If coalescing the per-cpu writer counts did not + * get us back to a positive writer count, we have + * a bug. + */ + if ((atomic_read(&mnt->__mnt_writers) < 0) && + !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { + printk(KERN_DEBUG "leak detected on mount(%p) writers " + "count: %d\n", + mnt, atomic_read(&mnt->__mnt_writers)); + WARN_ON(1); + /* use the flag to keep the dmesg spam down */ + mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; + } + spin_unlock(&vfsmount_lock); + unlock_mnt_writers(); +} + /** * mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access @@ -116,23 +271,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write); */ void mnt_drop_write(struct vfsmount *mnt) { + int must_check_underflow = 0; + struct mnt_writer *cpu_writer; + + cpu_writer = &get_cpu_var(mnt_writers); + spin_lock(&cpu_writer->lock); + + use_cpu_writer_for_mount(cpu_writer, mnt); + if (cpu_writer->count > 0) { + cpu_writer->count--; + } else { + must_check_underflow = 1; + atomic_dec(&mnt->__mnt_writers); + } + + spin_unlock(&cpu_writer->lock); + /* + * Logically, we could call this each time, + * but the __mnt_writers cacheline tends to + * be cold, and makes this expensive. + */ + if (must_check_underflow) + handle_write_count_underflow(mnt); + /* + * This could be done right after the spinlock + * is taken because the spinlock keeps us on + * the cpu, and disables preemption. However, + * putting it here bounds the amount that + * __mnt_writers can underflow. Without it, + * we could theoretically wrap __mnt_writers. + */ + put_cpu_var(mnt_writers); } EXPORT_SYMBOL_GPL(mnt_drop_write); -/* - * __mnt_is_readonly: check whether a mount is read-only - * @mnt: the mount to check for its write status - * - * This shouldn't be used directly ouside of the VFS. - * It does not guarantee that the filesystem will stay - * r/w, just that it is right *now*. This can not and - * should not be used in place of IS_RDONLY(inode). - */ -int __mnt_is_readonly(struct vfsmount *mnt) +int mnt_make_readonly(struct vfsmount *mnt) { - return (mnt->mnt_sb->s_flags & MS_RDONLY); + int ret = 0; + + lock_mnt_writers(); + /* + * With all the locks held, this value is stable + */ + if (atomic_read(&mnt->__mnt_writers) > 0) { + ret = -EBUSY; + goto out; + } + /* + * actually set mount's r/o flag here to make + * __mnt_is_readonly() true, which keeps anyone + * from doing a successful mnt_want_write(). + */ +out: + unlock_mnt_writers(); + return ret; } -EXPORT_SYMBOL_GPL(__mnt_is_readonly); int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) { @@ -325,7 +518,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, static inline void __mntput(struct vfsmount *mnt) { + int cpu; struct super_block *sb = mnt->mnt_sb; + /* + * We don't have to hold all of the locks at the + * same time here because we know that we're the + * last reference to mnt and that no new writers + * can come in. + */ + for_each_possible_cpu(cpu) { + struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); + if (cpu_writer->mnt != mnt) + continue; + spin_lock(&cpu_writer->lock); + atomic_add(cpu_writer->count, &mnt->__mnt_writers); + cpu_writer->count = 0; + /* + * Might as well do this so that no one + * ever sees the pointer and expects + * it to be valid. + */ + cpu_writer->mnt = NULL; + spin_unlock(&cpu_writer->lock); + } + /* + * This probably indicates that somebody messed + * up a mnt_want/drop_write() pair. If this + * happens, the filesystem was probably unable + * to make r/w->r/o transitions. + */ + WARN_ON(atomic_read(&mnt->__mnt_writers)); dput(mnt->mnt_root); free_vfsmnt(mnt); deactivate_super(sb); diff --git a/include/linux/mount.h b/include/linux/mount.h index 2eecd2c8c760..8c8e94369ac8 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -30,6 +31,7 @@ struct mnt_namespace; #define MNT_RELATIME 0x20 #define MNT_SHRINKABLE 0x100 +#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ @@ -62,6 +64,11 @@ struct vfsmount { int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; int mnt_ghosts; + /* + * This value is not stable unless all of the mnt_writers[] spinlocks + * are held, and all mnt_writer[]s on this mount have 0 as their ->count + */ + atomic_t __mnt_writers; }; static inline struct vfsmount *mntget(struct vfsmount *mnt) -- cgit From 2e4b7fcd926006531935a4c79a5e9349fe51125b Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 15 Feb 2008 14:38:00 -0800 Subject: [PATCH] r/o bind mounts: honor mount writer counts at remount Originally from: Herbert Poetzl This is the core of the read-only bind mount patch set. Note that this does _not_ add a "ro" option directly to the bind mount operation. If you require such a mount, you must first do the bind, then follow it up with a 'mount -o remount,ro' operation: If you wish to have a r/o bind mount of /foo on bar: mount --bind /foo /bar mount -o remount,ro /bar Acked-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/namespace.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- include/linux/mount.h | 1 + 2 files changed, 44 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index e3ce18d91aad..678f7ce060f2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -105,7 +105,11 @@ struct vfsmount *alloc_vfsmnt(const char *name) */ int __mnt_is_readonly(struct vfsmount *mnt) { - return (mnt->mnt_sb->s_flags & MS_RDONLY); + if (mnt->mnt_flags & MNT_READONLY) + return 1; + if (mnt->mnt_sb->s_flags & MS_RDONLY) + return 1; + return 0; } EXPORT_SYMBOL_GPL(__mnt_is_readonly); @@ -305,7 +309,7 @@ void mnt_drop_write(struct vfsmount *mnt) } EXPORT_SYMBOL_GPL(mnt_drop_write); -int mnt_make_readonly(struct vfsmount *mnt) +static int mnt_make_readonly(struct vfsmount *mnt) { int ret = 0; @@ -318,15 +322,25 @@ int mnt_make_readonly(struct vfsmount *mnt) goto out; } /* - * actually set mount's r/o flag here to make - * __mnt_is_readonly() true, which keeps anyone - * from doing a successful mnt_want_write(). + * nobody can do a successful mnt_want_write() with all + * of the counts in MNT_DENIED_WRITE and the locks held. */ + spin_lock(&vfsmount_lock); + if (!ret) + mnt->mnt_flags |= MNT_READONLY; + spin_unlock(&vfsmount_lock); out: unlock_mnt_writers(); return ret; } +static void __mnt_unmake_readonly(struct vfsmount *mnt) +{ + spin_lock(&vfsmount_lock); + mnt->mnt_flags &= ~MNT_READONLY; + spin_unlock(&vfsmount_lock); +} + int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) { mnt->mnt_sb = sb; @@ -693,7 +707,7 @@ static int show_vfsmnt(struct seq_file *m, void *v) seq_putc(m, '.'); mangle(m, mnt->mnt_sb->s_subtype); } - seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); + seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { if (mnt->mnt_sb->s_flags & fs_infop->flag) seq_puts(m, fs_infop->str); @@ -1295,6 +1309,23 @@ out: return err; } +static int change_mount_flags(struct vfsmount *mnt, int ms_flags) +{ + int error = 0; + int readonly_request = 0; + + if (ms_flags & MS_RDONLY) + readonly_request = 1; + if (readonly_request == __mnt_is_readonly(mnt)) + return 0; + + if (readonly_request) + error = mnt_make_readonly(mnt); + else + __mnt_unmake_readonly(mnt); + return error; +} + /* * change filesystem flags. dir should be a physical root of filesystem. * If you've mounted a non-root directory somewhere and want to do remount @@ -1317,7 +1348,10 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, return -EINVAL; down_write(&sb->s_umount); - err = do_remount_sb(sb, flags, data, 0); + if (flags & MS_BIND) + err = change_mount_flags(nd->path.mnt, flags); + else + err = do_remount_sb(sb, flags, data, 0); if (!err) nd->path.mnt->mnt_flags = mnt_flags; up_write(&sb->s_umount); @@ -1701,6 +1735,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, mnt_flags |= MNT_NODIRATIME; if (flags & MS_RELATIME) mnt_flags |= MNT_RELATIME; + if (flags & MS_RDONLY) + mnt_flags |= MNT_READONLY; flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); diff --git a/include/linux/mount.h b/include/linux/mount.h index 8c8e94369ac8..d6600e3f7e45 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -29,6 +29,7 @@ struct mnt_namespace; #define MNT_NOATIME 0x08 #define MNT_NODIRATIME 0x10 #define MNT_RELATIME 0x20 +#define MNT_READONLY 0x40 /* does the user want this to be r/o? */ #define MNT_SHRINKABLE 0x100 #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ -- cgit From ad775f5a8faa5845377f093ca11caf577404add9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 15 Feb 2008 14:38:01 -0800 Subject: [PATCH] r/o bind mounts: debugging for missed calls There have been a few oopses caused by 'struct file's with NULL f_vfsmnts. There was also a set of potentially missed mnt_want_write()s from dentry_open() calls. This patch provides a very simple debugging framework to catch these kinds of bugs. It will WARN_ON() them, but should stop us from having any oopses or mnt_writer count imbalances. I'm quite convinced that this is a good thing because it found bugs in the stuff I was working on as soon as I wrote it. [hch: made it conditional on a debug option. But it's still a little bit too ugly] [hch: merged forced remount r/o fix from Dave and akpm's fix for the fix] Signed-off-by: Dave Hansen Acked-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/file_table.c | 11 +++++++++-- fs/open.c | 12 +++++++++++- fs/super.c | 3 +++ include/linux/fs.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 10 ++++++++++ 5 files changed, 82 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/file_table.c b/fs/file_table.c index 71efc7000226..7a0a9b872251 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -42,6 +42,7 @@ static inline void file_free_rcu(struct rcu_head *head) static inline void file_free(struct file *f) { percpu_counter_dec(&nr_files); + file_check_state(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -207,6 +208,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, * that we can do debugging checks at __fput() */ if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { + file_take_write(file); error = mnt_want_write(mnt); WARN_ON(error); } @@ -237,8 +239,13 @@ void drop_file_write_access(struct file *file) struct inode *inode = dentry->d_inode; put_write_access(inode); - if (!special_file(inode->i_mode)) - mnt_drop_write(mnt); + + if (special_file(inode->i_mode)) + return; + if (file_check_writeable(file) != 0) + return; + mnt_drop_write(mnt); + file_release_write(file); } EXPORT_SYMBOL_GPL(drop_file_write_access); diff --git a/fs/open.c b/fs/open.c index e58382d57e72..b70e7666bb2c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -806,6 +806,8 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, error = __get_file_write_access(inode, mnt); if (error) goto cleanup_file; + if (!special_file(inode->i_mode)) + file_take_write(f); } f->f_mapping = inode->i_mapping; @@ -847,8 +849,16 @@ cleanup_all: fops_put(f->f_op); if (f->f_mode & FMODE_WRITE) { put_write_access(inode); - if (!special_file(inode->i_mode)) + if (!special_file(inode->i_mode)) { + /* + * We don't consider this a real + * mnt_want/drop_write() pair + * because it all happenend right + * here, so just reset the state. + */ + file_reset_write(f); mnt_drop_write(mnt); + } } file_kill(f); f->f_path.dentry = NULL; diff --git a/fs/super.c b/fs/super.c index 01d5c40e9119..1f8f05ede437 100644 --- a/fs/super.c +++ b/fs/super.c @@ -579,6 +579,9 @@ retry: if (!(f->f_mode & FMODE_WRITE)) continue; f->f_mode &= ~FMODE_WRITE; + if (file_check_writeable(f) != 0) + continue; + file_release_write(f); mnt = mntget(f->f_path.mnt); file_list_unlock(); /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 013b9c2b88e6..d1eeea669d2c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -776,6 +776,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) index < ra->start + ra->size); } +#define FILE_MNT_WRITE_TAKEN 1 +#define FILE_MNT_WRITE_RELEASED 2 + struct file { /* * fu_list becomes invalid after file_free is called and queued via @@ -810,6 +813,9 @@ struct file { spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; +#ifdef CONFIG_DEBUG_WRITECOUNT + unsigned long f_mnt_write_state; +#endif }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); @@ -818,6 +824,49 @@ extern spinlock_t files_lock; #define get_file(x) atomic_inc(&(x)->f_count) #define file_count(x) atomic_read(&(x)->f_count) +#ifdef CONFIG_DEBUG_WRITECOUNT +static inline void file_take_write(struct file *f) +{ + WARN_ON(f->f_mnt_write_state != 0); + f->f_mnt_write_state = FILE_MNT_WRITE_TAKEN; +} +static inline void file_release_write(struct file *f) +{ + f->f_mnt_write_state |= FILE_MNT_WRITE_RELEASED; +} +static inline void file_reset_write(struct file *f) +{ + f->f_mnt_write_state = 0; +} +static inline void file_check_state(struct file *f) +{ + /* + * At this point, either both or neither of these bits + * should be set. + */ + WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN); + WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED); +} +static inline int file_check_writeable(struct file *f) +{ + if (f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN) + return 0; + printk(KERN_WARNING "writeable file with no " + "mnt_want_write()\n"); + WARN_ON(1); + return -EINVAL; +} +#else /* !CONFIG_DEBUG_WRITECOUNT */ +static inline void file_take_write(struct file *filp) {} +static inline void file_release_write(struct file *filp) {} +static inline void file_reset_write(struct file *filp) {} +static inline void file_check_state(struct file *filp) {} +static inline int file_check_writeable(struct file *filp) +{ + return 0; +} +#endif /* CONFIG_DEBUG_WRITECOUNT */ + #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 95de3102bc87..623ef24c2381 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -427,6 +427,16 @@ config DEBUG_VM If unsure, say N. +config DEBUG_WRITECOUNT + bool "Debug filesystem writers count" + depends on DEBUG_KERNEL + help + Enable this to catch wrong use of the writers count in struct + vfsmount. This will increase the size of each file struct by + 32 bits. + + If unsure, say N. + config DEBUG_LIST bool "Debug linked list manipulation" depends on DEBUG_KERNEL -- cgit