From 2436f039d26a91e5404974ee0cb789b17db46168 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 10 Apr 2006 00:17:20 -0700 Subject: [PATCH] Fix block device symlink name As noted further on the this file, some block devices have a / in their name, so fix the "block:..." symlink name the same as the /sys/block name. Signed-off-by: Stephen Rothwell Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/partitions/check.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index f3b6af071722..45ae7dd3c650 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -372,6 +372,7 @@ static char *make_block_name(struct gendisk *disk) char *name; static char *block_str = "block:"; int size; + char *s; size = strlen(block_str) + strlen(disk->disk_name) + 1; name = kmalloc(size, GFP_KERNEL); @@ -379,6 +380,10 @@ static char *make_block_name(struct gendisk *disk) return NULL; strcpy(name, block_str); strcat(name, disk->disk_name); + /* ewww... some of these buggers have / in name... */ + s = strchr(name, '/'); + if (s) + *s = '!'; return name; } -- cgit From 75616cf9854b83eb83a968b1338ae0ee11c9673c Mon Sep 17 00:00:00 2001 From: "Ananiev, Leonid I" Date: Mon, 10 Apr 2006 22:54:38 -0700 Subject: [PATCH] ext3: Fix missed mutex unlock Missed unlock_super()call is added in error condition code path. Signed-off-by: Leonid Ananiev Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ext3/resize.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 14f5f6ea3e72..c5ffa8523968 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -767,6 +767,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (input->group != sbi->s_groups_count) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); + unlock_super(sb); err = -EBUSY; goto exit_journal; } -- cgit From 0a489cb3b6a7b277030cdbc97c2c65905db94536 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Apr 2006 13:02:48 -0700 Subject: x86: don't allow tail-calls in sys_ftruncate[64]() Gcc thinks it owns the incoming argument stack, but that's not true for "asmlinkage" functions, and it corrupts the caller-set-up argument stack when it pushes the third argument onto the stack. Which can result in %ebx getting corrupted in user space. Now, normally nobody sane would ever notice, since libc will save and restore %ebx anyway over the system call, but it's still wrong. I'd much rather have "asmlinkage" tell gcc directly that it doesn't own the stack, but no such attribute exists, so we're stuck with our hacky manual "prevent_tail_call()" macro once more (we've had the same issue before with sys_waitpid() and sys_wait4()). Thanks to Hans-Werner Hilse for reporting the issue and testing the fix. Signed-off-by: Linus Torvalds --- fs/open.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index c32c89d6d8db..8279c65d3bef 100644 --- a/fs/open.c +++ b/fs/open.c @@ -331,7 +331,9 @@ out: asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) { - return do_sys_ftruncate(fd, length, 1); + long ret = do_sys_ftruncate(fd, length, 1); + prevent_tail_call(ret); + return ret; } /* LFS versions of truncate are only needed on 32 bit machines */ @@ -343,7 +345,9 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length) asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) { - return do_sys_ftruncate(fd, length, 0); + long ret = do_sys_ftruncate(fd, length, 0); + prevent_tail_call(ret); + return ret; } #endif -- cgit From 385910f2b275a636238f70844f1b6da9fda6f2da Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Apr 2006 13:22:59 -0700 Subject: x86: be careful about tailcall breakage for sys_open[at] too Came up through a quick grep for other cases similar to the ftruncate() one in commit 0a489cb3b6a7b277030cdbc97c2c65905db94536. Also, add a comment, so that people who read the code understand why we do what looks like a no-op. (Again, this won't actually matter to any sane user, since libc will save and restore the register gcc stomps on, but it's still wrong to stomp on it) Signed-off-by: Linus Torvalds --- fs/open.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 8279c65d3bef..53ec28c36777 100644 --- a/fs/open.c +++ b/fs/open.c @@ -332,6 +332,7 @@ out: asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) { long ret = do_sys_ftruncate(fd, length, 1); + /* avoid REGPARM breakage on x86: */ prevent_tail_call(ret); return ret; } @@ -346,6 +347,7 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length) asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) { long ret = do_sys_ftruncate(fd, length, 0); + /* avoid REGPARM breakage on x86: */ prevent_tail_call(ret); return ret; } @@ -1097,20 +1099,30 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode) asmlinkage long sys_open(const char __user *filename, int flags, int mode) { + long ret; + if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(AT_FDCWD, filename, flags, mode); + ret = do_sys_open(AT_FDCWD, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } EXPORT_SYMBOL_GPL(sys_open); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, int mode) { + long ret; + if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(dfd, filename, flags, mode); + ret = do_sys_open(dfd, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } EXPORT_SYMBOL_GPL(sys_openat); -- cgit From 91ad66ef4469cb631ec0ccd131b07f16770773f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:55:10 +0200 Subject: [PATCH] splice: close i_size truncate races on read We need to check i_size after doing a blocking readpage. Signed-off-by: Jens Axboe --- fs/splice.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 8d57e89924a6..7e8585574726 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -145,8 +145,8 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, - int nr_pages, unsigned long offset, - unsigned long len, unsigned int flags) + int nr_pages, unsigned long len, + unsigned int offset, unsigned int flags) { int ret, do_wakeup, i; @@ -243,14 +243,16 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, unsigned int flags) { struct address_space *mapping = in->f_mapping; - unsigned int offset, nr_pages; + unsigned int loff, offset, nr_pages; struct page *pages[PIPE_BUFFERS]; struct page *page; - pgoff_t index; + pgoff_t index, end_index; + loff_t isize; + size_t bytes; int i, error; index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + loff = offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) @@ -268,6 +270,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * Now fill in the holes: */ error = 0; + bytes = 0; for (i = 0; i < nr_pages; i++, index++) { find_page: /* @@ -336,13 +339,41 @@ readpage: goto find_page; break; } + + /* + * i_size must be checked after ->readpage(). + */ + isize = i_size_read(mapping->host); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) { + page_cache_release(page); + break; + } + + /* + * if this is the last page, see if we need to shrink + * the length and stop + */ + if (end_index == index) { + loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); + if (bytes + loff > isize) { + page_cache_release(page); + break; + } + /* + * force quit after adding this page + */ + nr_pages = i; + } } fill_it: pages[i] = page; + bytes += PAGE_CACHE_SIZE - loff; + loff = 0; } if (i) - return move_to_pipe(pipe, pages, i, offset, len, flags); + return move_to_pipe(pipe, pages, i, bytes, offset, flags); return error; } -- cgit From c4f895cbe1e95aab633207fb19c650b7c984c01a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:56:12 +0200 Subject: [PATCH] splice: cleanup the SPLICE_F_NONBLOCK handling - generic_file_splice_read() more readable and correct - Don't bail on page allocation with NONBLOCK set, just don't allow direct blocking on IO (eg lock_page). Signed-off-by: Jens Axboe --- fs/splice.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 7e8585574726..78cd264340f2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -278,14 +278,6 @@ find_page: */ page = find_get_page(mapping, index); if (!page) { - /* - * If in nonblock mode then dont block on - * readpage (we've kicked readahead so there - * will be asynchronous progress): - */ - if (flags & SPLICE_F_NONBLOCK) - break; - /* * page didn't exist, allocate one */ @@ -307,6 +299,13 @@ find_page: * If the page isn't uptodate, we may need to start io on it */ if (!PageUptodate(page)) { + /* + * If in nonblock mode then dont block on waiting + * for an in-flight io page + */ + if (flags & SPLICE_F_NONBLOCK) + break; + lock_page(page); /* @@ -400,17 +399,20 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, while (len) { ret = __generic_file_splice_read(in, ppos, pipe, len, flags); - if (ret <= 0) + if (ret < 0) break; + else if (!ret) { + if (spliced) + break; + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } *ppos += ret; len -= ret; spliced += ret; - - if (!(flags & SPLICE_F_NONBLOCK)) - continue; - ret = -EAGAIN; - break; } if (spliced) -- cgit From 2a27250e6cf47ca1ea3bea0a55e4b7889c097627 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:56:40 +0200 Subject: [PATCH] tee: link_pipe() must be careful when dropping one of the pipe locks We need to ensure that we only drop a lock that is ordered last, to avoid ABBA deadlocks with competing processes. Signed-off-by: Jens Axboe --- fs/splice.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 78cd264340f2..4f5e6b09fb26 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1012,7 +1012,9 @@ static int link_pipe(struct pipe_inode_info *ipipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret = 0, do_wakeup = 0, i; + int ret, do_wakeup, i, ipipe_first; + + ret = do_wakeup = ipipe_first = 0; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1020,6 +1022,7 @@ static int link_pipe(struct pipe_inode_info *ipipe, * could deadlock (one doing tee from A -> B, the other from B -> A). */ if (ipipe->inode < opipe->inode) { + ipipe_first = 1; mutex_lock(&ipipe->inode->i_mutex); mutex_lock(&opipe->inode->i_mutex); } else { @@ -1068,9 +1071,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, /* * We have input available, but no output room. - * If we already copied data, return that. + * If we already copied data, return that. If we + * need to drop the opipe lock, it must be ordered + * last to avoid deadlocks. */ - if (flags & SPLICE_F_NONBLOCK) { + if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { if (!ret) ret = -EAGAIN; break; @@ -1104,7 +1109,12 @@ static int link_pipe(struct pipe_inode_info *ipipe, if (ret) break; } - if (flags & SPLICE_F_NONBLOCK) { + /* + * pipe_wait() drops the ipipe mutex. To avoid deadlocks + * with another process, we can only safely do that if + * the ipipe lock is ordered last. + */ + if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { if (!ret) ret = -EAGAIN; break; -- cgit From a4514ebd8e12c63c09ab02be518db545bd1d24af Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:57:05 +0200 Subject: [PATCH] splice: offset fixes - We need to adjust *ppos for writes as well. - Copy back modified offset value if one was passed in, similar to what sendfile does. Signed-off-by: Jens Axboe --- fs/splice.c | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 4f5e6b09fb26..27d6408ff490 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -720,22 +720,26 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, ssize_t ret; ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); - - /* - * If file or inode is SYNC and we actually wrote some data, sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) - && ret > 0) { + if (ret > 0) { struct inode *inode = mapping->host; - int err; - mutex_lock(&inode->i_mutex); - err = generic_osync_inode(mapping->host, mapping, - OSYNC_METADATA|OSYNC_DATA); - mutex_unlock(&inode->i_mutex); + *ppos += ret; + + /* + * If file or inode is SYNC and we actually wrote some data, + * sync it. + */ + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + mutex_lock(&inode->i_mutex); + err = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + mutex_unlock(&inode->i_mutex); - if (err) - ret = err; + if (err) + ret = err; + } } return ret; @@ -937,6 +941,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, { struct pipe_inode_info *pipe; loff_t offset, *off; + long ret; pipe = in->f_dentry->d_inode->i_pipe; if (pipe) { @@ -951,7 +956,12 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &out->f_pos; - return do_splice_from(pipe, out, off, len, flags); + ret = do_splice_from(pipe, out, off, len, flags); + + if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; } pipe = out->f_dentry->d_inode->i_pipe; @@ -967,7 +977,12 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &in->f_pos; - return do_splice_to(in, off, pipe, len, flags); + ret = do_splice_to(in, off, pipe, len, flags); + + if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; } return -EINVAL; -- cgit From 9e0267c26e237f84f608a68e579bf4eb89dad819 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:57:31 +0200 Subject: [PATCH] splice: fixup writeout path after ->map changes Since ->map() no longer locks the page, we need to adjust the handling of those pages (and stealing) a little. This now passes full regressions again. Signed-off-by: Jens Axboe --- fs/splice.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 27d6408ff490..22fac87e90b3 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -50,7 +50,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, struct page *page = buf->page; struct address_space *mapping = page_mapping(page); - WARN_ON(!PageLocked(page)); + lock_page(page); + WARN_ON(!PageUptodate(page)); /* @@ -65,8 +66,10 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, if (PagePrivate(page)) try_to_release_page(page, mapping_gfp_mask(mapping)); - if (!remove_mapping(mapping, page)) + if (!remove_mapping(mapping, page)) { + unlock_page(page); return 1; + } buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; return 0; @@ -507,14 +510,12 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, if (sd->flags & SPLICE_F_MOVE) { /* * If steal succeeds, buf->page is now pruned from the vm - * side (LRU and page cache) and we can reuse it. + * side (LRU and page cache) and we can reuse it. The page + * will also be looked on successful return. */ if (buf->ops->steal(info, buf)) goto find_page; - /* - * this will also set the page locked - */ page = buf->page; if (add_to_page_cache(page, mapping, index, gfp_mask)) goto find_page; @@ -523,15 +524,27 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, lru_cache_add(page); } else { find_page: - ret = -ENOMEM; - page = find_or_create_page(mapping, index, gfp_mask); - if (!page) - goto out_nomem; + page = find_lock_page(mapping, index); + if (!page) { + ret = -ENOMEM; + page = page_cache_alloc_cold(mapping); + if (unlikely(!page)) + goto out_nomem; + + /* + * This will also lock the page + */ + ret = add_to_page_cache_lru(page, mapping, index, + gfp_mask); + if (unlikely(ret)) + goto out; + } /* - * If the page is uptodate, it is also locked. If it isn't - * uptodate, we can mark it uptodate if we are filling the - * full page. Otherwise we need to read it in first... + * We get here with the page locked. If the page is also + * uptodate, we don't need to do more. If it isn't, we + * may need to bring it in if we are not going to overwrite + * the full page. */ if (!PageUptodate(page)) { if (sd->len < PAGE_CACHE_SIZE) { @@ -553,10 +566,8 @@ find_page: ret = -EIO; goto out; } - } else { - WARN_ON(!PageLocked(page)); + } else SetPageUptodate(page); - } } } @@ -585,10 +596,10 @@ find_page: mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); out: - if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { + if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) page_cache_release(page); - unlock_page(page); - } + + unlock_page(page); out_nomem: buf->ops->unmap(info, buf); return ret; -- cgit From 5e85d4abe3f43bb5362f384bab0e20ef082ce0b5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 18 Apr 2006 22:20:16 -0700 Subject: [PATCH] task: Make task list manipulations RCU safe While we can currently walk through thread groups, process groups, and sessions with just the rcu_read_lock, this opens the door to walking the entire task list. We already have all of the other RCU guarantees so there is no cost in doing this, this should be enough so that proc can stop taking the tasklist lock during readdir. prev_task was killed because it has no users, and using it will miss new tasks when doing an rcu traversal. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/linux/sched.h | 3 +-- kernel/exit.c | 2 +- kernel/fork.c | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 4121bb559739..3a79d97ac234 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -712,7 +712,7 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_PID, current->pid); attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); attach_pid(current, PIDTYPE_SID, current->signal->session); - list_add_tail(¤t->tasks, &init_task.tasks); + list_add_tail_rcu(¤t->tasks, &init_task.tasks); current->group_leader = current; leader->group_leader = current; diff --git a/include/linux/sched.h b/include/linux/sched.h index b7d31e2e1729..29b7d4f87d20 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1192,8 +1192,7 @@ extern void wait_task_inactive(task_t * p); #define remove_parent(p) list_del_init(&(p)->sibling) #define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children) -#define next_task(p) list_entry((p)->tasks.next, struct task_struct, tasks) -#define prev_task(p) list_entry((p)->tasks.prev, struct task_struct, tasks) +#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) diff --git a/kernel/exit.c b/kernel/exit.c index 1a9787ac6173..f86434d7b3d1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -56,7 +56,7 @@ static void __unhash_process(struct task_struct *p) detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); - list_del_init(&p->tasks); + list_del_rcu(&p->tasks); __get_cpu_var(process_counts)--; } list_del_rcu(&p->thread_group); diff --git a/kernel/fork.c b/kernel/fork.c index 54b15f8cda53..34515772611e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1204,7 +1204,7 @@ static task_t *copy_process(unsigned long clone_flags, attach_pid(p, PIDTYPE_PGID, process_group(p)); attach_pid(p, PIDTYPE_SID, p->signal->session); - list_add_tail(&p->tasks, &init_task.tasks); + list_add_tail_rcu(&p->tasks, &init_task.tasks); __get_cpu_var(process_counts)++; } attach_pid(p, PIDTYPE_PID, p->pid); -- cgit From dda27d1a55e185b0c5fd184b86ac26c66846f095 Mon Sep 17 00:00:00 2001 From: Arthur Othieno Date: Tue, 18 Apr 2006 22:20:57 -0700 Subject: [PATCH] hugetlbfs: add Kconfig help text In kernel bugzilla #6248 (http://bugzilla.kernel.org/show_bug.cgi?id=6248), Adrian Bunk notes that CONFIG_HUGETLBFS is missing Kconfig help text. Signed-off-by: Arthur Othieno Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 2524629dc835..f9b5842c8d2d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -842,6 +842,12 @@ config TMPFS config HUGETLBFS bool "HugeTLB file system support" depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN + help + hugetlbfs is a filesystem backing for HugeTLB pages, based on + ramfs. For architectures that support it, say Y here and read + for details. + + If unsure, say N. config HUGETLB_PAGE def_bool HUGETLBFS -- cgit From ca99c1da080345e227cfb083c330a184d42e27f3 Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Tue, 18 Apr 2006 22:21:46 -0700 Subject: [PATCH] Fix file lookup without ref There are places in the kernel where we look up files in fd tables and access the file structure without holding refereces to the file. So, we need special care to avoid the race between looking up files in the fd table and tearing down of the file in another CPU. Otherwise, one might see a NULL f_dentry or such torn down version of the file. This patch fixes those special places where such a race may happen. Signed-off-by: Dipankar Sarma Acked-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/tty_io.c | 8 ++++++-- fs/locks.c | 9 +++++++-- fs/proc/base.c | 21 +++++++++++++++------ 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 841f0bd3eaaf..f07637a8f88f 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -2723,7 +2723,11 @@ static void __do_SAK(void *arg) } task_lock(p); if (p->files) { - rcu_read_lock(); + /* + * We don't take a ref to the file, so we must + * hold ->file_lock instead. + */ + spin_lock(&p->files->file_lock); fdt = files_fdtable(p->files); for (i=0; i < fdt->max_fds; i++) { filp = fcheck_files(p->files, i); @@ -2738,7 +2742,7 @@ static void __do_SAK(void *arg) break; } } - rcu_read_unlock(); + spin_unlock(&p->files->file_lock); } task_unlock(p); } while_each_thread(g, p); diff --git a/fs/locks.c b/fs/locks.c index dda83d6cd48b..efad798824dc 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2230,7 +2230,12 @@ void steal_locks(fl_owner_t from) lock_kernel(); j = 0; - rcu_read_lock(); + + /* + * We are not taking a ref to the file structures, so + * we need to acquire ->file_lock. + */ + spin_lock(&files->file_lock); fdt = files_fdtable(files); for (;;) { unsigned long set; @@ -2248,7 +2253,7 @@ void steal_locks(fl_owner_t from) set >>= 1; } } - rcu_read_unlock(); + spin_unlock(&files->file_lock); unlock_kernel(); } EXPORT_SYMBOL(steal_locks); diff --git a/fs/proc/base.c b/fs/proc/base.c index a3a3eecef689..6cc77dc3f3ff 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -297,16 +297,20 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm files = get_files_struct(task); if (files) { - rcu_read_lock(); + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (file) { *mnt = mntget(file->f_vfsmnt); *dentry = dget(file->f_dentry); - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); return 0; } - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); } return -ENOENT; @@ -1523,7 +1527,12 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, if (!files) goto out_unlock; inode->i_mode = S_IFLNK; - rcu_read_lock(); + + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (!file) goto out_unlock2; @@ -1531,7 +1540,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, inode->i_mode |= S_IRUSR | S_IXUSR; if (file->f_mode & 2) inode->i_mode |= S_IWUSR | S_IXUSR; - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; @@ -1541,7 +1550,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, return NULL; out_unlock2: - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); out_unlock: iput(inode); -- cgit From 95cf959b245832ad49bb333bf88f9805244b225d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Apr 2006 13:14:06 -0400 Subject: VFS: Fix another open intent Oops If the call to nfs_intent_set_file() fails to open a file in nfs4_proc_create(), we should return an error. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 47ece1dd3c67..d86c0db7b1e8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1218,7 +1218,7 @@ out: return status; } -static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) +static int nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) { struct file *filp; @@ -1227,8 +1227,10 @@ static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, st struct nfs_open_context *ctx; ctx = (struct nfs_open_context *)filp->private_data; ctx->state = state; - } else - nfs4_close_state(state, nd->intent.open.flags); + return 0; + } + nfs4_close_state(state, nd->intent.open.flags); + return PTR_ERR(filp); } struct dentry * @@ -1835,7 +1837,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_setattr_update_inode(state->inode, sattr); } if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN)) - nfs4_intent_set_file(nd, dentry, state); + status = nfs4_intent_set_file(nd, dentry, state); else nfs4_close_state(state, flags); out: -- cgit From e99170ff3b799a9fd43d538932a9231fac1de9d4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Apr 2006 13:21:42 -0400 Subject: NFS,SUNRPC: Fix compiler warnings if CONFIG_PROC_FS & CONFIG_SYSCTL are unset Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 8 +++----- fs/nfs/file.c | 5 ++--- include/linux/sunrpc/xprt.h | 1 + 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0f583cb16ddb..3c72b0c07283 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -112,10 +112,9 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode */ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { - struct dentry *dentry = iocb->ki_filp->f_dentry; - dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", - dentry->d_name.name, (long long) pos, nr_segs); + iocb->ki_filp->f_dentry->d_name.name, + (long long) pos, nr_segs); return -EINVAL; } @@ -468,7 +467,6 @@ static const struct rpc_call_ops nfs_commit_direct_ops = { static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) { struct nfs_write_data *data = dreq->commit_data; - struct rpc_task *task = &data->task; data->inode = dreq->inode; data->cred = dreq->ctx->cred; @@ -489,7 +487,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ dreq->commit_data = NULL; - dprintk("NFS: %5u initiated commit call\n", task->tk_pid); + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); lock_kernel(); rpc_execute(&data->task); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f1df2c8d9259..fade02c15e6e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -534,10 +534,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) */ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) { - struct inode * inode = filp->f_mapping->host; - dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n", - inode->i_sb->s_id, inode->i_ino, + filp->f_dentry->d_inode->i_sb->s_id, + filp->f_dentry->d_inode->i_ino, fl->fl_type, fl->fl_flags); /* diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 7eebbab7160b..e8bbe8118de8 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -53,6 +53,7 @@ struct rpc_timeout { struct rpc_task; struct rpc_xprt; +struct seq_file; /* * This describes a complete RPC request -- cgit From ec535ce154f2eaad3d97f2f20a76a6d8bdac33e5 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 18 Apr 2006 13:21:50 -0400 Subject: NFS: make 2 functions static Signed-off-by: Adrian Bunk Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- fs/lockd/svclock.c | 2 +- net/sunrpc/stats.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index d2b66bad7d50..3ef739120dff 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -650,7 +650,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) svc_wake_up(block->b_daemon); } -void nlmsvc_grant_release(void *data) +static void nlmsvc_grant_release(void *data) { struct nlm_rqst *call = data; diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index dea529666d69..15c2db26767b 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -176,7 +176,8 @@ void rpc_count_iostats(struct rpc_task *task) op_metrics->om_execute += execute; } -void _print_name(struct seq_file *seq, unsigned int op, struct rpc_procinfo *procs) +static void _print_name(struct seq_file *seq, unsigned int op, + struct rpc_procinfo *procs) { if (procs[op].p_name) seq_printf(seq, "\t%12s: ", procs[op].p_name); -- cgit From b9d9506d944865876e67281a4e4269d823ce5381 Mon Sep 17 00:00:00 2001 From: John Hawkes Date: Wed, 19 Apr 2006 13:06:20 -0400 Subject: NFS: nfs_show_stats; for_each_possible_cpu(), not NR_CPUS Convert a for-loop that explicitly references "NR_CPUS" into the potentially more efficient for_each_possible_cpu() construct. Signed-off-by: John Hawkes Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2f7656b911b6..d0b991a92327 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -700,12 +700,9 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) /* * Display superblock I/O counters */ - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { struct nfs_iostats *stats; - if (!cpu_possible(cpu)) - continue; - preempt_disable(); stats = per_cpu_ptr(nfss->io_stats, cpu); -- cgit From 7451c4f0ee53e36fd74168af8df75b28fd04a2aa Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 19 Apr 2006 13:06:37 -0400 Subject: NFS: remove needless check in nfs_opendir() Local variable res was initialized to 0 - no check needed here. Signed-off-by: Carsten Otte Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a23f34894167..cae74dd4c7f5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -128,15 +128,14 @@ struct inode_operations nfs4_dir_inode_operations = { static int nfs_opendir(struct inode *inode, struct file *filp) { - int res = 0; + int res; dfprintk(VFS, "NFS: opendir(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); lock_kernel(); /* Call generic open code in order to cache credentials */ - if (!res) - res = nfs_open(inode, filp); + res = nfs_open(inode, filp); unlock_kernel(); return res; } -- cgit From 82aa5d6183667aa2a5f3c61e390934b0273d2ad7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 20 Apr 2006 13:05:48 +0200 Subject: [PATCH] splice: fix smaller sized splice reads Signed-off-by: Jens Axboe --- fs/splice.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 22fac87e90b3..0559e7577a04 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -275,6 +275,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, error = 0; bytes = 0; for (i = 0; i < nr_pages; i++, index++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min(len, PAGE_CACHE_SIZE - loff); find_page: /* * lookup the page for this index @@ -366,11 +375,13 @@ readpage: * force quit after adding this page */ nr_pages = i; + this_len = min(this_len, loff); } } fill_it: pages[i] = page; - bytes += PAGE_CACHE_SIZE - loff; + bytes += this_len; + len -= this_len; loff = 0; } -- cgit From 0bd4fa977f81c914eb8bada00284d0933825900e Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 21 Apr 2006 18:17:42 +0000 Subject: [CIFS] [CIFS] Do not take rename sem on most path based calls (during building of full path) to avoid hang rename/readdir hang Reported by Alan Tyson Signed-off-by: Steve French --- fs/cifs/dir.c | 4 ---- fs/cifs/fcntl.c | 2 -- fs/cifs/file.c | 2 -- fs/cifs/inode.c | 6 ------ fs/cifs/link.c | 6 ------ fs/cifs/readdir.c | 2 -- fs/cifs/xattr.c | 8 -------- 7 files changed, 30 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 1d0ca3eaaca5..3830dfeb31cf 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -139,9 +139,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -316,9 +314,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) rc = -ENOMEM; else if (pTcon->ses->capabilities & CAP_UNIX) { diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c index ec4dfe9bf5ef..633a93811328 100644 --- a/fs/cifs/fcntl.c +++ b/fs/cifs/fcntl.c @@ -86,9 +86,7 @@ int cifs_dir_notify(struct file * file, unsigned long arg) cifs_sb = CIFS_SB(file->f_dentry->d_sb); pTcon = cifs_sb->tcon; - mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(file->f_dentry); - mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) { rc = -ENOMEM; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5c497c529772..1476725e6051 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -203,9 +203,7 @@ int cifs_open(struct inode *inode, struct file *file) } } - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(file->f_dentry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 957ddd1571c6..4093764ef461 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -722,9 +722,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -807,9 +805,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -1141,9 +1137,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) rc = 0; } - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 9562f5bba65c..2ec99f833142 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -48,10 +48,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, /* No need to check for cross device links since server will do that BB note DFS case in future though (when we may have to check) */ - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); fromName = build_path_from_dentry(old_file); toName = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if((fromName == NULL) || (toName == NULL)) { rc = -ENOMEM; goto cifs_hl_exit; @@ -103,9 +101,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) xid = GetXid(); - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if (!full_path) goto out_no_free; @@ -164,9 +160,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 2f6e2825571e..7b8591acc5ad 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -404,9 +404,7 @@ static int initiate_cifs_search(const int xid, struct file *file) if(pTcon == NULL) return -EINVAL; - mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(file->f_dentry); - mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) { return -ENOMEM; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 3938444d87b2..7754d641775e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -62,9 +62,7 @@ int cifs_removexattr(struct dentry * direntry, const char * ea_name) cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -116,9 +114,7 @@ int cifs_setxattr(struct dentry * direntry, const char * ea_name, cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -223,9 +219,7 @@ ssize_t cifs_getxattr(struct dentry * direntry, const char * ea_name, cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -341,9 +335,7 @@ ssize_t cifs_listxattr(struct dentry * direntry, char * data, size_t buf_size) cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; -- cgit From 296034f7de8bdf111984ce1630ac598a9c94a253 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 21 Apr 2006 18:18:37 +0000 Subject: [CIFS] Don't allow a backslash in a path component Unless Posix paths have been negotiated, the backslash, "\", is not a valid character in a path component. Signed-off-by: Dave Kleikamp Signed-off-by: Steve French --- fs/cifs/dir.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 3830dfeb31cf..82315edc77d7 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -436,6 +436,20 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct name cifs_sb = CIFS_SB(parent_dir_inode->i_sb); pTcon = cifs_sb->tcon; + /* + * Don't allow the separator character in a path component. + * The VFS will not allow "/", but "\" is allowed by posix. + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { + int i; + for (i = 0; i < direntry->d_name.len; i++) + if (direntry->d_name.name[i] == '\\') { + cFYI(1, ("Invalid file name")); + FreeXid(xid); + return ERR_PTR(-EINVAL); + } + } + /* can not grab the rename sem here since it would deadlock in the cases (beginning of sys_rename itself) in which we already have the sb rename sem */ -- cgit From 45af7a0f2ebad1304cab956e15f0b37318226fcd Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 21 Apr 2006 22:52:25 +0000 Subject: [CIFS] Use the kthread_ API instead of opencoding lots of hairy code for kernel thread creation and teardown. It does not move the cifsd thread handling to kthread due to problems found in testing with wakeup of threads blocked in the socket peek api, but the other cifs kernel threads now use kthread. Also cleanup cifs_init to properly unwind when thread creation fails. Signed-off-by: Christoph Hellwig Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 99 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 50 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d4b713e5affb..c262d8874ce9 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "cifsfs.h" #include "cifspdu.h" #define DECLARE_GLOBALS_HERE @@ -75,9 +76,6 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ; module_param(cifs_max_pending, int, 0); MODULE_PARM_DESC(cifs_max_pending,"Simultaneous requests to server. Default: 50 Range: 2 to 256"); -static DECLARE_COMPLETION(cifs_oplock_exited); -static DECLARE_COMPLETION(cifs_dnotify_exited); - extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; @@ -841,10 +839,6 @@ static int cifs_oplock_thread(void * dummyarg) __u16 netfid; int rc; - daemonize("cifsoplockd"); - allow_signal(SIGTERM); - - oplockThread = current; do { if (try_to_freeze()) continue; @@ -900,9 +894,9 @@ static int cifs_oplock_thread(void * dummyarg) set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); /* yield in case q were corrupt */ } - } while(!signal_pending(current)); - oplockThread = NULL; - complete_and_exit (&cifs_oplock_exited, 0); + } while (!kthread_should_stop()); + + return 0; } static int cifs_dnotify_thread(void * dummyarg) @@ -910,10 +904,6 @@ static int cifs_dnotify_thread(void * dummyarg) struct list_head *tmp; struct cifsSesInfo *ses; - daemonize("cifsdnotifyd"); - allow_signal(SIGTERM); - - dnotifyThread = current; do { if(try_to_freeze()) continue; @@ -931,8 +921,9 @@ static int cifs_dnotify_thread(void * dummyarg) wake_up_all(&ses->server->response_q); } read_unlock(&GlobalSMBSeslock); - } while(!signal_pending(current)); - complete_and_exit (&cifs_dnotify_exited, 0); + } while (!kthread_should_stop()); + + return 0; } static int __init @@ -982,32 +973,48 @@ init_cifs(void) } rc = cifs_init_inodecache(); - if (!rc) { - rc = cifs_init_mids(); - if (!rc) { - rc = cifs_init_request_bufs(); - if (!rc) { - rc = register_filesystem(&cifs_fs_type); - if (!rc) { - rc = (int)kernel_thread(cifs_oplock_thread, NULL, - CLONE_FS | CLONE_FILES | CLONE_VM); - if(rc > 0) { - rc = (int)kernel_thread(cifs_dnotify_thread, NULL, - CLONE_FS | CLONE_FILES | CLONE_VM); - if(rc > 0) - return 0; - else - cERROR(1,("error %d create dnotify thread", rc)); - } else { - cERROR(1,("error %d create oplock thread",rc)); - } - } - cifs_destroy_request_bufs(); - } - cifs_destroy_mids(); - } - cifs_destroy_inodecache(); + if (rc) + goto out_clean_proc; + + rc = cifs_init_mids(); + if (rc) + goto out_destroy_inodecache; + + rc = cifs_init_request_bufs(); + if (rc) + goto out_destroy_mids; + + rc = register_filesystem(&cifs_fs_type); + if (rc) + goto out_destroy_request_bufs; + + oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd"); + if (IS_ERR(oplockThread)) { + rc = PTR_ERR(oplockThread); + cERROR(1,("error %d create oplock thread", rc)); + goto out_unregister_filesystem; } + + dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd"); + if (IS_ERR(dnotifyThread)) { + rc = PTR_ERR(dnotifyThread); + cERROR(1,("error %d create dnotify thread", rc)); + goto out_stop_oplock_thread; + } + + return 0; + + out_stop_oplock_thread: + kthread_stop(oplockThread); + out_unregister_filesystem: + unregister_filesystem(&cifs_fs_type); + out_destroy_request_bufs: + cifs_destroy_request_bufs(); + out_destroy_mids: + cifs_destroy_mids(); + out_destroy_inodecache: + cifs_destroy_inodecache(); + out_clean_proc: #ifdef CONFIG_PROC_FS cifs_proc_clean(); #endif @@ -1025,14 +1032,8 @@ exit_cifs(void) cifs_destroy_inodecache(); cifs_destroy_mids(); cifs_destroy_request_bufs(); - if(oplockThread) { - send_sig(SIGTERM, oplockThread, 1); - wait_for_completion(&cifs_oplock_exited); - } - if(dnotifyThread) { - send_sig(SIGTERM, dnotifyThread, 1); - wait_for_completion(&cifs_dnotify_exited); - } + kthread_stop(oplockThread); + kthread_stop(dnotifyThread); } MODULE_AUTHOR("Steve French "); -- cgit From 60808233f374aebba26488d06a5f25443f6763c3 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 22 Apr 2006 15:53:05 +0000 Subject: [CIFS] Readdir fixes to allow search to start at arbitrary position in directory Also includes first part of fix to compensate for servers which forget to return . and .. as well as updates to changelog and cifs readme. Signed-off-by: Steve French --- fs/cifs/CHANGES | 6 +++++- fs/cifs/README | 8 ++++++++ fs/cifs/cifssmb.c | 2 +- fs/cifs/connect.c | 5 ++++- fs/cifs/file.c | 32 ++++++++++++++++++++------------ fs/cifs/ntlmssp.c | 14 ++++++++++++++ fs/cifs/readdir.c | 43 ++++++++++++++++++++++--------------------- 7 files changed, 74 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 8a2de038882e..1a27ecb46c9a 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,7 +1,11 @@ Version 1.42 ------------ Fix slow oplock break when mounted to different servers at the same time and -the tids match and we try to find matching fid on wrong server. +the tids match and we try to find matching fid on wrong server. Fix read +looping when signing required by server (2.6.16 kernel only). Fix readdir +vs. rename race which could cause each to hang. Return . and .. even +if server does not. Allow searches to skip first three entries and +begin at any location. Fix oops in find_writeable_file. Version 1.41 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index b2b4d0803761..0355003f4f0a 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -511,6 +511,14 @@ LinuxExtensionsEnabled If set to one then the client will attempt to support and want to map the uid and gid fields to values supplied at mount (rather than the actual values, then set this to zero. (default 1) +Experimental When set to 1 used to enable certain experimental + features (currently enables multipage writes + when signing is enabled, the multipage write + performance enhancement was disabled when + signing turned on in case buffer was modified + just before it was sent, also this flag will + be used to use the new experimental sessionsetup + code). These experimental features and tracing can be enabled by changing flags in /proc/fs/cifs (after the cifs module has been installed or built into the diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d705500aa283..fd36892eda55 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -3119,7 +3119,7 @@ findFirstRetry: psrch_inf->endOfSearch = FALSE; psrch_inf->entries_in_buffer = le16_to_cpu(parms->SearchCount); - psrch_inf->index_of_last_entry = + psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + psrch_inf->entries_in_buffer; *pnetfid = parms->SearchHandle; } else { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0b86d5ca9014..aaf151cb5822 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3447,7 +3447,10 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, pSesInfo->server->secMode, pSesInfo->server->capabilities, pSesInfo->server->timeZone)); - if (extended_security + if(experimEnabled > 1) + rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */, + &ntlmv2_flag, nls_info); + else if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) && (pSesInfo->server->secType == NTLMSSP)) { cFYI(1, ("New style sesssetup")); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1476725e6051..e152bf6afa60 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -904,8 +904,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data, if (rc != 0) break; } - /* BB FIXME We can not sign across two buffers yet */ - if((pTcon->ses->server->secMode & + if(experimEnabled || (pTcon->ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) == 0) { struct kvec iov[2]; unsigned int len; @@ -921,13 +920,13 @@ static ssize_t cifs_write(struct file *file, const char *write_data, *poffset, &bytes_written, iov, 1, long_op); } else - /* BB FIXME fixup indentation of line below */ - rc = CIFSSMBWrite(xid, pTcon, - open_file->netfid, - min_t(const int, cifs_sb->wsize, - write_size - total_written), - *poffset, &bytes_written, - write_data + total_written, NULL, long_op); + rc = CIFSSMBWrite(xid, pTcon, + open_file->netfid, + min_t(const int, cifs_sb->wsize, + write_size - total_written), + *poffset, &bytes_written, + write_data + total_written, + NULL, long_op); } if (rc || (bytes_written == 0)) { if (total_written) @@ -966,6 +965,16 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) struct cifsFileInfo *open_file; int rc; + /* Having a null inode here (because mapping->host was set to zero by + the VFS or MM) should not happen but we had reports of on oops (due to + it being zero) during stress testcases so we need to check for it */ + + if(cifs_inode == NULL) { + cERROR(1,("Null inode passed to cifs_writeable_file")); + dump_stack(); + return NULL; + } + read_lock(&GlobalSMBSeslock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (open_file->closePend) @@ -1091,12 +1100,11 @@ static int cifs_writepages(struct address_space *mapping, if (cifs_sb->wsize < PAGE_CACHE_SIZE) return generic_writepages(mapping, wbc); - /* BB FIXME we do not have code to sign across multiple buffers yet, - so go to older writepage style write which we can sign if needed */ if((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server)) if(cifs_sb->tcon->ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - return generic_writepages(mapping, wbc); + if(!experimEnabled) + return generic_writepages(mapping, wbc); /* * BB: Is this meaningful for a non-block-device file system? diff --git a/fs/cifs/ntlmssp.c b/fs/cifs/ntlmssp.c index 78866f925747..115359cc7a32 100644 --- a/fs/cifs/ntlmssp.c +++ b/fs/cifs/ntlmssp.c @@ -121,6 +121,20 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, const int type, } + /* copy session key */ + + /* if Unicode, align strings to two byte boundary */ + + /* copy user name */ /* BB Do we need to special case null user name? */ + + /* copy domain name */ + + /* copy Linux version */ + + /* copy network operating system name */ + + /* update bcc and smb buffer length */ + /* rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buf_type, 0); */ /* SMB request buf freed in SendReceive2 */ diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 7b8591acc5ad..41c022e3c132 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -590,6 +590,13 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry - cifsFile->srch_inf.entries_in_buffer; + + /* if first entry in buf is zero then is first buffer + in search response data which means it is likely . and .. + will be in this buffer, although some servers do not return + . and .. for the root of a drive and for those we need + to start two entries earlier */ + /* dump_cifs_file_struct(file, "In fce ");*/ if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && is_dir_changed(file)) || @@ -632,23 +639,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, char * end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + smbCalcSize((struct smb_hdr *) cifsFile->srch_inf.ntwrk_buf_start); + + current_entry = cifsFile->srch_inf.srch_entries_start; first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry - cifsFile->srch_inf.entries_in_buffer; pos_in_buf = index_to_find - first_entry_in_buffer; cFYI(1,("found entry - pos_in_buf %d",pos_in_buf)); - current_entry = cifsFile->srch_inf.srch_entries_start; for(i=0;(i<(pos_in_buf)) && (current_entry != NULL);i++) { /* go entry by entry figuring out which is first */ - /* if( . or ..) - skip */ - rc = cifs_entry_is_dot(current_entry,cifsFile); - if(rc == 1) /* is . or .. so skip */ { - cFYI(1,("Entry is .")); /* BB removeme BB */ - /* continue; */ - } else if (rc == 2 ) { - cFYI(1,("Entry is ..")); /* BB removeme BB */ - /* continue; */ - } current_entry = nxt_dir_entry(current_entry,end_of_smb); } if((current_entry == NULL) && (i < pos_in_buf)) { @@ -768,6 +766,11 @@ static int cifs_filldir(char *pfindEntry, struct file *file, if(file->f_dentry == NULL) return -ENOENT; + rc = cifs_entry_is_dot(pfindEntry,cifsF); + /* skip . and .. since we added them first */ + if(rc != 0) + return 0; + cifs_sb = CIFS_SB(file->f_dentry->d_sb); qstring.name = scratch_buf; @@ -896,22 +899,22 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) switch ((int) file->f_pos) { case 0: - /*if (filldir(direntry, ".", 1, file->f_pos, + if (filldir(direntry, ".", 1, file->f_pos, file->f_dentry->d_inode->i_ino, DT_DIR) < 0) { - cERROR(1, ("Filldir for current dir failed ")); + cERROR(1, ("Filldir for current dir failed")); rc = -ENOMEM; break; } - file->f_pos++; */ + file->f_pos++; case 1: - /* if (filldir(direntry, "..", 2, file->f_pos, + if (filldir(direntry, "..", 2, file->f_pos, file->f_dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) { cERROR(1, ("Filldir for parent dir failed ")); rc = -ENOMEM; break; } - file->f_pos++; */ - case 2: + file->f_pos++; + default: /* 1) If search is active, is in current search buffer? if it before then restart search @@ -925,7 +928,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) return rc; } } - default: if(file->private_data == NULL) { rc = -EINVAL; FreeXid(xid); @@ -945,8 +947,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) kfree(cifsFile->search_resume_name); cifsFile->search_resume_name = NULL; */ - /* BB account for . and .. in f_pos as special case */ - rc = find_cifs_entry(xid,pTcon, file, ¤t_entry,&num_to_fill); if(rc) { @@ -975,7 +975,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) num_to_fill, i)); break; } - + /* if buggy server returns . and .. late do + we want to check for that here? */ rc = cifs_filldir(current_entry, file, filldir, direntry,tmp_buf); file->f_pos++; -- cgit From b9251b823b5e921c894eb135cb6c64abf483f50e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 22 Apr 2006 02:36:24 -0700 Subject: [PATCH] Fix reiserfs deadlock reiserfs_cache_default_acl() should return whether we successfully found the acl or not. We have to return correct value even if reiserfs_get_acl() returns error code and not just 0. Otherwise callers such as reiserfs_mkdir() can unnecessarily lock the xattrs and later functions such as reiserfs_new_inode() fail to notice that we have already taken the lock and try to take it again with obvious consequences. Signed-off-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr_acl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 58c418fbca2c..97ae1b92bc47 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -408,8 +408,9 @@ int reiserfs_cache_default_acl(struct inode *inode) acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT); reiserfs_read_unlock_xattrs(inode->i_sb); reiserfs_read_unlock_xattr_i(inode); - ret = acl ? 1 : 0; - posix_acl_release(acl); + ret = (acl && !IS_ERR(acl)); + if (ret) + posix_acl_release(acl); } return ret; -- cgit From b66ac3ea21f81dea02cdb4e9de66ee6afdc540e4 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 23 Apr 2006 01:54:50 +0000 Subject: [CIFS] Fix typo in previous Signed-off-by: Steve French --- fs/cifs/readdir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 41c022e3c132..b689c5035124 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -766,7 +766,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, if(file->f_dentry == NULL) return -ENOENT; - rc = cifs_entry_is_dot(pfindEntry,cifsF); + rc = cifs_entry_is_dot(pfindEntry,pCifsF); /* skip . and .. since we added them first */ if(rc != 0) return 0; -- cgit From 301dc3e6f6ea83703fa52919c00e60661da5a8fe Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 24 Apr 2006 16:24:54 +0000 Subject: [CIFS] Fix compile error when CONFIG_CIFS_EXPERIMENTAL is undefined Signed-off-by: Dave Kleikamp Signed-off-by: Steve French --- fs/cifs/connect.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index aaf151cb5822..d2ec806a4f32 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3447,10 +3447,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, pSesInfo->server->secMode, pSesInfo->server->capabilities, pSesInfo->server->timeZone)); +#ifdef CONFIG_CIFS_EXPERIMENTAL if(experimEnabled > 1) rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */, &ntlmv2_flag, nls_info); - else if (extended_security + else +#endif + if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) && (pSesInfo->server->secType == NTLMSSP)) { cFYI(1, ("New style sesssetup")); -- cgit From ba5f5d90c45a30e4e9a1bd136acf1b3973c905c8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 25 Apr 2006 15:33:34 +0200 Subject: [PATCH] splice: fix min() warning Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/splice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 0559e7577a04..4aa67254740f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -283,7 +283,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, /* * this_len is the max we'll use from this page */ - this_len = min(len, PAGE_CACHE_SIZE - loff); + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); find_page: /* * lookup the page for this index -- cgit From 016b661e2f717168e600f3c85f29e1a49f88e004 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Apr 2006 15:42:00 +0200 Subject: [PATCH] splice: fix offset problems Make the move_from_pipe() actors return number of bytes processed, then move_from_pipe() can decide more cleverly when to move on to the next buffer. This fixes problems with pipe offset and differing file offset. Signed-off-by: Jens Axboe --- fs/splice.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 4aa67254740f..8c6030c762e2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -439,14 +439,13 @@ EXPORT_SYMBOL(generic_file_splice_read); /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' - * using sendpage(). + * using sendpage(). Return the number of bytes sent. */ static int pipe_to_sendpage(struct pipe_inode_info *info, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->file; loff_t pos = sd->pos; - unsigned int offset; ssize_t ret; void *ptr; int more; @@ -461,16 +460,13 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, if (IS_ERR(ptr)) return PTR_ERR(ptr); - offset = pos & ~PAGE_CACHE_MASK; more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); + ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len, + &pos, more); buf->ops->unmap(info, buf); - if (ret == sd->len) - return 0; - - return -EIO; + return ret; } /* @@ -499,7 +495,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, struct file *file = sd->file; struct address_space *mapping = file->f_mapping; gfp_t gfp_mask = mapping_gfp_mask(mapping); - unsigned int offset; + unsigned int offset, this_len; struct page *page; pgoff_t index; char *src; @@ -515,6 +511,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; + this_len = sd->len; + if (this_len + offset > PAGE_CACHE_SIZE) + this_len = PAGE_CACHE_SIZE - offset; + /* * Reuse buf page, if SPLICE_F_MOVE is set. */ @@ -558,7 +558,7 @@ find_page: * the full page. */ if (!PageUptodate(page)) { - if (sd->len < PAGE_CACHE_SIZE) { + if (this_len < PAGE_CACHE_SIZE) { ret = mapping->a_ops->readpage(file, page); if (unlikely(ret)) goto out; @@ -582,7 +582,7 @@ find_page: } } - ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); + ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); if (ret == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto find_page; @@ -592,18 +592,22 @@ find_page: if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { char *dst = kmap_atomic(page, KM_USER0); - memcpy(dst + offset, src + buf->offset, sd->len); + memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); kunmap_atomic(dst, KM_USER0); } - ret = mapping->a_ops->commit_write(file, page, 0, sd->len); + ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); if (ret == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto find_page; } else if (ret) goto out; + /* + * Return the number of bytes written. + */ + ret = this_len; mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); out: @@ -652,16 +656,22 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, sd.len = sd.total_len; err = actor(pipe, buf, &sd); - if (err) { + if (err <= 0) { if (!ret && err != -ENODATA) ret = err; break; } - ret += sd.len; - buf->offset += sd.len; - buf->len -= sd.len; + ret += err; + buf->offset += err; + buf->len -= err; + + sd.len -= err; + sd.pos += err; + sd.total_len -= err; + if (sd.len) + continue; if (!buf->len) { buf->ops = NULL; @@ -672,8 +682,6 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, do_wakeup = 1; } - sd.pos += sd.len; - sd.total_len -= sd.len; if (!sd.total_len) break; } -- cgit From 5a5fb1ea74d8b82ca1461b885a1334fb21e037be Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 26 Apr 2006 10:48:55 +0200 Subject: Revert "[fuse] fix deadlock between fuse_put_super() and request_end()" This reverts 73ce8355c243a434524a34c05cc417dd0467996e commit. It was wrong, because it didn't take into account the requirement, that iput() for background requests must be performed synchronously with ->put_super(), otherwise active inodes may remain after unmount. The right solution is to keep the sbput_sem and perform iput() within the locked region, but move fput() outside sbput_sem. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 28 ++++++++++++---------------- fs/fuse/fuse_i.h | 12 +++++++++--- fs/fuse/inode.c | 27 ++++++++++----------------- 3 files changed, 31 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cc750c68fe70..4967bd40b953 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -128,14 +128,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) } } -void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req) +void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) { - list_del_init(&req->bg_entry); + iput(req->inode); + iput(req->inode2); + if (req->file) + fput(req->file); + spin_lock(&fc->lock); + list_del(&req->bg_entry); if (fc->num_background == FUSE_MAX_BACKGROUND) { fc->blocked = 0; wake_up_all(&fc->blocked_waitq); } fc->num_background--; + spin_unlock(&fc->lock); } /* @@ -165,27 +171,17 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&req->waitq); fuse_put_request(fc, req); } else { - struct inode *inode = req->inode; - struct inode *inode2 = req->inode2; - struct file *file = req->file; void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; - req->inode = NULL; - req->inode2 = NULL; - req->file = NULL; - if (!list_empty(&req->bg_entry)) - fuse_remove_background(fc, req); spin_unlock(&fc->lock); - + down_read(&fc->sbput_sem); + if (fc->mounted) + fuse_release_background(fc, req); + up_read(&fc->sbput_sem); if (end) end(fc, req); else fuse_put_request(fc, req); - - if (file) - fput(file); - iput(inode); - iput(inode2); } } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 59661c481d9d..0474202cb5dc 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -258,9 +258,15 @@ struct fuse_conn { /** waitq for blocked connection */ wait_queue_head_t blocked_waitq; + /** RW semaphore for exclusion with fuse_put_super() */ + struct rw_semaphore sbput_sem; + /** The next unique request id */ u64 reqctr; + /** Mount is active */ + unsigned mounted; + /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -471,11 +477,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); void request_send_background(struct fuse_conn *fc, struct fuse_req *req); /** - * Remove request from the the background list + * Release inodes and file associated with background request */ -void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req); +void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req); -/** Abort all requests */ +/* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); /** diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 43a6fc0db8a7..fd34037b0588 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -204,26 +204,17 @@ static void fuse_put_super(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); + down_write(&fc->sbput_sem); + while (!list_empty(&fc->background)) + fuse_release_background(fc, + list_entry(fc->background.next, + struct fuse_req, bg_entry)); + spin_lock(&fc->lock); + fc->mounted = 0; fc->connected = 0; - while (!list_empty(&fc->background)) { - struct fuse_req *req = list_entry(fc->background.next, - struct fuse_req, bg_entry); - struct inode *inode = req->inode; - struct inode *inode2 = req->inode2; - - /* File would hold a reference to vfsmount */ - BUG_ON(req->file); - req->inode = NULL; - req->inode2 = NULL; - fuse_remove_background(fc, req); - - spin_unlock(&fc->lock); - iput(inode); - iput(inode2); - spin_lock(&fc->lock); - } spin_unlock(&fc->lock); + up_write(&fc->sbput_sem); /* Flush all readers on this fs */ kill_fasync(&fc->fasync, SIGIO, POLL_IN); wake_up_all(&fc->waitq); @@ -395,6 +386,7 @@ static struct fuse_conn *new_conn(void) INIT_LIST_HEAD(&fc->processing); INIT_LIST_HEAD(&fc->io); INIT_LIST_HEAD(&fc->background); + init_rwsem(&fc->sbput_sem); kobj_set_kset_s(fc, connections_subsys); kobject_init(&fc->kobj); atomic_set(&fc->num_waiting, 0); @@ -549,6 +541,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err_free_req; sb->s_root = root_dentry; + fc->mounted = 1; fc->connected = 1; kobject_get(&fc->kobj); file->private_data = fc; -- cgit From 6dbbcb120570d747b00783820ee02d1e1bcf63de Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 26 Apr 2006 10:49:06 +0200 Subject: [fuse] fix deadlock between fuse_put_super() and request_end(), try #2 A deadlock was possible, when the last reference to the superblock was held due to a background request containing a file reference. Releasing the file would release the vfsmount which in turn would release the superblock. Since sbput_sem is held during the fput() and fuse_put_super() tries to acquire this same semaphore, a deadlock results. The solution is to move the fput() outside the region protected by sbput_sem. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4967bd40b953..104a62dadb94 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -128,12 +128,16 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) } } +/* + * Called with sbput_sem held for read (request_end) or write + * (fuse_put_super). By the time fuse_put_super() is finished, all + * inodes belonging to background requests must be released, so the + * iputs have to be done within the locked region. + */ void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) { iput(req->inode); iput(req->inode2); - if (req->file) - fput(req->file); spin_lock(&fc->lock); list_del(&req->bg_entry); if (fc->num_background == FUSE_MAX_BACKGROUND) { @@ -178,6 +182,11 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (fc->mounted) fuse_release_background(fc, req); up_read(&fc->sbput_sem); + + /* fput must go outside sbput_sem, otherwise it can deadlock */ + if (req->file) + fput(req->file); + if (end) end(fc, req); else -- cgit From 8aa09a50b5d9dbdf627f79e19d72d82994348089 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 26 Apr 2006 10:49:16 +0200 Subject: [fuse] fix race between checking and setting file->private_data BKL does not protect against races if the task may sleep between checking and setting a value. So move checking of file->private_data near to setting it in fuse_fill_super(). Found by Al Viro. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd34037b0588..7627022446b2 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -500,11 +500,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (file->f_op != &fuse_dev_operations) return -EINVAL; - /* Setting file->private_data can't race with other mount() - instances, since BKL is held for ->get_sb() */ - if (file->private_data) - return -EINVAL; - fc = new_conn(); if (!fc) return -ENOMEM; @@ -540,6 +535,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (err) goto err_free_req; + /* Setting file->private_data can't race with other mount() + instances, since BKL is held for ->get_sb() */ + err = -EINVAL; + if (file->private_data) + goto err_kobject_del; + sb->s_root = root_dentry; fc->mounted = 1; fc->connected = 1; @@ -556,6 +557,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) return 0; + err_kobject_del: + kobject_del(&fc->kobj); err_free_req: fuse_request_free(init_req); err_put_root: -- cgit From 912d35f86781e64d73be1ef358f703c08905ac37 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 Apr 2006 10:59:21 +0200 Subject: [PATCH] Add support for the sys_vmsplice syscall sys_splice() moves data to/from pipes with a file input/output. sys_vmsplice() moves data to a pipe, with the input being a user address range instead. This uses an approach suggested by Linus, where we can hold partial ranges inside the pages[] map. Hopefully this will be useful for network receive support as well. Signed-off-by: Jens Axboe --- arch/ia64/kernel/entry.S | 1 + arch/powerpc/kernel/systbl.S | 1 + arch/powerpc/platforms/cell/spu_callbacks.c | 1 + fs/splice.c | 292 ++++++++++++++++++++++++---- include/asm-i386/unistd.h | 3 +- include/asm-ia64/unistd.h | 3 +- include/asm-powerpc/unistd.h | 3 +- include/asm-x86_64/unistd.h | 4 +- include/linux/syscalls.h | 3 + 9 files changed, 268 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index e30798811216..bcb80ca5cf40 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1610,5 +1610,6 @@ sys_call_table: data8 sys_get_robust_list data8 sys_sync_file_range // 1300 data8 sys_tee + data8 sys_vmsplice .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 8d1522690501..0b98eea73c5e 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -324,6 +324,7 @@ COMPAT_SYS(ppoll) SYSCALL(unshare) SYSCALL(splice) SYSCALL(tee) +SYSCALL(vmsplice) /* * please add new calls to arch/powerpc/platforms/cell/spu_callbacks.c diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c index deb3afb94484..b283380a2a18 100644 --- a/arch/powerpc/platforms/cell/spu_callbacks.c +++ b/arch/powerpc/platforms/cell/spu_callbacks.c @@ -318,6 +318,7 @@ void *spu_syscall_table[] = { [__NR_unshare] sys_unshare, [__NR_splice] sys_splice, [__NR_tee] sys_tee, + [__NR_vmsplice] sys_vmsplice, }; long spu_sys_callback(struct spu_syscall_block *s) diff --git a/fs/splice.c b/fs/splice.c index 8c6030c762e2..0b2c1f060cae 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * Passed to the actors @@ -38,6 +39,22 @@ struct splice_desc { loff_t pos; /* file position */ }; +struct partial_page { + unsigned int offset; + unsigned int len; +}; + +/* + * Passed to move_to_pipe + */ +struct splice_pipe_desc { + struct page **pages; /* page map */ + struct partial_page *partial; /* pages[] may not be contig */ + int nr_pages; /* number of pages in map */ + unsigned int flags; /* splice flags */ + struct pipe_buf_operations *ops;/* ops associated with output pipe */ +}; + /* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the @@ -128,6 +145,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, kunmap(buf->page); } +static void *user_page_pipe_buf_map(struct file *file, + struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return kmap(buf->page); +} + +static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + kunmap(buf->page); +} + static void page_cache_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) { @@ -143,19 +173,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { .get = page_cache_pipe_buf_get, }; +static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static struct pipe_buf_operations user_page_pipe_buf_ops = { + .can_merge = 0, + .map = user_page_pipe_buf_map, + .unmap = user_page_pipe_buf_unmap, + .release = page_cache_pipe_buf_release, + .steal = user_page_pipe_buf_steal, + .get = page_cache_pipe_buf_get, +}; + /* * Pipe output worker. This sets up our pipe format with the page cache * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ -static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, - int nr_pages, unsigned long len, - unsigned int offset, unsigned int flags) +static ssize_t move_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) { - int ret, do_wakeup, i; + int ret, do_wakeup, page_nr; ret = 0; do_wakeup = 0; - i = 0; + page_nr = 0; if (pipe->inode) mutex_lock(&pipe->inode->i_mutex); @@ -171,27 +215,19 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, if (pipe->nrbufs < PIPE_BUFFERS) { int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; - struct page *page = pages[i++]; - unsigned long this_len; - this_len = PAGE_CACHE_SIZE - offset; - if (this_len > len) - this_len = len; - - buf->page = page; - buf->offset = offset; - buf->len = this_len; - buf->ops = &page_cache_pipe_buf_ops; + buf->page = spd->pages[page_nr]; + buf->offset = spd->partial[page_nr].offset; + buf->len = spd->partial[page_nr].len; + buf->ops = spd->ops; pipe->nrbufs++; + page_nr++; + ret += buf->len; + if (pipe->inode) do_wakeup = 1; - ret += this_len; - len -= this_len; - offset = 0; - if (!--nr_pages) - break; - if (!len) + if (!--spd->nr_pages) break; if (pipe->nrbufs < PIPE_BUFFERS) continue; @@ -199,7 +235,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, break; } - if (flags & SPLICE_F_NONBLOCK) { + if (spd->flags & SPLICE_F_NONBLOCK) { if (!ret) ret = -EAGAIN; break; @@ -234,8 +270,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } - while (i < nr_pages) - page_cache_release(pages[i++]); + while (page_nr < spd->nr_pages) + page_cache_release(spd->pages[page_nr++]); return ret; } @@ -246,17 +282,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, unsigned int flags) { struct address_space *mapping = in->f_mapping; - unsigned int loff, offset, nr_pages; + unsigned int loff, nr_pages; struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; struct page *page; pgoff_t index, end_index; loff_t isize; - size_t bytes; - int i, error; + size_t total_len; + int error; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + }; index = *ppos >> PAGE_CACHE_SHIFT; - loff = offset = *ppos & ~PAGE_CACHE_MASK; - nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) nr_pages = PIPE_BUFFERS; @@ -266,15 +309,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * read-ahead if this is a non-zero offset (we are likely doing small * chunk splice and the page is already there) for a single page. */ - if (!offset || nr_pages > 1) - do_page_cache_readahead(mapping, in, index, nr_pages); + if (!loff || spd.nr_pages > 1) + do_page_cache_readahead(mapping, in, index, spd.nr_pages); /* * Now fill in the holes: */ error = 0; - bytes = 0; - for (i = 0; i < nr_pages; i++, index++) { + total_len = 0; + for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) { unsigned int this_len; if (!len) @@ -367,26 +410,29 @@ readpage: */ if (end_index == index) { loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); - if (bytes + loff > isize) { + if (total_len + loff > isize) { page_cache_release(page); break; } /* * force quit after adding this page */ - nr_pages = i; + nr_pages = spd.nr_pages; this_len = min(this_len, loff); + loff = 0; } } fill_it: - pages[i] = page; - bytes += this_len; + pages[spd.nr_pages] = page; + partial[spd.nr_pages].offset = loff; + partial[spd.nr_pages].len = this_len; len -= this_len; + total_len += this_len; loff = 0; } - if (i) - return move_to_pipe(pipe, pages, i, bytes, offset, flags); + if (spd.nr_pages) + return move_to_pipe(pipe, &spd); return error; } @@ -1018,6 +1064,174 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; } +/* + * Map an iov into an array of pages and offset/length tupples. With the + * partial_page structure, we can map several non-contiguous ranges into + * our ones pages[] map instead of splitting that operation into pieces. + * Could easily be exported as a generic helper for other users, in which + * case one would probably want to add a 'max_nr_pages' parameter as well. + */ +static int get_iovec_page_array(const struct iovec __user *iov, + unsigned int nr_vecs, struct page **pages, + struct partial_page *partial) +{ + int buffers = 0, error = 0; + + /* + * It's ok to take the mmap_sem for reading, even + * across a "get_user()". + */ + down_read(¤t->mm->mmap_sem); + + while (nr_vecs) { + unsigned long off, npages; + void __user *base; + size_t len; + int i; + + /* + * Get user address base and length for this iovec. + */ + error = get_user(base, &iov->iov_base); + if (unlikely(error)) + break; + error = get_user(len, &iov->iov_len); + if (unlikely(error)) + break; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + if (unlikely(!len)) + break; + error = -EFAULT; + if (unlikely(!base)) + break; + + /* + * Get this base offset and number of pages, then map + * in the user pages. + */ + off = (unsigned long) base & ~PAGE_MASK; + npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (npages > PIPE_BUFFERS - buffers) + npages = PIPE_BUFFERS - buffers; + + error = get_user_pages(current, current->mm, + (unsigned long) base, npages, 0, 0, + &pages[buffers], NULL); + + if (unlikely(error <= 0)) + break; + + /* + * Fill this contiguous range into the partial page map. + */ + for (i = 0; i < error; i++) { + const int plen = min_t(size_t, len, PAGE_SIZE) - off; + + partial[buffers].offset = off; + partial[buffers].len = plen; + + off = 0; + len -= plen; + buffers++; + } + + /* + * We didn't complete this iov, stop here since it probably + * means we have to move some of this into a pipe to + * be able to continue. + */ + if (len) + break; + + /* + * Don't continue if we mapped fewer pages than we asked for, + * or if we mapped the max number of pages that we have + * room for. + */ + if (error < npages || buffers == PIPE_BUFFERS) + break; + + nr_vecs--; + iov++; + } + + up_read(¤t->mm->mmap_sem); + + if (buffers) + return buffers; + + return error; +} + +/* + * vmsplice splices a user address range into a pipe. It can be thought of + * as splice-from-memory, where the regular splice is splice-from-file (or + * to file). In both cases the output is a pipe, naturally. + * + * Note that vmsplice only supports splicing _from_ user memory to a pipe, + * not the other way around. Splicing from user memory is a simple operation + * that can be supported without any funky alignment restrictions or nasty + * vm tricks. We simply map in the user memory and fill them into a pipe. + * The reverse isn't quite as easy, though. There are two possible solutions + * for that: + * + * - memcpy() the data internally, at which point we might as well just + * do a regular read() on the buffer anyway. + * - Lots of nasty vm tricks, that are neither fast nor flexible (it + * has restriction limitations on both ends of the pipe). + * + * Alas, it isn't here. + * + */ +static long do_vmsplice(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe; + struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &user_page_pipe_buf_ops, + }; + + if (unlikely(!pipe)) + return -EBADF; + if (unlikely(nr_segs > UIO_MAXIOV)) + return -EINVAL; + else if (unlikely(!nr_segs)) + return 0; + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial); + if (spd.nr_pages <= 0) + return spd.nr_pages; + + return move_to_pipe(pipe, &spd); +} + +asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct file *file; + long error; + int fput; + + error = -EBADF; + file = fget_light(fd, &fput); + if (file) { + if (file->f_mode & FMODE_WRITE) + error = do_vmsplice(file, iov, nr_segs, flags); + + fput_light(file, fput); + } + + return error; +} + asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, size_t len, unsigned int flags) diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index d81d6cfc1bb4..eb4b152c82fc 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -321,8 +321,9 @@ #define __NR_splice 313 #define __NR_sync_file_range 314 #define __NR_tee 315 +#define __NR_vmsplice 316 -#define NR_syscalls 316 +#define NR_syscalls 317 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index a40ebec6aeeb..7107763168bf 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h @@ -290,12 +290,13 @@ #define __NR_get_robust_list 1299 #define __NR_sync_file_range 1300 #define __NR_tee 1301 +#define __NR_vmsplice 1302 #ifdef __KERNEL__ #include -#define NR_syscalls 278 /* length of syscall table */ +#define NR_syscalls 279 /* length of syscall table */ #define __ARCH_WANT_SYS_RT_SIGACTION diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h index c612f1a62772..34325e292596 100644 --- a/include/asm-powerpc/unistd.h +++ b/include/asm-powerpc/unistd.h @@ -303,8 +303,9 @@ #define __NR_unshare 282 #define __NR_splice 283 #define __NR_tee 284 +#define __NR_vmsplice 285 -#define __NR_syscalls 285 +#define __NR_syscalls 286 #ifdef __KERNEL__ #define __NR__exit __NR_exit diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 98c36eae567c..feb77cb8c044 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -615,8 +615,10 @@ __SYSCALL(__NR_splice, sys_splice) __SYSCALL(__NR_tee, sys_tee) #define __NR_sync_file_range 277 __SYSCALL(__NR_sync_file_range, sys_sync_file_range) +#define __NR_vmsplice 278 +__SYSCALL(__NR_vmsplice, sys_vmsplice) -#define __NR_syscall_max __NR_sync_file_range +#define __NR_syscall_max __NR_vmsplice #ifndef __NO_STUBS diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index d3ebc0e68b2b..3996960fc565 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -574,6 +574,9 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, size_t len, unsigned int flags); +asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags); + asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags); asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, -- cgit From 00522fb41a2a9bf0f98a007c0e2b516a3873148c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 Apr 2006 14:39:29 +0200 Subject: [PATCH] splice: rearrange moving to/from pipe helpers We need these for people writing their own ->splice_read/write hooks. Signed-off-by: Jens Axboe --- fs/splice.c | 35 +++++++++++------------------------ include/linux/pipe_fs_i.h | 17 +++++++++++++++++ 2 files changed, 28 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 0b2c1f060cae..447ebc0a37f3 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -29,23 +29,13 @@ #include #include -/* - * Passed to the actors - */ -struct splice_desc { - unsigned int len, total_len; /* current and remaining length */ - unsigned int flags; /* splice flags */ - struct file *file; /* file to read/write */ - loff_t pos; /* file position */ -}; - struct partial_page { unsigned int offset; unsigned int len; }; /* - * Passed to move_to_pipe + * Passed to splice_to_pipe */ struct splice_pipe_desc { struct page **pages; /* page map */ @@ -192,8 +182,8 @@ static struct pipe_buf_operations user_page_pipe_buf_ops = { * Pipe output worker. This sets up our pipe format with the page cache * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ -static ssize_t move_to_pipe(struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) +static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) { int ret, do_wakeup, page_nr; @@ -432,7 +422,7 @@ fill_it: } if (spd.nr_pages) - return move_to_pipe(pipe, &spd); + return splice_to_pipe(pipe, &spd); return error; } @@ -666,17 +656,14 @@ out_nomem: return ret; } -typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, - struct splice_desc *); - /* * Pipe input worker. Most of this logic works like a regular pipe, the * key here is the 'actor' worker passed in that actually moves the data * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ -static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags, - splice_actor *actor) +ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor) { int ret, do_wakeup, err; struct splice_desc sd; @@ -795,7 +782,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, struct address_space *mapping = out->f_mapping; ssize_t ret; - ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); + ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); if (ret > 0) { struct inode *inode = mapping->host; @@ -837,7 +824,7 @@ EXPORT_SYMBOL(generic_file_splice_write); ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); + return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); @@ -924,7 +911,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, /* * We don't have an immediate reader, but we'll read the stuff - * out of the pipe right after the move_to_pipe(). So set + * out of the pipe right after the splice_to_pipe(). So set * PIPE_READERS appropriately. */ pipe->readers = 1; @@ -1210,7 +1197,7 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, if (spd.nr_pages <= 0) return spd.nr_pages; - return move_to_pipe(pipe, &spd); + return splice_to_pipe(pipe, &spd); } asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index ef7f33c0be19..0008d4bd4059 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -61,4 +61,21 @@ void __free_pipe_info(struct pipe_inode_info *); /* from/to, of course */ #define SPLICE_F_MORE (0x04) /* expect more data */ +/* + * Passed to the actors + */ +struct splice_desc { + unsigned int len, total_len; /* current and remaining length */ + unsigned int flags; /* splice flags */ + struct file *file; /* file to read/write */ + loff_t pos; /* file position */ +}; + +typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, + struct splice_desc *); + +extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int, + splice_actor *); + #endif -- cgit From de0bb97aff6743f71abb8ec581238e2bdae9cdd1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Apr 2006 07:26:09 +0100 Subject: [PATCH] forgotten ->b_data in memcpy() call in ext3/resize.c (oopsable) sbi->s_group_desc is an array of pointers to buffer_head. memcpy() of buffer size from address of buffer_head is a bad idea - it will generate junk in any case, may oops if buffer_head is close to the end of slab page and next page is not mapped and isn't what was intended there. IOW, ->b_data is missing in that call. Fortunately, result doesn't go into the primary on-disk data structures, so only backup ones get crap written to them; that had allowed this bug to remain unnoticed until now. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/ext3/resize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index c5ffa8523968..8aac5334680d 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -213,7 +213,7 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; } lock_buffer(bh); - memcpy(gdb->b_data, sbi->s_group_desc[i], bh->b_size); + memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size); set_buffer_uptodate(gdb); unlock_buffer(bh); ext3_journal_dirty_metadata(handle, gdb); -- cgit From a090d9132c1e53e3517111123680c15afb25c0a4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Apr 2006 07:32:40 +0100 Subject: [PATCH] protect ext3 ioctl modifying append_only, immutable, etc. with i_mutex All modifications of ->i_flags in inodes that might be visible to somebody else must be under ->i_mutex. That patch fixes ext3 ioctl() setting S_APPEND and friends. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/ext3/ioctl.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index aaf1da17b6d4..8c22aa9a7fbb 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -48,6 +48,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, if (!S_ISDIR(inode->i_mode)) flags &= ~EXT3_DIRSYNC_FL; + mutex_lock(&inode->i_mutex); oldflags = ei->i_flags; /* The JOURNAL_DATA flag is modifiable only by root */ @@ -60,8 +61,10 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * This test looks nicer. Thanks to Pauline Middelink */ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); return -EPERM; + } } /* @@ -69,14 +72,18 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * the relevant capability. */ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) + if (!capable(CAP_SYS_RESOURCE)) { + mutex_unlock(&inode->i_mutex); return -EPERM; + } } handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + mutex_unlock(&inode->i_mutex); return PTR_ERR(handle); + } if (IS_SYNC(inode)) handle->h_sync = 1; err = ext3_reserve_inode_write(handle, inode, &iloc); @@ -93,11 +100,14 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, err = ext3_mark_iloc_dirty(handle, inode, &iloc); flags_err: ext3_journal_stop(handle); - if (err) + if (err) { + mutex_unlock(&inode->i_mutex); return err; + } if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) err = ext3_change_inode_journal_flag(inode, jflag); + mutex_unlock(&inode->i_mutex); return err; } case EXT3_IOC_GETVERSION: -- cgit From e7edf9cdeddc0cff125e8e658216efb2ff2b2219 Mon Sep 17 00:00:00 2001 From: James Morris Date: Wed, 26 Apr 2006 02:45:03 -0400 Subject: [PATCH] LSM: add missing hook to do_compat_readv_writev() This patch addresses a flaw in LSM, where there is no mediation of readv() and writev() in for 32-bit compatible apps using a 64-bit kernel. This bug was discovered and fixed initially in the native readv/writev code [1], but was not fixed in the compat code. Thanks to Al for spotting this one. [1] http://lwn.net/Articles/154282/ Signed-off-by: James Morris Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/compat.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 7f8e26ea427c..2e32bd340474 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1217,6 +1217,10 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, if (ret < 0) goto out; + ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE); + if (ret) + goto out; + fnv = NULL; if (type == READ) { fn = file->f_op->read; -- cgit From eb645a24de82496434cc81171d7f350edb327399 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Apr 2006 08:44:27 +0200 Subject: [PATCH] splice: switch to using page_cache_readahead() Avoids doing useless work, when the file is fully cached. Signed-off-by: Jens Axboe --- fs/splice.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 447ebc0a37f3..dc205f643090 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -299,8 +299,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * read-ahead if this is a non-zero offset (we are likely doing small * chunk splice and the page is already there) for a single page. */ - if (!loff || spd.nr_pages > 1) - do_page_cache_readahead(mapping, in, index, spd.nr_pages); + if (!loff || nr_pages > 1) + page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); /* * Now fill in the holes: -- cgit From eb20796bf6fdb95ccf51440ba2a827149bdc037f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Apr 2006 11:05:22 +0200 Subject: [PATCH] splice: make the read-side do batched page lookups Use the new find_get_pages_contig() to potentially look up the entire splice range in one single call. This speeds up generic_file_splice_read() quite a bit. Signed-off-by: Jens Axboe --- fs/splice.c | 95 ++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index dc205f643090..a46ddd28561e 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -279,7 +279,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, pgoff_t index, end_index; loff_t isize; size_t total_len; - int error; + int error, page_nr; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, @@ -307,39 +307,67 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, */ error = 0; total_len = 0; - for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) { - unsigned int this_len; - if (!len) - break; + /* + * Lookup the (hopefully) full range of pages we need. + */ + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); + /* + * If find_get_pages_contig() returned fewer pages than we needed, + * allocate the rest. + */ + index += spd.nr_pages; + while (spd.nr_pages < nr_pages) { /* - * this_len is the max we'll use from this page - */ - this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); -find_page: - /* - * lookup the page for this index + * Page could be there, find_get_pages_contig() breaks on + * the first hole. */ page = find_get_page(mapping, index); if (!page) { /* - * page didn't exist, allocate one + * page didn't exist, allocate one. */ page = page_cache_alloc_cold(mapping); if (!page) break; error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_mask(mapping)); + mapping_gfp_mask(mapping)); if (unlikely(error)) { page_cache_release(page); break; } - - goto readpage; + /* + * add_to_page_cache() locks the page, unlock it + * to avoid convoluting the logic below even more. + */ + unlock_page(page); } + pages[spd.nr_pages++] = page; + index++; + } + + /* + * Now loop over the map and see if we need to start IO on any + * pages, fill in the partial map, etc. + */ + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = pages[page_nr]; + /* * If the page isn't uptodate, we may need to start io on it */ @@ -360,7 +388,6 @@ find_page: */ if (!page->mapping) { unlock_page(page); - page_cache_release(page); break; } /* @@ -371,16 +398,20 @@ find_page: goto fill_it; } -readpage: /* * need to read in the page */ error = mapping->a_ops->readpage(in, page); - if (unlikely(error)) { - page_cache_release(page); + /* + * We really should re-lookup the page here, + * but it complicates things a lot. Instead + * lets just do what we already stored, and + * we'll get it the next time we are called. + */ if (error == AOP_TRUNCATED_PAGE) - goto find_page; + error = 0; + break; } @@ -389,10 +420,8 @@ readpage: */ isize = i_size_read(mapping->host); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!isize || index > end_index)) { - page_cache_release(page); + if (unlikely(!isize || index > end_index)) break; - } /* * if this is the last page, see if we need to shrink @@ -400,27 +429,33 @@ readpage: */ if (end_index == index) { loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); - if (total_len + loff > isize) { - page_cache_release(page); + if (total_len + loff > isize) break; - } /* * force quit after adding this page */ - nr_pages = spd.nr_pages; + len = this_len; this_len = min(this_len, loff); loff = 0; } } fill_it: - pages[spd.nr_pages] = page; - partial[spd.nr_pages].offset = loff; - partial[spd.nr_pages].len = this_len; + partial[page_nr].offset = loff; + partial[page_nr].len = this_len; len -= this_len; total_len += this_len; loff = 0; + spd.nr_pages++; + index++; } + /* + * Release any pages at the end, if we quit early. 'i' is how far + * we got, 'nr_pages' is how many pages are in the map. + */ + while (page_nr < nr_pages) + page_cache_release(pages[page_nr++]); + if (spd.nr_pages) return splice_to_pipe(pipe, &spd); -- cgit From 2833c28aa0d0326780acfa61149a2a02dcb2c9b4 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Thu, 27 Apr 2006 15:46:42 +0200 Subject: [PATCH] powerpc: Wire up *at syscalls Wire up *at syscalls. This patch has been tested on ppc64 (using glibc's testsuite, both 32bit and 64bit), and compile-tested for ppc32 (I have currently no ppc32 system available, but I expect no problems). Signed-off-by: Andreas Schwab Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/systbl.S | 13 +++++++++++++ arch/powerpc/platforms/cell/spu_callbacks.c | 13 +++++++++++++ fs/stat.c | 2 +- include/asm-powerpc/unistd.h | 20 +++++++++++++++++++- 4 files changed, 46 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 0b98eea73c5e..cf56a1d499ff 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -325,6 +325,19 @@ SYSCALL(unshare) SYSCALL(splice) SYSCALL(tee) SYSCALL(vmsplice) +COMPAT_SYS(openat) +SYSCALL(mkdirat) +SYSCALL(mknodat) +SYSCALL(fchownat) +COMPAT_SYS(futimesat) +SYSX(sys_newfstatat, sys_fstatat64, sys_fstatat64) +SYSCALL(unlinkat) +SYSCALL(renameat) +SYSCALL(linkat) +SYSCALL(symlinkat) +SYSCALL(readlinkat) +SYSCALL(fchmodat) +SYSCALL(faccessat) /* * please add new calls to arch/powerpc/platforms/cell/spu_callbacks.c diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c index b283380a2a18..95b36430aa0f 100644 --- a/arch/powerpc/platforms/cell/spu_callbacks.c +++ b/arch/powerpc/platforms/cell/spu_callbacks.c @@ -319,6 +319,19 @@ void *spu_syscall_table[] = { [__NR_splice] sys_splice, [__NR_tee] sys_tee, [__NR_vmsplice] sys_vmsplice, + [__NR_openat] sys_openat, + [__NR_mkdirat] sys_mkdirat, + [__NR_mknodat] sys_mknodat, + [__NR_fchownat] sys_fchownat, + [__NR_futimesat] sys_futimesat, + [__NR_newfstatat] sys_newfstatat, + [__NR_unlinkat] sys_unlinkat, + [__NR_renameat] sys_renameat, + [__NR_linkat] sys_linkat, + [__NR_symlinkat] sys_symlinkat, + [__NR_readlinkat] sys_readlinkat, + [__NR_fchmodat] sys_fchmodat, + [__NR_faccessat] sys_faccessat, }; long spu_sys_callback(struct spu_syscall_block *s) diff --git a/fs/stat.c b/fs/stat.c index 9948cc1685a4..0f282face322 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -261,7 +261,7 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf) return error; } -#ifndef __ARCH_WANT_STAT64 +#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) asmlinkage long sys_newfstatat(int dfd, char __user *filename, struct stat __user *statbuf, int flag) { diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h index 34325e292596..908acb44cb8a 100644 --- a/include/asm-powerpc/unistd.h +++ b/include/asm-powerpc/unistd.h @@ -304,8 +304,25 @@ #define __NR_splice 283 #define __NR_tee 284 #define __NR_vmsplice 285 +#define __NR_openat 286 +#define __NR_mkdirat 287 +#define __NR_mknodat 288 +#define __NR_fchownat 289 +#define __NR_futimesat 290 +#ifdef __powerpc64__ +#define __NR_newfstatat 291 +#else +#define __NR_fstatat64 291 +#endif +#define __NR_unlinkat 292 +#define __NR_renameat 293 +#define __NR_linkat 294 +#define __NR_symlinkat 295 +#define __NR_readlinkat 296 +#define __NR_fchmodat 297 +#define __NR_faccessat 298 -#define __NR_syscalls 286 +#define __NR_syscalls 299 #ifdef __KERNEL__ #define __NR__exit __NR_exit @@ -458,6 +475,7 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6 #ifdef CONFIG_PPC64 #define __ARCH_WANT_COMPAT_SYS_TIME #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND +#define __ARCH_WANT_SYS_NEWFSTATAT #endif /* -- cgit From 46e678c96bbd775abd05d3ddbe2fd334794f9157 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 30 Apr 2006 16:36:32 +0200 Subject: [PATCH] splice: fix bugs with stealing regular pipe pages - Check that page has suitable count for stealing in the regular pipes. - pipe_to_file() assumes that the page is locked on succesful steal, so do that in the pipe steal hook - Missing unlock_page() in add_to_page_cache() failure. Signed-off-by: Jens Axboe --- fs/pipe.c | 11 +++++++++-- fs/splice.c | 4 +++- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 7fefb10db8d9..5a369273c51b 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -127,8 +127,15 @@ static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - buf->flags |= PIPE_BUF_FLAG_STOLEN; - return 0; + struct page *page = buf->page; + + if (page_count(page) == 1) { + buf->flags |= PIPE_BUF_FLAG_STOLEN; + lock_page(page); + return 0; + } + + return 1; } static void anon_pipe_buf_get(struct pipe_inode_info *info, diff --git a/fs/splice.c b/fs/splice.c index a46ddd28561e..9df28d30efa0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -599,8 +599,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, goto find_page; page = buf->page; - if (add_to_page_cache(page, mapping, index, gfp_mask)) + if (add_to_page_cache(page, mapping, index, gfp_mask)) { + unlock_page(page); goto find_page; + } if (!(buf->flags & PIPE_BUF_FLAG_LRU)) lru_cache_add(page); -- cgit From 0568b409c74f7a125d92a09a3f386785700ef688 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 May 2006 19:50:48 +0200 Subject: [PATCH] splice: fix bugs in pipe_to_file() Found by Oleg Nesterov , fixed by me. - Only allow full pages to go to the page cache. - Check page != buf->page instead of using PIPE_BUF_FLAG_STOLEN. - Remember to clear 'stolen' if add_to_page_cache() fails. And as a cleanup on that: - Make the bottom fall-through logic a little less convoluted. Also make the steal path hold an extra reference to the page, so we don't have to differentiate between stolen and non-stolen at the end. Signed-off-by: Jens Axboe --- fs/pipe.c | 3 --- fs/splice.c | 37 +++++++++++++++++++------------------ include/linux/pipe_fs_i.h | 3 +-- 3 files changed, 20 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 5a369273c51b..888f265011bf 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -99,8 +99,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, { struct page *page = buf->page; - buf->flags &= ~PIPE_BUF_FLAG_STOLEN; - /* * If nobody else uses this page, and we don't already have a * temporary page, let's keep track of it as a one-deep @@ -130,7 +128,6 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct page *page = buf->page; if (page_count(page) == 1) { - buf->flags |= PIPE_BUF_FLAG_STOLEN; lock_page(page); return 0; } diff --git a/fs/splice.c b/fs/splice.c index 9df28d30efa0..1633778f3652 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -78,7 +78,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, return 1; } - buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; + buf->flags |= PIPE_BUF_FLAG_LRU; return 0; } @@ -87,7 +87,7 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info, { page_cache_release(buf->page); buf->page = NULL; - buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU); + buf->flags &= ~PIPE_BUF_FLAG_LRU; } static void *page_cache_pipe_buf_map(struct file *file, @@ -587,9 +587,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, this_len = PAGE_CACHE_SIZE - offset; /* - * Reuse buf page, if SPLICE_F_MOVE is set. + * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full + * page. */ - if (sd->flags & SPLICE_F_MOVE) { + if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) { /* * If steal succeeds, buf->page is now pruned from the vm * side (LRU and page cache) and we can reuse it. The page @@ -604,6 +605,8 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, goto find_page; } + page_cache_get(page); + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) lru_cache_add(page); } else { @@ -662,7 +665,7 @@ find_page: } else if (ret) goto out; - if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { + if (buf->page != page) { char *dst = kmap_atomic(page, KM_USER0); memcpy(dst + offset, src + buf->offset, this_len); @@ -671,22 +674,20 @@ find_page: } ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); - if (ret == AOP_TRUNCATED_PAGE) { + if (!ret) { + /* + * Return the number of bytes written and mark page as + * accessed, we are now done! + */ + ret = this_len; + mark_page_accessed(page); + balance_dirty_pages_ratelimited(mapping); + } else if (ret == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto find_page; - } else if (ret) - goto out; - - /* - * Return the number of bytes written. - */ - ret = this_len; - mark_page_accessed(page); - balance_dirty_pages_ratelimited(mapping); + } out: - if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) - page_cache_release(page); - + page_cache_release(page); unlock_page(page); out_nomem: buf->ops->unmap(info, buf); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 0008d4bd4059..3130977fc6ab 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -5,8 +5,7 @@ #define PIPE_BUFFERS (16) -#define PIPE_BUF_FLAG_STOLEN 0x01 -#define PIPE_BUF_FLAG_LRU 0x02 +#define PIPE_BUF_FLAG_LRU 0x01 struct pipe_buffer { struct page *page; -- cgit From f84d751994441292593523c7069ed147176f6cab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 May 2006 19:59:03 +0200 Subject: [PATCH] pipe: introduce ->pin() buffer operation The ->map() function is really expensive on highmem machines right now, since it has to use the slower kmap() instead of kmap_atomic(). Splice rarely needs to access the virtual address of a page, so it's a waste of time doing it. Introduce ->pin() to take over the responsibility of making sure the page data is valid. ->map() is then reduced to just kmap(). That way we can also share a most of the pipe buffer ops between pipe.c and splice.c Signed-off-by: Jens Axboe --- fs/pipe.c | 39 +++++++++++--------- fs/splice.c | 91 ++++++++++++++++------------------------------- include/linux/pipe_fs_i.h | 21 ++++++++++- 3 files changed, 73 insertions(+), 78 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 888f265011bf..d9644fd9cc0d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -110,14 +110,14 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, page_cache_release(page); } -static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +void *generic_pipe_buf_map(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { return kmap(buf->page); } -static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { kunmap(buf->page); } @@ -135,19 +135,24 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, return 1; } -static void anon_pipe_buf_get(struct pipe_inode_info *info, - struct pipe_buffer *buf) +void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) { page_cache_get(buf->page); } +int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf) +{ + return 0; +} + static struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, - .map = anon_pipe_buf_map, - .unmap = anon_pipe_buf_unmap, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = generic_pipe_buf_pin, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, - .get = anon_pipe_buf_get, + .get = generic_pipe_buf_get, }; static ssize_t @@ -183,12 +188,14 @@ pipe_readv(struct file *filp, const struct iovec *_iov, if (chars > total_len) chars = total_len; - addr = ops->map(filp, pipe, buf); - if (IS_ERR(addr)) { + error = ops->pin(pipe, buf); + if (error) { if (!ret) - ret = PTR_ERR(addr); + error = ret; break; } + + addr = ops->map(pipe, buf); error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); ops->unmap(pipe, buf); if (unlikely(error)) { @@ -300,11 +307,11 @@ pipe_writev(struct file *filp, const struct iovec *_iov, void *addr; int error; - addr = ops->map(filp, pipe, buf); - if (IS_ERR(addr)) { - error = PTR_ERR(addr); + error = ops->pin(pipe, buf); + if (error) goto out; - } + + addr = ops->map(pipe, buf); error = pipe_iov_copy_from_user(offset + addr, iov, chars); ops->unmap(pipe, buf); diff --git a/fs/splice.c b/fs/splice.c index 1633778f3652..d7538d83c367 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -90,9 +90,8 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info, buf->flags &= ~PIPE_BUF_FLAG_LRU; } -static void *page_cache_pipe_buf_map(struct file *file, - struct pipe_inode_info *info, - struct pipe_buffer *buf) +static int page_cache_pipe_buf_pin(struct pipe_inode_info *info, + struct pipe_buffer *buf) { struct page *page = buf->page; int err; @@ -118,49 +117,25 @@ static void *page_cache_pipe_buf_map(struct file *file, } /* - * Page is ok afterall, fall through to mapping. + * Page is ok afterall, we are done. */ unlock_page(page); } - return kmap(page); + return 0; error: unlock_page(page); - return ERR_PTR(err); -} - -static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, - struct pipe_buffer *buf) -{ - kunmap(buf->page); -} - -static void *user_page_pipe_buf_map(struct file *file, - struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return kmap(buf->page); -} - -static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - kunmap(buf->page); -} - -static void page_cache_pipe_buf_get(struct pipe_inode_info *info, - struct pipe_buffer *buf) -{ - page_cache_get(buf->page); + return err; } static struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, - .map = page_cache_pipe_buf_map, - .unmap = page_cache_pipe_buf_unmap, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = page_cache_pipe_buf_pin, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, - .get = page_cache_pipe_buf_get, + .get = generic_pipe_buf_get, }; static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, @@ -171,11 +146,12 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, static struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, - .map = user_page_pipe_buf_map, - .unmap = user_page_pipe_buf_unmap, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = generic_pipe_buf_pin, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, - .get = page_cache_pipe_buf_get, + .get = generic_pipe_buf_get, }; /* @@ -517,26 +493,16 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, { struct file *file = sd->file; loff_t pos = sd->pos; - ssize_t ret; - void *ptr; - int more; + int ret, more; - /* - * Sub-optimal, but we are limited by the pipe ->map. We don't - * need a kmap'ed buffer here, we just want to make sure we - * have the page pinned if the pipe page originates from the - * page cache. - */ - ptr = buf->ops->map(file, info, buf); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; + ret = buf->ops->pin(info, buf); + if (!ret) { + more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len, - &pos, more); + ret = file->f_op->sendpage(file, buf->page, buf->offset, + sd->len, &pos, more); + } - buf->ops->unmap(info, buf); return ret; } @@ -569,15 +535,14 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, unsigned int offset, this_len; struct page *page; pgoff_t index; - char *src; int ret; /* * make sure the data in this buffer is uptodate */ - src = buf->ops->map(file, info, buf); - if (IS_ERR(src)) - return PTR_ERR(src); + ret = buf->ops->pin(info, buf); + if (unlikely(ret)) + return ret; index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; @@ -666,11 +631,16 @@ find_page: goto out; if (buf->page != page) { - char *dst = kmap_atomic(page, KM_USER0); + /* + * Careful, ->map() uses KM_USER0! + */ + char *src = buf->ops->map(info, buf); + char *dst = kmap_atomic(page, KM_USER1); memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst, KM_USER1); + buf->ops->unmap(info, buf); } ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); @@ -690,7 +660,6 @@ out: page_cache_release(page); unlock_page(page); out_nomem: - buf->ops->unmap(info, buf); return ret; } diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 3130977fc6ab..b8aae1fc5185 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -14,10 +14,23 @@ struct pipe_buffer { unsigned int flags; }; +/* + * Note on the nesting of these functions: + * + * ->pin() + * ->steal() + * ... + * ->map() + * ... + * ->unmap() + * + * That is, ->map() must be called on a pinned buffer, same goes for ->steal(). + */ struct pipe_buf_operations { int can_merge; - void * (*map)(struct file *, struct pipe_inode_info *, struct pipe_buffer *); + void * (*map)(struct pipe_inode_info *, struct pipe_buffer *); void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *); + int (*pin)(struct pipe_inode_info *, struct pipe_buffer *); void (*release)(struct pipe_inode_info *, struct pipe_buffer *); int (*steal)(struct pipe_inode_info *, struct pipe_buffer *); void (*get)(struct pipe_inode_info *, struct pipe_buffer *); @@ -50,6 +63,12 @@ struct pipe_inode_info * alloc_pipe_info(struct inode * inode); void free_pipe_info(struct inode * inode); void __free_pipe_info(struct pipe_inode_info *); +/* Generic pipe buffer ops functions */ +void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *); +void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *); +void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); +int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *); + /* * splice is tied to pipes as a transport (at least for now), so we'll just * add the splice flags here. -- cgit From 7f9c51f0d9783c78db5c2aa16806d0c256ac667f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 May 2006 19:59:32 +0200 Subject: [PATCH] Add ->splice_read/splice_write to def_blk_fops It can use the generic handlers. Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index af88c43043d5..f5958f413bd1 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1104,6 +1104,8 @@ const struct file_operations def_blk_fops = { .readv = generic_file_readv, .writev = generic_file_write_nolock, .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) -- cgit From e27dedd84c119e2f7af54fcde3293be5ad812103 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 May 2006 19:59:54 +0200 Subject: [PATCH] splice: call handle_ra_miss() on failure to lookup page Notify the readahead logic of the missing page. Suggested by Oleg Nesterov. Signed-off-by: Jens Axboe --- fs/splice.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index d7538d83c367..0a6916423e7d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -301,6 +301,12 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, */ page = find_get_page(mapping, index); if (!page) { + /* + * Make sure the read-ahead engine is notified + * about this failure. + */ + handle_ra_miss(mapping, &in->f_ra, index); + /* * page didn't exist, allocate one. */ -- cgit From f6762b7ad8edd6abc802542ce845d3bc8adcb92f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 May 2006 20:02:05 +0200 Subject: [PATCH] pipe: enable atomic copying of pipe data to/from user space The pipe ->map() method uses kmap() to virtually map the pages, which is both slow and has known scalability issues on SMP. This patch enables atomic copying of pipe pages, by pre-faulting data and using kmap_atomic() instead. lmbench bw_pipe and lat_pipe measurements agree this is a Good Thing. Here are results from that on a UP machine with highmem (1.5GiB of RAM), running first a UP kernel, SMP kernel, and SMP kernel patched. Vanilla-UP: Pipe bandwidth: 1622.28 MB/sec Pipe bandwidth: 1610.59 MB/sec Pipe bandwidth: 1608.30 MB/sec Pipe latency: 7.3275 microseconds Pipe latency: 7.2995 microseconds Pipe latency: 7.3097 microseconds Vanilla-SMP: Pipe bandwidth: 1382.19 MB/sec Pipe bandwidth: 1317.27 MB/sec Pipe bandwidth: 1355.61 MB/sec Pipe latency: 9.6402 microseconds Pipe latency: 9.6696 microseconds Pipe latency: 9.6153 microseconds Patched-SMP: Pipe bandwidth: 1578.70 MB/sec Pipe bandwidth: 1579.95 MB/sec Pipe bandwidth: 1578.63 MB/sec Pipe latency: 9.1654 microseconds Pipe latency: 9.2266 microseconds Pipe latency: 9.1527 microseconds Signed-off-by: Jens Axboe --- fs/pipe.c | 141 ++++++++++++++++++++++++++++++++++++++-------- fs/splice.c | 4 +- include/linux/pipe_fs_i.h | 11 ++-- 3 files changed, 126 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index d9644fd9cc0d..3941a7f78b5d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -55,7 +55,8 @@ void pipe_wait(struct pipe_inode_info *pipe) } static int -pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) +pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, + int atomic) { unsigned long copy; @@ -64,8 +65,13 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) iov++; copy = min_t(unsigned long, len, iov->iov_len); - if (copy_from_user(to, iov->iov_base, copy)) - return -EFAULT; + if (atomic) { + if (__copy_from_user_inatomic(to, iov->iov_base, copy)) + return -EFAULT; + } else { + if (copy_from_user(to, iov->iov_base, copy)) + return -EFAULT; + } to += copy; len -= copy; iov->iov_base += copy; @@ -75,7 +81,8 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) } static int -pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) +pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, + int atomic) { unsigned long copy; @@ -84,8 +91,13 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) iov++; copy = min_t(unsigned long, len, iov->iov_len); - if (copy_to_user(iov->iov_base, from, copy)) - return -EFAULT; + if (atomic) { + if (__copy_to_user_inatomic(iov->iov_base, from, copy)) + return -EFAULT; + } else { + if (copy_to_user(iov->iov_base, from, copy)) + return -EFAULT; + } from += copy; len -= copy; iov->iov_base += copy; @@ -94,6 +106,47 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) return 0; } +/* + * Attempt to pre-fault in the user memory, so we can use atomic copies. + * Returns the number of bytes not faulted in. + */ +static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) +{ + while (!iov->iov_len) + iov++; + + while (len > 0) { + unsigned long this_len; + + this_len = min_t(unsigned long, len, iov->iov_len); + if (fault_in_pages_writeable(iov->iov_base, this_len)) + break; + + len -= this_len; + iov++; + } + + return len; +} + +/* + * Pre-fault in the user memory, so we can use atomic copies. + */ +static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) +{ + while (!iov->iov_len) + iov++; + + while (len > 0) { + unsigned long this_len; + + this_len = min_t(unsigned long, len, iov->iov_len); + fault_in_pages_readable(iov->iov_base, this_len); + len -= this_len; + iov++; + } +} + static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -111,15 +164,24 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, } void *generic_pipe_buf_map(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) + struct pipe_buffer *buf, int atomic) { + if (atomic) { + buf->flags |= PIPE_BUF_FLAG_ATOMIC; + return kmap_atomic(buf->page, KM_USER0); + } + return kmap(buf->page); } void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) + struct pipe_buffer *buf, void *map_data) { - kunmap(buf->page); + if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { + buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; + kunmap_atomic(map_data, KM_USER0); + } else + kunmap(buf->page); } static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, @@ -183,7 +245,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, struct pipe_buf_operations *ops = buf->ops; void *addr; size_t chars = buf->len; - int error; + int error, atomic; if (chars > total_len) chars = total_len; @@ -195,12 +257,21 @@ pipe_readv(struct file *filp, const struct iovec *_iov, break; } - addr = ops->map(pipe, buf); - error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); - ops->unmap(pipe, buf); + atomic = !iov_fault_in_pages_write(iov, chars); +redo: + addr = ops->map(pipe, buf, atomic); + error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); + ops->unmap(pipe, buf, addr); if (unlikely(error)) { + /* + * Just retry with the slow path if we failed. + */ + if (atomic) { + atomic = 0; + goto redo; + } if (!ret) - ret = -EFAULT; + ret = error; break; } ret += chars; @@ -304,21 +375,28 @@ pipe_writev(struct file *filp, const struct iovec *_iov, int offset = buf->offset + buf->len; if (ops->can_merge && offset + chars <= PAGE_SIZE) { + int error, atomic = 1; void *addr; - int error; error = ops->pin(pipe, buf); if (error) goto out; - addr = ops->map(pipe, buf); + iov_fault_in_pages_read(iov, chars); +redo1: + addr = ops->map(pipe, buf, atomic); error = pipe_iov_copy_from_user(offset + addr, iov, - chars); - ops->unmap(pipe, buf); + chars, atomic); + ops->unmap(pipe, buf, addr); ret = error; do_wakeup = 1; - if (error) + if (error) { + if (atomic) { + atomic = 0; + goto redo1; + } goto out; + } buf->len += chars; total_len -= chars; ret = chars; @@ -341,7 +419,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov, int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pipe->tmp_page; - int error; + char *src; + int error, atomic = 1; if (!page) { page = alloc_page(GFP_HIGHUSER); @@ -361,11 +440,27 @@ pipe_writev(struct file *filp, const struct iovec *_iov, if (chars > total_len) chars = total_len; - error = pipe_iov_copy_from_user(kmap(page), iov, chars); - kunmap(page); + iov_fault_in_pages_read(iov, chars); +redo2: + if (atomic) + src = kmap_atomic(page, KM_USER0); + else + src = kmap(page); + + error = pipe_iov_copy_from_user(src, iov, chars, + atomic); + if (atomic) + kunmap_atomic(src, KM_USER0); + else + kunmap(page); + if (unlikely(error)) { + if (atomic) { + atomic = 0; + goto redo2; + } if (!ret) - ret = -EFAULT; + ret = error; break; } ret += chars; diff --git a/fs/splice.c b/fs/splice.c index 0a6916423e7d..d4664a297bab 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -640,13 +640,13 @@ find_page: /* * Careful, ->map() uses KM_USER0! */ - char *src = buf->ops->map(info, buf); + char *src = buf->ops->map(info, buf, 1); char *dst = kmap_atomic(page, KM_USER1); memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); kunmap_atomic(dst, KM_USER1); - buf->ops->unmap(info, buf); + buf->ops->unmap(info, buf, src); } ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index b8aae1fc5185..4c054491e38e 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -5,7 +5,8 @@ #define PIPE_BUFFERS (16) -#define PIPE_BUF_FLAG_LRU 0x01 +#define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ +#define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ struct pipe_buffer { struct page *page; @@ -28,8 +29,8 @@ struct pipe_buffer { */ struct pipe_buf_operations { int can_merge; - void * (*map)(struct pipe_inode_info *, struct pipe_buffer *); - void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *); + void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int); + void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *); int (*pin)(struct pipe_inode_info *, struct pipe_buffer *); void (*release)(struct pipe_inode_info *, struct pipe_buffer *); int (*steal)(struct pipe_inode_info *, struct pipe_buffer *); @@ -64,8 +65,8 @@ void free_pipe_info(struct inode * inode); void __free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ -void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *); -void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *); +void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int); +void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *); void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *); -- cgit From 7afa6fd037e51e95d322990cb127bb2b1217251a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 1 May 2006 20:02:33 +0200 Subject: [PATCH] vmsplice: allow user to pass in gift pages If SPLICE_F_GIFT is set, the user is basically giving this pages away to the kernel. That means we can steal them for eg page cache uses instead of copying it. The data must be properly page aligned and also a multiple of the page size in length. Signed-off-by: Jens Axboe --- fs/splice.c | 28 +++++++++++++++++++++++++--- include/linux/pipe_fs_i.h | 2 ++ 2 files changed, 27 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index d4664a297bab..b150493b6fc3 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -141,7 +141,10 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - return 1; + if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) + return 1; + + return 0; } static struct pipe_buf_operations user_page_pipe_buf_ops = { @@ -186,6 +189,9 @@ static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; buf->ops = spd->ops; + if (spd->flags & SPLICE_F_GIFT) + buf->flags |= PIPE_BUF_FLAG_GIFT; + pipe->nrbufs++; page_nr++; ret += buf->len; @@ -1073,7 +1079,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, */ static int get_iovec_page_array(const struct iovec __user *iov, unsigned int nr_vecs, struct page **pages, - struct partial_page *partial) + struct partial_page *partial, int aligned) { int buffers = 0, error = 0; @@ -1113,6 +1119,15 @@ static int get_iovec_page_array(const struct iovec __user *iov, * in the user pages. */ off = (unsigned long) base & ~PAGE_MASK; + + /* + * If asked for alignment, the offset must be zero and the + * length a multiple of the PAGE_SIZE. + */ + error = -EINVAL; + if (aligned && (off || len & ~PAGE_MASK)) + break; + npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; if (npages > PIPE_BUFFERS - buffers) npages = PIPE_BUFFERS - buffers; @@ -1206,7 +1221,8 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, else if (unlikely(!nr_segs)) return 0; - spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial); + spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, + flags & SPLICE_F_GIFT); if (spd.nr_pages <= 0) return spd.nr_pages; @@ -1314,6 +1330,12 @@ static int link_pipe(struct pipe_inode_info *ipipe, obuf = opipe->bufs + nbuf; *obuf = *ibuf; + /* + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. + */ + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + if (obuf->len > len) obuf->len = len; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 4c054491e38e..df4d3fa7d3dc 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -7,6 +7,7 @@ #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ +#define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ struct pipe_buffer { struct page *page; @@ -79,6 +80,7 @@ int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *); /* we may still block on the fd we splice */ /* from/to, of course */ #define SPLICE_F_MORE (0x04) /* expect more data */ +#define SPLICE_F_GIFT (0x08) /* pages passed in are a gift */ /* * Passed to the actors -- cgit From d2610202290b4924b71747314a0f88f28807702e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 1 May 2006 12:15:48 -0700 Subject: [PATCH] x86_64: Add compat_sys_vmsplice and use it in x86-64 Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86_64/ia32/ia32entry.S | 1 + fs/compat.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'fs') diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 57fc37e0fb9c..5a92fed2d1d5 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -695,4 +695,5 @@ ia32_sys_call_table: .quad sys_splice .quad sys_sync_file_range .quad sys_tee + .quad compat_sys_vmsplice ia32_syscall_end: diff --git a/fs/compat.c b/fs/compat.c index 2e32bd340474..3f3e8f4d43d6 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1317,6 +1317,26 @@ out: return ret; } +asmlinkage long +compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, + unsigned int nr_segs, unsigned int flags) +{ + unsigned i; + struct iovec *iov; + if (nr_segs >= UIO_MAXIOV) + return -EINVAL; + iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); + for (i = 0; i < nr_segs; i++) { + struct compat_iovec v; + if (get_user(v.iov_base, &iov32[i].iov_base) || + get_user(v.iov_len, &iov32[i].iov_len) || + put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || + put_user(v.iov_len, &iov[i].iov_len)) + return -EFAULT; + } + return sys_vmsplice(fd, iov, nr_segs, flags); +} + /* * Exactly like fs/open.c:sys_open(), except that it doesn't set the * O_LARGEFILE flag. -- cgit From 7591489a8fbee83f19bacc75756989a6a4d0389c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 May 2006 12:57:18 +0200 Subject: [PATCH] vmsplice: fix badly placed end paranthesis We need to use the minium of {len, PAGE_SIZE-off}, not {len, PAGE_SIZE}-off. The latter doesn't make any sense, and could cause us to attempt negative length transfers... Signed-off-by: Jens Axboe --- fs/splice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index b150493b6fc3..b0c157d76948 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1143,7 +1143,7 @@ static int get_iovec_page_array(const struct iovec __user *iov, * Fill this contiguous range into the partial page map. */ for (i = 0; i < error; i++) { - const int plen = min_t(size_t, len, PAGE_SIZE) - off; + const int plen = min_t(size_t, len, PAGE_SIZE - off); partial[buffers].offset = off; partial[buffers].len = plen; -- cgit From a893b99be71f1d669b74f840e3a683dd077d007b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 May 2006 15:03:27 +0200 Subject: [PATCH] splice: fix page LRU accounting Currently we rely on the PIPE_BUF_FLAG_LRU flag being set correctly to know whether we need to fiddle with page LRU state after stealing it, however for some origins we just don't know if the page is on the LRU list or not. So remove PIPE_BUF_FLAG_LRU and do this check/add manually in pipe_to_file() instead. Signed-off-by: Jens Axboe --- fs/splice.c | 31 +++++++++++++++++++++---------- include/linux/pipe_fs_i.h | 5 ++--- 2 files changed, 23 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index b0c157d76948..3318b965c10b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -78,7 +78,6 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, return 1; } - buf->flags |= PIPE_BUF_FLAG_LRU; return 0; } @@ -86,8 +85,6 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf) { page_cache_release(buf->page); - buf->page = NULL; - buf->flags &= ~PIPE_BUF_FLAG_LRU; } static int page_cache_pipe_buf_pin(struct pipe_inode_info *info, @@ -570,22 +567,36 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) { /* * If steal succeeds, buf->page is now pruned from the vm - * side (LRU and page cache) and we can reuse it. The page - * will also be looked on successful return. + * side (page cache) and we can reuse it. The page will also + * be locked on successful return. */ if (buf->ops->steal(info, buf)) goto find_page; page = buf->page; + page_cache_get(page); + + /* + * page must be on the LRU for adding to the pagecache. + * Check this without grabbing the zone lock, if it isn't + * the do grab the zone lock, recheck, and add if necessary. + */ + if (!PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (!PageLRU(page)) { + SetPageLRU(page); + add_page_to_inactive_list(zone, page); + } + spin_unlock_irq(&zone->lru_lock); + } + if (add_to_page_cache(page, mapping, index, gfp_mask)) { + page_cache_release(page); unlock_page(page); goto find_page; } - - page_cache_get(page); - - if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add(page); } else { find_page: page = find_lock_page(mapping, index); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index df4d3fa7d3dc..070954f05947 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -5,9 +5,8 @@ #define PIPE_BUFFERS (16) -#define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ -#define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ -#define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ +#define PIPE_BUF_FLAG_ATOMIC 0x01 /* was atomically mapped */ +#define PIPE_BUF_FLAG_GIFT 0x02 /* page is a gift */ struct pipe_buffer { struct page *page; -- cgit From 330ab71619bacc4d4494227a6cfc9b7f5500403d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 May 2006 15:29:57 +0200 Subject: [PATCH] vmsplice: restrict stealing a little more Apply the same rules as the anon pipe pages, only allow stealing if no one else is using the page. Signed-off-by: Jens Axboe --- fs/pipe.c | 6 +++--- fs/splice.c | 2 +- include/linux/pipe_fs_i.h | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 3941a7f78b5d..5acd8954aaa0 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -184,8 +184,8 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, kunmap(buf->page); } -static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +int generic_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; @@ -213,7 +213,7 @@ static struct pipe_buf_operations anon_pipe_buf_ops = { .unmap = generic_pipe_buf_unmap, .pin = generic_pipe_buf_pin, .release = anon_pipe_buf_release, - .steal = anon_pipe_buf_steal, + .steal = generic_pipe_buf_steal, .get = generic_pipe_buf_get, }; diff --git a/fs/splice.c b/fs/splice.c index 3318b965c10b..7fb04970c72d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -141,7 +141,7 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) return 1; - return 0; + return generic_pipe_buf_steal(pipe, buf); } static struct pipe_buf_operations user_page_pipe_buf_ops = { diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 070954f05947..ba73108cbf8b 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -69,6 +69,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int); void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *); void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *); +int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); /* * splice is tied to pipes as a transport (at least for now), so we'll just -- cgit