diff options
Diffstat (limited to 'fs')
92 files changed, 2261 insertions, 1162 deletions
diff --git a/fs/Makefile b/fs/Makefile index da0bbb456d3f..bedff48e8fdc 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o \ - stack.o fs_struct.o statfs.o fs_pin.o + stack.o fs_struct.o statfs.o fs_pin.o nsfs.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o @@ -286,12 +286,37 @@ static void aio_free_ring(struct kioctx *ctx) static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { + vma->vm_flags |= VM_DONTEXPAND; vma->vm_ops = &generic_file_vm_ops; return 0; } +static void aio_ring_remap(struct file *file, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + struct kioctx_table *table; + int i; + + spin_lock(&mm->ioctx_lock); + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + for (i = 0; i < table->nr; i++) { + struct kioctx *ctx; + + ctx = table->table[i]; + if (ctx && ctx->aio_ring_file == file) { + ctx->user_id = ctx->mmap_base = vma->vm_start; + break; + } + } + + rcu_read_unlock(); + spin_unlock(&mm->ioctx_lock); +} + static const struct file_operations aio_ring_fops = { .mmap = aio_ring_mmap, + .mremap = aio_ring_remap, }; #if IS_ENABLED(CONFIG_MIGRATION) @@ -1228,8 +1253,12 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ - wait_event_interruptible_hrtimeout(ctx->wait, - aio_read_events(ctx, min_nr, nr, event, &ret), until); + if (until.tv64 == 0) + aio_read_events(ctx, min_nr, nr, event, &ret); + else + wait_event_interruptible_hrtimeout(ctx->wait, + aio_read_events(ctx, min_nr, nr, event, &ret), + until); if (!ret && signal_pending(current)) ret = -EINTR; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index c04ef1d4f18a..97aff2879cda 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -254,6 +254,7 @@ static char *scanarg(char *s, char del) return NULL; } } + s[-1] ='\0'; return s; } @@ -378,8 +379,7 @@ static Node *create_entry(const char __user *buffer, size_t count) p = scanarg(p, del); if (!p) goto einval; - p[-1] = '\0'; - if (p == e->magic) + if (!e->magic[0]) goto einval; if (USE_DEBUG) print_hex_dump_bytes( @@ -391,8 +391,7 @@ static Node *create_entry(const char __user *buffer, size_t count) p = scanarg(p, del); if (!p) goto einval; - p[-1] = '\0'; - if (p == e->mask) { + if (!e->mask[0]) { e->mask = NULL; pr_debug("register: mask[raw]: none\n"); } else if (USE_DEBUG) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e6fbbd74b716..7e607416755a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3481,8 +3481,8 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end); -int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes); +int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 type); int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 30965120772b..8c63419a7f70 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4121,12 +4121,6 @@ again: if (ret) break; - /* opt_discard */ - if (btrfs_test_opt(root, DISCARD)) - ret = btrfs_error_discard_extent(root, start, - end + 1 - start, - NULL); - clear_extent_dirty(unpin, start, end, GFP_NOFS); btrfs_error_unpin_extent_range(root, start, end); cond_resched(); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 222d6aea4a8a..a80b97100d90 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1889,8 +1889,8 @@ static int btrfs_issue_discard(struct block_device *bdev, return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); } -static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) +int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes) { int ret; u64 discarded_bytes = 0; @@ -5727,7 +5727,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, update_global_block_rsv(fs_info); } -static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, + const bool return_free_space) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; @@ -5751,7 +5752,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) if (start < cache->last_byte_to_unpin) { len = min(len, cache->last_byte_to_unpin - start); - btrfs_add_free_space(cache, start, len); + if (return_free_space) + btrfs_add_free_space(cache, start, len); } start += len; @@ -5815,7 +5817,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, GFP_NOFS); - unpin_extent_range(root, start, end); + unpin_extent_range(root, start, end, true); cond_resched(); } @@ -8872,6 +8874,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) cache_node); rb_erase(&block_group->cache_node, &info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); spin_unlock(&info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); @@ -9130,6 +9133,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) spin_lock(&info->block_group_cache_lock); rb_erase(&cache->cache_node, &info->block_group_cache_tree); + RB_CLEAR_NODE(&cache->cache_node); spin_unlock(&info->block_group_cache_lock); btrfs_put_block_group(cache); goto error; @@ -9271,6 +9275,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&cache->cache_node, &root->fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&cache->cache_node); spin_unlock(&root->fs_info->block_group_cache_lock); btrfs_put_block_group(cache); return ret; @@ -9690,13 +9695,7 @@ out: int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { - return unpin_extent_range(root, start, end); -} - -int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) -{ - return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); + return unpin_extent_range(root, start, end, false); } int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 030847bf7cec..d6c03f7f136b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2966,8 +2966,8 @@ static int do_trimming(struct btrfs_block_group_cache *block_group, spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); - ret = btrfs_error_discard_extent(fs_info->extent_root, - start, bytes, &trimmed); + ret = btrfs_discard_extent(fs_info->extent_root, + start, bytes, &trimmed); if (!ret) *total_trimmed += trimmed; @@ -3185,16 +3185,18 @@ out: spin_unlock(&block_group->lock); + lock_chunks(block_group->fs_info->chunk_root); em_tree = &block_group->fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, block_group->key.objectid, 1); BUG_ON(!em); /* logic error, can't happen */ + /* + * remove_extent_mapping() will delete us from the pinned_chunks + * list, which is protected by the chunk mutex. + */ remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - - lock_chunks(block_group->fs_info->chunk_root); - list_del_init(&em->list); unlock_chunks(block_group->fs_info->chunk_root); /* once for us and once for the tree */ diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0144790e296e..50c5a8762aed 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1485,7 +1485,7 @@ static void update_dev_time(char *path_name) struct file *filp; filp = filp_open(path_name, O_RDWR, 0); - if (!filp) + if (IS_ERR(filp)) return; file_update_time(filp); filp_close(filp, NULL); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 18c06bbaf136..f5013d92a7e6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -192,17 +192,30 @@ static int readpage_nounlock(struct file *filp, struct page *page) struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; int err = 0; + u64 off = page_offset(page); u64 len = PAGE_CACHE_SIZE; - err = ceph_readpage_from_fscache(inode, page); + if (off >= i_size_read(inode)) { + zero_user_segment(page, err, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return 0; + } + /* + * Uptodate inline data should have been added into page cache + * while getting Fcr caps. + */ + if (ci->i_inline_version != CEPH_INLINE_NONE) + return -EINVAL; + + err = ceph_readpage_from_fscache(inode, page); if (err == 0) goto out; dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - (u64) page_offset(page), &len, + off, &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); if (err == -ENOENT) @@ -319,7 +332,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) off, len); vino = ceph_vino(inode); req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, - 1, CEPH_OSD_OP_READ, + 0, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); @@ -384,6 +397,9 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, int rc = 0; int max = 0; + if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) + return -EINVAL; + rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, &nr_pages); @@ -673,7 +689,7 @@ static int ceph_writepages_start(struct address_space *mapping, int rc = 0; unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; - int do_sync; + int do_sync = 0; u64 truncate_size, snap_size; u32 truncate_seq; @@ -750,7 +766,6 @@ retry: last_snapc = snapc; while (!done && index <= end) { - int num_ops = do_sync ? 2 : 1; unsigned i; int first; pgoff_t next; @@ -850,7 +865,8 @@ get_more_pages: len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, - offset, &len, num_ops, + offset, &len, 0, + do_sync ? 2 : 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, @@ -862,6 +878,9 @@ get_more_pages: break; } + if (do_sync) + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + req->r_callback = writepages_finish; req->r_inode = inode; @@ -1204,6 +1223,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; + struct page *pinned_page = NULL; loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; int want, got, ret; @@ -1215,7 +1235,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE; while (1) { got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, + -1, &got, &pinned_page); if (ret == 0) break; if (ret != -ERESTARTSYS) { @@ -1226,12 +1247,54 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) dout("filemap_fault %p %llu~%zd got cap refs on %s\n", inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); - ret = filemap_fault(vma, vmf); + if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || + ci->i_inline_version == CEPH_INLINE_NONE) + ret = filemap_fault(vma, vmf); + else + ret = -EAGAIN; dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); + if (pinned_page) + page_cache_release(pinned_page); ceph_put_cap_refs(ci, got); + if (ret != -EAGAIN) + return ret; + + /* read inline data */ + if (off >= PAGE_CACHE_SIZE) { + /* does not support inline data > PAGE_SIZE */ + ret = VM_FAULT_SIGBUS; + } else { + int ret1; + struct address_space *mapping = inode->i_mapping; + struct page *page = find_or_create_page(mapping, 0, + mapping_gfp_mask(mapping) & + ~__GFP_FS); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + ret1 = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, true); + if (ret1 < 0 || off >= i_size_read(inode)) { + unlock_page(page); + page_cache_release(page); + ret = VM_FAULT_SIGBUS; + goto out; + } + if (ret1 < PAGE_CACHE_SIZE) + zero_user_segment(page, ret1, PAGE_CACHE_SIZE); + else + flush_dcache_page(page); + SetPageUptodate(page); + vmf->page = page; + ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; + } +out: + dout("filemap_fault %p %llu~%zd read inline data ret %d\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ret); return ret; } @@ -1250,6 +1313,19 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) size_t len; int want, got, ret; + if (ci->i_inline_version != CEPH_INLINE_NONE) { + struct page *locked_page = NULL; + if (off == 0) { + lock_page(page); + locked_page = page; + } + ret = ceph_uninline_data(vma->vm_file, locked_page); + if (locked_page) + unlock_page(locked_page); + if (ret < 0) + return VM_FAULT_SIGBUS; + } + if (off + PAGE_CACHE_SIZE <= size) len = PAGE_CACHE_SIZE; else @@ -1263,7 +1339,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER; while (1) { got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + &got, NULL); if (ret == 0) break; if (ret != -ERESTARTSYS) { @@ -1297,11 +1374,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } out: - if (ret != VM_FAULT_LOCKED) { + if (ret != VM_FAULT_LOCKED) unlock_page(page); - } else { + if (ret == VM_FAULT_LOCKED || + ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); spin_unlock(&ci->i_ceph_lock); if (dirty) @@ -1315,6 +1394,178 @@ out: return ret; } +void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, + char *data, size_t len) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + if (locked_page) { + page = locked_page; + } else { + if (i_size_read(inode) == 0) + return; + page = find_or_create_page(mapping, 0, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return; + if (PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + return; + } + } + + dout("fill_inline_data %p %llx.%llx len %lu locked_page %p\n", + inode, ceph_vinop(inode), len, locked_page); + + if (len > 0) { + void *kaddr = kmap_atomic(page); + memcpy(kaddr, data, len); + kunmap_atomic(kaddr); + } + + if (page != locked_page) { + if (len < PAGE_CACHE_SIZE) + zero_user_segment(page, len, PAGE_CACHE_SIZE); + else + flush_dcache_page(page); + + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + } +} + +int ceph_uninline_data(struct file *filp, struct page *locked_page) +{ + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + struct page *page = NULL; + u64 len, inline_version; + int err = 0; + bool from_pagecache = false; + + spin_lock(&ci->i_ceph_lock); + inline_version = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + + dout("uninline_data %p %llx.%llx inline_version %llu\n", + inode, ceph_vinop(inode), inline_version); + + if (inline_version == 1 || /* initial version, no data */ + inline_version == CEPH_INLINE_NONE) + goto out; + + if (locked_page) { + page = locked_page; + WARN_ON(!PageUptodate(page)); + } else if (ceph_caps_issued(ci) & + (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { + page = find_get_page(inode->i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + from_pagecache = true; + lock_page(page); + } else { + page_cache_release(page); + page = NULL; + } + } + } + + if (page) { + len = i_size_read(inode); + if (len > PAGE_CACHE_SIZE) + len = PAGE_CACHE_SIZE; + } else { + page = __page_cache_alloc(GFP_NOFS); + if (!page) { + err = -ENOMEM; + goto out; + } + err = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, true); + if (err < 0) { + /* no inline data */ + if (err == -ENODATA) + err = 0; + goto out; + } + len = err; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), 0, &len, 0, 1, + CEPH_OSD_OP_CREATE, + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + ci->i_snap_realm->cached_context, + 0, 0, false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_put_request(req); + if (err < 0) + goto out; + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), 0, &len, 1, 3, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + ci->i_snap_realm->cached_context, + ci->i_truncate_seq, ci->i_truncate_size, + false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); + + err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, + "inline_version", &inline_version, + sizeof(inline_version), + CEPH_OSD_CMPXATTR_OP_GT, + CEPH_OSD_CMPXATTR_MODE_U64); + if (err) + goto out_put; + + err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, + "inline_version", &inline_version, + sizeof(inline_version), 0, 0); + if (err) + goto out_put; + + ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, req); +out_put: + ceph_osdc_put_request(req); + if (err == -ECANCELED) + err = 0; +out: + if (page && page != locked_page) { + if (from_pagecache) { + unlock_page(page); + page_cache_release(page); + } else + __free_pages(page, 0); + } + + dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", + inode, ceph_vinop(inode), inline_version, err); + return err; +} + static struct vm_operations_struct ceph_vmops = { .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index cefca661464b..b93c631c6c87 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -975,10 +975,12 @@ static int send_cap_msg(struct ceph_mds_session *session, kuid_t uid, kgid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, - u64 follows) + u64 follows, bool inline_data) { struct ceph_mds_caps *fc; struct ceph_msg *msg; + void *p; + size_t extra_len; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" " seq %u/%u mseq %u follows %lld size %llu/%llu" @@ -988,7 +990,10 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); + /* flock buffer size + inline version + inline data size */ + extra_len = 4 + 8 + 4; + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, + GFP_NOFS, false); if (!msg) return -ENOMEM; @@ -1020,6 +1025,14 @@ static int send_cap_msg(struct ceph_mds_session *session, fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); fc->mode = cpu_to_le32(mode); + p = fc + 1; + /* flock buffer size */ + ceph_encode_32(&p, 0); + /* inline version */ + ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); + /* inline data size */ + ceph_encode_32(&p, 0); + fc->xattr_version = cpu_to_le64(xattr_version); if (xattrs_buf) { msg->middle = ceph_buffer_get(xattrs_buf); @@ -1126,6 +1139,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, u64 flush_tid = 0; int i; int ret; + bool inline_data; held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; @@ -1209,13 +1223,15 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, xattr_version = ci->i_xattrs.version; } + inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + spin_unlock(&ci->i_ceph_lock); ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, size, max_size, &mtime, &atime, time_warp_seq, uid, gid, mode, xattr_version, xattr_blob, - follows); + follows, inline_data); if (ret < 0) { dout("error sending cap msg, must requeue %p\n", inode); delayed = 1; @@ -1336,7 +1352,7 @@ retry: capsnap->time_warp_seq, capsnap->uid, capsnap->gid, capsnap->mode, capsnap->xattr_version, capsnap->xattr_blob, - capsnap->follows); + capsnap->follows, capsnap->inline_data); next_follows = capsnap->follows + 1; ceph_put_cap_snap(capsnap); @@ -2057,15 +2073,17 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) * requested from the MDS. */ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - int *got, loff_t endoff, int *check_max, int *err) + loff_t endoff, int *got, struct page **pinned_page, + int *check_max, int *err) { struct inode *inode = &ci->vfs_inode; int ret = 0; - int have, implemented; + int have, implemented, _got = 0; int file_wanted; dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); +again: spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ @@ -2075,7 +2093,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, ceph_cap_string(need), ceph_cap_string(file_wanted)); *err = -EBADF; ret = 1; - goto out; + goto out_unlock; } /* finish pending truncate */ @@ -2095,7 +2113,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, *check_max = 1; ret = 1; } - goto out; + goto out_unlock; } /* * If a sync write is in progress, we must wait, so that we @@ -2103,7 +2121,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, */ if (__ceph_have_pending_cap_snap(ci)) { dout("get_cap_refs %p cap_snap_pending\n", inode); - goto out; + goto out_unlock; } } @@ -2120,18 +2138,50 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if ((revoking & not) == 0) { - *got = need | (have & want); - __take_cap_refs(ci, *got); + _got = need | (have & want); + __take_cap_refs(ci, _got); ret = 1; } } else { dout("get_cap_refs %p have %s needed %s\n", inode, ceph_cap_string(have), ceph_cap_string(need)); } -out: +out_unlock: spin_unlock(&ci->i_ceph_lock); + + if (ci->i_inline_version != CEPH_INLINE_NONE && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(inode) > 0) { + int ret1; + struct page *page = find_get_page(inode->i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + *pinned_page = page; + goto out; + } + page_cache_release(page); + } + /* + * drop cap refs first because getattr while holding + * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* getattr request will bring inline data into page cache */ + ret1 = __ceph_do_getattr(inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, true); + if (ret1 >= 0) { + ret = 0; + goto again; + } + *err = ret1; + ret = 1; + } +out: dout("get_cap_refs %p ret %d got %s\n", inode, - ret, ceph_cap_string(*got)); + ret, ceph_cap_string(_got)); + *got = _got; return ret; } @@ -2168,8 +2218,8 @@ static void check_max_size(struct inode *inode, loff_t endoff) * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got, - loff_t endoff) +int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, + loff_t endoff, int *got, struct page **pinned_page) { int check_max, ret, err; @@ -2179,8 +2229,8 @@ retry: check_max = 0; err = 0; ret = wait_event_interruptible(ci->i_cap_wq, - try_get_cap_refs(ci, need, want, - got, endoff, + try_get_cap_refs(ci, need, want, endoff, + got, pinned_page, &check_max, &err)); if (err) ret = err; @@ -2383,6 +2433,8 @@ static void invalidate_aliases(struct inode *inode) static void handle_cap_grant(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *grant, void *snaptrace, int snaptrace_len, + u64 inline_version, + void *inline_data, int inline_len, struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, struct ceph_cap *cap, int issued) @@ -2403,6 +2455,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, bool queue_invalidate = false; bool queue_revalidate = false; bool deleted_inode = false; + bool fill_inline = false; dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, mds, seq, ceph_cap_string(newcaps)); @@ -2576,6 +2629,13 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, } BUG_ON(cap->issued & ~cap->implemented); + if (inline_version > 0 && inline_version >= ci->i_inline_version) { + ci->i_inline_version = inline_version; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) + fill_inline = true; + } + spin_unlock(&ci->i_ceph_lock); if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { @@ -2589,6 +2649,9 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, wake = true; } + if (fill_inline) + ceph_fill_inline_data(inode, NULL, inline_data, inline_len); + if (queue_trunc) { ceph_queue_vmtruncate(inode); ceph_queue_revalidate(inode); @@ -2996,11 +3059,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, u64 cap_id; u64 size, max_size; u64 tid; + u64 inline_version = 0; + void *inline_data = NULL; + u32 inline_len = 0; void *snaptrace; size_t snaptrace_len; - void *flock; - void *end; - u32 flock_len; + void *p, *end; dout("handle_caps from mds%d\n", mds); @@ -3021,30 +3085,37 @@ void ceph_handle_caps(struct ceph_mds_session *session, snaptrace = h + 1; snaptrace_len = le32_to_cpu(h->snap_trace_len); + p = snaptrace + snaptrace_len; if (le16_to_cpu(msg->hdr.version) >= 2) { - void *p = snaptrace + snaptrace_len; + u32 flock_len; ceph_decode_32_safe(&p, end, flock_len, bad); if (p + flock_len > end) goto bad; - flock = p; - } else { - flock = NULL; - flock_len = 0; + p += flock_len; } if (le16_to_cpu(msg->hdr.version) >= 3) { if (op == CEPH_CAP_OP_IMPORT) { - void *p = flock + flock_len; if (p + sizeof(*peer) > end) goto bad; peer = p; + p += sizeof(*peer); } else if (op == CEPH_CAP_OP_EXPORT) { /* recorded in unused fields */ peer = (void *)&h->size; } } + if (le16_to_cpu(msg->hdr.version) >= 4) { + ceph_decode_64_safe(&p, end, inline_version, bad); + ceph_decode_32_safe(&p, end, inline_len, bad); + if (p + inline_len > end) + goto bad; + inline_data = p; + p += inline_len; + } + /* lookup ino */ inode = ceph_find_inode(sb, vino); ci = ceph_inode(inode); @@ -3085,6 +3156,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, handle_cap_import(mdsc, inode, h, peer, session, &cap, &issued); handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, + inline_version, inline_data, inline_len, msg->middle, session, cap, issued); goto done_unlocked; } @@ -3105,8 +3177,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_GRANT: __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); - handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle, - session, cap, issued); + handle_cap_grant(mdsc, inode, h, NULL, 0, + inline_version, inline_data, inline_len, + msg->middle, session, cap, issued); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: @@ -3137,8 +3210,7 @@ flush_cap_releases: done: mutex_unlock(&session->s_mutex); done_unlocked: - if (inode) - iput(inode); + iput(inode); return; bad: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 681a8537b64f..c241603764fd 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -183,7 +183,7 @@ more: spin_unlock(&parent->d_lock); /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete(dir)) { + if (!ceph_dir_is_complete_ordered(dir)) { dout(" lost dir complete on %p; falling back to mds\n", dir); dput(dentry); err = -EAGAIN; @@ -261,10 +261,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* always start with . and .. */ if (ctx->pos == 0) { - /* note dir version at start of readdir so we can tell - * if any dentries get dropped */ - fi->dir_release_count = atomic_read(&ci->i_release_count); - dout("readdir off 0 -> '.'\n"); if (!dir_emit(ctx, ".", 1, ceph_translate_ino(inode->i_sb, inode->i_ino), @@ -289,7 +285,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) if ((ctx->pos == 2 || fi->dentry) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && - __ceph_dir_is_complete(ci) && + __ceph_dir_is_complete_ordered(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { u32 shared_gen = ci->i_shared_gen; spin_unlock(&ci->i_ceph_lock); @@ -312,6 +308,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* proceed with a normal readdir */ + if (ctx->pos == 2) { + /* note dir version at start of readdir so we can tell + * if any dentries get dropped */ + fi->dir_release_count = atomic_read(&ci->i_release_count); + fi->dir_ordered_count = ci->i_ordered_count; + } + more: /* do we have the correct frag content buffered? */ if (fi->frag != frag || fi->last_readdir == NULL) { @@ -446,8 +449,12 @@ more: */ spin_lock(&ci->i_ceph_lock); if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { - dout(" marking %p complete\n", inode); - __ceph_dir_set_complete(ci, fi->dir_release_count); + if (ci->i_ordered_count == fi->dir_ordered_count) + dout(" marking %p complete and ordered\n", inode); + else + dout(" marking %p complete\n", inode); + __ceph_dir_set_complete(ci, fi->dir_release_count, + fi->dir_ordered_count); } spin_unlock(&ci->i_ceph_lock); @@ -805,7 +812,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) acls.pagelist = NULL; } err = ceph_mdsc_do_request(mdsc, dir, req); - if (!err && !req->r_reply_info.head->is_dentry) + if (!err && + !req->r_reply_info.head->is_target && + !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); out: diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9f8e3572040e..ce74b394b49d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -333,6 +333,11 @@ int ceph_release(struct inode *inode, struct file *file) return 0; } +enum { + CHECK_EOF = 1, + READ_INLINE = 2, +}; + /* * Read a range of bytes striped over one or more objects. Iterate over * objects we stripe over. (That's not atomic, but good enough for now.) @@ -412,7 +417,7 @@ more: ret = read; /* did we bounce off eof? */ if (pos + left > inode->i_size) - *checkeof = 1; + *checkeof = CHECK_EOF; } dout("striped_read returns %d\n", ret); @@ -598,7 +603,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, + vino, pos, &len, 0, 2,/*include a 'startsync' command*/ CEPH_OSD_OP_WRITE, flags, snapc, ci->i_truncate_seq, @@ -609,6 +614,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) break; } + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + n = iov_iter_get_pages_alloc(from, &pages, len, &start); if (unlikely(n < 0)) { ret = n; @@ -713,7 +720,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, 1, + vino, pos, &len, 0, 1, CEPH_OSD_OP_WRITE, flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, @@ -803,9 +810,10 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) size_t len = iocb->ki_nbytes; struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); + struct page *pinned_page = NULL; ssize_t ret; int want, got = 0; - int checkeof = 0, read = 0; + int retry_op = 0, read = 0; again: dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", @@ -815,7 +823,7 @@ again: want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); if (ret < 0) return ret; @@ -827,8 +835,12 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); - /* hmm, this isn't really async... */ - ret = ceph_sync_read(iocb, to, &checkeof); + if (ci->i_inline_version == CEPH_INLINE_NONE) { + /* hmm, this isn't really async... */ + ret = ceph_sync_read(iocb, to, &retry_op); + } else { + retry_op = READ_INLINE; + } } else { dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, @@ -838,13 +850,55 @@ again: } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); + if (pinned_page) { + page_cache_release(pinned_page); + pinned_page = NULL; + } ceph_put_cap_refs(ci, got); + if (retry_op && ret >= 0) { + int statret; + struct page *page = NULL; + loff_t i_size; + if (retry_op == READ_INLINE) { + page = __page_cache_alloc(GFP_NOFS); + if (!page) + return -ENOMEM; + } - if (checkeof && ret >= 0) { - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); + statret = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, !!page); + if (statret < 0) { + __free_page(page); + if (statret == -ENODATA) { + BUG_ON(retry_op != READ_INLINE); + goto again; + } + return statret; + } + + i_size = i_size_read(inode); + if (retry_op == READ_INLINE) { + /* does not support inline data > PAGE_SIZE */ + if (i_size > PAGE_CACHE_SIZE) { + ret = -EIO; + } else if (iocb->ki_pos < i_size) { + loff_t end = min_t(loff_t, i_size, + iocb->ki_pos + len); + if (statret < end) + zero_user_segment(page, statret, end); + ret = copy_page_to_iter(page, + iocb->ki_pos & ~PAGE_MASK, + end - iocb->ki_pos, to); + iocb->ki_pos += ret; + } else { + ret = 0; + } + __free_pages(page, 0); + return ret; + } /* hit EOF or hole? */ - if (statret == 0 && iocb->ki_pos < inode->i_size && + if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && ret < len) { dout("sync_read hit hole, ppos %lld < size %lld" ", reading more\n", iocb->ki_pos, @@ -852,7 +906,7 @@ again: read += ret; len -= ret; - checkeof = 0; + retry_op = 0; goto again; } } @@ -909,6 +963,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) goto out; + if (ci->i_inline_version != CEPH_INLINE_NONE) { + err = ceph_uninline_data(file, NULL); + if (err < 0) + goto out; + } + retry_snap: if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { err = -ENOSPC; @@ -922,7 +982,8 @@ retry_snap: else want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count); + err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, + &got, NULL); if (err < 0) goto out; @@ -969,6 +1030,7 @@ retry_snap: if (written >= 0) { int dirty; spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); spin_unlock(&ci->i_ceph_lock); if (dirty) @@ -1111,7 +1173,7 @@ static int ceph_zero_partial_object(struct inode *inode, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), offset, length, - 1, op, + 0, 1, op, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, NULL, 0, 0, false); @@ -1214,6 +1276,12 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } + if (ci->i_inline_version != CEPH_INLINE_NONE) { + ret = ceph_uninline_data(file, NULL); + if (ret < 0) + goto unlock; + } + size = i_size_read(inode); if (!(mode & FALLOC_FL_KEEP_SIZE)) endoff = offset + length; @@ -1223,7 +1291,7 @@ static long ceph_fallocate(struct file *file, int mode, else want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); if (ret < 0) goto unlock; @@ -1240,6 +1308,7 @@ static long ceph_fallocate(struct file *file, int mode, if (!ret) { spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); spin_unlock(&ci->i_ceph_lock); if (dirty) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a5593d51d035..f61a74115beb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -387,8 +387,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) spin_lock_init(&ci->i_ceph_lock); ci->i_version = 0; + ci->i_inline_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; + ci->i_ordered_count = 0; atomic_set(&ci->i_release_count, 1); atomic_set(&ci->i_complete_count, 0); ci->i_symlink = NULL; @@ -657,7 +659,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, * Populate an inode based on info from mds. May be called on new or * existing inodes. */ -static int fill_inode(struct inode *inode, +static int fill_inode(struct inode *inode, struct page *locked_page, struct ceph_mds_reply_info_in *iinfo, struct ceph_mds_reply_dirfrag *dirinfo, struct ceph_mds_session *session, @@ -675,6 +677,7 @@ static int fill_inode(struct inode *inode, bool wake = false; bool queue_trunc = false; bool new_version = false; + bool fill_inline = false; dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode), le64_to_cpu(info->version), @@ -845,7 +848,8 @@ static int fill_inode(struct inode *inode, (issued & CEPH_CAP_FILE_EXCL) == 0 && !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); - __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); + __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count), + ci->i_ordered_count); } /* were we issued a capability? */ @@ -873,8 +877,23 @@ static int fill_inode(struct inode *inode, ceph_vinop(inode)); __ceph_get_fmode(ci, cap_fmode); } + + if (iinfo->inline_version > 0 && + iinfo->inline_version >= ci->i_inline_version) { + int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + ci->i_inline_version = iinfo->inline_version; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (locked_page || + (le32_to_cpu(info->cap.caps) & cache_caps))) + fill_inline = true; + } + spin_unlock(&ci->i_ceph_lock); + if (fill_inline) + ceph_fill_inline_data(inode, locked_page, + iinfo->inline_data, iinfo->inline_len); + if (wake) wake_up_all(&ci->i_cap_wq); @@ -1062,7 +1081,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, struct inode *dir = req->r_locked_dir; if (dir) { - err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, + err = fill_inode(dir, NULL, + &rinfo->diri, rinfo->dirfrag, session, req->r_request_started, -1, &req->r_caps_reservation); if (err < 0) @@ -1132,7 +1152,7 @@ retry_lookup: } req->r_target_inode = in; - err = fill_inode(in, &rinfo->targeti, NULL, + err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, session, req->r_request_started, (!req->r_aborted && rinfo->head->result == 0) ? req->r_fmode : -1, @@ -1204,8 +1224,8 @@ retry_lookup: ceph_invalidate_dentry_lease(dn); /* d_move screws up sibling dentries' offsets */ - ceph_dir_clear_complete(dir); - ceph_dir_clear_complete(olddir); + ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(olddir); dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1217,6 +1237,7 @@ retry_lookup: if (!rinfo->head->is_target) { dout("fill_trace null dentry\n"); if (dn->d_inode) { + ceph_dir_clear_ordered(dir); dout("d_delete %p\n", dn); d_delete(dn); } else { @@ -1233,7 +1254,7 @@ retry_lookup: /* attach proper inode */ if (!dn->d_inode) { - ceph_dir_clear_complete(dir); + ceph_dir_clear_ordered(dir); ihold(in); dn = splice_dentry(dn, in, &have_lease); if (IS_ERR(dn)) { @@ -1263,7 +1284,7 @@ retry_lookup: BUG_ON(!dir); BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); dout(" linking snapped dir %p to dn %p\n", in, dn); - ceph_dir_clear_complete(dir); + ceph_dir_clear_ordered(dir); ihold(in); dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { @@ -1300,7 +1321,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, dout("new_inode badness got %d\n", err); continue; } - rc = fill_inode(in, &rinfo->dir_in[i], NULL, session, + rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (rc < 0) { @@ -1416,7 +1437,7 @@ retry_lookup: } } - if (fill_inode(in, &rinfo->dir_in[i], NULL, session, + if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); @@ -1899,7 +1920,8 @@ out_put: * Verify that we have a lease on the given mask. If not, * do a getattr against an mds. */ -int ceph_do_getattr(struct inode *inode, int mask, bool force) +int __ceph_do_getattr(struct inode *inode, struct page *locked_page, + int mask, bool force) { struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -1911,7 +1933,8 @@ int ceph_do_getattr(struct inode *inode, int mask, bool force) return 0; } - dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); + dout("do_getattr inode %p mask %s mode 0%o\n", + inode, ceph_cap_string(mask), inode->i_mode); if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) return 0; @@ -1922,7 +1945,19 @@ int ceph_do_getattr(struct inode *inode, int mask, bool force) ihold(inode); req->r_num_caps = 1; req->r_args.getattr.mask = cpu_to_le32(mask); + req->r_locked_page = locked_page; err = ceph_mdsc_do_request(mdsc, NULL, req); + if (locked_page && err == 0) { + u64 inline_version = req->r_reply_info.targeti.inline_version; + if (inline_version == 0) { + /* the reply is supposed to contain inline data */ + err = -EINVAL; + } else if (inline_version == CEPH_INLINE_NONE) { + err = -ENODATA; + } else { + err = req->r_reply_info.targeti.inline_len; + } + } ceph_mdsc_put_request(req); dout("do_getattr result=%d\n", err); return err; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index fbc39c47bacd..c35c5c614e38 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -9,6 +9,8 @@ #include <linux/ceph/pagelist.h> static u64 lock_secret; +static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); static inline u64 secure_addr(void *addr) { @@ -40,6 +42,9 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, u64 length = 0; u64 owner; + if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) + wait = 0; + req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); @@ -68,6 +73,9 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; + if (wait) + req->r_wait_for_completion = ceph_lock_wait_for_completion; + err = ceph_mdsc_do_request(mdsc, inode, req); if (operation == CEPH_MDS_OP_GETFILELOCK) { @@ -96,6 +104,52 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, return err; } +static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct ceph_mds_request *intr_req; + struct inode *inode = req->r_inode; + int err, lock_type; + + BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK); + if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL) + lock_type = CEPH_LOCK_FCNTL_INTR; + else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK) + lock_type = CEPH_LOCK_FLOCK_INTR; + else + BUG_ON(1); + BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK); + + err = wait_for_completion_interruptible(&req->r_completion); + if (!err) + return 0; + + dout("ceph_lock_wait_for_completion: request %llu was interrupted\n", + req->r_tid); + + intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, + USE_AUTH_MDS); + if (IS_ERR(intr_req)) + return PTR_ERR(intr_req); + + intr_req->r_inode = inode; + ihold(inode); + intr_req->r_num_caps = 1; + + intr_req->r_args.filelock_change = req->r_args.filelock_change; + intr_req->r_args.filelock_change.rule = lock_type; + intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK; + + err = ceph_mdsc_do_request(mdsc, inode, intr_req); + ceph_mdsc_put_request(intr_req); + + if (err && err != -ERESTARTSYS) + return err; + + wait_for_completion(&req->r_completion); + return 0; +} + /** * Attempt to set an fcntl lock. * For now, this just goes away to the server. Later it may be more awesome. @@ -143,11 +197,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err); } } - - } else if (err == -ERESTARTSYS) { - dout("undoing lock\n"); - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - CEPH_LOCK_UNLOCK, 0, fl); } return err; } @@ -186,11 +235,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) file, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on flock_lock_file_wait, undid lock", err); } - } else if (err == -ERESTARTSYS) { - dout("undoing lock\n"); - ceph_lock_message(CEPH_LOCK_FLOCK, - CEPH_MDS_OP_SETFILELOCK, - file, CEPH_LOCK_UNLOCK, 0, fl); } return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a92d3f5c6c12..d2171f4a6980 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -89,6 +89,16 @@ static int parse_reply_info_in(void **p, void *end, ceph_decode_need(p, end, info->xattr_len, bad); info->xattr_data = *p; *p += info->xattr_len; + + if (features & CEPH_FEATURE_MDS_INLINE_DATA) { + ceph_decode_64_safe(p, end, info->inline_version, bad); + ceph_decode_32_safe(p, end, info->inline_len, bad); + ceph_decode_need(p, end, info->inline_len, bad); + info->inline_data = *p; + *p += info->inline_len; + } else + info->inline_version = CEPH_INLINE_NONE; + return 0; bad: return err; @@ -524,8 +534,7 @@ void ceph_mdsc_release_request(struct kref *kref) } if (req->r_locked_dir) ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); - if (req->r_target_inode) - iput(req->r_target_inode); + iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) @@ -861,8 +870,11 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 /* * Serialize client metadata into waiting buffer space, using * the format that userspace expects for map<string, string> + * + * ClientSession messages with metadata are v2 */ - msg->hdr.version = 2; /* ClientSession messages with metadata are v2 */ + msg->hdr.version = cpu_to_le16(2); + msg->hdr.compat_version = cpu_to_le16(1); /* The write pointer, following the session_head structure */ p = msg->front.iov_base + sizeof(*h); @@ -1066,8 +1078,7 @@ out: session->s_cap_iterator = NULL; spin_unlock(&session->s_cap_lock); - if (last_inode) - iput(last_inode); + iput(last_inode); if (old_cap) ceph_put_cap(session->s_mdsc, old_cap); @@ -1874,7 +1885,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, goto out_free2; } - msg->hdr.version = 2; + msg->hdr.version = cpu_to_le16(2); msg->hdr.tid = cpu_to_le64(req->r_tid); head = msg->front.iov_base; @@ -2208,6 +2219,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, &req->r_completion, req->r_timeout); if (err == 0) err = -EIO; + } else if (req->r_wait_for_completion) { + err = req->r_wait_for_completion(mdsc, req); } else { err = wait_for_completion_killable(&req->r_completion); } @@ -3744,6 +3757,20 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, return msg; } +static int sign_message(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_handshake *auth = &s->s_auth; + return ceph_auth_sign_message(auth, msg); +} + +static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_handshake *auth = &s->s_auth; + return ceph_auth_check_message_signature(auth, msg); +} + static const struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, @@ -3753,6 +3780,8 @@ static const struct ceph_connection_operations mds_con_ops = { .invalidate_authorizer = invalidate_authorizer, .peer_reset = peer_reset, .alloc_msg = mds_alloc_msg, + .sign_message = sign_message, + .check_message_signature = check_message_signature, }; /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3288359353e9..e2817d00f7d9 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -41,6 +41,9 @@ struct ceph_mds_reply_info_in { char *symlink; u32 xattr_len; char *xattr_data; + u64 inline_version; + u32 inline_len; + char *inline_data; }; /* @@ -166,6 +169,11 @@ struct ceph_mds_client; */ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, struct ceph_mds_request *req); +/* + * wait for request completion callback + */ +typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); /* * an in-flight mds request @@ -215,6 +223,7 @@ struct ceph_mds_request { int r_request_release_offset; struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; + struct page *r_locked_page; int r_err; bool r_aborted; @@ -239,6 +248,7 @@ struct ceph_mds_request { struct completion r_completion; struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; + ceph_mds_request_wait_callback_t r_wait_for_completion; struct list_head r_unsafe_item; /* per-session unsafe list item */ bool r_got_unsafe, r_got_safe, r_got_result; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index f01645a27752..ce35fbd4ba5d 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -288,6 +288,9 @@ static int cmpu64_rev(const void *a, const void *b) return 0; } + +static struct ceph_snap_context *empty_snapc; + /* * build the snap context for a given realm. */ @@ -328,6 +331,12 @@ static int build_snap_context(struct ceph_snap_realm *realm) return 0; } + if (num == 0 && realm->seq == empty_snapc->seq) { + ceph_get_snap_context(empty_snapc); + snapc = empty_snapc; + goto done; + } + /* alloc new snap context */ err = -ENOMEM; if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) @@ -365,8 +374,8 @@ static int build_snap_context(struct ceph_snap_realm *realm) realm->ino, realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); - if (realm->cached_context) - ceph_put_snap_context(realm->cached_context); +done: + ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; return 0; @@ -466,6 +475,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); + } else if (ci->i_snap_realm->cached_context == empty_snapc) { + dout("queue_cap_snap %p empty snapc\n", inode); + kfree(capsnap); } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { struct ceph_snap_context *snapc = ci->i_head_snapc; @@ -504,6 +516,8 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->xattr_version = 0; } + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + /* dirty page count moved from _head to this cap_snap; all subsequent writes page dirties occur _after_ this snapshot. */ @@ -590,15 +604,13 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); + iput(lastinode); lastinode = inode; ceph_queue_cap_snap(ci); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); + iput(lastinode); list_for_each_entry(child, &realm->children, child_item) { dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", @@ -928,5 +940,16 @@ out: return; } +int __init ceph_snap_init(void) +{ + empty_snapc = ceph_create_snap_context(0, GFP_NOFS); + if (!empty_snapc) + return -ENOMEM; + empty_snapc->seq = 1; + return 0; +} - +void ceph_snap_exit(void) +{ + ceph_put_snap_context(empty_snapc); +} diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f6e12377335c..50f06cddc94b 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -515,7 +515,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_fs_client *fsc; const u64 supported_features = CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH; + CEPH_FEATURE_DIRLAYOUTHASH | + CEPH_FEATURE_MDS_INLINE_DATA; const u64 required_features = 0; int page_count; size_t size; @@ -1017,9 +1018,6 @@ static struct file_system_type ceph_fs_type = { }; MODULE_ALIAS_FS("ceph"); -#define _STRINGIFY(x) #x -#define STRINGIFY(x) _STRINGIFY(x) - static int __init init_ceph(void) { int ret = init_caches(); @@ -1028,15 +1026,20 @@ static int __init init_ceph(void) ceph_flock_init(); ceph_xattr_init(); + ret = ceph_snap_init(); + if (ret) + goto out_xattr; ret = register_filesystem(&ceph_fs_type); if (ret) - goto out_icache; + goto out_snap; pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); return 0; -out_icache: +out_snap: + ceph_snap_exit(); +out_xattr: ceph_xattr_exit(); destroy_caches(); out: @@ -1047,6 +1050,7 @@ static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); + ceph_snap_exit(); ceph_xattr_exit(); destroy_caches(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b82f507979b8..e1aa32d0759d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -161,6 +161,7 @@ struct ceph_cap_snap { u64 time_warp_seq; int writing; /* a sync write is still in progress */ int dirty_pages; /* dirty pages awaiting writeback */ + bool inline_data; }; static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) @@ -253,9 +254,11 @@ struct ceph_inode_info { spinlock_t i_ceph_lock; u64 i_version; + u64 i_inline_version; u32 i_time_warp_seq; unsigned i_ceph_flags; + int i_ordered_count; atomic_t i_release_count; atomic_t i_complete_count; @@ -434,14 +437,19 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* * Ceph inode. */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ +#define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */ +#define CEPH_I_NODELAY 4 /* do not delay cap release */ +#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, - int release_count) + int release_count, int ordered_count) { atomic_set(&ci->i_complete_count, release_count); + if (ci->i_ordered_count == ordered_count) + ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; + else + ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; } static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) @@ -455,16 +463,35 @@ static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) atomic_read(&ci->i_release_count); } +static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) +{ + return __ceph_dir_is_complete(ci) && + (ci->i_ceph_flags & CEPH_I_DIR_ORDERED); +} + static inline void ceph_dir_clear_complete(struct inode *inode) { __ceph_dir_clear_complete(ceph_inode(inode)); } -static inline bool ceph_dir_is_complete(struct inode *inode) +static inline void ceph_dir_clear_ordered(struct inode *inode) { - return __ceph_dir_is_complete(ceph_inode(inode)); + struct ceph_inode_info *ci = ceph_inode(inode); + spin_lock(&ci->i_ceph_lock); + ci->i_ordered_count++; + ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; + spin_unlock(&ci->i_ceph_lock); } +static inline bool ceph_dir_is_complete_ordered(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + bool ret; + spin_lock(&ci->i_ceph_lock); + ret = __ceph_dir_is_complete_ordered(ci); + spin_unlock(&ci->i_ceph_lock); + return ret; +} /* find a specific frag @f */ extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, @@ -580,6 +607,7 @@ struct ceph_file_info { char *last_name; /* last entry in previous chunk */ struct dentry *dentry; /* next dentry (for dcache readdir) */ int dir_release_count; + int dir_ordered_count; /* used for -o dirstat read() on directory thing */ char *dir_info; @@ -673,6 +701,8 @@ extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); +extern int ceph_snap_init(void); +extern void ceph_snap_exit(void); /* * a cap_snap is "pending" if it is still awaiting an in-progress @@ -715,7 +745,12 @@ extern void ceph_queue_vmtruncate(struct inode *inode); extern void ceph_queue_invalidate(struct inode *inode); extern void ceph_queue_writeback(struct inode *inode); -extern int ceph_do_getattr(struct inode *inode, int mask, bool force); +extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, + int mask, bool force); +static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) +{ + return __ceph_do_getattr(inode, NULL, mask, force); +} extern int ceph_permission(struct inode *inode, int mask); extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, @@ -830,7 +865,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, int mds, int drop, int unless); extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, - int *got, loff_t endoff); + loff_t endoff, int *got, struct page **pinned_page); /* for counting open files by mode */ static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) @@ -852,7 +887,9 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, int *opened); extern int ceph_release(struct inode *inode, struct file *filp); - +extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, + char *data, size_t len); +int ceph_uninline_data(struct file *filp, struct page *locked_page); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct inode_operations ceph_dir_iops; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 678b0d2bbbc4..5a492caf34cb 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -854,7 +854,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_pagelist *pagelist = NULL; int err; - if (value) { + if (size > 0) { /* copy value into pagelist */ pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); if (!pagelist) @@ -864,7 +864,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, err = ceph_pagelist_append(pagelist, value, size); if (err) goto out; - } else { + } else if (!value) { flags |= CEPH_XATTR_REMOVE; } @@ -1001,6 +1001,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_setxattr(dentry, name, value, size, flags); + if (size == 0) + value = ""; /* empty EA, do not remove */ + return __ceph_setxattr(dentry, name, value, size, flags); } diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 7ff025966e4f..86c893884eb9 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -426,7 +426,6 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx) struct coda_file_info *cfi; struct coda_inode_info *cii; struct file *host_file; - struct dentry *de; struct venus_dirent *vdir; unsigned long vdir_size = offsetof(struct venus_dirent, d_name); unsigned int type; @@ -438,8 +437,7 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - de = coda_file->f_path.dentry; - cii = ITOC(de->d_inode); + cii = ITOC(file_inode(coda_file)); vdir = kmalloc(sizeof(*vdir), GFP_KERNEL); if (!vdir) return -ENOMEM; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 8e0f2f410189..517e64938438 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -22,6 +22,7 @@ #include <linux/io.h> #include <linux/slab.h> #include <linux/atomic.h> +#include <linux/device.h> static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -762,3 +763,56 @@ struct dentry *debugfs_create_regset32(const char *name, umode_t mode, EXPORT_SYMBOL_GPL(debugfs_create_regset32); #endif /* CONFIG_HAS_IOMEM */ + +struct debugfs_devm_entry { + int (*read)(struct seq_file *seq, void *data); + struct device *dev; +}; + +static int debugfs_devm_entry_open(struct inode *inode, struct file *f) +{ + struct debugfs_devm_entry *entry = inode->i_private; + + return single_open(f, entry->read, entry->dev); +} + +static const struct file_operations debugfs_devm_entry_ops = { + .owner = THIS_MODULE, + .open = debugfs_devm_entry_open, + .release = single_release, + .read = seq_read, + .llseek = seq_lseek +}; + +/** + * debugfs_create_devm_seqfile - create a debugfs file that is bound to device. + * + * @dev: device related to this debugfs file. + * @name: name of the debugfs file. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @read_fn: function pointer called to print the seq_file content. + */ +struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, + struct dentry *parent, + int (*read_fn)(struct seq_file *s, + void *data)) +{ + struct debugfs_devm_entry *entry; + + if (IS_ERR(parent)) + return ERR_PTR(-ENOENT); + + entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL); + if (!entry) + return ERR_PTR(-ENOMEM); + + entry->read = read_fn; + entry->dev = dev; + + return debugfs_create_file(name, S_IRUGO, parent, entry, + &debugfs_devm_entry_ops); +} +EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile); + diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index c2d6604667b0..719e1ce1c609 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1917,7 +1917,6 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, break; case 2: dst[dst_byte_offset++] |= (src_byte); - dst[dst_byte_offset] = 0; current_bit_offset = 0; break; } diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 80154ec4f8c2..6f4e659f508f 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -190,23 +190,11 @@ static int ecryptfs_open(struct inode *inode, struct file *file) { int rc = 0; struct ecryptfs_crypt_stat *crypt_stat = NULL; - struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct dentry *ecryptfs_dentry = file->f_path.dentry; /* Private value of ecryptfs_dentry allocated in * ecryptfs_lookup() */ struct ecryptfs_file_info *file_info; - mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; - if ((mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) - && ((file->f_flags & O_WRONLY) || (file->f_flags & O_RDWR) - || (file->f_flags & O_CREAT) || (file->f_flags & O_TRUNC) - || (file->f_flags & O_APPEND))) { - printk(KERN_WARNING "Mount has encrypted view enabled; " - "files may only be read\n"); - rc = -EPERM; - goto out; - } /* Released in ecryptfs_release or end of function if failure */ file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); ecryptfs_set_file_private(file, file_info); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 635e8e16a5b7..917bd5c9776a 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -100,12 +100,12 @@ int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, (*size) = 0; if (data[0] < 192) { /* One-byte length */ - (*size) = (unsigned char)data[0]; + (*size) = data[0]; (*length_size) = 1; } else if (data[0] < 224) { /* Two-byte length */ - (*size) = (((unsigned char)(data[0]) - 192) * 256); - (*size) += ((unsigned char)(data[1]) + 192); + (*size) = (data[0] - 192) * 256; + (*size) += data[1] + 192; (*length_size) = 2; } else if (data[0] == 255) { /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */ diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index c4cd1fd86cc2..d9eb84bda559 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -493,6 +493,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags { struct super_block *s; struct ecryptfs_sb_info *sbi; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; struct inode *inode; @@ -511,6 +512,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags err = "Error parsing options"; goto out; } + mount_crypt_stat = &sbi->mount_crypt_stat; s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) { @@ -557,11 +559,19 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags /** * Set the POSIX ACL flag based on whether they're enabled in the lower - * mount. Force a read-only eCryptfs mount if the lower mount is ro. - * Allow a ro eCryptfs mount even when the lower mount is rw. + * mount. */ s->s_flags = flags & ~MS_POSIXACL; - s->s_flags |= path.dentry->d_sb->s_flags & (MS_RDONLY | MS_POSIXACL); + s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL; + + /** + * Force a read-only eCryptfs mount when: + * 1) The lower mount is ro + * 2) The ecryptfs_encrypted_view mount option is specified + */ + if (path.dentry->d_sb->s_flags & MS_RDONLY || + mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + s->s_flags |= MS_RDONLY; s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 503ea15dc5db..370420bfae8d 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -267,7 +267,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; - unsigned int w_flags = 0; unsigned int tmp_data_size, data_size, replaced_size; int err2, jblocks, retries = 0; int replaced_count = 0; @@ -288,9 +287,6 @@ again: return 0; } - if (segment_eq(get_fs(), KERNEL_DS)) - w_flags |= AOP_FLAG_UNINTERRUPTIBLE; - orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 966ace8b243f..28d0c7abba1c 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -415,7 +415,7 @@ err_unlock: err_region: unregister_chrdev_region(devt, 1); err: - fuse_conn_kill(fc); + fuse_abort_conn(fc); goto out; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ca887314aba9..ba1107977f2e 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -511,6 +511,35 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) } EXPORT_SYMBOL_GPL(fuse_request_send); +ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) +{ + struct fuse_req *req; + ssize_t ret; + + req = fuse_get_req(fc, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = args->in.h.opcode; + req->in.h.nodeid = args->in.h.nodeid; + req->in.numargs = args->in.numargs; + memcpy(req->in.args, args->in.args, + args->in.numargs * sizeof(struct fuse_in_arg)); + req->out.argvar = args->out.argvar; + req->out.numargs = args->out.numargs; + memcpy(req->out.args, args->out.args, + args->out.numargs * sizeof(struct fuse_arg)); + fuse_request_send(fc, req); + ret = req->out.h.error; + if (!ret && args->out.argvar) { + BUG_ON(args->out.numargs != 1); + ret = req->out.args[0].size; + } + fuse_put_request(fc, req); + + return ret; +} + static void fuse_request_send_nowait_locked(struct fuse_conn *fc, struct fuse_req *req) { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index df562cc87763..252b8a5de8b5 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -145,22 +145,22 @@ static void fuse_invalidate_entry(struct dentry *entry) fuse_invalidate_entry_cache(entry); } -static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_req *req, +static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, u64 nodeid, struct qstr *name, struct fuse_entry_out *outarg) { memset(outarg, 0, sizeof(struct fuse_entry_out)); - req->in.h.opcode = FUSE_LOOKUP; - req->in.h.nodeid = nodeid; - req->in.numargs = 1; - req->in.args[0].size = name->len + 1; - req->in.args[0].value = name->name; - req->out.numargs = 1; + args->in.h.opcode = FUSE_LOOKUP; + args->in.h.nodeid = nodeid; + args->in.numargs = 1; + args->in.args[0].size = name->len + 1; + args->in.args[0].value = name->name; + args->out.numargs = 1; if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; else - req->out.args[0].size = sizeof(struct fuse_entry_out); - req->out.args[0].value = outarg; + args->out.args[0].size = sizeof(struct fuse_entry_out); + args->out.args[0].value = outarg; } u64 fuse_get_attr_version(struct fuse_conn *fc) @@ -200,9 +200,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || (flags & LOOKUP_REVAL)) { - int err; struct fuse_entry_out outarg; - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; @@ -215,31 +214,23 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto out; fc = get_fuse_conn(inode); - req = fuse_get_req_nopages(fc); - ret = PTR_ERR(req); - if (IS_ERR(req)) - goto out; forget = fuse_alloc_forget(); - if (!forget) { - fuse_put_request(fc, req); - ret = -ENOMEM; + ret = -ENOMEM; + if (!forget) goto out; - } attr_version = fuse_get_attr_version(fc); parent = dget_parent(entry); - fuse_lookup_init(fc, req, get_node_id(parent->d_inode), + fuse_lookup_init(fc, &args, get_node_id(parent->d_inode), &entry->d_name, &outarg); - fuse_request_send(fc, req); + ret = fuse_simple_request(fc, &args); dput(parent); - err = req->out.h.error; - fuse_put_request(fc, req); /* Zero nodeid is same as -ENOENT */ - if (!err && !outarg.nodeid) - err = -ENOENT; - if (!err) { + if (!ret && !outarg.nodeid) + ret = -ENOENT; + if (!ret) { fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode)) { fuse_queue_forget(fc, forget, outarg.nodeid, 1); @@ -250,7 +241,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) spin_unlock(&fc->lock); } kfree(forget); - if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) + if (ret == -ENOMEM) + goto out; + if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) goto invalid; fuse_change_attributes(inode, &outarg.attr, @@ -296,7 +289,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { struct fuse_conn *fc = get_fuse_conn_super(sb); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; int err; @@ -306,24 +299,16 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, if (name->len > FUSE_NAME_MAX) goto out; - req = fuse_get_req_nopages(fc); - err = PTR_ERR(req); - if (IS_ERR(req)) - goto out; forget = fuse_alloc_forget(); err = -ENOMEM; - if (!forget) { - fuse_put_request(fc, req); + if (!forget) goto out; - } attr_version = fuse_get_attr_version(fc); - fuse_lookup_init(fc, req, nodeid, name, outarg); - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + fuse_lookup_init(fc, &args, nodeid, name, outarg); + err = fuse_simple_request(fc, &args); /* Zero nodeid is same as -ENOENT, but with valid timeout */ if (err || !outarg->nodeid) goto out_put_forget; @@ -405,7 +390,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int err; struct inode *inode; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; struct fuse_open_out outopen; @@ -420,15 +405,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (!forget) goto out_err; - req = fuse_get_req_nopages(fc); - err = PTR_ERR(req); - if (IS_ERR(req)) - goto out_put_forget_req; - err = -ENOMEM; ff = fuse_file_alloc(fc); if (!ff) - goto out_put_request; + goto out_put_forget_req; if (!fc->dont_mask) mode &= ~current_umask(); @@ -439,24 +419,23 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.flags = flags; inarg.mode = mode; inarg.umask = current_umask(); - req->in.h.opcode = FUSE_CREATE; - req->in.h.nodeid = get_node_id(dir); - req->in.numargs = 2; - req->in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) : + args.in.h.opcode = FUSE_CREATE; + args.in.h.nodeid = get_node_id(dir); + args.in.numargs = 2; + args.in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) : sizeof(inarg); - req->in.args[0].value = &inarg; - req->in.args[1].size = entry->d_name.len + 1; - req->in.args[1].value = entry->d_name.name; - req->out.numargs = 2; + args.in.args[0].value = &inarg; + args.in.args[1].size = entry->d_name.len + 1; + args.in.args[1].value = entry->d_name.name; + args.out.numargs = 2; if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + args.out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; else - req->out.args[0].size = sizeof(outentry); - req->out.args[0].value = &outentry; - req->out.args[1].size = sizeof(outopen); - req->out.args[1].value = &outopen; - fuse_request_send(fc, req); - err = req->out.h.error; + args.out.args[0].size = sizeof(outentry); + args.out.args[0].value = &outentry; + args.out.args[1].size = sizeof(outopen); + args.out.args[1].value = &outopen; + err = fuse_simple_request(fc, &args); if (err) goto out_free_ff; @@ -464,7 +443,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) goto out_free_ff; - fuse_put_request(fc, req); ff->fh = outopen.fh; ff->nodeid = outentry.nodeid; ff->open_flags = outopen.open_flags; @@ -492,8 +470,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, out_free_ff: fuse_file_free(ff); -out_put_request: - fuse_put_request(fc, req); out_put_forget_req: kfree(forget); out_err: @@ -547,7 +523,7 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, +static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, struct inode *dir, struct dentry *entry, umode_t mode) { @@ -557,22 +533,18 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, struct fuse_forget_link *forget; forget = fuse_alloc_forget(); - if (!forget) { - fuse_put_request(fc, req); + if (!forget) return -ENOMEM; - } memset(&outarg, 0, sizeof(outarg)); - req->in.h.nodeid = get_node_id(dir); - req->out.numargs = 1; + args->in.h.nodeid = get_node_id(dir); + args->out.numargs = 1; if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; else - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args->out.args[0].size = sizeof(outarg); + args->out.args[0].value = &outarg; + err = fuse_simple_request(fc, args); if (err) goto out_put_forget_req; @@ -609,9 +581,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, { struct fuse_mknod_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + FUSE_ARGS(args); if (!fc->dont_mask) mode &= ~current_umask(); @@ -620,14 +590,14 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, inarg.mode = mode; inarg.rdev = new_encode_dev(rdev); inarg.umask = current_umask(); - req->in.h.opcode = FUSE_MKNOD; - req->in.numargs = 2; - req->in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE : + args.in.h.opcode = FUSE_MKNOD; + args.in.numargs = 2; + args.in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE : sizeof(inarg); - req->in.args[0].value = &inarg; - req->in.args[1].size = entry->d_name.len + 1; - req->in.args[1].value = entry->d_name.name; - return create_new_entry(fc, req, dir, entry, mode); + args.in.args[0].value = &inarg; + args.in.args[1].size = entry->d_name.len + 1; + args.in.args[1].value = entry->d_name.name; + return create_new_entry(fc, &args, dir, entry, mode); } static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, @@ -640,9 +610,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + FUSE_ARGS(args); if (!fc->dont_mask) mode &= ~current_umask(); @@ -650,13 +618,13 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.umask = current_umask(); - req->in.h.opcode = FUSE_MKDIR; - req->in.numargs = 2; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->in.args[1].size = entry->d_name.len + 1; - req->in.args[1].value = entry->d_name.name; - return create_new_entry(fc, req, dir, entry, S_IFDIR); + args.in.h.opcode = FUSE_MKDIR; + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = entry->d_name.len + 1; + args.in.args[1].value = entry->d_name.name; + return create_new_entry(fc, &args, dir, entry, S_IFDIR); } static int fuse_symlink(struct inode *dir, struct dentry *entry, @@ -664,17 +632,15 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; - struct fuse_req *req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + FUSE_ARGS(args); - req->in.h.opcode = FUSE_SYMLINK; - req->in.numargs = 2; - req->in.args[0].size = entry->d_name.len + 1; - req->in.args[0].value = entry->d_name.name; - req->in.args[1].size = len; - req->in.args[1].value = link; - return create_new_entry(fc, req, dir, entry, S_IFLNK); + args.in.h.opcode = FUSE_SYMLINK; + args.in.numargs = 2; + args.in.args[0].size = entry->d_name.len + 1; + args.in.args[0].value = entry->d_name.name; + args.in.args[1].size = len; + args.in.args[1].value = link; + return create_new_entry(fc, &args, dir, entry, S_IFLNK); } static inline void fuse_update_ctime(struct inode *inode) @@ -689,18 +655,14 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->in.h.opcode = FUSE_UNLINK; - req->in.h.nodeid = get_node_id(dir); - req->in.numargs = 1; - req->in.args[0].size = entry->d_name.len + 1; - req->in.args[0].value = entry->d_name.name; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + FUSE_ARGS(args); + + args.in.h.opcode = FUSE_UNLINK; + args.in.h.nodeid = get_node_id(dir); + args.in.numargs = 1; + args.in.args[0].size = entry->d_name.len + 1; + args.in.args[0].value = entry->d_name.name; + err = fuse_simple_request(fc, &args); if (!err) { struct inode *inode = entry->d_inode; struct fuse_inode *fi = get_fuse_inode(inode); @@ -729,18 +691,14 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->in.h.opcode = FUSE_RMDIR; - req->in.h.nodeid = get_node_id(dir); - req->in.numargs = 1; - req->in.args[0].size = entry->d_name.len + 1; - req->in.args[0].value = entry->d_name.name; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + FUSE_ARGS(args); + + args.in.h.opcode = FUSE_RMDIR; + args.in.h.nodeid = get_node_id(dir); + args.in.numargs = 1; + args.in.args[0].size = entry->d_name.len + 1; + args.in.args[0].value = entry->d_name.name; + err = fuse_simple_request(fc, &args); if (!err) { clear_nlink(entry->d_inode); fuse_invalidate_attr(dir); @@ -757,27 +715,21 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, int err; struct fuse_rename2_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req; - - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + FUSE_ARGS(args); memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); inarg.flags = flags; - req->in.h.opcode = opcode; - req->in.h.nodeid = get_node_id(olddir); - req->in.numargs = 3; - req->in.args[0].size = argsize; - req->in.args[0].value = &inarg; - req->in.args[1].size = oldent->d_name.len + 1; - req->in.args[1].value = oldent->d_name.name; - req->in.args[2].size = newent->d_name.len + 1; - req->in.args[2].value = newent->d_name.name; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = opcode; + args.in.h.nodeid = get_node_id(olddir); + args.in.numargs = 3; + args.in.args[0].size = argsize; + args.in.args[0].value = &inarg; + args.in.args[1].size = oldent->d_name.len + 1; + args.in.args[1].value = oldent->d_name.name; + args.in.args[2].size = newent->d_name.len + 1; + args.in.args[2].value = newent->d_name.name; + err = fuse_simple_request(fc, &args); if (!err) { /* ctime changes */ fuse_invalidate_attr(oldent->d_inode); @@ -849,19 +801,17 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_link_in inarg; struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); - req->in.h.opcode = FUSE_LINK; - req->in.numargs = 2; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->in.args[1].size = newent->d_name.len + 1; - req->in.args[1].value = newent->d_name.name; - err = create_new_entry(fc, req, newdir, newent, inode->i_mode); + args.in.h.opcode = FUSE_LINK; + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = newent->d_name.len + 1; + args.in.args[1].value = newent->d_name.name; + err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); /* Contrary to "normal" filesystems it can happen that link makes two "logical" inodes point to the same "physical" inode. We invalidate the attributes of the old one, so it @@ -929,13 +879,9 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, struct fuse_getattr_in inarg; struct fuse_attr_out outarg; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); u64 attr_version; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - attr_version = fuse_get_attr_version(fc); memset(&inarg, 0, sizeof(inarg)); @@ -947,20 +893,18 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } - req->in.h.opcode = FUSE_GETATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->out.numargs = 1; + args.in.h.opcode = FUSE_GETATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + args.out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; else - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); if (!err) { if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { make_bad_inode(inode); @@ -1102,7 +1046,7 @@ int fuse_allow_current_process(struct fuse_conn *fc) static int fuse_access(struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_access_in inarg; int err; @@ -1111,20 +1055,14 @@ static int fuse_access(struct inode *inode, int mask) if (fc->no_access) return 0; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); - req->in.h.opcode = FUSE_ACCESS; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = FUSE_ACCESS; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_access = 1; err = 0; @@ -1445,31 +1383,27 @@ static char *read_link(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_req_nopages(fc); + FUSE_ARGS(args); char *link; - - if (IS_ERR(req)) - return ERR_CAST(req); + ssize_t ret; link = (char *) __get_free_page(GFP_KERNEL); - if (!link) { - link = ERR_PTR(-ENOMEM); - goto out; - } - req->in.h.opcode = FUSE_READLINK; - req->in.h.nodeid = get_node_id(inode); - req->out.argvar = 1; - req->out.numargs = 1; - req->out.args[0].size = PAGE_SIZE - 1; - req->out.args[0].value = link; - fuse_request_send(fc, req); - if (req->out.h.error) { + if (!link) + return ERR_PTR(-ENOMEM); + + args.in.h.opcode = FUSE_READLINK; + args.in.h.nodeid = get_node_id(inode); + args.out.argvar = 1; + args.out.numargs = 1; + args.out.args[0].size = PAGE_SIZE - 1; + args.out.args[0].value = link; + ret = fuse_simple_request(fc, &args); + if (ret < 0) { free_page((unsigned long) link); - link = ERR_PTR(req->out.h.error); - } else - link[req->out.args[0].size] = '\0'; - out: - fuse_put_request(fc, req); + link = ERR_PTR(ret); + } else { + link[ret] = '\0'; + } fuse_invalidate_atime(inode); return link; } @@ -1629,22 +1563,22 @@ void fuse_release_nowrite(struct inode *inode) spin_unlock(&fc->lock); } -static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, +static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, struct inode *inode, struct fuse_setattr_in *inarg_p, struct fuse_attr_out *outarg_p) { - req->in.h.opcode = FUSE_SETATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(*inarg_p); - req->in.args[0].value = inarg_p; - req->out.numargs = 1; + args->in.h.opcode = FUSE_SETATTR; + args->in.h.nodeid = get_node_id(inode); + args->in.numargs = 1; + args->in.args[0].size = sizeof(*inarg_p); + args->in.args[0].value = inarg_p; + args->out.numargs = 1; if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; else - req->out.args[0].size = sizeof(*outarg_p); - req->out.args[0].value = outarg_p; + args->out.args[0].size = sizeof(*outarg_p); + args->out.args[0].value = outarg_p; } /* @@ -1653,14 +1587,9 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; - int err; - - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); @@ -1677,12 +1606,9 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } - fuse_setattr_fill(fc, req, inode, &inarg, &outarg); - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); - return err; + return fuse_simple_request(fc, &args); } /* @@ -1698,7 +1624,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; bool is_truncate = false; @@ -1723,10 +1649,6 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, if (attr->ia_valid & ATTR_SIZE) is_truncate = true; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); @@ -1747,10 +1669,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, inarg.valid |= FATTR_LOCKOWNER; inarg.lock_owner = fuse_lock_owner_id(fc, current->files); } - fuse_setattr_fill(fc, req, inode, &inarg, &outarg); - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); + err = fuse_simple_request(fc, &args); if (err) { if (err == -EINTR) fuse_invalidate_attr(inode); @@ -1837,32 +1757,26 @@ static int fuse_setxattr(struct dentry *entry, const char *name, { struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_setxattr_in inarg; int err; if (fc->no_setxattr) return -EOPNOTSUPP; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); inarg.size = size; inarg.flags = flags; - req->in.h.opcode = FUSE_SETXATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 3; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->in.args[1].size = strlen(name) + 1; - req->in.args[1].value = name; - req->in.args[2].size = size; - req->in.args[2].value = value; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = FUSE_SETXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 3; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = strlen(name) + 1; + args.in.args[1].value = name; + args.in.args[2].size = size; + args.in.args[2].value = value; + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_setxattr = 1; err = -EOPNOTSUPP; @@ -1879,7 +1793,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, { struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; @@ -1887,40 +1801,32 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, if (fc->no_getxattr) return -EOPNOTSUPP; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); inarg.size = size; - req->in.h.opcode = FUSE_GETXATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 2; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->in.args[1].size = strlen(name) + 1; - req->in.args[1].value = name; + args.in.h.opcode = FUSE_GETXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = strlen(name) + 1; + args.in.args[1].value = name; /* This is really two different operations rolled into one */ - req->out.numargs = 1; + args.out.numargs = 1; if (size) { - req->out.argvar = 1; - req->out.args[0].size = size; - req->out.args[0].value = value; + args.out.argvar = 1; + args.out.args[0].size = size; + args.out.args[0].value = value; } else { - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; } - fuse_request_send(fc, req); - ret = req->out.h.error; - if (!ret) - ret = size ? req->out.args[0].size : outarg.size; - else { - if (ret == -ENOSYS) { - fc->no_getxattr = 1; - ret = -EOPNOTSUPP; - } + ret = fuse_simple_request(fc, &args); + if (!ret && !size) + ret = outarg.size; + if (ret == -ENOSYS) { + fc->no_getxattr = 1; + ret = -EOPNOTSUPP; } - fuse_put_request(fc, req); return ret; } @@ -1928,7 +1834,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) { struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; @@ -1939,38 +1845,30 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) if (fc->no_listxattr) return -EOPNOTSUPP; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); inarg.size = size; - req->in.h.opcode = FUSE_LISTXATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; + args.in.h.opcode = FUSE_LISTXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; /* This is really two different operations rolled into one */ - req->out.numargs = 1; + args.out.numargs = 1; if (size) { - req->out.argvar = 1; - req->out.args[0].size = size; - req->out.args[0].value = list; + args.out.argvar = 1; + args.out.args[0].size = size; + args.out.args[0].value = list; } else { - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; } - fuse_request_send(fc, req); - ret = req->out.h.error; - if (!ret) - ret = size ? req->out.args[0].size : outarg.size; - else { - if (ret == -ENOSYS) { - fc->no_listxattr = 1; - ret = -EOPNOTSUPP; - } + ret = fuse_simple_request(fc, &args); + if (!ret && !size) + ret = outarg.size; + if (ret == -ENOSYS) { + fc->no_listxattr = 1; + ret = -EOPNOTSUPP; } - fuse_put_request(fc, req); return ret; } @@ -1978,24 +1876,18 @@ static int fuse_removexattr(struct dentry *entry, const char *name) { struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); int err; if (fc->no_removexattr) return -EOPNOTSUPP; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->in.h.opcode = FUSE_REMOVEXATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = strlen(name) + 1; - req->in.args[0].value = name; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = FUSE_REMOVEXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = strlen(name) + 1; + args.in.args[0].value = name; + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_removexattr = 1; err = -EOPNOTSUPP; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index bf50259012ab..760b2c552197 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -24,30 +24,22 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, int opcode, struct fuse_open_out *outargp) { struct fuse_open_in inarg; - struct fuse_req *req; - int err; - - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); if (!fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; - req->in.h.opcode = opcode; - req->in.h.nodeid = nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->out.numargs = 1; - req->out.args[0].size = sizeof(*outargp); - req->out.args[0].value = outargp; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = opcode; + args.in.h.nodeid = nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(*outargp); + args.out.args[0].value = outargp; - return err; + return fuse_simple_request(fc, &args); } struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) @@ -89,37 +81,9 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } -static void fuse_release_async(struct work_struct *work) -{ - struct fuse_req *req; - struct fuse_conn *fc; - struct path path; - - req = container_of(work, struct fuse_req, misc.release.work); - path = req->misc.release.path; - fc = get_fuse_conn(path.dentry->d_inode); - - fuse_put_request(fc, req); - path_put(&path); -} - static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { - if (fc->destroy_req) { - /* - * If this is a fuseblk mount, then it's possible that - * releasing the path will result in releasing the - * super block and sending the DESTROY request. If - * the server is single threaded, this would hang. - * For this reason do the path_put() in a separate - * thread. - */ - atomic_inc(&req->count); - INIT_WORK(&req->misc.release.work, fuse_release_async); - schedule_work(&req->misc.release.work); - } else { - path_put(&req->misc.release.path); - } + iput(req->misc.release.inode); } static void fuse_file_put(struct fuse_file *ff, bool sync) @@ -133,12 +97,12 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) * implement 'open' */ req->background = 0; - path_put(&req->misc.release.path); + iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else if (sync) { req->background = 0; fuse_request_send(ff->fc, req); - path_put(&req->misc.release.path); + iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else { req->end = fuse_release_end; @@ -297,9 +261,8 @@ void fuse_release_common(struct file *file, int opcode) inarg->lock_owner = fuse_lock_owner_id(ff->fc, (fl_owner_t) file); } - /* Hold vfsmount and dentry until release is finished */ - path_get(&file->f_path); - req->misc.release.path = file->f_path; + /* Hold inode until release is finished */ + req->misc.release.inode = igrab(file_inode(file)); /* * Normally this will send the RELEASE request, however if @@ -480,7 +443,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, struct inode *inode = file->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_fsync_in inarg; int err; @@ -506,23 +469,15 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) goto out; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) { - err = PTR_ERR(req); - goto out; - } - memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.fsync_flags = datasync ? 1 : 0; - req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { if (isdir) fc->no_fsyncdir = 1; @@ -2156,49 +2111,44 @@ static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, return 0; } -static void fuse_lk_fill(struct fuse_req *req, struct file *file, +static void fuse_lk_fill(struct fuse_args *args, struct file *file, const struct file_lock *fl, int opcode, pid_t pid, - int flock) + int flock, struct fuse_lk_in *inarg) { struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; - struct fuse_lk_in *arg = &req->misc.lk_in; - - arg->fh = ff->fh; - arg->owner = fuse_lock_owner_id(fc, fl->fl_owner); - arg->lk.start = fl->fl_start; - arg->lk.end = fl->fl_end; - arg->lk.type = fl->fl_type; - arg->lk.pid = pid; + + memset(inarg, 0, sizeof(*inarg)); + inarg->fh = ff->fh; + inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner); + inarg->lk.start = fl->fl_start; + inarg->lk.end = fl->fl_end; + inarg->lk.type = fl->fl_type; + inarg->lk.pid = pid; if (flock) - arg->lk_flags |= FUSE_LK_FLOCK; - req->in.h.opcode = opcode; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(*arg); - req->in.args[0].value = arg; + inarg->lk_flags |= FUSE_LK_FLOCK; + args->in.h.opcode = opcode; + args->in.h.nodeid = get_node_id(inode); + args->in.numargs = 1; + args->in.args[0].size = sizeof(*inarg); + args->in.args[0].value = inarg; } static int fuse_getlk(struct file *file, struct file_lock *fl) { struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); + struct fuse_lk_in inarg; struct fuse_lk_out outarg; int err; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - - fuse_lk_fill(req, file, fl, FUSE_GETLK, 0, 0); - req->out.numargs = 1; - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); if (!err) err = convert_fuse_file_lock(&outarg.lk, fl); @@ -2209,7 +2159,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) { struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); + struct fuse_lk_in inarg; int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; int err; @@ -2223,17 +2174,13 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) if (fl->fl_flags & FL_CLOSE) return 0; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg); + err = fuse_simple_request(fc, &args); - fuse_lk_fill(req, file, fl, opcode, pid, flock); - fuse_request_send(fc, req); - err = req->out.h.error; /* locking is restartable */ if (err == -EINTR) err = -ERESTARTSYS; - fuse_put_request(fc, req); + return err; } @@ -2283,7 +2230,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_bmap_in inarg; struct fuse_bmap_out outarg; int err; @@ -2291,24 +2238,18 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) if (!inode->i_sb->s_bdev || fc->no_bmap) return 0; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return 0; - memset(&inarg, 0, sizeof(inarg)); inarg.block = block; inarg.blocksize = inode->i_sb->s_blocksize; - req->in.h.opcode = FUSE_BMAP; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->out.numargs = 1; - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = FUSE_BMAP; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) fc->no_bmap = 1; @@ -2776,7 +2717,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) struct fuse_conn *fc = ff->fc; struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; struct fuse_poll_out outarg; - struct fuse_req *req; + FUSE_ARGS(args); int err; if (fc->no_poll) @@ -2794,21 +2735,15 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) fuse_register_polled_file(fc, ff); } - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return POLLERR; - - req->in.h.opcode = FUSE_POLL; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->out.numargs = 1; - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = FUSE_POLL; + args.in.h.nodeid = ff->nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); if (!err) return outarg.revents; @@ -2949,10 +2884,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, loff_t length) { struct fuse_file *ff = file->private_data; - struct inode *inode = file->f_inode; + struct inode *inode = file_inode(file); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = ff->fc; - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_fallocate_in inarg = { .fh = ff->fh, .offset = offset, @@ -2985,25 +2920,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) { - err = PTR_ERR(req); - goto out; - } - - req->in.h.opcode = FUSE_FALLOCATE; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - fuse_request_send(fc, req); - err = req->out.h.error; + args.in.h.opcode = FUSE_FALLOCATE; + args.in.h.nodeid = ff->nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_fallocate = 1; err = -EOPNOTSUPP; } - fuse_put_request(fc, req); - if (err) goto out; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e8e47a6ab518..e0fc6725d1d0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -213,7 +213,7 @@ struct fuse_out { unsigned numargs; /** Array of arguments */ - struct fuse_arg args[3]; + struct fuse_arg args[2]; }; /** FUSE page descriptor */ @@ -222,6 +222,25 @@ struct fuse_page_desc { unsigned int offset; }; +struct fuse_args { + struct { + struct { + uint32_t opcode; + uint64_t nodeid; + } h; + unsigned numargs; + struct fuse_in_arg args[3]; + + } in; + struct { + unsigned argvar:1; + unsigned numargs; + struct fuse_arg args[2]; + } out; +}; + +#define FUSE_ARGS(args) struct fuse_args args = {} + /** The request state */ enum fuse_req_state { FUSE_REQ_INIT = 0, @@ -305,11 +324,8 @@ struct fuse_req { /** Data for asynchronous requests */ union { struct { - union { - struct fuse_release_in in; - struct work_struct work; - }; - struct path path; + struct fuse_release_in in; + struct inode *inode; } release; struct fuse_init_in init_in; struct fuse_init_out init_out; @@ -324,7 +340,6 @@ struct fuse_req { struct fuse_req *next; } write; struct fuse_notify_retrieve_in retrieve_in; - struct fuse_lk_in lk_in; } misc; /** page vector */ @@ -754,15 +769,6 @@ struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, void __fuse_get_request(struct fuse_req *req); /** - * Get a request, may fail with -ENOMEM, - * useful for callers who doesn't use req->pages[] - */ -static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc) -{ - return fuse_get_req(fc, 0); -} - -/** * Gets a requests for a file operation, always succeeds */ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, @@ -780,6 +786,11 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); /** + * Simple request sending that does request allocation and freeing + */ +ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); + +/** * Send a request in the background */ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); @@ -804,8 +815,6 @@ void fuse_invalidate_atime(struct inode *inode); */ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); -void fuse_conn_kill(struct fuse_conn *fc); - /** * Initialize fuse_conn */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 03246cd9d47a..6749109f255d 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -376,28 +376,13 @@ static void fuse_bdi_destroy(struct fuse_conn *fc) bdi_destroy(&fc->bdi); } -void fuse_conn_kill(struct fuse_conn *fc) -{ - spin_lock(&fc->lock); - fc->connected = 0; - fc->blocked = 0; - fc->initialized = 1; - spin_unlock(&fc->lock); - /* Flush all readers on this fs */ - kill_fasync(&fc->fasync, SIGIO, POLL_IN); - wake_up_all(&fc->waitq); - wake_up_all(&fc->blocked_waitq); - wake_up_all(&fc->reserved_req_waitq); -} -EXPORT_SYMBOL_GPL(fuse_conn_kill); - static void fuse_put_super(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); fuse_send_destroy(fc); - fuse_conn_kill(fc); + fuse_abort_conn(fc); mutex_lock(&fuse_mutex); list_del(&fc->entry); fuse_ctl_remove_conn(fc); @@ -425,7 +410,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_statfs_out outarg; int err; @@ -434,23 +419,17 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - memset(&outarg, 0, sizeof(outarg)); - req->in.numargs = 0; - req->in.h.opcode = FUSE_STATFS; - req->in.h.nodeid = get_node_id(dentry->d_inode); - req->out.numargs = 1; - req->out.args[0].size = + args.in.numargs = 0; + args.in.h.opcode = FUSE_STATFS; + args.in.h.nodeid = get_node_id(dentry->d_inode); + args.out.numargs = 1; + args.out.args[0].size = fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); - req->out.args[0].value = &outarg; - fuse_request_send(fc, req); - err = req->out.h.error; + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); - fuse_put_request(fc, req); return err; } diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 32602c667b4a..7892e6fddb66 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -38,21 +38,30 @@ int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, return hfsplus_strcmp(&k1->cat.name, &k2->cat.name); } -void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, - u32 parent, struct qstr *str) +/* Generates key for catalog file/folders record. */ +int hfsplus_cat_build_key(struct super_block *sb, + hfsplus_btree_key *key, u32 parent, struct qstr *str) { - int len; + int len, err; key->cat.parent = cpu_to_be32(parent); - if (str) { - hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN, - str->name, str->len); - len = be16_to_cpu(key->cat.name.length); - } else { - key->cat.name.length = 0; - len = 0; - } + err = hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN, + str->name, str->len); + if (unlikely(err < 0)) + return err; + + len = be16_to_cpu(key->cat.name.length); key->key_len = cpu_to_be16(6 + 2 * len); + return 0; +} + +/* Generates key for catalog thread record. */ +void hfsplus_cat_build_key_with_cnid(struct super_block *sb, + hfsplus_btree_key *key, u32 parent) +{ + key->cat.parent = cpu_to_be32(parent); + key->cat.name.length = 0; + key->key_len = cpu_to_be16(6); } static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent, @@ -167,11 +176,16 @@ static int hfsplus_fill_cat_thread(struct super_block *sb, hfsplus_cat_entry *entry, int type, u32 parentid, struct qstr *str) { + int err; + entry->type = cpu_to_be16(type); entry->thread.reserved = 0; entry->thread.parentID = cpu_to_be32(parentid); - hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN, + err = hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN, str->name, str->len); + if (unlikely(err < 0)) + return err; + return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2; } @@ -183,7 +197,7 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, int err; u16 type; - hfsplus_cat_build_key(sb, fd->search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd->search_key, cnid); err = hfs_brec_read(fd, &tmp, sizeof(hfsplus_cat_entry)); if (err) return err; @@ -250,11 +264,16 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, if (err) return err; - hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD, dir->i_ino, str); + if (unlikely(entry_size < 0)) { + err = entry_size; + goto err2; + } + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) @@ -265,7 +284,10 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, if (err) goto err2; - hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + if (unlikely(err)) + goto err1; + entry_size = hfsplus_cat_build_record(&entry, cnid, inode); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err != -ENOENT) { @@ -288,7 +310,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, return 0; err1: - hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); if (!hfs_brec_find(&fd, hfs_find_rec_by_key)) hfs_brec_remove(&fd); err2: @@ -313,7 +335,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) if (!str) { int len; - hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -329,7 +351,9 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) off + 2, len); fd.search_key->key_len = cpu_to_be16(6 + len); } else - hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + if (unlikely(err)) + goto out; err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) @@ -360,7 +384,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) if (err) goto out; - hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -405,7 +429,11 @@ int hfsplus_rename_cat(u32 cnid, dst_fd = src_fd; /* find the old dir entry and read the data */ - hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); + err = hfsplus_cat_build_key(sb, src_fd.search_key, + src_dir->i_ino, src_name); + if (unlikely(err)) + goto out; + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; @@ -419,7 +447,11 @@ int hfsplus_rename_cat(u32 cnid, type = be16_to_cpu(entry.type); /* create new dir entry with the data from the old entry */ - hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name); + err = hfsplus_cat_build_key(sb, dst_fd.search_key, + dst_dir->i_ino, dst_name); + if (unlikely(err)) + goto out; + err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) @@ -436,7 +468,11 @@ int hfsplus_rename_cat(u32 cnid, dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; /* finally remove the old entry */ - hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); + err = hfsplus_cat_build_key(sb, src_fd.search_key, + src_dir->i_ino, src_name); + if (unlikely(err)) + goto out; + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; @@ -449,7 +485,7 @@ int hfsplus_rename_cat(u32 cnid, src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; /* remove old thread entry */ - hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid); err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; @@ -459,9 +495,14 @@ int hfsplus_rename_cat(u32 cnid, goto out; /* create new thread entry */ - hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, dst_fd.search_key, cnid); entry_size = hfsplus_fill_cat_thread(sb, &entry, type, dst_dir->i_ino, dst_name); + if (unlikely(entry_size < 0)) { + err = entry_size; + goto out; + } + err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 610a3260bef1..435bea231cc6 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -44,7 +44,10 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return ERR_PTR(err); - hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); + err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, + &dentry->d_name); + if (unlikely(err < 0)) + goto fail; again: err = hfs_brec_read(&fd, &entry, sizeof(entry)); if (err) { @@ -97,9 +100,11 @@ again: be32_to_cpu(entry.file.permissions.dev); str.len = sprintf(name, "iNode%d", linkid); str.name = name; - hfsplus_cat_build_key(sb, fd.search_key, + err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb)->hidden_dir->i_ino, &str); + if (unlikely(err < 0)) + goto fail; goto again; } } else if (!dentry->d_fsdata) @@ -145,7 +150,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) err = -ENOMEM; goto out; } - hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, inode->i_ino); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index eb5e059f481a..b0441d65fa54 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -443,8 +443,10 @@ int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1, const hfsplus_btree_key *k2); int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, const hfsplus_btree_key *k2); -void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, +int hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, u32 parent, struct qstr *str); +void hfsplus_cat_build_key_with_cnid(struct super_block *sb, + hfsplus_btree_key *key, u32 parent); void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms); int hfsplus_find_cat(struct super_block *sb, u32 cnid, struct hfs_find_data *fd); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 4cf2024b87da..593af2fdcc2d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -515,7 +515,9 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = hfs_find_init(sbi->cat_tree, &fd); if (err) goto out_put_root; - hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); + err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); + if (unlikely(err < 0)) + goto out_put_root; if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) diff --git a/fs/inode.c b/fs/inode.c index ad60555b4768..aa149e7262ac 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -114,6 +114,11 @@ int proc_nr_inodes(struct ctl_table *table, int write, } #endif +static int no_open(struct inode *inode, struct file *file) +{ + return -ENXIO; +} + /** * inode_init_always - perform inode structure intialisation * @sb: superblock inode belongs to @@ -125,7 +130,7 @@ int proc_nr_inodes(struct ctl_table *table, int write, int inode_init_always(struct super_block *sb, struct inode *inode) { static const struct inode_operations empty_iops; - static const struct file_operations empty_fops; + static const struct file_operations no_open_fops = {.open = no_open}; struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; @@ -133,7 +138,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_flags = 0; atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; - inode->i_fop = &empty_fops; + inode->i_fop = &no_open_fops; inode->__i_nlink = 1; inode->i_opflags = 0; i_uid_write(inode, 0); @@ -1798,7 +1803,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) } else if (S_ISFIFO(mode)) inode->i_fop = &pipefifo_fops; else if (S_ISSOCK(mode)) - inode->i_fop = &bad_sock_fops; + ; /* leave it no_open_fops */ else printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" " inode %s:%lu\n", mode, inode->i_sb->s_id, diff --git a/fs/internal.h b/fs/internal.h index 757ba2abf21e..e9a61fe67575 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -147,3 +147,8 @@ extern const struct file_operations pipefifo_fops; */ extern void sb_pin_kill(struct super_block *sb); extern void mnt_pin_kill(struct mount *m); + +/* + * fs/nsfs.c + */ +extern struct dentry_operations ns_dentry_operations; diff --git a/fs/ioctl.c b/fs/ioctl.c index 77c9a7812542..214c3c11fbc2 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -443,7 +443,7 @@ int ioctl_preallocate(struct file *filp, void __user *argp) return -EINVAL; } - return do_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); + return vfs_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } static int file_ioctl(struct file *filp, unsigned int cmd, diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index f488bbae541a..735d7522a3a9 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -30,6 +30,7 @@ struct rock_state { int cont_size; int cont_extent; int cont_offset; + int cont_loops; struct inode *inode; }; @@ -73,6 +74,9 @@ static void init_rock_state(struct rock_state *rs, struct inode *inode) rs->inode = inode; } +/* Maximum number of Rock Ridge continuation entries */ +#define RR_MAX_CE_ENTRIES 32 + /* * Returns 0 if the caller should continue scanning, 1 if the scan must end * and -ve on error. @@ -105,6 +109,8 @@ static int rock_continue(struct rock_state *rs) goto out; } ret = -EIO; + if (++rs->cont_loops >= RR_MAX_CE_ENTRIES) + goto out; bh = sb_bread(rs->inode->i_sb, rs->cont_extent); if (bh) { memcpy(rs->buffer, bh->b_data + rs->cont_offset, @@ -356,6 +362,9 @@ repeat: rs.cont_size = isonum_733(rr->u.CE.size); break; case SIG('E', 'R'): + /* Invalid length of ER tag id? */ + if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len) + goto out; ISOFS_SB(inode->i_sb)->s_rock = 1; printk(KERN_DEBUG "ISO 9660 Extensions: "); { diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 386303dca382..dddbde4f56f4 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -224,7 +224,7 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw)); - /* If a node has zero dsize, we only have to keep if it if it might be the + /* If a node has zero dsize, we only have to keep it if it might be the node with highest version -- i.e. the one which will end up as f->metadata. Note that such nodes won't be REF_UNCHECKED since there are no data to check anyway. */ diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index c522d098bb4f..bc5385471a6e 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -844,6 +844,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock /* Write out summary information - called from jffs2_do_reserve_space */ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c) + __must_hold(&c->erase_completion_block) { int datasize, infosize, padsize; struct jffs2_eraseblock *jeb; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 4429d6d9217f..ddc9f9612f16 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -106,7 +106,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) const struct kernfs_ops *ops; /* - * @of->mutex nests outside active ref and is just to ensure that + * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); @@ -189,13 +189,16 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, const struct kernfs_ops *ops; char *buf; - buf = kmalloc(len, GFP_KERNEL); + buf = of->prealloc_buf; + if (!buf) + buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* - * @of->mutex nests outside active ref and is just to ensure that - * the ops aren't called concurrently for the same open file. + * @of->mutex nests outside active ref and is used both to ensure that + * the ops aren't called concurrently for the same open file, and + * to provide exclusive access to ->prealloc_buf (when that exists). */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { @@ -210,21 +213,22 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, else len = -EINVAL; - kernfs_put_active(of->kn); - mutex_unlock(&of->mutex); - if (len < 0) - goto out_free; + goto out_unlock; if (copy_to_user(user_buf, buf, len)) { len = -EFAULT; - goto out_free; + goto out_unlock; } *ppos += len; + out_unlock: + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); out_free: - kfree(buf); + if (buf != of->prealloc_buf) + kfree(buf); return len; } @@ -278,19 +282,16 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, len = min_t(size_t, count, PAGE_SIZE); } - buf = kmalloc(len + 1, GFP_KERNEL); + buf = of->prealloc_buf; + if (!buf) + buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; - if (copy_from_user(buf, user_buf, len)) { - len = -EFAULT; - goto out_free; - } - buf[len] = '\0'; /* guarantee string termination */ - /* - * @of->mutex nests outside active ref and is just to ensure that - * the ops aren't called concurrently for the same open file. + * @of->mutex nests outside active ref and is used both to ensure that + * the ops aren't called concurrently for the same open file, and + * to provide exclusive access to ->prealloc_buf (when that exists). */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { @@ -299,19 +300,27 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, goto out_free; } + if (copy_from_user(buf, user_buf, len)) { + len = -EFAULT; + goto out_unlock; + } + buf[len] = '\0'; /* guarantee string termination */ + ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, *ppos); else len = -EINVAL; - kernfs_put_active(of->kn); - mutex_unlock(&of->mutex); - if (len > 0) *ppos += len; + +out_unlock: + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); out_free: - kfree(buf); + if (buf != of->prealloc_buf) + kfree(buf); return len; } @@ -439,27 +448,6 @@ static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, return pol; } -static int kernfs_vma_migrate(struct vm_area_struct *vma, - const nodemask_t *from, const nodemask_t *to, - unsigned long flags) -{ - struct file *file = vma->vm_file; - struct kernfs_open_file *of = kernfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!kernfs_get_active(of->kn)) - return 0; - - ret = 0; - if (of->vm_ops->migrate) - ret = of->vm_ops->migrate(vma, from, to, flags); - - kernfs_put_active(of->kn); - return ret; -} #endif static const struct vm_operations_struct kernfs_vm_ops = { @@ -470,7 +458,6 @@ static const struct vm_operations_struct kernfs_vm_ops = { #ifdef CONFIG_NUMA .set_policy = kernfs_vma_set_policy, .get_policy = kernfs_vma_get_policy, - .migrate = kernfs_vma_migrate, #endif }; @@ -685,6 +672,22 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) */ of->atomic_write_len = ops->atomic_write_len; + error = -EINVAL; + /* + * ->seq_show is incompatible with ->prealloc, + * as seq_read does its own allocation. + * ->read must be used instead. + */ + if (ops->prealloc && ops->seq_show) + goto err_free; + if (ops->prealloc) { + int len = of->atomic_write_len ?: PAGE_SIZE; + of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); + error = -ENOMEM; + if (!of->prealloc_buf) + goto err_free; + } + /* * Always instantiate seq_file even if read access doesn't use * seq_file or is not requested. This unifies private data access @@ -715,6 +718,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) err_close: seq_release(inode, file); err_free: + kfree(of->prealloc_buf); kfree(of); err_out: kernfs_put_active(kn); @@ -728,6 +732,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) kernfs_put_open_node(kn, of); seq_release(inode, filp); + kfree(of->prealloc_buf); kfree(of); return 0; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 9106f42c472c..1cc6ec51e6b1 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -214,7 +214,7 @@ int nsm_monitor(const struct nlm_host *host) if (unlikely(res.status != 0)) status = -EIO; if (unlikely(status < 0)) { - printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name); + pr_notice_ratelimited("lockd: cannot monitor %s\n", nsm->sm_name); return status; } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index d1bb7ecfd201..e94c887da2d7 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -350,7 +350,7 @@ static struct svc_serv *lockd_create_svc(void) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); return ERR_PTR(-ENOMEM); diff --git a/fs/mount.h b/fs/mount.h index f82c62840905..0ad6f760ce52 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -1,10 +1,11 @@ #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/poll.h> +#include <linux/ns_common.h> struct mnt_namespace { atomic_t count; - unsigned int proc_inum; + struct ns_common ns; struct mount * root; struct list_head list; struct user_namespace *user_ns; diff --git a/fs/namei.c b/fs/namei.c index ca814165d84c..bc35b02883bb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -487,6 +487,19 @@ void path_put(const struct path *path) } EXPORT_SYMBOL(path_put); +struct nameidata { + struct path path; + struct qstr last; + struct path root; + struct inode *inode; /* path.dentry.d_inode */ + unsigned int flags; + unsigned seq, m_seq; + int last_type; + unsigned depth; + struct file *base; + char *saved_names[MAX_NESTED_LINKS + 1]; +}; + /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't @@ -695,6 +708,18 @@ void nd_jump_link(struct nameidata *nd, struct path *path) nd->flags |= LOOKUP_JUMPED; } +void nd_set_link(struct nameidata *nd, char *path) +{ + nd->saved_names[nd->depth] = path; +} +EXPORT_SYMBOL(nd_set_link); + +char *nd_get_link(struct nameidata *nd) +{ + return nd->saved_names[nd->depth]; +} +EXPORT_SYMBOL(nd_get_link); + static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) { struct inode *inode = link->dentry->d_inode; @@ -1821,13 +1846,14 @@ static int link_path_walk(const char *name, struct nameidata *nd) } static int path_init(int dfd, const char *name, unsigned int flags, - struct nameidata *nd, struct file **fp) + struct nameidata *nd) { int retval = 0; nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags | LOOKUP_JUMPED; + nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; nd->depth = 0; + nd->base = NULL; if (flags & LOOKUP_ROOT) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; @@ -1847,7 +1873,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, } else { path_get(&nd->path); } - return 0; + goto done; } nd->root.mnt = NULL; @@ -1897,7 +1923,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { if (f.flags & FDPUT_FPUT) - *fp = f.file; + nd->base = f.file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); rcu_read_lock(); } else { @@ -1908,13 +1934,26 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->inode = nd->path.dentry->d_inode; if (!(flags & LOOKUP_RCU)) - return 0; + goto done; if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq))) - return 0; + goto done; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; rcu_read_unlock(); return -ECHILD; +done: + current->total_link_count = 0; + return link_path_walk(name, nd); +} + +static void path_cleanup(struct nameidata *nd) +{ + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + if (unlikely(nd->base)) + fput(nd->base); } static inline int lookup_last(struct nameidata *nd, struct path *path) @@ -1930,7 +1969,6 @@ static inline int lookup_last(struct nameidata *nd, struct path *path) static int path_lookupat(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { - struct file *base = NULL; struct path path; int err; @@ -1948,14 +1986,7 @@ static int path_lookupat(int dfd, const char *name, * be handled by restarting a traditional ref-walk (which will always * be able to complete). */ - err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); - - if (unlikely(err)) - goto out; - - current->total_link_count = 0; - err = link_path_walk(name, nd); - + err = path_init(dfd, name, flags, nd); if (!err && !(flags & LOOKUP_PARENT)) { err = lookup_last(nd, &path); while (err > 0) { @@ -1983,14 +2014,7 @@ static int path_lookupat(int dfd, const char *name, } } -out: - if (base) - fput(base); - - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - path_put(&nd->root); - nd->root.mnt = NULL; - } + path_cleanup(nd); return err; } @@ -2297,19 +2321,13 @@ out: static int path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags) { - struct file *base = NULL; struct nameidata nd; int err; - err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base); + err = path_init(dfd, name, flags, &nd); if (unlikely(err)) goto out; - current->total_link_count = 0; - err = link_path_walk(name, &nd); - if (err) - goto out; - err = mountpoint_last(&nd, path); while (err > 0) { void *cookie; @@ -2325,12 +2343,7 @@ path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags put_link(&nd, &link, cookie); } out: - if (base) - fput(base); - - if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT)) - path_put(&nd.root); - + path_cleanup(&nd); return err; } @@ -3181,7 +3194,6 @@ out: static struct file *path_openat(int dfd, struct filename *pathname, struct nameidata *nd, const struct open_flags *op, int flags) { - struct file *base = NULL; struct file *file; struct path path; int opened = 0; @@ -3198,12 +3210,7 @@ static struct file *path_openat(int dfd, struct filename *pathname, goto out; } - error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base); - if (unlikely(error)) - goto out; - - current->total_link_count = 0; - error = link_path_walk(pathname->name, nd); + error = path_init(dfd, pathname->name, flags, nd); if (unlikely(error)) goto out; @@ -3229,10 +3236,7 @@ static struct file *path_openat(int dfd, struct filename *pathname, put_link(nd, &link, cookie); } out: - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) - path_put(&nd->root); - if (base) - fput(base); + path_cleanup(nd); if (!(opened & FILE_OPENED)) { BUG_ON(!error); put_filp(file); diff --git a/fs/namespace.c b/fs/namespace.c index 5b66b2b3624d..cd1e9681a0cf 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -963,7 +963,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } /* Don't allow unprivileged users to reveal what is under a mount */ - if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) + if ((flag & CL_UNPRIVILEGED) && + (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire))) mnt->mnt.mnt_flags |= MNT_LOCKED; atomic_inc(&sb->s_active); @@ -1369,6 +1370,8 @@ void umount_tree(struct mount *mnt, int how) } if (last) { last->mnt_hash.next = unmounted.first; + if (unmounted.first) + unmounted.first->pprev = &last->mnt_hash.next; unmounted.first = tmp_list.first; unmounted.first->pprev = &unmounted.first; } @@ -1544,6 +1547,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; if (mnt->mnt.mnt_flags & MNT_LOCKED) goto dput_and_out; + retval = -EPERM; + if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) + goto dput_and_out; retval = do_umount(mnt, flags); dput_and_out: @@ -1569,17 +1575,13 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static bool is_mnt_ns_file(struct dentry *dentry) { /* Is this a proxy for a mount namespace? */ - struct inode *inode = dentry->d_inode; - struct proc_ns *ei; - - if (!proc_ns_inode(inode)) - return false; - - ei = get_proc_ns(inode); - if (ei->ns_ops != &mntns_operations) - return false; + return dentry->d_op == &ns_dentry_operations && + dentry->d_fsdata == &mntns_operations; +} - return true; +struct mnt_namespace *to_mnt_ns(struct ns_common *ns) +{ + return container_of(ns, struct mnt_namespace, ns); } static bool mnt_ns_loop(struct dentry *dentry) @@ -1591,7 +1593,7 @@ static bool mnt_ns_loop(struct dentry *dentry) if (!is_mnt_ns_file(dentry)) return false; - mnt_ns = get_proc_ns(dentry->d_inode)->ns; + mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } @@ -1610,7 +1612,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, if (IS_ERR(q)) return q; - q->mnt.mnt_flags &= ~MNT_LOCKED; q->mnt_mountpoint = mnt->mnt_mountpoint; p = mnt; @@ -2020,7 +2021,10 @@ static int do_loopback(struct path *path, const char *old_name, if (IS_MNT_UNBINDABLE(old)) goto out2; - if (!check_mnt(parent) || !check_mnt(old)) + if (!check_mnt(parent)) + goto out2; + + if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations) goto out2; if (!recurse && has_locked_children(old, old_path.dentry)) @@ -2098,7 +2102,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags, } if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) { - return -EPERM; + /* Was the nodev implicitly added in mount? */ + if ((mnt->mnt_ns->user_ns != &init_user_ns) && + !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) { + mnt_flags |= MNT_NODEV; + } else { + return -EPERM; + } } if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) { @@ -2640,7 +2650,7 @@ dput_out: static void free_mnt_ns(struct mnt_namespace *ns) { - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); put_user_ns(ns->user_ns); kfree(ns); } @@ -2662,11 +2672,12 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = proc_alloc_inum(&new_ns->proc_inum); + ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); return ERR_PTR(ret); } + new_ns->ns.ops = &mntns_operations; new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); new_ns->root = NULL; @@ -2958,6 +2969,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, /* mount new_root on / */ attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); touch_mnt_namespace(current->nsproxy->mnt_ns); + /* A moved mount should not expire automatically */ + list_del_init(&new_mnt->mnt_expire); unlock_mount_hash(); chroot_fs_refs(&root, &new); put_mountpoint(root_mp); @@ -3002,6 +3015,7 @@ static void __init init_mount_tree(void) root.mnt = mnt; root.dentry = mnt->mnt_root; + mnt->mnt_flags |= MNT_LOCKED; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); @@ -3144,31 +3158,31 @@ found: return visible; } -static void *mntns_get(struct task_struct *task) +static struct ns_common *mntns_get(struct task_struct *task) { - struct mnt_namespace *ns = NULL; + struct ns_common *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { - ns = nsproxy->mnt_ns; - get_mnt_ns(ns); + ns = &nsproxy->mnt_ns->ns; + get_mnt_ns(to_mnt_ns(ns)); } task_unlock(task); return ns; } -static void mntns_put(void *ns) +static void mntns_put(struct ns_common *ns) { - put_mnt_ns(ns); + put_mnt_ns(to_mnt_ns(ns)); } -static int mntns_install(struct nsproxy *nsproxy, void *ns) +static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) { struct fs_struct *fs = current->fs; - struct mnt_namespace *mnt_ns = ns; + struct mnt_namespace *mnt_ns = to_mnt_ns(ns); struct path root; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || @@ -3198,17 +3212,10 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) return 0; } -static unsigned int mntns_inum(void *ns) -{ - struct mnt_namespace *mnt_ns = ns; - return mnt_ns->proc_inum; -} - const struct proc_ns_operations mntns_operations = { .name = "mnt", .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, - .inum = mntns_inum, }; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0beb023f25ac..ac71d13c69ef 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -33,6 +33,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/file.h> +#include <linux/falloc.h> #include <linux/slab.h> #include "idmap.h" @@ -772,7 +773,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * the client wants us to do more in this compound: */ if (!nfsd4_last_compound_op(rqstp)) - rqstp->rq_splice_ok = false; + clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* check stateid */ if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), @@ -1014,6 +1015,44 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } static __be32 +nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate, int flags) +{ + __be32 status = nfserr_notsupp; + struct file *file; + + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, + &fallocate->falloc_stateid, + WR_STATE, &file); + if (status != nfs_ok) { + dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); + return status; + } + + status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, + fallocate->falloc_offset, + fallocate->falloc_length, + flags); + fput(file); + return status; +} + +static __be32 +nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate) +{ + return nfsd4_fallocate(rqstp, cstate, fallocate, 0); +} + +static __be32 +nfsd4_deallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate) +{ + return nfsd4_fallocate(rqstp, cstate, fallocate, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE); +} + +static __be32 nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_seek *seek) { @@ -1331,7 +1370,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, * Don't use the deferral mechanism for NFSv4; compounds make it * too hard to avoid non-idempotency problems. */ - rqstp->rq_usedeferral = false; + clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); /* * According to RFC3010, this takes precedence over all other errors. @@ -1447,7 +1486,7 @@ encode_op: BUG_ON(cstate->replay_owner); out: /* Reset deferral mechanism for RPC deferrals */ - rqstp->rq_usedeferral = true; + set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } @@ -1929,6 +1968,18 @@ static struct nfsd4_operation nfsd4_ops[] = { }, /* NFSv4.2 operations */ + [OP_ALLOCATE] = { + .op_func = (nfsd4op_func)nfsd4_allocate, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_ALLOCATE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + }, + [OP_DEALLOCATE] = { + .op_func = (nfsd4op_func)nfsd4_deallocate, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_DEALLOCATE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4e1d7268b004..3550a9c87616 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -275,9 +275,11 @@ opaque_hashval(const void *ptr, int nbytes) return x; } -static void nfsd4_free_file(struct nfs4_file *f) +static void nfsd4_free_file_rcu(struct rcu_head *rcu) { - kmem_cache_free(file_slab, f); + struct nfs4_file *fp = container_of(rcu, struct nfs4_file, fi_rcu); + + kmem_cache_free(file_slab, fp); } static inline void @@ -286,9 +288,10 @@ put_nfs4_file(struct nfs4_file *fi) might_lock(&state_lock); if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { - hlist_del(&fi->fi_hash); + hlist_del_rcu(&fi->fi_hash); spin_unlock(&state_lock); - nfsd4_free_file(fi); + WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); + call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); } } @@ -1440,7 +1443,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); - if (cses->flags & SESSION4_BACK_CHAN) { + { struct sockaddr *sa = svc_addr(rqstp); /* * This is a little silly; with sessions there's no real @@ -1711,15 +1714,14 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source) return 0; } -static long long +static int compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2) { - long long res; - - res = o1->len - o2->len; - if (res) - return res; - return (long long)memcmp(o1->data, o2->data, o1->len); + if (o1->len < o2->len) + return -1; + if (o1->len > o2->len) + return 1; + return memcmp(o1->data, o2->data, o1->len); } static int same_name(const char *n1, const char *n2) @@ -1907,7 +1909,7 @@ add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root) static struct nfs4_client * find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root) { - long long cmp; + int cmp; struct rb_node *node = root->rb_node; struct nfs4_client *clp; @@ -3057,10 +3059,9 @@ static struct nfs4_file *nfsd4_alloc_file(void) } /* OPEN Share state helper functions */ -static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh) +static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, + struct nfs4_file *fp) { - unsigned int hashval = file_hashval(fh); - lockdep_assert_held(&state_lock); atomic_set(&fp->fi_ref, 1); @@ -3073,7 +3074,7 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh) fp->fi_share_deny = 0; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); - hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]); + hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); } void @@ -3294,17 +3295,14 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) /* search file_hashtbl[] for file */ static struct nfs4_file * -find_file_locked(struct knfsd_fh *fh) +find_file_locked(struct knfsd_fh *fh, unsigned int hashval) { - unsigned int hashval = file_hashval(fh); struct nfs4_file *fp; - lockdep_assert_held(&state_lock); - - hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { + hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { if (nfsd_fh_match(&fp->fi_fhandle, fh)) { - get_nfs4_file(fp); - return fp; + if (atomic_inc_not_zero(&fp->fi_ref)) + return fp; } } return NULL; @@ -3314,10 +3312,11 @@ static struct nfs4_file * find_file(struct knfsd_fh *fh) { struct nfs4_file *fp; + unsigned int hashval = file_hashval(fh); - spin_lock(&state_lock); - fp = find_file_locked(fh); - spin_unlock(&state_lock); + rcu_read_lock(); + fp = find_file_locked(fh, hashval); + rcu_read_unlock(); return fp; } @@ -3325,11 +3324,18 @@ static struct nfs4_file * find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh) { struct nfs4_file *fp; + unsigned int hashval = file_hashval(fh); + + rcu_read_lock(); + fp = find_file_locked(fh, hashval); + rcu_read_unlock(); + if (fp) + return fp; spin_lock(&state_lock); - fp = find_file_locked(fh); - if (fp == NULL) { - nfsd4_init_file(new, fh); + fp = find_file_locked(fh, hashval); + if (likely(fp == NULL)) { + nfsd4_init_file(fh, hashval, new); fp = new; } spin_unlock(&state_lock); @@ -4127,7 +4133,7 @@ void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, nfs4_put_stateowner(so); } if (open->op_file) - nfsd4_free_file(open->op_file); + kmem_cache_free(file_slab, open->op_file); if (open->op_stp) nfs4_put_stid(&open->op_stp->st_stid); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b1eed4dd2eab..15f7b73e0c0f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1514,6 +1514,23 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str } static __be32 +nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, + struct nfsd4_fallocate *fallocate) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid); + if (status) + return status; + + READ_BUF(16); + p = xdr_decode_hyper(p, &fallocate->falloc_offset); + xdr_decode_hyper(p, &fallocate->falloc_length); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { DECODE_HEAD; @@ -1604,10 +1621,10 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, /* new operations for NFSv4.2 */ - [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -1714,7 +1731,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) - argp->rqstp->rq_splice_ok = false; + clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); DECODE_TAIL; } @@ -1795,9 +1812,12 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, } else end++; + if (found_esc) + end = next; + str = end; } - pathlen = htonl(xdr->buf->len - pathlen_offset); + pathlen = htonl(count); write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4); return 0; } @@ -3236,10 +3256,10 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ if (!p) { - WARN_ON_ONCE(resp->rqstp->rq_splice_ok); + WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); return nfserr_resource; } - if (resp->xdr.buf->page_len && resp->rqstp->rq_splice_ok) { + if (resp->xdr.buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { WARN_ON_ONCE(1); return nfserr_resource; } @@ -3256,7 +3276,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, goto err_truncate; } - if (file->f_op->splice_read && resp->rqstp->rq_splice_ok) + if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) err = nfsd4_encode_splice_read(resp, read, file, maxcount); else err = nfsd4_encode_readv(resp, read, file, maxcount); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 122f69185ef5..83a9694ec485 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -490,7 +490,7 @@ found_entry: /* From the hall of fame of impractical attacks: * Is this a user who tries to snoop on the cache? */ rtn = RC_DOIT; - if (!rqstp->rq_secure && rp->c_secure) + if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure) goto out; /* Compose RPC reply header */ @@ -579,7 +579,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) spin_lock(&b->cache_lock); drc_mem_usage += bufsize; lru_put_end(b, rp); - rp->c_secure = rqstp->rq_secure; + rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags); rp->c_type = cachetype; rp->c_state = RC_DONE; spin_unlock(&b->cache_lock); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 9506ea565610..19ace74d35f6 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -608,7 +608,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) num); sep = " "; - if (len > remaining) + if (len >= remaining) break; remaining -= len; buf += len; @@ -623,7 +623,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) '+' : '-', minor); - if (len > remaining) + if (len >= remaining) break; remaining -= len; buf += len; @@ -631,7 +631,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) } len = snprintf(buf, remaining, "\n"); - if (len > remaining) + if (len >= remaining) return -EINVAL; return tlen + len; } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 88026fc6a981..965b478d50fc 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -86,7 +86,7 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, int flags = nfsexp_flags(rqstp, exp); /* Check if the request originated from a secure port. */ - if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) { + if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && !(flags & NFSEXP_INSECURE_PORT)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("nfsd: request from insecure port %s!\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 752d56bbe0ba..314f5c8f8f1a 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -692,7 +692,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) /* Now call the procedure handler, and encode NFS status. */ nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); nfserr = map_new_errors(rqstp->rq_vers, nfserr); - if (nfserr == nfserr_dropit || rqstp->rq_dropme) { + if (nfserr == nfserr_dropit || test_bit(RQ_DROPME, &rqstp->rq_flags)) { dprintk("nfsd: Dropping request; may be revisited later\n"); nfsd_cache_update(rqstp, RC_NOCACHE, NULL); return 0; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 2712042a66b1..9d3be371240a 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -463,17 +463,24 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) /* * nfs4_file: a file opened by some number of (open) nfs4_stateowners. * - * These objects are global. nfsd only keeps one instance of a nfs4_file per - * inode (though it may keep multiple file descriptors open per inode). These - * are tracked in the file_hashtbl which is protected by the state_lock - * spinlock. + * These objects are global. nfsd keeps one instance of a nfs4_file per + * filehandle (though it may keep multiple file descriptors for each). Each + * inode can have multiple filehandles associated with it, so there is + * (potentially) a many to one relationship between this struct and struct + * inode. + * + * These are hashed by filehandle in the file_hashtbl, which is protected by + * the global state_lock spinlock. */ struct nfs4_file { atomic_t fi_ref; spinlock_t fi_lock; - struct hlist_node fi_hash; /* hash by "struct inode *" */ + struct hlist_node fi_hash; /* hash on fi_fhandle */ struct list_head fi_stateids; - struct list_head fi_delegations; + union { + struct list_head fi_delegations; + struct rcu_head fi_rcu; + }; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ struct file * fi_fds[3]; /* diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0a82e3c033ee..5685c679dd93 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -16,6 +16,7 @@ #include <linux/fs.h> #include <linux/file.h> #include <linux/splice.h> +#include <linux/falloc.h> #include <linux/fcntl.h> #include <linux/namei.h> #include <linux/delay.h> @@ -533,6 +534,26 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif +__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, loff_t len, + int flags) +{ + __be32 err; + int error; + + if (!S_ISREG(file_inode(file)->i_mode)) + return nfserr_inval; + + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE); + if (err) + return err; + + error = vfs_fallocate(file, flags, offset, len); + if (!error) + error = commit_metadata(fhp); + + return nfserrno(error); +} #endif /* defined(CONFIG_NFSD_V4) */ #ifdef CONFIG_NFSD_V3 @@ -881,7 +902,7 @@ static __be32 nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { - if (file->f_op->splice_read && rqstp->rq_splice_ok) + if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) return nfsd_splice_read(rqstp, file, offset, count); else return nfsd_readv(file, offset, vec, vlen, count); @@ -937,9 +958,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int stable = *stablep; int use_wgather; loff_t pos = offset; + loff_t end = LLONG_MAX; unsigned int pflags = current->flags; - if (rqstp->rq_local) + if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* * We want less throttling in balance_dirty_pages() * and shrink_inactive_list() so that nfs to @@ -967,10 +989,13 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, fsnotify_modify(file); if (stable) { - if (use_wgather) + if (use_wgather) { host_err = wait_for_concurrent_writes(file); - else - host_err = vfs_fsync_range(file, offset, offset+*cnt, 0); + } else { + if (*cnt) + end = offset + *cnt - 1; + host_err = vfs_fsync_range(file, offset, end, 0); + } } out_nfserr: @@ -979,7 +1004,7 @@ out_nfserr: err = 0; else err = nfserrno(host_err); - if (rqstp->rq_local) + if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index b1796d6ee538..2050cb016998 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -54,6 +54,8 @@ int nfsd_mountpoint(struct dentry *, struct svc_export *); #ifdef CONFIG_NFSD_V4 __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, struct xdr_netobj *); +__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, + struct file *, loff_t, loff_t, int); #endif /* CONFIG_NFSD_V4 */ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 5720e9457f33..90a5925bd6ab 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -428,6 +428,13 @@ struct nfsd4_reclaim_complete { u32 rca_one_fs; }; +struct nfsd4_fallocate { + /* request */ + stateid_t falloc_stateid; + loff_t falloc_offset; + u64 falloc_length; +}; + struct nfsd4_seek { /* request */ stateid_t seek_stateid; @@ -486,6 +493,8 @@ struct nfsd4_op { struct nfsd4_free_stateid free_stateid; /* NFSv4.2 */ + struct nfsd4_fallocate allocate; + struct nfsd4_fallocate deallocate; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; diff --git a/fs/nsfs.c b/fs/nsfs.c new file mode 100644 index 000000000000..af1b24fa899d --- /dev/null +++ b/fs/nsfs.c @@ -0,0 +1,161 @@ +#include <linux/mount.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/proc_ns.h> +#include <linux/magic.h> +#include <linux/ktime.h> + +static struct vfsmount *nsfs_mnt; + +static const struct file_operations ns_file_operations = { + .llseek = no_llseek, +}; + +static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + + return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", + ns_ops->name, inode->i_ino); +} + +static void ns_prune_dentry(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + if (inode) { + struct ns_common *ns = inode->i_private; + atomic_long_set(&ns->stashed, 0); + } +} + +const struct dentry_operations ns_dentry_operations = +{ + .d_prune = ns_prune_dentry, + .d_delete = always_delete_dentry, + .d_dname = ns_dname, +}; + +static void nsfs_evict(struct inode *inode) +{ + struct ns_common *ns = inode->i_private; + clear_inode(inode); + ns->ops->put(ns); +} + +void *ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct vfsmount *mnt = mntget(nsfs_mnt); + struct qstr qname = { .name = "", }; + struct dentry *dentry; + struct inode *inode; + struct ns_common *ns; + unsigned long d; + +again: + ns = ns_ops->get(task); + if (!ns) { + mntput(mnt); + return ERR_PTR(-ENOENT); + } + rcu_read_lock(); + d = atomic_long_read(&ns->stashed); + if (!d) + goto slow; + dentry = (struct dentry *)d; + if (!lockref_get_not_dead(&dentry->d_lockref)) + goto slow; + rcu_read_unlock(); + ns_ops->put(ns); +got_it: + path->mnt = mnt; + path->dentry = dentry; + return NULL; +slow: + rcu_read_unlock(); + inode = new_inode_pseudo(mnt->mnt_sb); + if (!inode) { + ns_ops->put(ns); + mntput(mnt); + return ERR_PTR(-ENOMEM); + } + inode->i_ino = ns->inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_flags |= S_IMMUTABLE; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + inode->i_private = ns; + + dentry = d_alloc_pseudo(mnt->mnt_sb, &qname); + if (!dentry) { + iput(inode); + mntput(mnt); + return ERR_PTR(-ENOMEM); + } + d_instantiate(dentry, inode); + dentry->d_fsdata = (void *)ns_ops; + d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); + if (d) { + d_delete(dentry); /* make sure ->d_prune() does nothing */ + dput(dentry); + cpu_relax(); + goto again; + } + goto got_it; +} + +int ns_get_name(char *buf, size_t size, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct ns_common *ns; + int res = -ENOENT; + ns = ns_ops->get(task); + if (ns) { + res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum); + ns_ops->put(ns); + } + return res; +} + +struct file *proc_ns_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &ns_file_operations) + goto out_invalid; + + return file; + +out_invalid: + fput(file); + return ERR_PTR(-EINVAL); +} + +static const struct super_operations nsfs_ops = { + .statfs = simple_statfs, + .evict_inode = nsfs_evict, +}; +static struct dentry *nsfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "nsfs:", &nsfs_ops, + &ns_dentry_operations, NSFS_MAGIC); +} +static struct file_system_type nsfs = { + .name = "nsfs", + .mount = nsfs_mount, + .kill_sb = kill_anon_super, +}; + +void __init nsfs_init(void) +{ + nsfs_mnt = kern_mount(&nsfs); + if (IS_ERR(nsfs_mnt)) + panic("can't set nsfs up\n"); + nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER; +} diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a93bf9892256..fcae9ef1a328 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5662,7 +5662,7 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_extent_tree *et, u32 cpos, u32 phys_cpos, u32 len, int flags, struct ocfs2_cached_dealloc_ctxt *dealloc, - u64 refcount_loc) + u64 refcount_loc, bool refcount_tree_locked) { int ret, credits = 0, extra_blocks = 0; u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); @@ -5676,11 +5676,13 @@ int ocfs2_remove_btree_range(struct inode *inode, BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); - ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, - &ref_tree, NULL); - if (ret) { - mlog_errno(ret); - goto bail; + if (!refcount_tree_locked) { + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + goto bail; + } } ret = ocfs2_prepare_refcount_change_for_del(inode, @@ -7021,6 +7023,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, u64 refcount_loc = le64_to_cpu(di->i_refcount_loc); struct ocfs2_extent_tree et; struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_refcount_tree *ref_tree = NULL; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); ocfs2_init_dealloc_ctxt(&dealloc); @@ -7130,9 +7133,18 @@ start: phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) { + status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, NULL); + if (status) { + mlog_errno(status); + goto bail; + } + } + status = ocfs2_remove_btree_range(inode, &et, trunc_cpos, phys_cpos, trunc_len, flags, &dealloc, - refcount_loc); + refcount_loc, true); if (status < 0) { mlog_errno(status); goto bail; @@ -7147,6 +7159,8 @@ start: goto start; bail: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); ocfs2_schedule_truncate_log_flush(osb, 1); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index ca381c584127..fb09b97db162 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -142,7 +142,7 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_extent_tree *et, u32 cpos, u32 phys_cpos, u32 len, int flags, struct ocfs2_cached_dealloc_ctxt *dealloc, - u64 refcount_loc); + u64 refcount_loc, bool refcount_tree_locked); int ocfs2_num_free_extents(struct ocfs2_super *osb, struct ocfs2_extent_tree *et); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index d9f222987f24..46d93e941f3d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -894,7 +894,7 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) } } -static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) +static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc) { int i; @@ -915,7 +915,11 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) page_cache_release(wc->w_target_page); } ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); +} +static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) +{ + ocfs2_unlock_pages(wc); brelse(wc->w_di_bh); kfree(wc); } @@ -2042,11 +2046,19 @@ out_write_size: ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, wc->w_di_bh); + /* unlock pages before dealloc since it needs acquiring j_trans_barrier + * lock, or it will cause a deadlock since journal commit threads holds + * this lock and will ask for the page lock when flushing the data. + * put it here to preserve the unlock order. + */ + ocfs2_unlock_pages(wc); + ocfs2_commit_trans(osb, handle); ocfs2_run_deallocs(osb, &wc->w_dealloc); - ocfs2_free_write_ctxt(wc); + brelse(wc->w_di_bh); + kfree(wc); return copied; } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 79d56dc981bc..319e786175af 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -4479,7 +4479,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0, - &dealloc, 0); + &dealloc, 0, false); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3689b3592042..a6944b25fd5b 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -695,14 +695,6 @@ void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, res->inflight_assert_workers); } -static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res) -{ - spin_lock(&res->spinlock); - __dlm_lockres_grab_inflight_worker(dlm, res); - spin_unlock(&res->spinlock); -} - static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { @@ -1646,6 +1638,7 @@ send_response: } mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", dlm->node_num, res->lockname.len, res->lockname.name); + spin_lock(&res->spinlock); ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, DLM_ASSERT_MASTER_MLE_CLEANUP); if (ret < 0) { @@ -1653,7 +1646,8 @@ send_response: response = DLM_MASTER_RESP_ERROR; dlm_lockres_put(res); } else - dlm_lockres_grab_inflight_worker(dlm, res); + __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); } else { if (res) dlm_lockres_put(res); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 69fb9f75b082..3950693dd0f6 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1803,7 +1803,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos, phys_cpos, trunc_len, flags, - &dealloc, refcount_loc); + &dealloc, refcount_loc, false); if (ret < 0) { mlog_errno(ret); goto out; diff --git a/fs/open.c b/fs/open.c index d45bd905d418..813be037b412 100644 --- a/fs/open.c +++ b/fs/open.c @@ -222,7 +222,7 @@ SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) #endif /* BITS_PER_LONG == 32 */ -int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); long ret; @@ -309,6 +309,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) sb_end_write(inode->i_sb); return ret; } +EXPORT_SYMBOL_GPL(vfs_fallocate); SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { @@ -316,7 +317,7 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) int error = -EBADF; if (f.file) { - error = do_fallocate(f.file, mode, offset, len); + error = vfs_fallocate(f.file, mode, offset, len); fdput(f); } return error; diff --git a/fs/pnode.c b/fs/pnode.c index aae331a5d03b..260ac8f898a4 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -242,6 +242,7 @@ static int propagate_one(struct mount *m) child = copy_tree(last_source, last_source->mnt.mnt_root, type); if (IS_ERR(child)) return PTR_ERR(child); + child->mnt.mnt_flags &= ~MNT_LOCKED; mnt_set_mountpoint(m, mp, child); last_dest = m; last_source = child; diff --git a/fs/proc/base.c b/fs/proc/base.c index 590aeda5af12..3f3d7aeb0712 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2464,6 +2464,57 @@ static const struct file_operations proc_projid_map_operations = { .llseek = seq_lseek, .release = proc_id_map_release, }; + +static int proc_setgroups_open(struct inode *inode, struct file *file) +{ + struct user_namespace *ns = NULL; + struct task_struct *task; + int ret; + + ret = -ESRCH; + task = get_proc_task(inode); + if (task) { + rcu_read_lock(); + ns = get_user_ns(task_cred_xxx(task, user_ns)); + rcu_read_unlock(); + put_task_struct(task); + } + if (!ns) + goto err; + + if (file->f_mode & FMODE_WRITE) { + ret = -EACCES; + if (!ns_capable(ns, CAP_SYS_ADMIN)) + goto err_put_ns; + } + + ret = single_open(file, &proc_setgroups_show, ns); + if (ret) + goto err_put_ns; + + return 0; +err_put_ns: + put_user_ns(ns); +err: + return ret; +} + +static int proc_setgroups_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + int ret = single_release(inode, file); + put_user_ns(ns); + return ret; +} + +static const struct file_operations proc_setgroups_operations = { + .open = proc_setgroups_open, + .write = proc_setgroups_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_setgroups_release, +}; #endif /* CONFIG_USER_NS */ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, @@ -2572,6 +2623,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif #ifdef CONFIG_CHECKPOINT_RESTORE REG("timers", S_IRUGO, proc_timers_operations), @@ -2916,6 +2968,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif }; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 333080d7a671..8420a2f80811 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -32,8 +32,6 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; struct ctl_table_header *head; - const struct proc_ns_operations *ns_ops; - void *ns; truncate_inode_pages_final(&inode->i_data); clear_inode(inode); @@ -50,11 +48,6 @@ static void proc_evict_inode(struct inode *inode) RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); sysctl_head_put(head); } - /* Release any associated namespace */ - ns_ops = PROC_I(inode)->ns.ns_ops; - ns = PROC_I(inode)->ns.ns; - if (ns_ops && ns) - ns_ops->put(ns); } static struct kmem_cache * proc_inode_cachep; @@ -73,8 +66,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; - ei->ns.ns = NULL; - ei->ns.ns_ops = NULL; + ei->ns_ops = NULL; inode = &ei->vfs_inode; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; return inode; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 7fb1a4869fd0..6fcdba573e0f 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -65,7 +65,7 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; - struct proc_ns ns; + const struct proc_ns_operations *ns_ops; struct inode vfs_inode; }; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index aa1eee06420f..d3ebf2e61853 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -12,6 +12,9 @@ #include <linux/vmstat.h> #include <linux/atomic.h> #include <linux/vmalloc.h> +#ifdef CONFIG_CMA +#include <linux/cma.h> +#endif #include <asm/page.h> #include <asm/pgtable.h> #include "internal.h" @@ -138,6 +141,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_TRANSPARENT_HUGEPAGE "AnonHugePages: %8lu kB\n" #endif +#ifdef CONFIG_CMA + "CmaTotal: %8lu kB\n" + "CmaFree: %8lu kB\n" +#endif , K(i.totalram), K(i.freeram), @@ -187,12 +194,16 @@ static int meminfo_proc_show(struct seq_file *m, void *v) vmi.used >> 10, vmi.largest_chunk >> 10 #ifdef CONFIG_MEMORY_FAILURE - ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) + , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + , K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * HPAGE_PMD_NR) #endif +#ifdef CONFIG_CMA + , K(totalcma_pages) + , K(global_page_state(NR_FREE_CMA_PAGES)) +#endif ); hugetlb_report_meminfo(m); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 89026095f2b5..c9eac4563fa8 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -1,10 +1,6 @@ #include <linux/proc_fs.h> #include <linux/nsproxy.h> -#include <linux/sched.h> #include <linux/ptrace.h> -#include <linux/fs_struct.h> -#include <linux/mount.h> -#include <linux/path.h> #include <linux/namei.h> #include <linux/file.h> #include <linux/utsname.h> @@ -34,138 +30,45 @@ static const struct proc_ns_operations *ns_entries[] = { &mntns_operations, }; -static const struct file_operations ns_file_operations = { - .llseek = no_llseek, -}; - -static const struct inode_operations ns_inode_operations = { - .setattr = proc_setattr, -}; - -static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) -{ - struct inode *inode = dentry->d_inode; - const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops; - - return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", - ns_ops->name, inode->i_ino); -} - -const struct dentry_operations ns_dentry_operations = -{ - .d_delete = always_delete_dentry, - .d_dname = ns_dname, -}; - -static struct dentry *proc_ns_get_dentry(struct super_block *sb, - struct task_struct *task, const struct proc_ns_operations *ns_ops) -{ - struct dentry *dentry, *result; - struct inode *inode; - struct proc_inode *ei; - struct qstr qname = { .name = "", }; - void *ns; - - ns = ns_ops->get(task); - if (!ns) - return ERR_PTR(-ENOENT); - - dentry = d_alloc_pseudo(sb, &qname); - if (!dentry) { - ns_ops->put(ns); - return ERR_PTR(-ENOMEM); - } - - inode = iget_locked(sb, ns_ops->inum(ns)); - if (!inode) { - dput(dentry); - ns_ops->put(ns); - return ERR_PTR(-ENOMEM); - } - - ei = PROC_I(inode); - if (inode->i_state & I_NEW) { - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_op = &ns_inode_operations; - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &ns_file_operations; - ei->ns.ns_ops = ns_ops; - ei->ns.ns = ns; - unlock_new_inode(inode); - } else { - ns_ops->put(ns); - } - - d_set_d_op(dentry, &ns_dentry_operations); - result = d_instantiate_unique(dentry, inode); - if (result) { - dput(dentry); - dentry = result; - } - - return dentry; -} - static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct super_block *sb = inode->i_sb; - struct proc_inode *ei = PROC_I(inode); + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; void *error = ERR_PTR(-EACCES); task = get_proc_task(inode); if (!task) - goto out; + return error; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out_put_task; - - ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops); - if (IS_ERR(ns_path.dentry)) { - error = ERR_CAST(ns_path.dentry); - goto out_put_task; + if (ptrace_may_access(task, PTRACE_MODE_READ)) { + error = ns_get_path(&ns_path, task, ns_ops); + if (!error) + nd_jump_link(nd, &ns_path); } - - ns_path.mnt = mntget(nd->path.mnt); - nd_jump_link(nd, &ns_path); - error = NULL; - -out_put_task: put_task_struct(task); -out: return error; } static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = dentry->d_inode; - struct proc_inode *ei = PROC_I(inode); - const struct proc_ns_operations *ns_ops = ei->ns.ns_ops; + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; - void *ns; char name[50]; int res = -EACCES; task = get_proc_task(inode); if (!task) - goto out; - - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out_put_task; + return res; - res = -ENOENT; - ns = ns_ops->get(task); - if (!ns) - goto out_put_task; - - snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); - res = readlink_copy(buffer, buflen, name); - ns_ops->put(ns); -out_put_task: + if (ptrace_may_access(task, PTRACE_MODE_READ)) { + res = ns_get_name(name, sizeof(name), task, ns_ops); + if (res >= 0) + res = readlink_copy(buffer, buflen, name); + } put_task_struct(task); -out: return res; } @@ -189,7 +92,7 @@ static int proc_ns_instantiate(struct inode *dir, ei = PROC_I(inode); inode->i_mode = S_IFLNK|S_IRWXUGO; inode->i_op = &proc_ns_link_inode_operations; - ei->ns.ns_ops = ns_ops; + ei->ns_ops = ns_ops; d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); @@ -267,31 +170,3 @@ const struct inode_operations proc_ns_dir_inode_operations = { .getattr = pid_getattr, .setattr = proc_setattr, }; - -struct file *proc_ns_fget(int fd) -{ - struct file *file; - - file = fget(fd); - if (!file) - return ERR_PTR(-EBADF); - - if (file->f_op != &ns_file_operations) - goto out_invalid; - - return file; - -out_invalid: - fput(file); - return ERR_PTR(-EINVAL); -} - -struct proc_ns *get_proc_ns(struct inode *inode) -{ - return &PROC_I(inode)->ns; -} - -bool proc_ns_inode(struct inode *inode) -{ - return inode->i_fop == &ns_file_operations; -} diff --git a/fs/proc/stat.c b/fs/proc/stat.c index bf2d03f8fd3e..510413eb25b8 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -159,7 +159,7 @@ static int show_stat(struct seq_file *p, void *v) /* sum again ? it could be updated? */ for_each_irq_nr(j) - seq_put_decimal_ull(p, ' ', kstat_irqs(j)); + seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j)); seq_printf(p, "\nctxt %llu\n" diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 73ca1740d839..0f96f71ab32b 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -91,6 +91,7 @@ static void show_type(struct seq_file *m, struct super_block *sb) static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) { + struct proc_mounts *p = proc_mounts(m); struct mount *r = real_mount(mnt); int err = 0; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -104,7 +105,10 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) mangle(m, r->mnt_devname ? r->mnt_devname : "none"); } seq_putc(m, ' '); - seq_path(m, &mnt_path, " \t\n\\"); + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); + if (err) + goto out; seq_putc(m, ' '); show_type(m, sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); @@ -125,7 +129,6 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - struct path root = p->root; int err = 0; seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, @@ -139,7 +142,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ - err = seq_path_root(m, &mnt_path, &root, " \t\n\\"); + err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); if (err) goto out; @@ -182,6 +185,7 @@ out: static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) { + struct proc_mounts *p = proc_mounts(m); struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; @@ -201,7 +205,10 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) /* mount point */ seq_puts(m, " mounted on "); - seq_path(m, &mnt_path, " \t\n\\"); + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); + if (err) + goto out; seq_putc(m, ' '); /* file system type */ @@ -216,6 +223,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) } seq_putc(m, '\n'); +out: return err; } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 2f389ce5023c..8613e5b35c22 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -561,7 +561,6 @@ static struct platform_driver ramoops_driver = { .remove = __exit_p(ramoops_remove), .driver = { .name = "ramoops", - .owner = THIS_MODULE, }, }; diff --git a/fs/read_write.c b/fs/read_write.c index 7d9318c3d43c..c0805c93b6fa 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -412,6 +412,23 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p EXPORT_SYMBOL(new_sync_read); +ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, + loff_t *pos) +{ + ssize_t ret; + + if (file->f_op->read) + ret = file->f_op->read(file, buf, count, pos); + else if (file->f_op->aio_read) + ret = do_sync_read(file, buf, count, pos); + else if (file->f_op->read_iter) + ret = new_sync_read(file, buf, count, pos); + else + ret = -EINVAL; + + return ret; +} + ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; @@ -426,12 +443,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) ret = rw_verify_area(READ, file, pos, count); if (ret >= 0) { count = ret; - if (file->f_op->read) - ret = file->f_op->read(file, buf, count, pos); - else if (file->f_op->aio_read) - ret = do_sync_read(file, buf, count, pos); - else - ret = new_sync_read(file, buf, count, pos); + ret = __vfs_read(file, buf, count, pos); if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index ea63ab13ef92..71fbbe3e2dab 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2172,6 +2172,9 @@ error_unlocked: reiserfs_write_unlock(s); } + if (sbi->commit_wq) + destroy_workqueue(sbi->commit_wq); + cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); reiserfs_free_bitmap_cache(s); diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index b6fa8657dcbc..ffb093e72b6c 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -120,6 +120,21 @@ config SQUASHFS_ZLIB If unsure, say Y. +config SQUASHFS_LZ4 + bool "Include support for LZ4 compressed file systems" + depends on SQUASHFS + select LZ4_DECOMPRESS + help + Saying Y here includes support for reading Squashfs file systems + compressed with LZ4 compression. LZ4 compression is mainly + aimed at embedded systems with slower CPUs where the overheads + of zlib are too high. + + LZ4 is not the standard compression used in Squashfs and so most + file systems will be readable without selecting this option. + + If unsure, say N. + config SQUASHFS_LZO bool "Include support for LZO compressed file systems" depends on SQUASHFS diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 4132520b4ff2..246a6f329d89 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -11,6 +11,7 @@ squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o +squashfs-$(CONFIG_SQUASHFS_LZ4) += lz4_wrapper.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index ac22fe73b0ad..e9034bf6e5ae 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -41,6 +41,12 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 }; +#ifndef CONFIG_SQUASHFS_LZ4 +static const struct squashfs_decompressor squashfs_lz4_comp_ops = { + NULL, NULL, NULL, NULL, LZ4_COMPRESSION, "lz4", 0 +}; +#endif + #ifndef CONFIG_SQUASHFS_LZO static const struct squashfs_decompressor squashfs_lzo_comp_ops = { NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 @@ -65,6 +71,7 @@ static const struct squashfs_decompressor squashfs_unknown_comp_ops = { static const struct squashfs_decompressor *decompressor[] = { &squashfs_zlib_comp_ops, + &squashfs_lz4_comp_ops, &squashfs_lzo_comp_ops, &squashfs_xz_comp_ops, &squashfs_lzma_unsupported_comp_ops, diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index af0985321808..a25713c031a5 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -46,6 +46,10 @@ static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk, extern const struct squashfs_decompressor squashfs_xz_comp_ops; #endif +#ifdef CONFIG_SQUASHFS_LZ4 +extern const struct squashfs_decompressor squashfs_lz4_comp_ops; +#endif + #ifdef CONFIG_SQUASHFS_LZO extern const struct squashfs_decompressor squashfs_lzo_comp_ops; #endif diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c new file mode 100644 index 000000000000..c31e2bc9c081 --- /dev/null +++ b/fs/squashfs/lz4_wrapper.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2013, 2014 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/buffer_head.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/lz4.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" +#include "page_actor.h" + +#define LZ4_LEGACY 1 + +struct lz4_comp_opts { + __le32 version; + __le32 flags; +}; + +struct squashfs_lz4 { + void *input; + void *output; +}; + + +static void *lz4_comp_opts(struct squashfs_sb_info *msblk, + void *buff, int len) +{ + struct lz4_comp_opts *comp_opts = buff; + + /* LZ4 compressed filesystems always have compression options */ + if (comp_opts == NULL || len < sizeof(*comp_opts)) + return ERR_PTR(-EIO); + + if (le32_to_cpu(comp_opts->version) != LZ4_LEGACY) { + /* LZ4 format currently used by the kernel is the 'legacy' + * format */ + ERROR("Unknown LZ4 version\n"); + return ERR_PTR(-EINVAL); + } + + return NULL; +} + + +static void *lz4_init(struct squashfs_sb_info *msblk, void *buff) +{ + int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); + struct squashfs_lz4 *stream; + + stream = kzalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + stream->input = vmalloc(block_size); + if (stream->input == NULL) + goto failed2; + stream->output = vmalloc(block_size); + if (stream->output == NULL) + goto failed3; + + return stream; + +failed3: + vfree(stream->input); +failed2: + kfree(stream); +failed: + ERROR("Failed to initialise LZ4 decompressor\n"); + return ERR_PTR(-ENOMEM); +} + + +static void lz4_free(void *strm) +{ + struct squashfs_lz4 *stream = strm; + + if (stream) { + vfree(stream->input); + vfree(stream->output); + } + kfree(stream); +} + + +static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) +{ + struct squashfs_lz4 *stream = strm; + void *buff = stream->input, *data; + int avail, i, bytes = length, res; + size_t dest_len = output->length; + + for (i = 0; i < b; i++) { + avail = min(bytes, msblk->devblksize - offset); + memcpy(buff, bh[i]->b_data + offset, avail); + buff += avail; + bytes -= avail; + offset = 0; + put_bh(bh[i]); + } + + res = lz4_decompress_unknownoutputsize(stream->input, length, + stream->output, &dest_len); + if (res) + return -EIO; + + bytes = dest_len; + data = squashfs_first_page(output); + buff = stream->output; + while (data) { + if (bytes <= PAGE_CACHE_SIZE) { + memcpy(data, buff, bytes); + break; + } + memcpy(data, buff, PAGE_CACHE_SIZE); + buff += PAGE_CACHE_SIZE; + bytes -= PAGE_CACHE_SIZE; + data = squashfs_next_page(output); + } + squashfs_finish_page(output); + + return dest_len; +} + +const struct squashfs_decompressor squashfs_lz4_comp_ops = { + .init = lz4_init, + .comp_opts = lz4_comp_opts, + .free = lz4_free, + .decompress = lz4_uncompress, + .id = LZ4_COMPRESSION, + .name = "lz4", + .supported = 1 +}; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 4b2beda49498..506f4ba5b983 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -240,6 +240,7 @@ struct meta_index { #define LZMA_COMPRESSION 2 #define LZO_COMPRESSION 3 #define XZ_COMPRESSION 4 +#define LZ4_COMPRESSION 5 struct squashfs_super_block { __le32 s_magic; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index e9ef59b3abb1..dfe928a9540f 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -102,6 +102,22 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, return battr->read(of->file, kobj, battr, buf, pos, count); } +/* kernfs read callback for regular sysfs files with pre-alloc */ +static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) +{ + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); + struct kobject *kobj = of->kn->parent->priv; + + /* + * If buf != of->prealloc_buf, we don't know how + * large it is, so cannot safely pass it to ->show + */ + if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) + return 0; + return ops->show(kobj, of->kn->priv, buf); +} + /* kernfs write callback for regular sysfs files */ static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) @@ -125,7 +141,7 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, if (size) { if (size <= pos) - return 0; + return -EFBIG; count = min_t(ssize_t, count, size - pos); } if (!count) @@ -184,6 +200,22 @@ static const struct kernfs_ops sysfs_file_kfops_rw = { .write = sysfs_kf_write, }; +static const struct kernfs_ops sysfs_prealloc_kfops_ro = { + .read = sysfs_kf_read, + .prealloc = true, +}; + +static const struct kernfs_ops sysfs_prealloc_kfops_wo = { + .write = sysfs_kf_write, + .prealloc = true, +}; + +static const struct kernfs_ops sysfs_prealloc_kfops_rw = { + .read = sysfs_kf_read, + .write = sysfs_kf_write, + .prealloc = true, +}; + static const struct kernfs_ops sysfs_bin_kfops_ro = { .read = sysfs_kf_bin_read, }; @@ -222,13 +254,22 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, kobject_name(kobj))) return -EINVAL; - if (sysfs_ops->show && sysfs_ops->store) - ops = &sysfs_file_kfops_rw; - else if (sysfs_ops->show) - ops = &sysfs_file_kfops_ro; - else if (sysfs_ops->store) - ops = &sysfs_file_kfops_wo; - else + if (sysfs_ops->show && sysfs_ops->store) { + if (mode & SYSFS_PREALLOC) + ops = &sysfs_prealloc_kfops_rw; + else + ops = &sysfs_file_kfops_rw; + } else if (sysfs_ops->show) { + if (mode & SYSFS_PREALLOC) + ops = &sysfs_prealloc_kfops_ro; + else + ops = &sysfs_file_kfops_ro; + } else if (sysfs_ops->store) { + if (mode & SYSFS_PREALLOC) + ops = &sysfs_prealloc_kfops_wo; + else + ops = &sysfs_file_kfops_wo; + } else ops = &sysfs_file_kfops_empty; size = PAGE_SIZE; @@ -253,7 +294,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, if (!attr->ignore_lockdep) key = attr->key ?: (struct lock_class_key *)&attr->skey; #endif - kn = __kernfs_create_file(parent, attr->name, mode, size, ops, + kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops, (void *)attr, ns, true, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) diff --git a/fs/udf/dir.c b/fs/udf/dir.c index a012c51caffd..05e90edd1992 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -57,6 +57,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) sector_t offset; int i, num, ret = 0; struct extent_position epos = { NULL, 0, {0, 0} }; + struct super_block *sb = dir->i_sb; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) @@ -76,16 +77,16 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) if (nf_pos == 0) nf_pos = udf_ext0_offset(dir); - fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1); + fibh.soffset = fibh.eoffset = nf_pos & (sb->s_blocksize - 1); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits, + if (inode_bmap(dir, nf_pos >> sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { ret = -ENOENT; goto out; } - block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); - if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { + block = udf_get_lb_pblock(sb, &eloc, offset); + if ((++offset << sb->s_blocksize_bits) < elen) { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) epos.offset -= sizeof(struct short_ad); else if (iinfo->i_alloc_type == @@ -95,18 +96,18 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) offset = 0; } - if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block))) { + if (!(fibh.sbh = fibh.ebh = udf_tread(sb, block))) { ret = -EIO; goto out; } - if (!(offset & ((16 >> (dir->i_sb->s_blocksize_bits - 9)) - 1))) { - i = 16 >> (dir->i_sb->s_blocksize_bits - 9); - if (i + offset > (elen >> dir->i_sb->s_blocksize_bits)) - i = (elen >> dir->i_sb->s_blocksize_bits) - offset; + if (!(offset & ((16 >> (sb->s_blocksize_bits - 9)) - 1))) { + i = 16 >> (sb->s_blocksize_bits - 9); + if (i + offset > (elen >> sb->s_blocksize_bits)) + i = (elen >> sb->s_blocksize_bits) - offset; for (num = 0; i > 0; i--) { - block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i); - tmp = udf_tgetblk(dir->i_sb, block); + block = udf_get_lb_pblock(sb, &eloc, offset + i); + tmp = udf_tgetblk(sb, block); if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) bha[num++] = tmp; else @@ -152,12 +153,12 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) } if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { - if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE)) + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) continue; } if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { - if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE)) + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) continue; } @@ -167,12 +168,12 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) continue; } - flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); + flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); if (!flen) continue; tloc = lelb_to_cpu(cfi.icb.extLocation); - iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0); + iblock = udf_get_lb_pblock(sb, &tloc, 0); if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN)) goto out; } /* end while */ diff --git a/fs/udf/inode.c b/fs/udf/inode.c index c9b4df5810d5..5bc71d9a674a 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1489,6 +1489,20 @@ reread: } inode->i_generation = iinfo->i_unique; + /* Sanity checks for files in ICB so that we don't get confused later */ + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + /* + * For file in ICB data is stored in allocation descriptor + * so sizes should match + */ + if (iinfo->i_lenAlloc != inode->i_size) + goto out; + /* File in ICB has to fit in there... */ + if (inode->i_size > inode->i_sb->s_blocksize - + udf_file_entry_alloc_offset(inode)) + goto out; + } + switch (fe->icbTag.fileType) { case ICBTAG_FILE_TYPE_DIRECTORY: inode->i_op = &udf_dir_inode_operations; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index c12e260fd6c4..33b246b82c98 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -159,18 +159,19 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, struct udf_inode_info *dinfo = UDF_I(dir); int isdotdot = child->len == 2 && child->name[0] == '.' && child->name[1] == '.'; + struct super_block *sb = dir->i_sb; size = udf_ext0_offset(dir) + dir->i_size; f_pos = udf_ext0_offset(dir); fibh->sbh = fibh->ebh = NULL; - fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); + fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1); if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, + if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) goto out_err; - block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); - if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { + block = udf_get_lb_pblock(sb, &eloc, offset); + if ((++offset << sb->s_blocksize_bits) < elen) { if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) epos.offset -= sizeof(struct short_ad); else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) @@ -178,7 +179,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, } else offset = 0; - fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); + fibh->sbh = fibh->ebh = udf_tread(sb, block); if (!fibh->sbh) goto out_err; } @@ -217,12 +218,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, } if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { - if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE)) + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) continue; } if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { - if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE)) + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) continue; } @@ -233,7 +234,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, if (!lfi) continue; - flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); + flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); if (flen && udf_match(flen, fname, child->len, child->name)) goto out_ok; } diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 6fb7945c1e6e..ac10ca939f26 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -30,49 +30,73 @@ #include <linux/buffer_head.h> #include "udf_i.h" -static void udf_pc_to_char(struct super_block *sb, unsigned char *from, - int fromlen, unsigned char *to) +static int udf_pc_to_char(struct super_block *sb, unsigned char *from, + int fromlen, unsigned char *to, int tolen) { struct pathComponent *pc; int elen = 0; + int comp_len; unsigned char *p = to; + /* Reserve one byte for terminating \0 */ + tolen--; while (elen < fromlen) { pc = (struct pathComponent *)(from + elen); + elen += sizeof(struct pathComponent); switch (pc->componentType) { case 1: /* * Symlink points to some place which should be agreed * upon between originator and receiver of the media. Ignore. */ - if (pc->lengthComponentIdent > 0) + if (pc->lengthComponentIdent > 0) { + elen += pc->lengthComponentIdent; break; + } /* Fall through */ case 2: + if (tolen == 0) + return -ENAMETOOLONG; p = to; *p++ = '/'; + tolen--; break; case 3: + if (tolen < 3) + return -ENAMETOOLONG; memcpy(p, "../", 3); p += 3; + tolen -= 3; break; case 4: + if (tolen < 2) + return -ENAMETOOLONG; memcpy(p, "./", 2); p += 2; + tolen -= 2; /* that would be . - just ignore */ break; case 5: - p += udf_get_filename(sb, pc->componentIdent, p, - pc->lengthComponentIdent); + elen += pc->lengthComponentIdent; + if (elen > fromlen) + return -EIO; + comp_len = udf_get_filename(sb, pc->componentIdent, + pc->lengthComponentIdent, + p, tolen); + p += comp_len; + tolen -= comp_len; + if (tolen == 0) + return -ENAMETOOLONG; *p++ = '/'; + tolen--; break; } - elen += sizeof(struct pathComponent) + pc->lengthComponentIdent; } if (p > to + 1) p[-1] = '\0'; else p[0] = '\0'; + return 0; } static int udf_symlink_filler(struct file *file, struct page *page) @@ -80,11 +104,17 @@ static int udf_symlink_filler(struct file *file, struct page *page) struct inode *inode = page->mapping->host; struct buffer_head *bh = NULL; unsigned char *symlink; - int err = -EIO; + int err; unsigned char *p = kmap(page); struct udf_inode_info *iinfo; uint32_t pos; + /* We don't support symlinks longer than one block */ + if (inode->i_size > inode->i_sb->s_blocksize) { + err = -ENAMETOOLONG; + goto out_unmap; + } + iinfo = UDF_I(inode); pos = udf_block_map(inode, 0); @@ -94,14 +124,18 @@ static int udf_symlink_filler(struct file *file, struct page *page) } else { bh = sb_bread(inode->i_sb, pos); - if (!bh) - goto out; + if (!bh) { + err = -EIO; + goto out_unlock_inode; + } symlink = bh->b_data; } - udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p); + err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE); brelse(bh); + if (err) + goto out_unlock_inode; up_read(&iinfo->i_data_sem); SetPageUptodate(page); @@ -109,9 +143,10 @@ static int udf_symlink_filler(struct file *file, struct page *page) unlock_page(page); return 0; -out: +out_unlock_inode: up_read(&iinfo->i_data_sem); SetPageError(page); +out_unmap: kunmap(page); unlock_page(page); return err; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 1cc3c993ebd0..47bb3f5ca360 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -211,7 +211,8 @@ udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc, } /* unicode.c */ -extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int); +extern int udf_get_filename(struct super_block *, uint8_t *, int, uint8_t *, + int); extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, int); extern int udf_build_ustr(struct ustr *, dstring *, int); diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index afd470e588ff..b84fee372734 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -28,7 +28,8 @@ #include "udf_sb.h" -static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int); +static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *, + int); static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) { @@ -333,8 +334,8 @@ try_again: return u_len + 1; } -int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, - int flen) +int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen, + uint8_t *dname, int dlen) { struct ustr *filename, *unifilename; int len = 0; @@ -347,7 +348,7 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, if (!unifilename) goto out1; - if (udf_build_ustr_exact(unifilename, sname, flen)) + if (udf_build_ustr_exact(unifilename, sname, slen)) goto out2; if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { @@ -366,7 +367,8 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, } else goto out2; - len = udf_translate_to_linux(dname, filename->u_name, filename->u_len, + len = udf_translate_to_linux(dname, dlen, + filename->u_name, filename->u_len, unifilename->u_name, unifilename->u_len); out2: kfree(unifilename); @@ -403,10 +405,12 @@ int udf_put_filename(struct super_block *sb, const uint8_t *sname, #define EXT_MARK '.' #define CRC_MARK '#' #define EXT_SIZE 5 +/* Number of chars we need to store generated CRC to make filename unique */ +#define CRC_LEN 5 -static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, - int udfLen, uint8_t *fidName, - int fidNameLen) +static int udf_translate_to_linux(uint8_t *newName, int newLen, + uint8_t *udfName, int udfLen, + uint8_t *fidName, int fidNameLen) { int index, newIndex = 0, needsCRC = 0; int extIndex = 0, newExtIndex = 0, hasExt = 0; @@ -439,7 +443,7 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, newExtIndex = newIndex; } } - if (newIndex < 256) + if (newIndex < newLen) newName[newIndex++] = curr; else needsCRC = 1; @@ -467,13 +471,13 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, } ext[localExtIndex++] = curr; } - maxFilenameLen = 250 - localExtIndex; + maxFilenameLen = newLen - CRC_LEN - localExtIndex; if (newIndex > maxFilenameLen) newIndex = maxFilenameLen; else newIndex = newExtIndex; - } else if (newIndex > 250) - newIndex = 250; + } else if (newIndex > newLen - CRC_LEN) + newIndex = newLen - CRC_LEN; newName[newIndex++] = CRC_MARK; valueCRC = crc_itu_t(0, fidName, fidNameLen); newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8); |