diff options
Diffstat (limited to 'fs')
133 files changed, 4489 insertions, 1223 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 995e332eee5c..eb2151fb6049 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -51,6 +51,8 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) if (!v9ses->cachetag) { if (v9fs_random_cachetag(v9ses) < 0) { v9ses->fscache = NULL; + kfree(v9ses->cachetag); + v9ses->cachetag = NULL; return; } } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 4cc966a31cb3..fe7f0bd2048e 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -513,6 +513,7 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); if (!v9inode->writeback_fid && + (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) { /* * clone a fid and add it to writeback_fid @@ -614,6 +615,8 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) (vma->vm_end - vma->vm_start - 1), }; + if (!(vma->vm_flags & VM_SHARED)) + return; p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index ca243e658d71..74df32be4c6a 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -58,7 +58,7 @@ static int v9fs_set_super(struct super_block *s, void *data) static int v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, - int flags, void *data) + int flags) { int ret; @@ -132,7 +132,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(sb); goto clunk_fid; } - retval = v9fs_fill_super(sb, v9ses, flags, data); + retval = v9fs_fill_super(sb, v9ses, flags); if (retval) goto release_sb; diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index bcd1bafb0278..4150280509ff 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -10,13 +10,6 @@ #include <linux/dns_resolver.h> #include "internal.h" -const struct file_operations afs_dynroot_file_operations = { - .open = dcache_dir_open, - .release = dcache_dir_close, - .iterate_shared = dcache_readdir, - .llseek = dcache_dir_lseek, -}; - /* * Probe to see if a cell may exist. This prevents positive dentries from * being created unnecessarily. diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 7b1c18c32f48..46d2d7cb461d 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -443,7 +443,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { inode->i_op = &afs_dynroot_inode_operations; - inode->i_fop = &afs_dynroot_file_operations; + inode->i_fop = &simple_dir_operations; } else { inode->i_op = &afs_autocell_inode_operations; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9cdfabaeaa0b..759e0578012c 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -910,7 +910,6 @@ extern int afs_silly_iput(struct dentry *, struct inode *); /* * dynroot.c */ -extern const struct file_operations afs_dynroot_file_operations; extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index cec3b4146440..c5642bcb6b46 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -879,7 +879,7 @@ out_free_interp: the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { - int elf_prot, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; + int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; @@ -911,13 +911,6 @@ out_free_interp: */ } } - - /* - * Some binaries have overlapping elf segments and then - * we have to forcefully map over an existing mapping - * e.g. over this newly established brk mapping. - */ - elf_fixed = MAP_FIXED; } elf_prot = make_prot(elf_ppnt->p_flags); @@ -930,7 +923,7 @@ out_free_interp: * the ET_DYN load_addr calculations, proceed normally. */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { - elf_flags |= elf_fixed; + elf_flags |= MAP_FIXED; } else if (loc->elf_ex.e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program @@ -966,7 +959,7 @@ out_free_interp: load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - elf_flags |= elf_fixed; + elf_flags |= MAP_FIXED; } else load_bias = 0; @@ -1121,7 +1114,8 @@ out_free_interp: * (since it grows up, and may collide early with the stack * growing down), and into the unused ELF_ET_DYN_BASE region. */ - if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && !interpreter) + if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && + loc->elf_ex.e_type == ET_DYN && !interpreter) current->mm->brk = current->mm->start_brk = ELF_ET_DYN_BASE; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7b32b6af322d..cceaf05aada2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3745,12 +3745,21 @@ err_unlock: static void set_btree_ioerr(struct page *page) { struct extent_buffer *eb = (struct extent_buffer *)page->private; + struct btrfs_fs_info *fs_info; SetPageError(page); if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; /* + * If we error out, we should add back the dirty_metadata_bytes + * to make it consistent. + */ + fs_info = eb->fs_info; + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + eb->len, fs_info->dirty_metadata_batch); + + /* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. * We do this because while the transaction is running and before it's @@ -3986,6 +3995,10 @@ retry: if (!ret) { free_extent_buffer(eb); continue; + } else if (ret < 0) { + done = 1; + free_extent_buffer(eb); + break; } ret = write_one_eb(eb, wbc, &epd); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8fe4eb7e5045..27e5b269e729 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1591,7 +1591,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; - struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; @@ -1611,6 +1610,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, return -ENOMEM; while (iov_iter_count(i) > 0) { + struct extent_state *cached_state = NULL; size_t offset = offset_in_page(pos); size_t sector_offset; size_t write_bytes = min(iov_iter_count(i), @@ -1758,9 +1758,20 @@ again: if (copied > 0) ret = btrfs_dirty_pages(inode, pages, dirty_pages, pos, copied, &cached_state); + + /* + * If we have not locked the extent range, because the range's + * start offset is >= i_size, we might still have a non-NULL + * cached extent state, acquired while marking the extent range + * as delalloc through btrfs_dirty_pages(). Therefore free any + * possible cached extent state to avoid a memory leak. + */ if (extents_locked) unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); + else + free_extent_state(cached_state); + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, true); if (ret) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0546401bc0a..0f2754eaa05b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6305,13 +6305,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, u32 sizes[2]; int nitems = name ? 2 : 1; unsigned long ptr; + unsigned int nofs_flag; int ret; path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); + nofs_flag = memalloc_nofs_save(); inode = new_inode(fs_info->sb); + memalloc_nofs_restore(nofs_flag); if (!inode) { btrfs_free_path(path); return ERR_PTR(-ENOMEM); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8d3bd799ac7d..c4bb69941c77 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3166,9 +3166,6 @@ out: btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); - if (!btrfs_fs_closing(fs_info)) - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -3184,16 +3181,30 @@ out: trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); + trans = NULL; btrfs_err(fs_info, "fail to start transaction for status update: %d", err); - goto done; } - ret = update_qgroup_status_item(trans); - if (ret < 0) { - err = ret; - btrfs_err(fs_info, "fail to update qgroup status: %d", err); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (!btrfs_fs_closing(fs_info)) + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + if (trans) { + ret = update_qgroup_status_item(trans); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d", + err); + } } + fs_info->qgroup_rescan_running = false; + complete_all(&fs_info->qgroup_rescan_completion); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (!trans) + return; + btrfs_end_transaction(trans); if (btrfs_fs_closing(fs_info)) { @@ -3204,12 +3215,6 @@ out: } else { btrfs_err(fs_info, "qgroup scan failed with %d", err); } - -done: - mutex_lock(&fs_info->qgroup_rescan_lock); - fs_info->qgroup_rescan_running = false; - mutex_unlock(&fs_info->qgroup_rescan_lock); - complete_all(&fs_info->qgroup_rescan_completion); } /* @@ -3437,6 +3442,9 @@ cleanup: while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); + /* Also free data bytes of already reserved one */ + btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, + orig_reserved, BTRFS_QGROUP_RSV_DATA); extent_changeset_release(reserved); return ret; } @@ -3481,7 +3489,7 @@ static int qgroup_free_reserved_data(struct inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, free_start, free_start + free_len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index e87cbdad02a3..b57f3618e58e 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -500,7 +500,7 @@ static int process_leaf(struct btrfs_root *root, struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u32 count; - int i = 0, tree_block_level = 0, ret; + int i = 0, tree_block_level = 0, ret = 0; struct btrfs_key key; int nritems = btrfs_header_nritems(leaf); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2f0e25afa486..00504657b602 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1435,6 +1435,13 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int clear_rsv = 0; int ret; + /* + * The subvolume has reloc tree but the swap is finished, no need to + * create/update the dead reloc tree + */ + if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) + return 0; + if (root->reloc_root) { reloc_root = root->reloc_root; reloc_root->last_trans = trans->transid; @@ -2187,7 +2194,6 @@ static int clean_dirty_subvols(struct reloc_control *rc) /* Merged subvolume, cleanup its reloc root */ struct btrfs_root *reloc_root = root->reloc_root; - clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); list_del_init(&root->reloc_dirty_list); root->reloc_root = NULL; if (reloc_root) { @@ -2196,6 +2202,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) if (ret2 < 0 && !ret) ret = ret2; } + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); btrfs_put_fs_root(root); } else { /* Orphan reloc tree, just clean it up */ diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f3215028235c..123ac54af071 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5085,7 +5085,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *path; struct btrfs_key key; int ret; - u64 clone_src_i_size; + u64 clone_src_i_size = 0; /* * Prevent cloning from a zero offset with a length matching the sector diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b5e80563efaa..99fe9bf3fdac 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -52,7 +52,13 @@ static struct file_system_type test_type = { struct inode *btrfs_new_test_inode(void) { - return new_inode(test_mnt->mnt_sb); + struct inode *inode; + + inode = new_inode(test_mnt->mnt_sb); + if (inode) + inode_init_owner(inode, NULL, S_IFREG); + + return inode; } static int btrfs_init_test_fs(void) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 29b82a795522..8a6cc600bf18 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2932,7 +2932,8 @@ out: * in the tree of log roots */ static int update_log_root(struct btrfs_trans_handle *trans, - struct btrfs_root *log) + struct btrfs_root *log, + struct btrfs_root_item *root_item) { struct btrfs_fs_info *fs_info = log->fs_info; int ret; @@ -2940,10 +2941,10 @@ static int update_log_root(struct btrfs_trans_handle *trans, if (log->log_transid == 1) { /* insert root item on the first sync */ ret = btrfs_insert_root(trans, fs_info->log_root_tree, - &log->root_key, &log->root_item); + &log->root_key, root_item); } else { ret = btrfs_update_root(trans, fs_info->log_root_tree, - &log->root_key, &log->root_item); + &log->root_key, root_item); } return ret; } @@ -3041,6 +3042,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = fs_info->log_root_tree; + struct btrfs_root_item new_root_item; int log_transid = 0; struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; @@ -3104,18 +3106,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } + /* + * We _must_ update under the root->log_mutex in order to make sure we + * have a consistent view of the log root we are trying to commit at + * this moment. + * + * We _must_ copy this into a local copy, because we are not holding the + * log_root_tree->log_mutex yet. This is important because when we + * commit the log_root_tree we must have a consistent view of the + * log_root_tree when we update the super block to point at the + * log_root_tree bytenr. If we update the log_root_tree here we'll race + * with the commit and possibly point at the new block which we may not + * have written out. + */ btrfs_set_root_node(&log->root_item, log->node); + memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; /* - * Update or create log root item under the root's log_mutex to prevent - * races with concurrent log syncs that can lead to failure to update - * log root item because it was not created yet. - */ - ret = update_log_root(trans, log); - /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to * new positions. so it's safe to allow log writers to go in. @@ -3135,6 +3145,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); mutex_lock(&log_root_tree->log_mutex); + + /* + * Now we are safe to update the log_root_tree because we're under the + * log_mutex, and we're a current writer so we're holding the commit + * open until we drop the log_mutex. + */ + ret = update_log_root(trans, log, &new_root_item); + if (atomic_dec_and_test(&log_root_tree->log_writers)) { /* atomic_dec_and_test implies a barrier */ cond_wake_up_nomb(&log_root_tree->log_writer_wait); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a324480bc88b..bdfe4493e43a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3845,7 +3845,11 @@ static int alloc_profile_is_valid(u64 flags, int extended) return !extended; /* "0" is valid for usual profiles */ /* true if exactly one bit set */ - return is_power_of_2(flags); + /* + * Don't use is_power_of_2(unsigned long) because it won't work + * for the single profile (1ULL << 48) on 32-bit CPUs. + */ + return flags != 0 && (flags & (flags - 1)) == 0; } static inline int balance_need_close(struct btrfs_fs_info *fs_info) @@ -4063,7 +4067,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } num_devices = btrfs_num_devices(fs_info); - allowed = 0; + + /* + * SINGLE profile on-disk has no profile bit, but in-memory we have a + * special bit for it, to make it easier to distinguish. Thus we need + * to set it manually, or balance would refuse the profile. + */ + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) if (num_devices >= btrfs_raid_array[i].devs_min) allowed |= btrfs_raid_array[i].bg_flag; diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 6c3bd07868d7..0f0dc1c1fe41 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -57,9 +57,18 @@ struct smb_query_info { /* char buffer[]; */ } __packed; +struct smb3_key_debug_info { + __u64 Suid; + __u16 cipher_type; + __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */ + __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) #define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array) #define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info) +#define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index eb428349f29a..439b99cefeb0 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -90,8 +90,39 @@ struct cifs_acl { __le32 num_aces; } __attribute__((packed)); +/* ACE types - see MS-DTYP 2.4.4.1 */ +#define ACCESS_ALLOWED_ACE_TYPE 0x00 +#define ACCESS_DENIED_ACE_TYPE 0x01 +#define SYSTEM_AUDIT_ACE_TYPE 0x02 +#define SYSTEM_ALARM_ACE_TYPE 0x03 +#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */ +#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */ +#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11 +#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12 +#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13 + +/* ACE flags */ +#define OBJECT_INHERIT_ACE 0x01 +#define CONTAINER_INHERIT_ACE 0x02 +#define NO_PROPAGATE_INHERIT_ACE 0x04 +#define INHERIT_ONLY_ACE 0x08 +#define INHERITED_ACE 0x10 +#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40 +#define FAILED_ACCESS_ACE_FLAG 0x80 + struct cifs_ace { - __u8 type; + __u8 type; /* see above and MS-DTYP 2.4.4.1 */ __u8 flags; __le16 size; __le32 access_req; @@ -99,6 +130,54 @@ struct cifs_ace { } __attribute__((packed)); /* + * The current SMB3 form of security descriptor is similar to what was used for + * cifs (see above) but some fields are split, and fields in the struct below + * matches names of fields to the the spec, MS-DTYP (see sections 2.4.5 and + * 2.4.6). Note that "CamelCase" fields are used in this struct in order to + * match the MS-DTYP and MS-SMB2 specs which define the wire format. + */ +struct smb3_sd { + __u8 Revision; /* revision level, MUST be one */ + __u8 Sbz1; /* only meaningful if 'RM' flag set below */ + __le16 Control; + __le32 OffsetOwner; + __le32 OffsetGroup; + __le32 OffsetSacl; + __le32 OffsetDacl; +} __packed; + +/* Meaning of 'Control' field flags */ +#define ACL_CONTROL_SR 0x0001 /* Self relative */ +#define ACL_CONTROL_RM 0x0002 /* Resource manager control bits */ +#define ACL_CONTROL_PS 0x0004 /* SACL protected from inherits */ +#define ACL_CONTROL_PD 0x0008 /* DACL protected from inherits */ +#define ACL_CONTROL_SI 0x0010 /* SACL Auto-Inherited */ +#define ACL_CONTROL_DI 0x0020 /* DACL Auto-Inherited */ +#define ACL_CONTROL_SC 0x0040 /* SACL computed through inheritance */ +#define ACL_CONTROL_DC 0x0080 /* DACL computed through inheritence */ +#define ACL_CONTROL_SS 0x0100 /* Create server ACL */ +#define ACL_CONTROL_DT 0x0200 /* DACL provided by trusteed source */ +#define ACL_CONTROL_SD 0x0400 /* SACL defaulted */ +#define ACL_CONTROL_SP 0x0800 /* SACL is present on object */ +#define ACL_CONTROL_DD 0x1000 /* DACL defaulted */ +#define ACL_CONTROL_DP 0x2000 /* DACL is present on object */ +#define ACL_CONTROL_GD 0x4000 /* Group was defaulted */ +#define ACL_CONTROL_OD 0x8000 /* User was defaulted */ + +/* Meaning of AclRevision flags */ +#define ACL_REVISION 0x02 /* See section 2.4.4.1 of MS-DTYP */ +#define ACL_REVISION_DS 0x04 /* Additional AceTypes allowed */ + +struct smb3_acl { + u8 AclRevision; /* revision level */ + u8 Sbz1; /* MBZ */ + __le16 AclSize; + __le16 AceCount; + __le16 Sbz2; /* MBZ */ +} __packed; + + +/* * Minimum security identifier can be one for system defined Users * and Groups such as NULL SID and World or Built-in accounts such * as Administrator and Guest and consists of diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2e9c7f493f99..c049c7b3aa87 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -169,18 +169,26 @@ cifs_read_super(struct super_block *sb) else sb->s_maxbytes = MAX_NON_LFS; - /* BB FIXME fix time_gran to be larger for LANMAN sessions */ - sb->s_time_gran = 100; - - if (tcon->unix_ext) { - ts = cifs_NTtimeToUnix(0); + /* Some very old servers like DOS and OS/2 used 2 second granularity */ + if ((tcon->ses->server->vals->protocol_id == SMB10_PROT_ID) && + ((tcon->ses->capabilities & + tcon->ses->server->vals->cap_nt_find) == 0) && + !tcon->unix_ext) { + sb->s_time_gran = 1000000000; /* 1 second is max allowed gran */ + ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0); sb->s_time_min = ts.tv_sec; - ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX)); + ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX), + cpu_to_le16(SMB_TIME_MAX), 0); sb->s_time_max = ts.tv_sec; } else { - ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0); + /* + * Almost every server, including all SMB2+, uses DCE TIME + * ie 100 nanosecond units, since 1601. See MS-DTYP and MS-FSCC + */ + sb->s_time_gran = 100; + ts = cifs_NTtimeToUnix(0); sb->s_time_min = ts.tv_sec; - ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX), cpu_to_le16(SMB_TIME_MAX), 0); + ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX)); sb->s_time_max = ts.tv_sec; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 54e204589cb9..50dfd9049370 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -331,8 +331,9 @@ struct smb_version_operations { umode_t mode, struct cifs_tcon *tcon, const char *full_path, struct cifs_sb_info *cifs_sb); - int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *, - struct cifs_sb_info *); + int (*mkdir)(const unsigned int xid, struct inode *inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *sb); /* set info on created directory */ void (*mkdir_setinfo)(struct inode *, const char *, struct cifs_sb_info *, struct cifs_tcon *, @@ -1209,6 +1210,7 @@ struct cifs_search_info { bool smallBuf:1; /* so we know which buf_release function to call */ }; +#define ACL_NO_MODE ((umode_t)(-1)) struct cifs_open_parms { struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 99b1b1ef558c..e53e9f62b87b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -372,7 +372,8 @@ extern int CIFSSMBUnixSetPathInfo(const unsigned int xid, const struct nls_table *nls_codepage, int remap); -extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, +extern int CIFSSMBMkDir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index dbee2132e419..4f554f019a98 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1078,7 +1078,8 @@ RmDirRetry: } int -CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, +CIFSSMBMkDir(const unsigned int xid, struct inode *inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { int rc = 0; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2850c3ce4391..a64dfa95a925 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4264,7 +4264,7 @@ static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, server->ops->qfs_tcon(*xid, tcon); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) { if (tcon->fsDevInfo.DeviceCharacteristics & - FILE_READ_ONLY_DEVICE) + cpu_to_le32(FILE_READ_ONLY_DEVICE)) cifs_dbg(VFS, "mounted to read only share\n"); else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE) == 0) @@ -4445,7 +4445,7 @@ static int setup_dfs_tgt_conn(const char *path, int rc; struct dfs_info3_param ref = {0}; char *mdata = NULL, *fake_devname = NULL; - struct smb_vol fake_vol = {0}; + struct smb_vol fake_vol = {NULL}; cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index dd5ac841aefa..7ce689d31aa2 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -738,10 +738,16 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, static int cifs_d_revalidate(struct dentry *direntry, unsigned int flags) { + struct inode *inode; + if (flags & LOOKUP_RCU) return -ECHILD; if (d_really_is_positive(direntry)) { + inode = d_inode(direntry); + if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode))) + CIFS_I(inode)->time = 0; /* force reval */ + if (cifs_revalidate_dentry(direntry)) return 0; else { @@ -752,7 +758,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) * attributes will have been updated by * cifs_revalidate_dentry(). */ - if (IS_AUTOMOUNT(d_inode(direntry)) && + if (IS_AUTOMOUNT(inode) && !(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) { spin_lock(&direntry->d_lock); direntry->d_flags |= DCACHE_NEED_AUTOMOUNT; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 4b95700c507c..5ad15de2bb4f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -253,6 +253,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, xid, fid); + if (rc) { + server->ops->close(xid, tcon, fid); + if (rc == -ESTALE) + rc = -EOPENSTALE; + } + out: kfree(buf); return rc; @@ -1840,13 +1846,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); /* we could simply get the first_list_entry since write-only entries are always at the end of the list but since the first entry might have a close pending, we go through the whole list */ @@ -1858,7 +1863,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, /* found a good file */ /* lock it so it will not be closed on us */ cifsFileInfo_get(open_file); - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return open_file; } /* else might as well continue, and look for another, or simply have the caller reopen it @@ -1866,7 +1871,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, } else /* write only file */ break; /* write only files are last so must be done */ } - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return NULL; } @@ -1877,7 +1882,6 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, { struct cifsFileInfo *open_file, *inv_file = NULL; struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; bool any_available = false; int rc = -EBADF; unsigned int refind = 0; @@ -1897,16 +1901,15 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, } cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); - tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); refind_writable: if (refind > MAX_REOPEN_ATT) { - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return rc; } list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { @@ -1918,7 +1921,7 @@ refind_writable: if (!open_file->invalidHandle) { /* found a good writable file */ cifsFileInfo_get(open_file); - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); *ret_file = open_file; return 0; } else { @@ -1938,7 +1941,7 @@ refind_writable: cifsFileInfo_get(inv_file); } - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); if (inv_file) { rc = cifs_reopen_file(inv_file, false); @@ -1953,7 +1956,7 @@ refind_writable: cifsFileInfo_put(inv_file); ++refind; inv_file = NULL; - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); goto refind_writable; } @@ -4461,17 +4464,15 @@ static int cifs_readpage(struct file *file, struct page *page) static int is_inode_writable(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *open_file; - struct cifs_tcon *tcon = - cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb)); - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return 1; } } - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return 0; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 26cdfbf1e164..5dcc95b38310 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -414,6 +414,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* if uniqueid is different, return error */ if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) { + CIFS_I(*pinode)->time = 0; /* force reval */ rc = -ESTALE; goto cgiiu_exit; } @@ -421,6 +422,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* if filetype is different, return error */ if (unlikely(((*pinode)->i_mode & S_IFMT) != (fattr.cf_mode & S_IFMT))) { + CIFS_I(*pinode)->time = 0; /* force reval */ rc = -ESTALE; goto cgiiu_exit; } @@ -933,6 +935,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* if uniqueid is different, return error */ if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { + CIFS_I(*inode)->time = 0; /* force reval */ rc = -ESTALE; goto cgii_exit; } @@ -940,6 +943,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* if filetype is different, return error */ if (unlikely(((*inode)->i_mode & S_IFMT) != (fattr.cf_mode & S_IFMT))) { + CIFS_I(*inode)->time = 0; /* force reval */ rc = -ESTALE; goto cgii_exit; } @@ -1622,13 +1626,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) } /* BB add setting the equivalent of mode via CreateX w/ACLs */ - rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb); + rc = server->ops->mkdir(xid, inode, mode, tcon, full_path, cifs_sb); if (rc) { cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc); d_drop(direntry); goto mkdir_out; } + /* TODO: skip this for smb2/smb3 */ rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon, xid); mkdir_out: diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 76ddd98b6298..1a01e108d75e 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -164,6 +164,7 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); + struct smb3_key_debug_info pkey_inf; int rc = -ENOTTY; /* strange error - but the precedent */ unsigned int xid; struct cifsFileInfo *pSMBFile = filep->private_data; @@ -270,6 +271,34 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = -EOPNOTSUPP; break; + case CIFS_DUMP_KEY: + if (pSMBFile == NULL) + break; + if (!capable(CAP_SYS_ADMIN)) { + rc = -EACCES; + break; + } + + tcon = tlink_tcon(pSMBFile->tlink); + if (!smb3_encryption_required(tcon)) { + rc = -EOPNOTSUPP; + break; + } + pkey_inf.cipher_type = + le16_to_cpu(tcon->ses->server->cipher_type); + pkey_inf.Suid = tcon->ses->Suid; + memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response, + 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); + memcpy(pkey_inf.smb3decryptionkey, + tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + memcpy(pkey_inf.smb3encryptionkey, + tcon->ses->smb3encryptionkey, SMB3_SIGN_KEY_SIZE); + if (copy_to_user((void __user *)arg, &pkey_inf, + sizeof(struct smb3_key_debug_info))) + rc = -EFAULT; + else + rc = 0; + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 49c17ee18254..9b41436fb8db 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -117,10 +117,6 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = { {0, 0} }; -static const struct smb_to_posix_error mapping_table_ERRHRD[] = { - {0, 0} -}; - /* * Convert a string containing text IPv4 or IPv6 address to binary form. * diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 4c764ff7edd2..85bd644f9773 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -698,7 +698,6 @@ sess_auth_lanman(struct sess_data *sess_data) char *bcc_ptr; struct cifs_ses *ses = sess_data->ses; char lnm_session_key[CIFS_AUTH_RESP_SIZE]; - __u32 capabilities; __u16 bytes_remaining; /* lanman 2 style sessionsetup */ @@ -709,7 +708,7 @@ sess_auth_lanman(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; bcc_ptr = sess_data->iov[2].iov_base; - capabilities = cifs_ssetup_hdr(ses, pSMB); + (void)cifs_ssetup_hdr(ses, pSMB); pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index d2a3fb7e5c8d..4121ac1163ca 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -51,7 +51,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, __u32 desired_access, __u32 create_disposition, - __u32 create_options, void *ptr, int command, + __u32 create_options, umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile) { int rc; @@ -103,6 +103,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, oparms.create_options |= CREATE_OPEN_BACKUP_INTENT; oparms.fid = &fid; oparms.reconnect = false; + oparms.mode = mode; memset(&open_iov, 0, sizeof(open_iov)); rqst[num_rqst].rq_iov = open_iov; @@ -478,7 +479,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - smb2_data, SMB2_OP_QUERY_INFO, cfile); + ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile); if (rc == -EOPNOTSUPP) { *symlink = true; create_options |= OPEN_REPARSE_POINT; @@ -486,8 +487,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* Failed on a symbolic link - query a reparse point info */ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, smb2_data, - SMB2_OP_QUERY_INFO, NULL); + create_options, ACL_NO_MODE, + smb2_data, SMB2_OP_QUERY_INFO, NULL); } if (rc) goto out; @@ -499,12 +500,14 @@ out: } int -smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, +smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR, NULL); + CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, + NULL); } void @@ -525,8 +528,8 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, cifs_get_writable_path(tcon, name, &cfile); tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO, - cfile); + CREATE_NOT_FILE, ACL_NO_MODE, + &data, SMB2_OP_SET_INFO, cfile); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -536,7 +539,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_NOT_FILE, + CREATE_NOT_FILE, ACL_NO_MODE, NULL, SMB2_OP_RMDIR, NULL); } @@ -546,7 +549,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - NULL, SMB2_OP_DELETE, NULL); + ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL); } static int @@ -564,7 +567,8 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, goto smb2_rename_path; } rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, 0, smb2_to_name, command, cfile); + FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, + command, cfile); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -601,8 +605,8 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, __le64 eof = cpu_to_le64(size); return smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_WRITE_DATA, FILE_OPEN, 0, &eof, - SMB2_OP_SET_EOF, NULL); + FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, + &eof, SMB2_OP_SET_EOF, NULL); } int @@ -623,8 +627,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path, return PTR_ERR(tlink); rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path, - FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, - SMB2_OP_SET_INFO, NULL); + FILE_WRITE_ATTRIBUTES, FILE_OPEN, + 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, NULL); cifs_put_tlink(tlink); return rc; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index eaed18061314..4c0922596467 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -751,6 +751,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) goto oshr_exit; } + atomic_inc(&tcon->num_remote_opens); + o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; oparms.fid->persistent_fid = o_rsp->PersistentFileId; oparms.fid->volatile_fid = o_rsp->VolatileFileId; @@ -1176,6 +1178,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rc = compound_send_recv(xid, ses, flags, 3, rqst, resp_buftype, rsp_iov); + /* no need to bump num_remote_opens because handle immediately closed */ sea_exit: kfree(ea); @@ -1518,6 +1521,8 @@ smb2_ioctl_query_info(const unsigned int xid, resp_buftype, rsp_iov); if (rc) goto iqinf_exit; + + /* No need to bump num_remote_opens since handle immediately closed */ if (qi.flags & PASSTHRU_FSCTL) { pqi = (struct smb_query_info __user *)arg; io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base; @@ -3328,6 +3333,11 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; + /* Check if the server granted an oplock rather than a lease */ + if (oplock & SMB2_OPLOCK_LEVEL_EXCLUSIVE) + return smb2_set_oplock_level(cinode, oplock, epoch, + purge_cache); + if (oplock & SMB2_LEASE_READ_CACHING_HE) { new_oplock |= CIFS_CACHE_READ_FLG; strcat(message, "R"); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 87066f1af12c..05149862aea4 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -751,6 +751,8 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) unsigned int num = *num_iovec; iov[num].iov_base = create_posix_buf(mode); + if (mode == ACL_NO_MODE) + cifs_dbg(FYI, "illegal mode\n"); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); @@ -2352,6 +2354,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, rqst.rq_iov = iov; rqst.rq_nvec = n_iov; + /* no need to inc num_remote_opens because we close it just below */ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); /* resource #4: response buffer */ @@ -2416,6 +2419,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, /* File attributes ignored on open (used in create though) */ req->FileAttributes = cpu_to_le32(file_attributes); req->ShareAccess = FILE_SHARE_ALL_LE; + req->CreateDisposition = cpu_to_le32(oparms->disposition); req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK); req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)); @@ -2517,6 +2521,20 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, return rc; } + if ((oparms->disposition == FILE_CREATE) && + (oparms->mode != ACL_NO_MODE)) { + if (n_iov > 2) { + struct create_context *ccontext = + (struct create_context *)iov[n_iov-1].iov_base; + ccontext->Next = + cpu_to_le32(iov[n_iov-1].iov_len); + } + + /* rc = add_sd_context(iov, &n_iov, oparms->mode); */ + if (rc) + return rc; + } + if (n_iov > 2) { struct create_context *ccontext = (struct create_context *)iov[n_iov-1].iov_base; @@ -3180,7 +3198,7 @@ SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, * See MS-SMB2 2.2.35 and 2.2.36 */ -int +static int SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 completion_filter, bool watch_tree) @@ -3196,7 +3214,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; - req->OutputBufferLength = SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE; + req->OutputBufferLength = + cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE); req->CompletionFilter = cpu_to_le32(completion_filter); if (watch_tree) req->Flags = cpu_to_le16(SMB2_WATCH_TREE); diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 67a91b11fd59..71b2930b8e0b 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -84,7 +84,8 @@ extern int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, umode_t mode, struct cifs_tcon *tcon, const char *full_path, struct cifs_sb_info *cifs_sb); -extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, +extern int smb2_mkdir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, struct cifs_sb_info *cifs_sb, @@ -149,6 +150,10 @@ extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, bool is_fsctl, char *in_data, u32 indatalen, __u32 max_response_size); extern void SMB2_ioctl_free(struct smb_rqst *rqst); +extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, bool watch_tree, + u32 completion_filter); + extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 08628e6a42ac..1ff28529cf4b 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -144,6 +144,17 @@ #define IO_REPARSE_APPXSTREAM 0xC0000014 /* NFS symlinks, Win 8/SMB3 and later */ #define IO_REPARSE_TAG_NFS 0x80000014 +/* + * AzureFileSync - see + * https://docs.microsoft.com/en-us/azure/storage/files/storage-sync-cloud-tiering + */ +#define IO_REPARSE_TAG_AZ_FILE_SYNC 0x8000001e +/* WSL reparse tags */ +#define IO_REPARSE_TAG_LX_SYMLINK 0xA000001D +#define IO_REPARSE_TAG_AF_UNIX 0x80000023 +#define IO_REPARSE_TAG_LX_FIFO 0x80000024 +#define IO_REPARSE_TAG_LX_CHR 0x80000025 +#define IO_REPARSE_TAG_LX_BLK 0x80000026 /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 9076150758d8..db4ba8f6077e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -31,7 +31,7 @@ #include "cifs_fs_sb.h" #include "cifs_unicode.h" -#define MAX_EA_VALUE_SIZE 65535 +#define MAX_EA_VALUE_SIZE CIFSMaxBufSize #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" #define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */ #define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */ diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 93e4ca6b2ad7..87846aad594b 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -19,6 +19,7 @@ #include <linux/atomic.h> #include <linux/device.h> #include <linux/poll.h> +#include <linux/security.h> #include "internal.h" @@ -136,6 +137,25 @@ void debugfs_file_put(struct dentry *dentry) } EXPORT_SYMBOL_GPL(debugfs_file_put); +/* + * Only permit access to world-readable files when the kernel is locked down. + * We also need to exclude any file that has ways to write or alter it as root + * can bypass the permissions check. + */ +static bool debugfs_is_locked_down(struct inode *inode, + struct file *filp, + const struct file_operations *real_fops) +{ + if ((inode->i_mode & 07777) == 0444 && + !(filp->f_mode & FMODE_WRITE) && + !real_fops->unlocked_ioctl && + !real_fops->compat_ioctl && + !real_fops->mmap) + return false; + + return security_locked_down(LOCKDOWN_DEBUGFS); +} + static int open_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); @@ -147,6 +167,11 @@ static int open_proxy_open(struct inode *inode, struct file *filp) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); + + r = debugfs_is_locked_down(inode, filp, real_fops); + if (r) + goto out; + real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not clean up after itself at exit? */ @@ -272,6 +297,11 @@ static int full_proxy_open(struct inode *inode, struct file *filp) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); + + r = debugfs_is_locked_down(inode, filp, real_fops); + if (r) + goto out; + real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not cleanup after itself at exit? */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 042b688ed124..7b975dbb2bb4 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -26,6 +26,7 @@ #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/security.h> #include "internal.h" @@ -35,6 +36,32 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; +/* + * Don't allow access attributes to be changed whilst the kernel is locked down + * so that we can use the file mode as part of a heuristic to determine whether + * to lock down individual files. + */ +static int debugfs_setattr(struct dentry *dentry, struct iattr *ia) +{ + int ret = security_locked_down(LOCKDOWN_DEBUGFS); + + if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) + return ret; + return simple_setattr(dentry, ia); +} + +static const struct inode_operations debugfs_file_inode_operations = { + .setattr = debugfs_setattr, +}; +static const struct inode_operations debugfs_dir_inode_operations = { + .lookup = simple_lookup, + .setattr = debugfs_setattr, +}; +static const struct inode_operations debugfs_symlink_inode_operations = { + .get_link = simple_get_link, + .setattr = debugfs_setattr, +}; + static struct inode *debugfs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -369,6 +396,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, inode->i_mode = mode; inode->i_private = data; + inode->i_op = &debugfs_file_inode_operations; inode->i_fop = proxy_fops; dentry->d_fsdata = (void *)((unsigned long)real_fops | DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); @@ -532,7 +560,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) } inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; - inode->i_op = &simple_dir_inode_operations; + inode->i_op = &debugfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -632,7 +660,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, return failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; - inode->i_op = &simple_symlink_inode_operations; + inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; d_instantiate(dentry, inode); return end_creating(dentry); diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 8a9fcbd0e8ac..fc3a8d8064f8 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -34,11 +34,15 @@ static void erofs_readendio(struct bio *bio) struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr) { - struct inode *const bd_inode = sb->s_bdev->bd_inode; - struct address_space *const mapping = bd_inode->i_mapping; + struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping; + struct page *page; - return read_cache_page_gfp(mapping, blkaddr, + page = read_cache_page_gfp(mapping, blkaddr, mapping_gfp_constraint(mapping, ~__GFP_FS)); + /* should already be PageUptodate */ + if (!IS_ERR(page)) + lock_page(page); + return page; } static int erofs_map_blocks_flatmode(struct inode *inode, diff --git a/fs/erofs/super.c b/fs/erofs/super.c index caf9a95173b0..0e369494f2f2 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -105,9 +105,9 @@ static int erofs_read_superblock(struct super_block *sb) int ret; page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL); - if (!page) { + if (IS_ERR(page)) { erofs_err(sb, "cannot read erofs superblock"); - return -EIO; + return PTR_ERR(page); } sbi = EROFS_SB(sb); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 96e34c90f814..fad80c97d247 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -575,7 +575,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; struct z_erofs_collector *const clt = &fe->clt; const loff_t offset = page_offset(page); - bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED); + bool tight = true; enum z_erofs_cache_alloctype cache_strategy; enum z_erofs_page_type page_type; @@ -628,8 +628,16 @@ restart_now: preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy, pagepool); - tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED); hitted: + /* + * Ensure the current partial page belongs to this submit chain rather + * than other concurrent submit chains or the noio(bypass) chain since + * those chains are handled asynchronously thus the page cannot be used + * for inplace I/O or pagevec (should be processed in strict order.) + */ + tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED && + clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); diff --git a/fs/exec.c b/fs/exec.c index f7f6a140856a..555e93c7dec8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1033,6 +1033,7 @@ static int exec_mmap(struct mm_struct *mm) } task_lock(tsk); active_mm = tsk->active_mm; + membarrier_exec_mmap(mm); tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); @@ -1825,7 +1826,6 @@ static int __do_execve_file(int fd, struct filename *filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - membarrier_execve(current); rseq_execve(current); acct_update_integrals(current); task_numa_free(current, false); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 123e3dee7733..516faa280ced 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4551,6 +4551,7 @@ static int __ext4_get_inode_loc(struct inode *inode, struct buffer_head *bh; struct super_block *sb = inode->i_sb; ext4_fsblk_t block; + struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; @@ -4639,6 +4640,7 @@ make_io: * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ + blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; @@ -4669,6 +4671,7 @@ make_io: get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); + blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 814ad2c2ba80..054acd9fd033 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -88,9 +88,7 @@ static int fat__get_entry(struct inode *dir, loff_t *pos, int err, offset; next: - if (*bh) - brelse(*bh); - + brelse(*bh); *bh = NULL; iblock = *pos >> sb->s_blocksize_bits; err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false); diff --git a/fs/fhandle.c b/fs/fhandle.c index 0ee727485615..01263ffbc4c0 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -246,7 +246,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, * sys_open_by_handle_at: Open the file handle * @mountdirfd: directory file descriptor * @handle: file handle to be opened - * @flag: open flags. + * @flags: open flags. * * @mountdirfd indicate the directory file descriptor * of the mount point. file handle is decoded relative diff --git a/fs/file_table.c b/fs/file_table.c index b07b53f24ff5..30d55c9a1744 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -327,6 +327,7 @@ void flush_delayed_fput(void) { delayed_fput(NULL); } +EXPORT_SYMBOL_GPL(flush_delayed_fput); static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8aaa7eec7b74..e88421d9a48d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -164,8 +164,13 @@ static void finish_writeback_work(struct bdi_writeback *wb, if (work->auto_free) kfree(work); - if (done && atomic_dec_and_test(&done->cnt)) - wake_up_all(done->waitq); + if (done) { + wait_queue_head_t *waitq = done->waitq; + + /* @done can't be accessed after the following dec */ + if (atomic_dec_and_test(&done->cnt)) + wake_up_all(waitq); + } } static void wb_queue_work(struct bdi_writeback *wb, diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 24fc5a5c1b97..0635cba19971 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -27,3 +27,14 @@ config CUSE If you want to develop or use a userspace character device based on CUSE, answer Y or M. + +config VIRTIO_FS + tristate "Virtio Filesystem" + depends on FUSE_FS + select VIRTIO + help + The Virtio Filesystem allows guests to mount file systems from the + host. + + If you want to share files between guests or with the host, answer Y + or M. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 9485019c2a14..6419a2b3510d 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -5,5 +5,6 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o +obj-$(CONFIG_VIRTIO_FS) += virtio_fs.o fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index fc89cb40e874..956aeaf961ae 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -353,6 +353,10 @@ struct fuse_req { /** Used to wake up the task waiting for completion of request*/ wait_queue_head_t waitq; +#if IS_ENABLED(CONFIG_VIRTIO_FS) + /** virtio-fs's physically contiguous buffer for in and out args */ + void *argbuf; +#endif }; struct fuse_iqueue; @@ -383,6 +387,11 @@ struct fuse_iqueue_ops { */ void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) __releases(fiq->lock); + + /** + * Clean up when fuse_iqueue is destroyed + */ + void (*release)(struct fuse_iqueue *fiq); }; /** /dev/fuse input queue operations */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 51cb471f4dc3..e040e2a2b621 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -630,6 +630,10 @@ EXPORT_SYMBOL_GPL(fuse_conn_init); void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { + struct fuse_iqueue *fiq = &fc->iq; + + if (fiq->ops->release) + fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); put_user_ns(fc->user_ns); fc->release(fc); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c new file mode 100644 index 000000000000..6af3f131e468 --- /dev/null +++ b/fs/fuse/virtio_fs.c @@ -0,0 +1,1195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * virtio-fs: Virtio Filesystem + * Copyright (C) 2018 Red Hat, Inc. + */ + +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/virtio.h> +#include <linux/virtio_fs.h> +#include <linux/delay.h> +#include <linux/fs_context.h> +#include <linux/highmem.h> +#include "fuse_i.h" + +/* List of virtio-fs device instances and a lock for the list. Also provides + * mutual exclusion in device removal and mounting path + */ +static DEFINE_MUTEX(virtio_fs_mutex); +static LIST_HEAD(virtio_fs_instances); + +enum { + VQ_HIPRIO, + VQ_REQUEST +}; + +/* Per-virtqueue state */ +struct virtio_fs_vq { + spinlock_t lock; + struct virtqueue *vq; /* protected by ->lock */ + struct work_struct done_work; + struct list_head queued_reqs; + struct delayed_work dispatch_work; + struct fuse_dev *fud; + bool connected; + long in_flight; + char name[24]; +} ____cacheline_aligned_in_smp; + +/* A virtio-fs device instance */ +struct virtio_fs { + struct kref refcount; + struct list_head list; /* on virtio_fs_instances */ + char *tag; + struct virtio_fs_vq *vqs; + unsigned int nvqs; /* number of virtqueues */ + unsigned int num_request_queues; /* number of request queues */ +}; + +struct virtio_fs_forget { + struct fuse_in_header ih; + struct fuse_forget_in arg; + /* This request can be temporarily queued on virt queue */ + struct list_head list; +}; + +static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) +{ + struct virtio_fs *fs = vq->vdev->priv; + + return &fs->vqs[vq->index]; +} + +static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq) +{ + return &vq_to_fsvq(vq)->fud->pq; +} + +static void release_virtio_fs_obj(struct kref *ref) +{ + struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount); + + kfree(vfs->vqs); + kfree(vfs); +} + +/* Make sure virtiofs_mutex is held */ +static void virtio_fs_put(struct virtio_fs *fs) +{ + kref_put(&fs->refcount, release_virtio_fs_obj); +} + +static void virtio_fs_fiq_release(struct fuse_iqueue *fiq) +{ + struct virtio_fs *vfs = fiq->priv; + + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(vfs); + mutex_unlock(&virtio_fs_mutex); +} + +static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) +{ + WARN_ON(fsvq->in_flight < 0); + + /* Wait for in flight requests to finish.*/ + while (1) { + spin_lock(&fsvq->lock); + if (!fsvq->in_flight) { + spin_unlock(&fsvq->lock); + break; + } + spin_unlock(&fsvq->lock); + /* TODO use completion instead of timeout */ + usleep_range(1000, 2000); + } + + flush_work(&fsvq->done_work); + flush_delayed_work(&fsvq->dispatch_work); +} + +static inline void drain_hiprio_queued_reqs(struct virtio_fs_vq *fsvq) +{ + struct virtio_fs_forget *forget; + + spin_lock(&fsvq->lock); + while (1) { + forget = list_first_entry_or_null(&fsvq->queued_reqs, + struct virtio_fs_forget, list); + if (!forget) + break; + list_del(&forget->list); + kfree(forget); + } + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_drain_all_queues(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + if (i == VQ_HIPRIO) + drain_hiprio_queued_reqs(fsvq); + + virtio_fs_drain_queue(fsvq); + } +} + +static void virtio_fs_start_all_queues(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + spin_lock(&fsvq->lock); + fsvq->connected = true; + spin_unlock(&fsvq->lock); + } +} + +/* Add a new instance to the list or return -EEXIST if tag name exists*/ +static int virtio_fs_add_instance(struct virtio_fs *fs) +{ + struct virtio_fs *fs2; + bool duplicate = false; + + mutex_lock(&virtio_fs_mutex); + + list_for_each_entry(fs2, &virtio_fs_instances, list) { + if (strcmp(fs->tag, fs2->tag) == 0) + duplicate = true; + } + + if (!duplicate) + list_add_tail(&fs->list, &virtio_fs_instances); + + mutex_unlock(&virtio_fs_mutex); + + if (duplicate) + return -EEXIST; + return 0; +} + +/* Return the virtio_fs with a given tag, or NULL */ +static struct virtio_fs *virtio_fs_find_instance(const char *tag) +{ + struct virtio_fs *fs; + + mutex_lock(&virtio_fs_mutex); + + list_for_each_entry(fs, &virtio_fs_instances, list) { + if (strcmp(fs->tag, tag) == 0) { + kref_get(&fs->refcount); + goto found; + } + } + + fs = NULL; /* not found */ + +found: + mutex_unlock(&virtio_fs_mutex); + + return fs; +} + +static void virtio_fs_free_devs(struct virtio_fs *fs) +{ + unsigned int i; + + for (i = 0; i < fs->nvqs; i++) { + struct virtio_fs_vq *fsvq = &fs->vqs[i]; + + if (!fsvq->fud) + continue; + + fuse_dev_free(fsvq->fud); + fsvq->fud = NULL; + } +} + +/* Read filesystem name from virtio config into fs->tag (must kfree()). */ +static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs) +{ + char tag_buf[sizeof_field(struct virtio_fs_config, tag)]; + char *end; + size_t len; + + virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag), + &tag_buf, sizeof(tag_buf)); + end = memchr(tag_buf, '\0', sizeof(tag_buf)); + if (end == tag_buf) + return -EINVAL; /* empty tag */ + if (!end) + end = &tag_buf[sizeof(tag_buf)]; + + len = end - tag_buf; + fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL); + if (!fs->tag) + return -ENOMEM; + memcpy(fs->tag, tag_buf, len); + fs->tag[len] = '\0'; + return 0; +} + +/* Work function for hiprio completion */ +static void virtio_fs_hiprio_done_work(struct work_struct *work) +{ + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + done_work); + struct virtqueue *vq = fsvq->vq; + + /* Free completed FUSE_FORGET requests */ + spin_lock(&fsvq->lock); + do { + unsigned int len; + void *req; + + virtqueue_disable_cb(vq); + + while ((req = virtqueue_get_buf(vq, &len)) != NULL) { + kfree(req); + fsvq->in_flight--; + } + } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_dummy_dispatch_work(struct work_struct *work) +{ +} + +static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) +{ + struct virtio_fs_forget *forget; + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + dispatch_work.work); + struct virtqueue *vq = fsvq->vq; + struct scatterlist sg; + struct scatterlist *sgs[] = {&sg}; + bool notify; + int ret; + + pr_debug("virtio-fs: worker %s called.\n", __func__); + while (1) { + spin_lock(&fsvq->lock); + forget = list_first_entry_or_null(&fsvq->queued_reqs, + struct virtio_fs_forget, list); + if (!forget) { + spin_unlock(&fsvq->lock); + return; + } + + list_del(&forget->list); + if (!fsvq->connected) { + spin_unlock(&fsvq->lock); + kfree(forget); + continue; + } + + sg_init_one(&sg, forget, sizeof(*forget)); + + /* Enqueue the request */ + dev_dbg(&vq->vdev->dev, "%s\n", __func__); + ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", + ret); + list_add_tail(&forget->list, + &fsvq->queued_reqs); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + } else { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", + ret); + kfree(forget); + } + spin_unlock(&fsvq->lock); + return; + } + + fsvq->in_flight++; + notify = virtqueue_kick_prepare(vq); + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); + pr_debug("virtio-fs: worker %s dispatched one forget request.\n", + __func__); + } +} + +/* Allocate and copy args into req->argbuf */ +static int copy_args_to_argbuf(struct fuse_req *req) +{ + struct fuse_args *args = req->args; + unsigned int offset = 0; + unsigned int num_in; + unsigned int num_out; + unsigned int len; + unsigned int i; + + num_in = args->in_numargs - args->in_pages; + num_out = args->out_numargs - args->out_pages; + len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) + + fuse_len_args(num_out, args->out_args); + + req->argbuf = kmalloc(len, GFP_ATOMIC); + if (!req->argbuf) + return -ENOMEM; + + for (i = 0; i < num_in; i++) { + memcpy(req->argbuf + offset, + args->in_args[i].value, + args->in_args[i].size); + offset += args->in_args[i].size; + } + + return 0; +} + +/* Copy args out of and free req->argbuf */ +static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) +{ + unsigned int remaining; + unsigned int offset; + unsigned int num_in; + unsigned int num_out; + unsigned int i; + + remaining = req->out.h.len - sizeof(req->out.h); + num_in = args->in_numargs - args->in_pages; + num_out = args->out_numargs - args->out_pages; + offset = fuse_len_args(num_in, (struct fuse_arg *)args->in_args); + + for (i = 0; i < num_out; i++) { + unsigned int argsize = args->out_args[i].size; + + if (args->out_argvar && + i == args->out_numargs - 1 && + argsize > remaining) { + argsize = remaining; + } + + memcpy(args->out_args[i].value, req->argbuf + offset, argsize); + offset += argsize; + + if (i != args->out_numargs - 1) + remaining -= argsize; + } + + /* Store the actual size of the variable-length arg */ + if (args->out_argvar) + args->out_args[args->out_numargs - 1].size = remaining; + + kfree(req->argbuf); + req->argbuf = NULL; +} + +/* Work function for request completion */ +static void virtio_fs_requests_done_work(struct work_struct *work) +{ + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + done_work); + struct fuse_pqueue *fpq = &fsvq->fud->pq; + struct fuse_conn *fc = fsvq->fud->fc; + struct virtqueue *vq = fsvq->vq; + struct fuse_req *req; + struct fuse_args_pages *ap; + struct fuse_req *next; + struct fuse_args *args; + unsigned int len, i, thislen; + struct page *page; + LIST_HEAD(reqs); + + /* Collect completed requests off the virtqueue */ + spin_lock(&fsvq->lock); + do { + virtqueue_disable_cb(vq); + + while ((req = virtqueue_get_buf(vq, &len)) != NULL) { + spin_lock(&fpq->lock); + list_move_tail(&req->list, &reqs); + spin_unlock(&fpq->lock); + } + } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + spin_unlock(&fsvq->lock); + + /* End requests */ + list_for_each_entry_safe(req, next, &reqs, list) { + /* + * TODO verify that server properly follows FUSE protocol + * (oh.uniq, oh.len) + */ + args = req->args; + copy_args_from_argbuf(args, req); + + if (args->out_pages && args->page_zeroing) { + len = args->out_args[args->out_numargs - 1].size; + ap = container_of(args, typeof(*ap), args); + for (i = 0; i < ap->num_pages; i++) { + thislen = ap->descs[i].length; + if (len < thislen) { + WARN_ON(ap->descs[i].offset); + page = ap->pages[i]; + zero_user_segment(page, len, thislen); + len = 0; + } else { + len -= thislen; + } + } + } + + spin_lock(&fpq->lock); + clear_bit(FR_SENT, &req->flags); + list_del_init(&req->list); + spin_unlock(&fpq->lock); + + fuse_request_end(fc, req); + spin_lock(&fsvq->lock); + fsvq->in_flight--; + spin_unlock(&fsvq->lock); + } +} + +/* Virtqueue interrupt handler */ +static void virtio_fs_vq_done(struct virtqueue *vq) +{ + struct virtio_fs_vq *fsvq = vq_to_fsvq(vq); + + dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name); + + schedule_work(&fsvq->done_work); +} + +/* Initialize virtqueues */ +static int virtio_fs_setup_vqs(struct virtio_device *vdev, + struct virtio_fs *fs) +{ + struct virtqueue **vqs; + vq_callback_t **callbacks; + const char **names; + unsigned int i; + int ret = 0; + + virtio_cread(vdev, struct virtio_fs_config, num_request_queues, + &fs->num_request_queues); + if (fs->num_request_queues == 0) + return -EINVAL; + + fs->nvqs = 1 + fs->num_request_queues; + fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); + if (!fs->vqs) + return -ENOMEM; + + vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL); + callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]), + GFP_KERNEL); + names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL); + if (!vqs || !callbacks || !names) { + ret = -ENOMEM; + goto out; + } + + callbacks[VQ_HIPRIO] = virtio_fs_vq_done; + snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name), + "hiprio"); + names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; + INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work); + INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs); + INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work, + virtio_fs_hiprio_dispatch_work); + spin_lock_init(&fs->vqs[VQ_HIPRIO].lock); + + /* Initialize the requests virtqueues */ + for (i = VQ_REQUEST; i < fs->nvqs; i++) { + spin_lock_init(&fs->vqs[i].lock); + INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work); + INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work, + virtio_fs_dummy_dispatch_work); + INIT_LIST_HEAD(&fs->vqs[i].queued_reqs); + snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name), + "requests.%u", i - VQ_REQUEST); + callbacks[i] = virtio_fs_vq_done; + names[i] = fs->vqs[i].name; + } + + ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL); + if (ret < 0) + goto out; + + for (i = 0; i < fs->nvqs; i++) + fs->vqs[i].vq = vqs[i]; + + virtio_fs_start_all_queues(fs); +out: + kfree(names); + kfree(callbacks); + kfree(vqs); + if (ret) + kfree(fs->vqs); + return ret; +} + +/* Free virtqueues (device must already be reset) */ +static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, + struct virtio_fs *fs) +{ + vdev->config->del_vqs(vdev); +} + +static int virtio_fs_probe(struct virtio_device *vdev) +{ + struct virtio_fs *fs; + int ret; + + fs = kzalloc(sizeof(*fs), GFP_KERNEL); + if (!fs) + return -ENOMEM; + kref_init(&fs->refcount); + vdev->priv = fs; + + ret = virtio_fs_read_tag(vdev, fs); + if (ret < 0) + goto out; + + ret = virtio_fs_setup_vqs(vdev, fs); + if (ret < 0) + goto out; + + /* TODO vq affinity */ + + /* Bring the device online in case the filesystem is mounted and + * requests need to be sent before we return. + */ + virtio_device_ready(vdev); + + ret = virtio_fs_add_instance(fs); + if (ret < 0) + goto out_vqs; + + return 0; + +out_vqs: + vdev->config->reset(vdev); + virtio_fs_cleanup_vqs(vdev, fs); + +out: + vdev->priv = NULL; + kfree(fs); + return ret; +} + +static void virtio_fs_stop_all_queues(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + spin_lock(&fsvq->lock); + fsvq->connected = false; + spin_unlock(&fsvq->lock); + } +} + +static void virtio_fs_remove(struct virtio_device *vdev) +{ + struct virtio_fs *fs = vdev->priv; + + mutex_lock(&virtio_fs_mutex); + /* This device is going away. No one should get new reference */ + list_del_init(&fs->list); + virtio_fs_stop_all_queues(fs); + virtio_fs_drain_all_queues(fs); + vdev->config->reset(vdev); + virtio_fs_cleanup_vqs(vdev, fs); + + vdev->priv = NULL; + /* Put device reference on virtio_fs object */ + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); +} + +#ifdef CONFIG_PM_SLEEP +static int virtio_fs_freeze(struct virtio_device *vdev) +{ + /* TODO need to save state here */ + pr_warn("virtio-fs: suspend/resume not yet supported\n"); + return -EOPNOTSUPP; +} + +static int virtio_fs_restore(struct virtio_device *vdev) +{ + /* TODO need to restore state here */ + return 0; +} +#endif /* CONFIG_PM_SLEEP */ + +const static struct virtio_device_id id_table[] = { + { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID }, + {}, +}; + +const static unsigned int feature_table[] = {}; + +static struct virtio_driver virtio_fs_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .feature_table = feature_table, + .feature_table_size = ARRAY_SIZE(feature_table), + .probe = virtio_fs_probe, + .remove = virtio_fs_remove, +#ifdef CONFIG_PM_SLEEP + .freeze = virtio_fs_freeze, + .restore = virtio_fs_restore, +#endif +}; + +static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + struct fuse_forget_link *link; + struct virtio_fs_forget *forget; + struct scatterlist sg; + struct scatterlist *sgs[] = {&sg}; + struct virtio_fs *fs; + struct virtqueue *vq; + struct virtio_fs_vq *fsvq; + bool notify; + u64 unique; + int ret; + + link = fuse_dequeue_forget(fiq, 1, NULL); + unique = fuse_get_unique(fiq); + + fs = fiq->priv; + fsvq = &fs->vqs[VQ_HIPRIO]; + spin_unlock(&fiq->lock); + + /* Allocate a buffer for the request */ + forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); + + forget->ih = (struct fuse_in_header){ + .opcode = FUSE_FORGET, + .nodeid = link->forget_one.nodeid, + .unique = unique, + .len = sizeof(*forget), + }; + forget->arg = (struct fuse_forget_in){ + .nlookup = link->forget_one.nlookup, + }; + + sg_init_one(&sg, forget, sizeof(*forget)); + + /* Enqueue the request */ + spin_lock(&fsvq->lock); + + if (!fsvq->connected) { + kfree(forget); + spin_unlock(&fsvq->lock); + goto out; + } + + vq = fsvq->vq; + dev_dbg(&vq->vdev->dev, "%s\n", __func__); + + ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later.\n", + ret); + list_add_tail(&forget->list, &fsvq->queued_reqs); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + } else { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", + ret); + kfree(forget); + } + spin_unlock(&fsvq->lock); + goto out; + } + + fsvq->in_flight++; + notify = virtqueue_kick_prepare(vq); + + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); +out: + kfree(link); +} + +static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + /* + * TODO interrupts. + * + * Normal fs operations on a local filesystems aren't interruptible. + * Exceptions are blocking lock operations; for example fcntl(F_SETLKW) + * with shared lock between host and guest. + */ + spin_unlock(&fiq->lock); +} + +/* Return the number of scatter-gather list elements required */ +static unsigned int sg_count_fuse_req(struct fuse_req *req) +{ + struct fuse_args *args = req->args; + struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); + unsigned int total_sgs = 1 /* fuse_in_header */; + + if (args->in_numargs - args->in_pages) + total_sgs += 1; + + if (args->in_pages) + total_sgs += ap->num_pages; + + if (!test_bit(FR_ISREPLY, &req->flags)) + return total_sgs; + + total_sgs += 1 /* fuse_out_header */; + + if (args->out_numargs - args->out_pages) + total_sgs += 1; + + if (args->out_pages) + total_sgs += ap->num_pages; + + return total_sgs; +} + +/* Add pages to scatter-gather list and return number of elements used */ +static unsigned int sg_init_fuse_pages(struct scatterlist *sg, + struct page **pages, + struct fuse_page_desc *page_descs, + unsigned int num_pages, + unsigned int total_len) +{ + unsigned int i; + unsigned int this_len; + + for (i = 0; i < num_pages && total_len; i++) { + sg_init_table(&sg[i], 1); + this_len = min(page_descs[i].length, total_len); + sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset); + total_len -= this_len; + } + + return i; +} + +/* Add args to scatter-gather list and return number of elements used */ +static unsigned int sg_init_fuse_args(struct scatterlist *sg, + struct fuse_req *req, + struct fuse_arg *args, + unsigned int numargs, + bool argpages, + void *argbuf, + unsigned int *len_used) +{ + struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); + unsigned int total_sgs = 0; + unsigned int len; + + len = fuse_len_args(numargs - argpages, args); + if (len) + sg_init_one(&sg[total_sgs++], argbuf, len); + + if (argpages) + total_sgs += sg_init_fuse_pages(&sg[total_sgs], + ap->pages, ap->descs, + ap->num_pages, + args[numargs - 1].size); + + if (len_used) + *len_used = len; + + return total_sgs; +} + +/* Add a request to a virtqueue and kick the device */ +static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, + struct fuse_req *req) +{ + /* requests need at least 4 elements */ + struct scatterlist *stack_sgs[6]; + struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)]; + struct scatterlist **sgs = stack_sgs; + struct scatterlist *sg = stack_sg; + struct virtqueue *vq; + struct fuse_args *args = req->args; + unsigned int argbuf_used = 0; + unsigned int out_sgs = 0; + unsigned int in_sgs = 0; + unsigned int total_sgs; + unsigned int i; + int ret; + bool notify; + + /* Does the sglist fit on the stack? */ + total_sgs = sg_count_fuse_req(req); + if (total_sgs > ARRAY_SIZE(stack_sgs)) { + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC); + sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC); + if (!sgs || !sg) { + ret = -ENOMEM; + goto out; + } + } + + /* Use a bounce buffer since stack args cannot be mapped */ + ret = copy_args_to_argbuf(req); + if (ret < 0) + goto out; + + /* Request elements */ + sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h)); + out_sgs += sg_init_fuse_args(&sg[out_sgs], req, + (struct fuse_arg *)args->in_args, + args->in_numargs, args->in_pages, + req->argbuf, &argbuf_used); + + /* Reply elements */ + if (test_bit(FR_ISREPLY, &req->flags)) { + sg_init_one(&sg[out_sgs + in_sgs++], + &req->out.h, sizeof(req->out.h)); + in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req, + args->out_args, args->out_numargs, + args->out_pages, + req->argbuf + argbuf_used, NULL); + } + + WARN_ON(out_sgs + in_sgs != total_sgs); + + for (i = 0; i < total_sgs; i++) + sgs[i] = &sg[i]; + + spin_lock(&fsvq->lock); + + if (!fsvq->connected) { + spin_unlock(&fsvq->lock); + ret = -ENOTCONN; + goto out; + } + + vq = fsvq->vq; + ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); + if (ret < 0) { + spin_unlock(&fsvq->lock); + goto out; + } + + fsvq->in_flight++; + notify = virtqueue_kick_prepare(vq); + + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); + +out: + if (ret < 0 && req->argbuf) { + kfree(req->argbuf); + req->argbuf = NULL; + } + if (sgs != stack_sgs) { + kfree(sgs); + kfree(sg); + } + + return ret; +} + +static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */ + struct virtio_fs *fs; + struct fuse_conn *fc; + struct fuse_req *req; + struct fuse_pqueue *fpq; + int ret; + + WARN_ON(list_empty(&fiq->pending)); + req = list_last_entry(&fiq->pending, struct fuse_req, list); + clear_bit(FR_PENDING, &req->flags); + list_del_init(&req->list); + WARN_ON(!list_empty(&fiq->pending)); + spin_unlock(&fiq->lock); + + fs = fiq->priv; + fc = fs->vqs[queue_id].fud->fc; + + pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n", + __func__, req->in.h.opcode, req->in.h.unique, + req->in.h.nodeid, req->in.h.len, + fuse_len_args(req->args->out_numargs, req->args->out_args)); + + fpq = &fs->vqs[queue_id].fud->pq; + spin_lock(&fpq->lock); + if (!fpq->connected) { + spin_unlock(&fpq->lock); + req->out.h.error = -ENODEV; + pr_err("virtio-fs: %s disconnected\n", __func__); + fuse_request_end(fc, req); + return; + } + list_add_tail(&req->list, fpq->processing); + spin_unlock(&fpq->lock); + set_bit(FR_SENT, &req->flags); + /* matches barrier in request_wait_answer() */ + smp_mb__after_atomic(); + +retry: + ret = virtio_fs_enqueue_req(&fs->vqs[queue_id], req); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + /* Virtqueue full. Retry submission */ + /* TODO use completion instead of timeout */ + usleep_range(20, 30); + goto retry; + } + req->out.h.error = ret; + pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret); + spin_lock(&fpq->lock); + clear_bit(FR_SENT, &req->flags); + list_del_init(&req->list); + spin_unlock(&fpq->lock); + fuse_request_end(fc, req); + return; + } +} + +const static struct fuse_iqueue_ops virtio_fs_fiq_ops = { + .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, + .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, + .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, + .release = virtio_fs_fiq_release, +}; + +static int virtio_fs_fill_super(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct virtio_fs *fs = fc->iq.priv; + unsigned int i; + int err; + struct fuse_fs_context ctx = { + .rootmode = S_IFDIR, + .default_permissions = 1, + .allow_other = 1, + .max_read = UINT_MAX, + .blksize = 512, + .destroy = true, + .no_control = true, + .no_force_umount = true, + }; + + mutex_lock(&virtio_fs_mutex); + + /* After holding mutex, make sure virtiofs device is still there. + * Though we are holding a reference to it, drive ->remove might + * still have cleaned up virtual queues. In that case bail out. + */ + err = -EINVAL; + if (list_empty(&fs->list)) { + pr_info("virtio-fs: tag <%s> not found\n", fs->tag); + goto err; + } + + err = -ENOMEM; + /* Allocate fuse_dev for hiprio and notification queues */ + for (i = 0; i < VQ_REQUEST; i++) { + struct virtio_fs_vq *fsvq = &fs->vqs[i]; + + fsvq->fud = fuse_dev_alloc(); + if (!fsvq->fud) + goto err_free_fuse_devs; + } + + ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud; + err = fuse_fill_super_common(sb, &ctx); + if (err < 0) + goto err_free_fuse_devs; + + fc = fs->vqs[VQ_REQUEST].fud->fc; + + for (i = 0; i < fs->nvqs; i++) { + struct virtio_fs_vq *fsvq = &fs->vqs[i]; + + if (i == VQ_REQUEST) + continue; /* already initialized */ + fuse_dev_install(fsvq->fud, fc); + } + + /* Previous unmount will stop all queues. Start these again */ + virtio_fs_start_all_queues(fs); + fuse_send_init(fc); + mutex_unlock(&virtio_fs_mutex); + return 0; + +err_free_fuse_devs: + virtio_fs_free_devs(fs); +err: + mutex_unlock(&virtio_fs_mutex); + return err; +} + +static void virtio_kill_sb(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct virtio_fs *vfs; + struct virtio_fs_vq *fsvq; + + /* If mount failed, we can still be called without any fc */ + if (!fc) + return fuse_kill_sb_anon(sb); + + vfs = fc->iq.priv; + fsvq = &vfs->vqs[VQ_HIPRIO]; + + /* Stop forget queue. Soon destroy will be sent */ + spin_lock(&fsvq->lock); + fsvq->connected = false; + spin_unlock(&fsvq->lock); + virtio_fs_drain_all_queues(vfs); + + fuse_kill_sb_anon(sb); + + /* fuse_kill_sb_anon() must have sent destroy. Stop all queues + * and drain one more time and free fuse devices. Freeing fuse + * devices will drop their reference on fuse_conn and that in + * turn will drop its reference on virtio_fs object. + */ + virtio_fs_stop_all_queues(vfs); + virtio_fs_drain_all_queues(vfs); + virtio_fs_free_devs(vfs); +} + +static int virtio_fs_test_super(struct super_block *sb, + struct fs_context *fsc) +{ + struct fuse_conn *fc = fsc->s_fs_info; + + return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv; +} + +static int virtio_fs_set_super(struct super_block *sb, + struct fs_context *fsc) +{ + int err; + + err = get_anon_bdev(&sb->s_dev); + if (!err) + fuse_conn_get(fsc->s_fs_info); + + return err; +} + +static int virtio_fs_get_tree(struct fs_context *fsc) +{ + struct virtio_fs *fs; + struct super_block *sb; + struct fuse_conn *fc; + int err; + + /* This gets a reference on virtio_fs object. This ptr gets installed + * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() + * to drop the reference to this object. + */ + fs = virtio_fs_find_instance(fsc->source); + if (!fs) { + pr_info("virtio-fs: tag <%s> not found\n", fsc->source); + return -EINVAL; + } + + fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); + if (!fc) { + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); + return -ENOMEM; + } + + fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops, + fs); + fc->release = fuse_free_conn; + fc->delete_stale = true; + + fsc->s_fs_info = fc; + sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super); + fuse_conn_put(fc); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + err = virtio_fs_fill_super(sb); + if (err) { + deactivate_locked_super(sb); + return err; + } + + sb->s_flags |= SB_ACTIVE; + } + + WARN_ON(fsc->root); + fsc->root = dget(sb->s_root); + return 0; +} + +static const struct fs_context_operations virtio_fs_context_ops = { + .get_tree = virtio_fs_get_tree, +}; + +static int virtio_fs_init_fs_context(struct fs_context *fsc) +{ + fsc->ops = &virtio_fs_context_ops; + return 0; +} + +static struct file_system_type virtio_fs_type = { + .owner = THIS_MODULE, + .name = "virtiofs", + .init_fs_context = virtio_fs_init_fs_context, + .kill_sb = virtio_kill_sb, +}; + +static int __init virtio_fs_init(void) +{ + int ret; + + ret = register_virtio_driver(&virtio_fs_driver); + if (ret < 0) + return ret; + + ret = register_filesystem(&virtio_fs_type); + if (ret < 0) { + unregister_virtio_driver(&virtio_fs_driver); + return ret; + } + + return 0; +} +module_init(virtio_fs_init); + +static void __exit virtio_fs_exit(void) +{ + unregister_filesystem(&virtio_fs_type); + unregister_virtio_driver(&virtio_fs_driver); +} +module_exit(virtio_fs_exit); + +MODULE_AUTHOR("Stefan Hajnoczi <stefanha@redhat.com>"); +MODULE_DESCRIPTION("Virtio Filesystem"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_FS(KBUILD_MODNAME); +MODULE_DEVICE_TABLE(virtio, id_table); diff --git a/fs/io_uring.c b/fs/io_uring.c index 38d274fc0f25..76fdbe84aff5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2763,6 +2763,38 @@ out: return submit; } +struct io_wait_queue { + struct wait_queue_entry wq; + struct io_ring_ctx *ctx; + unsigned to_wait; + unsigned nr_timeouts; +}; + +static inline bool io_should_wake(struct io_wait_queue *iowq) +{ + struct io_ring_ctx *ctx = iowq->ctx; + + /* + * Wake up if we have enough events, or if a timeout occured since we + * started waiting. For timeouts, we always want to return to userspace, + * regardless of event count. + */ + return io_cqring_events(ctx->rings) >= iowq->to_wait || + atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; +} + +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, + int wake_flags, void *key) +{ + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, + wq); + + if (!io_should_wake(iowq)) + return -1; + + return autoremove_wake_function(curr, mode, wake_flags, key); +} + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -2770,8 +2802,16 @@ out: static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { + struct io_wait_queue iowq = { + .wq = { + .private = current, + .func = io_wake_function, + .entry = LIST_HEAD_INIT(iowq.wq.entry), + }, + .ctx = ctx, + .to_wait = min_events, + }; struct io_rings *rings = ctx->rings; - unsigned nr_timeouts; int ret; if (io_cqring_events(rings) >= min_events) @@ -2790,15 +2830,21 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - nr_timeouts = atomic_read(&ctx->cq_timeouts); - /* - * Return if we have enough events, or if a timeout occured since - * we started waiting. For timeouts, we always want to return to - * userspace. - */ - ret = wait_event_interruptible(ctx->wait, - io_cqring_events(rings) >= min_events || - atomic_read(&ctx->cq_timeouts) != nr_timeouts); + ret = 0; + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + do { + prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, + TASK_INTERRUPTIBLE); + if (io_should_wake(&iowq)) + break; + schedule(); + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + } while (1); + finish_wait(&ctx->wait, &iowq.wq); + restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; @@ -3454,7 +3500,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != ctx->rings->sq_ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; - if (READ_ONCE(ctx->rings->sq.head) != ctx->cached_cq_tail) + if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index cbe70637c117..0e6406c4f362 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -163,13 +163,11 @@ static const struct export_operations jffs2_export_ops = { * Opt_rp_size: size of reserved pool in KiB */ enum { - Opt_source, Opt_override_compr, Opt_rp_size, }; static const struct fs_parameter_spec jffs2_param_specs[] = { - fsparam_string ("source", Opt_source), fsparam_enum ("compr", Opt_override_compr), fsparam_u32 ("rp_size", Opt_rp_size), {} diff --git a/fs/libfs.c b/fs/libfs.c index c9b2850c0f7c..540611b99b9a 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -89,58 +89,45 @@ int dcache_dir_close(struct inode *inode, struct file *file) EXPORT_SYMBOL(dcache_dir_close); /* parent is locked at least shared */ -static struct dentry *next_positive(struct dentry *parent, - struct list_head *from, - int count) +/* + * Returns an element of siblings' list. + * We are looking for <count>th positive after <p>; if + * found, dentry is grabbed and returned to caller. + * If no such element exists, NULL is returned. + */ +static struct dentry *scan_positives(struct dentry *cursor, + struct list_head *p, + loff_t count, + struct dentry *last) { - unsigned *seq = &parent->d_inode->i_dir_seq, n; - struct dentry *res; - struct list_head *p; - bool skipped; - int i; + struct dentry *dentry = cursor->d_parent, *found = NULL; -retry: - i = count; - skipped = false; - n = smp_load_acquire(seq) & ~1; - res = NULL; - rcu_read_lock(); - for (p = from->next; p != &parent->d_subdirs; p = p->next) { + spin_lock(&dentry->d_lock); + while ((p = p->next) != &dentry->d_subdirs) { struct dentry *d = list_entry(p, struct dentry, d_child); - if (!simple_positive(d)) { - skipped = true; - } else if (!--i) { - res = d; - break; + // we must at least skip cursors, to avoid livelocks + if (d->d_flags & DCACHE_DENTRY_CURSOR) + continue; + if (simple_positive(d) && !--count) { + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(d)) + found = dget_dlock(d); + spin_unlock(&d->d_lock); + if (likely(found)) + break; + count = 1; + } + if (need_resched()) { + list_move(&cursor->d_child, p); + p = &cursor->d_child; + spin_unlock(&dentry->d_lock); + cond_resched(); + spin_lock(&dentry->d_lock); } } - rcu_read_unlock(); - if (skipped) { - smp_rmb(); - if (unlikely(*seq != n)) - goto retry; - } - return res; -} - -static void move_cursor(struct dentry *cursor, struct list_head *after) -{ - struct dentry *parent = cursor->d_parent; - unsigned n, *seq = &parent->d_inode->i_dir_seq; - spin_lock(&parent->d_lock); - for (;;) { - n = *seq; - if (!(n & 1) && cmpxchg(seq, n, n + 1) == n) - break; - cpu_relax(); - } - __list_del(cursor->d_child.prev, cursor->d_child.next); - if (after) - list_add(&cursor->d_child, after); - else - list_add_tail(&cursor->d_child, &parent->d_subdirs); - smp_store_release(seq, n + 2); - spin_unlock(&parent->d_lock); + spin_unlock(&dentry->d_lock); + dput(last); + return found; } loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) @@ -158,17 +145,25 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) return -EINVAL; } if (offset != file->f_pos) { + struct dentry *cursor = file->private_data; + struct dentry *to = NULL; + + inode_lock_shared(dentry->d_inode); + + if (offset > 2) + to = scan_positives(cursor, &dentry->d_subdirs, + offset - 2, NULL); + spin_lock(&dentry->d_lock); + if (to) + list_move(&cursor->d_child, &to->d_child); + else + list_del_init(&cursor->d_child); + spin_unlock(&dentry->d_lock); + dput(to); + file->f_pos = offset; - if (file->f_pos >= 2) { - struct dentry *cursor = file->private_data; - struct dentry *to; - loff_t n = file->f_pos - 2; - - inode_lock_shared(dentry->d_inode); - to = next_positive(dentry, &dentry->d_subdirs, n); - move_cursor(cursor, to ? &to->d_child : NULL); - inode_unlock_shared(dentry->d_inode); - } + + inode_unlock_shared(dentry->d_inode); } return offset; } @@ -190,25 +185,35 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; - struct list_head *p = &cursor->d_child; - struct dentry *next; - bool moved = false; + struct list_head *anchor = &dentry->d_subdirs; + struct dentry *next = NULL; + struct list_head *p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) - p = &dentry->d_subdirs; - while ((next = next_positive(dentry, p, 1)) != NULL) { + p = anchor; + else if (!list_empty(&cursor->d_child)) + p = &cursor->d_child; + else + return 0; + + while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, dt_type(d_inode(next)))) break; - moved = true; - p = &next->d_child; ctx->pos++; + p = &next->d_child; } - if (moved) - move_cursor(cursor, p); + spin_lock(&dentry->d_lock); + if (next) + list_move_tail(&cursor->d_child, &next->d_child); + else + list_del_init(&cursor->d_child); + spin_unlock(&dentry->d_lock); + dput(next); + return 0; } EXPORT_SYMBOL(dcache_readdir); diff --git a/fs/locks.c b/fs/locks.c index a364ebc5cec3..6970f55daf54 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -212,6 +212,7 @@ struct file_lock_list_struct { static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list); DEFINE_STATIC_PERCPU_RWSEM(file_rwsem); + /* * The blocked_hash is used to find POSIX lock loops for deadlock detection. * It is protected by blocked_lock_lock. @@ -1991,6 +1992,64 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp, } EXPORT_SYMBOL(generic_setlease); +#if IS_ENABLED(CONFIG_SRCU) +/* + * Kernel subsystems can register to be notified on any attempt to set + * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd + * to close files that it may have cached when there is an attempt to set a + * conflicting lease. + */ +static struct srcu_notifier_head lease_notifier_chain; + +static inline void +lease_notifier_chain_init(void) +{ + srcu_init_notifier_head(&lease_notifier_chain); +} + +static inline void +setlease_notifier(long arg, struct file_lock *lease) +{ + if (arg != F_UNLCK) + srcu_notifier_call_chain(&lease_notifier_chain, arg, lease); +} + +int lease_register_notifier(struct notifier_block *nb) +{ + return srcu_notifier_chain_register(&lease_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(lease_register_notifier); + +void lease_unregister_notifier(struct notifier_block *nb) +{ + srcu_notifier_chain_unregister(&lease_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(lease_unregister_notifier); + +#else /* !IS_ENABLED(CONFIG_SRCU) */ +static inline void +lease_notifier_chain_init(void) +{ +} + +static inline void +setlease_notifier(long arg, struct file_lock *lease) +{ +} + +int lease_register_notifier(struct notifier_block *nb) +{ + return 0; +} +EXPORT_SYMBOL_GPL(lease_register_notifier); + +void lease_unregister_notifier(struct notifier_block *nb) +{ +} +EXPORT_SYMBOL_GPL(lease_unregister_notifier); + +#endif /* IS_ENABLED(CONFIG_SRCU) */ + /** * vfs_setlease - sets a lease on an open file * @filp: file pointer @@ -2011,6 +2070,8 @@ EXPORT_SYMBOL(generic_setlease); int vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { + if (lease) + setlease_notifier(arg, *lease); if (filp->f_op->setlease) return filp->f_op->setlease(filp, arg, lease, priv); else @@ -2924,6 +2985,7 @@ static int __init filelock_init(void) INIT_HLIST_HEAD(&fll->hlist); } + lease_notifier_chain_init(); return 0; } core_initcall(filelock_init); diff --git a/fs/namespace.c b/fs/namespace.c index b75d458f817d..fe0e9e1410fe 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3026,7 +3026,7 @@ void *copy_mount_options(const void __user * data) * the remainder of the page. */ /* copy_from_user cannot cross TASK_SIZE ! */ - size = TASK_SIZE - (unsigned long)data; + size = TASK_SIZE - (unsigned long)untagged_addr(data); if (size > PAGE_SIZE) size = PAGE_SIZE; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 0adfd8840110..e180033e35cf 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1669,10 +1669,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) #endif /* CONFIG_NFSV4 */ -/* - * Code common to create, mkdir, and mknod. - */ -int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, +struct dentry * +nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { @@ -1680,13 +1678,10 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, struct inode *dir = d_inode(parent); struct inode *inode; struct dentry *d; - int error = -EACCES; + int error; d_drop(dentry); - /* We may have been initialized further down */ - if (d_really_is_positive(dentry)) - goto out; if (fhandle->size == 0) { error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); if (error) @@ -1702,18 +1697,32 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, } inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); d = d_splice_alias(inode, dentry); - if (IS_ERR(d)) { - error = PTR_ERR(d); - goto out_error; - } - dput(d); out: dput(parent); - return 0; + return d; out_error: nfs_mark_for_revalidate(dir); - dput(parent); - return error; + d = ERR_PTR(error); + goto out; +} +EXPORT_SYMBOL_GPL(nfs_add_or_obtain); + +/* + * Code common to create, mkdir, and mknod. + */ +int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + struct dentry *d; + + d = nfs_add_or_obtain(dentry, fhandle, fattr, label); + if (IS_ERR(d)) + return PTR_ERR(d); + + /* Callers don't care */ + dput(d); + return 0; } EXPORT_SYMBOL_GPL(nfs_instantiate); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 222d7115db71..040a50fd9bf3 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -64,13 +64,6 @@ static struct kmem_cache *nfs_direct_cachep; -/* - * This represents a set of asynchronous requests that we're waiting on - */ -struct nfs_direct_mirror { - ssize_t count; -}; - struct nfs_direct_req { struct kref kref; /* release manager */ @@ -84,9 +77,6 @@ struct nfs_direct_req { atomic_t io_count; /* i/os we're waiting for */ spinlock_t lock; /* protect completion state */ - struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX]; - int mirror_count; - loff_t io_start; /* Start offset for I/O */ ssize_t count, /* bytes actually processed */ max_count, /* max expected count */ @@ -123,32 +113,42 @@ static inline int put_dreq(struct nfs_direct_req *dreq) } static void -nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) +nfs_direct_handle_truncated(struct nfs_direct_req *dreq, + const struct nfs_pgio_header *hdr, + ssize_t dreq_len) { - int i; - ssize_t count; + if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) || + test_bit(NFS_IOHDR_EOF, &hdr->flags))) + return; + if (dreq->max_count >= dreq_len) { + dreq->max_count = dreq_len; + if (dreq->count > dreq_len) + dreq->count = dreq_len; + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) + dreq->error = hdr->error; + else /* Clear outstanding error if this is EOF */ + dreq->error = 0; + } +} - WARN_ON_ONCE(dreq->count >= dreq->max_count); +static void +nfs_direct_count_bytes(struct nfs_direct_req *dreq, + const struct nfs_pgio_header *hdr) +{ + loff_t hdr_end = hdr->io_start + hdr->good_bytes; + ssize_t dreq_len = 0; - if (dreq->mirror_count == 1) { - dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; - dreq->count += hdr->good_bytes; - } else { - /* mirrored writes */ - count = dreq->mirrors[hdr->pgio_mirror_idx].count; - if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { - count = hdr->io_start + hdr->good_bytes - dreq->io_start; - dreq->mirrors[hdr->pgio_mirror_idx].count = count; - } - /* update the dreq->count by finding the minimum agreed count from all - * mirrors */ - count = dreq->mirrors[0].count; + if (hdr_end > dreq->io_start) + dreq_len = hdr_end - dreq->io_start; - for (i = 1; i < dreq->mirror_count; i++) - count = min(count, dreq->mirrors[i].count); + nfs_direct_handle_truncated(dreq, hdr, dreq_len); - dreq->count = count; - } + if (dreq_len > dreq->max_count) + dreq_len = dreq->max_count; + + if (dreq->count < dreq_len) + dreq->count = dreq_len; } /* @@ -293,18 +293,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, cinfo->completion_ops = &nfs_direct_commit_completion_ops; } -static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq, - struct nfs_pageio_descriptor *pgio, - struct nfs_page *req) -{ - int mirror_count = 1; - - if (pgio->pg_ops->pg_get_mirror_count) - mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); - - dreq->mirror_count = mirror_count; -} - static inline struct nfs_direct_req *nfs_direct_req_alloc(void) { struct nfs_direct_req *dreq; @@ -319,7 +307,6 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) INIT_LIST_HEAD(&dreq->mds_cinfo.list); dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); - dreq->mirror_count = 1; spin_lock_init(&dreq->lock); return dreq; @@ -402,20 +389,12 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) struct nfs_direct_req *dreq = hdr->dreq; spin_lock(&dreq->lock); - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { spin_unlock(&dreq->lock); goto out_put; } - if (hdr->good_bytes != 0) - nfs_direct_good_bytes(dreq, hdr); - - if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) - dreq->error = 0; - + nfs_direct_count_bytes(dreq, hdr); spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { @@ -646,29 +625,22 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) LIST_HEAD(reqs); struct nfs_commit_info cinfo; LIST_HEAD(failed); - int i; nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); dreq->count = 0; + dreq->max_count = 0; + list_for_each_entry(req, &reqs, wb_list) + dreq->max_count += req->wb_bytes; dreq->verf.committed = NFS_INVALID_STABLE_HOW; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); - for (i = 0; i < dreq->mirror_count; i++) - dreq->mirrors[i].count = 0; get_dreq(dreq); nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; - req = nfs_list_entry(reqs.next); - nfs_direct_setup_mirroring(dreq, &desc, req); - if (desc.pg_error < 0) { - list_splice_init(&reqs, &failed); - goto out_failed; - } - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { /* Bump the transmission count */ req->wb_nio++; @@ -686,7 +658,6 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) } nfs_pageio_complete(&desc); -out_failed: while (!list_empty(&failed)) { req = nfs_list_entry(failed.next); nfs_list_remove_request(req); @@ -791,17 +762,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { spin_unlock(&dreq->lock); goto out_put; } + nfs_direct_count_bytes(dreq, hdr); if (hdr->good_bytes != 0) { - nfs_direct_good_bytes(dreq, hdr); if (nfs_write_need_commit(hdr)) { if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) request_commit = true; @@ -923,7 +890,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, break; } - nfs_direct_setup_mirroring(dreq, &desc, req); if (desc.pg_error < 0) { nfs_free_request(req); result = desc.pg_error; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 3cb073c50fa6..c9b605f6c9cb 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, + .flags = PNFS_LAYOUTGET_ON_OPEN, .max_layoutget_response = 4096, /* 1 page or so... */ .alloc_layout_hdr = filelayout_alloc_layout_hdr, .free_layout_hdr = filelayout_free_layout_hdr, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e64f810223be..447a3c17fa8e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -16,14 +16,6 @@ extern const struct export_operations nfs_export_ops; struct nfs_string; -/* Maximum number of readahead requests - * FIXME: this should really be a sysctl so that users may tune it to suit - * their needs. People that do NFS over a slow network, might for - * instance want to reduce it to something closer to 1 for improved - * interactive response. - */ -#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) - static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) { if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index a3ad2d46fd42..9eb2f1a503ab 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -279,15 +279,17 @@ static struct nfs3_createdata *nfs3_alloc_createdata(void) return data; } -static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) +static struct dentry * +nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) { int status; status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); nfs_post_op_update_inode(dir, data->res.dir_attr); - if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); - return status; + if (status != 0) + return ERR_PTR(status); + + return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL); } static void nfs3_free_createdata(struct nfs3_createdata *data) @@ -304,6 +306,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call create %pd\n", dentry); @@ -330,7 +333,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; for (;;) { - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); if (status != -ENOTSUPP) break; @@ -355,6 +359,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + /* When we created the file with exclusive semantics, make * sure we set the attributes afterwards. */ if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) { @@ -372,11 +379,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_post_op_update_inode(d_inode(dentry), data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); if (status != 0) - goto out_release_acls; + goto out_dput; } status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); +out_dput: + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); @@ -504,6 +513,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; if (len > NFS3_MAXPATHLEN) @@ -522,7 +532,11 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, data->arg.symlink.pathlen = len; data->arg.symlink.sattr = sattr; - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); + + if (status == 0) + dput(d_alias); nfs3_free_createdata(data); out: @@ -535,6 +549,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); @@ -553,12 +568,18 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) data->arg.mkdir.len = dentry->d_name.len; data->arg.mkdir.sattr = sattr; - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); + if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); @@ -660,6 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call mknod %pd %u:%u\n", dentry, @@ -698,12 +720,17 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; } - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 3564da1ba8a1..16b2e5cc3e94 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -491,8 +491,6 @@ extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, const struct nfs_lock_context *, nfs4_stateid *, const struct cred **); -extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst, - struct nfs4_state *state); extern bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state); @@ -574,6 +572,15 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; } +static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1) +{ + u32 seqid = be32_to_cpu(s1->seqid); + + if (++seqid == 0) + ++seqid; + s1->seqid = cpu_to_be32(seqid); +} + static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) { return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1406858bae6c..ab8ca20fd579 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1073,14 +1073,26 @@ static const struct rpc_call_ops nfs40_call_sync_ops = { .rpc_call_done = nfs40_call_sync_done, }; +static int nfs4_call_sync_custom(struct rpc_task_setup *task_setup) +{ + int ret; + struct rpc_task *task; + + task = rpc_run_task(task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + + ret = task->tk_status; + rpc_put_task(task); + return ret; +} + static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res) { - int ret; - struct rpc_task *task; struct nfs_client *clp = server->nfs_client; struct nfs4_call_sync_data data = { .seq_server = server, @@ -1094,14 +1106,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, .callback_data = &data }; - task = rpc_run_task(&task_setup); - if (IS_ERR(task)) - ret = PTR_ERR(task); - else { - ret = task->tk_status; - rpc_put_task(task); - } - return ret; + return nfs4_call_sync_custom(&task_setup); } int nfs4_call_sync(struct rpc_clnt *clnt, @@ -3308,6 +3313,75 @@ nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task) return pnfs_wait_on_layoutreturn(inode, task); } +/* + * Update the seqid of an open stateid + */ +static void nfs4_sync_open_stateid(nfs4_stateid *dst, + struct nfs4_state *state) +{ + __be32 seqid_open; + u32 dst_seqid; + int seq; + + for (;;) { + if (!nfs4_valid_open_stateid(state)) + break; + seq = read_seqbegin(&state->seqlock); + if (!nfs4_state_match_open_stateid_other(state, dst)) { + nfs4_stateid_copy(dst, &state->open_stateid); + if (read_seqretry(&state->seqlock, seq)) + continue; + break; + } + seqid_open = state->open_stateid.seqid; + if (read_seqretry(&state->seqlock, seq)) + continue; + + dst_seqid = be32_to_cpu(dst->seqid); + if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) < 0) + dst->seqid = seqid_open; + break; + } +} + +/* + * Update the seqid of an open stateid after receiving + * NFS4ERR_OLD_STATEID + */ +static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, + struct nfs4_state *state) +{ + __be32 seqid_open; + u32 dst_seqid; + bool ret; + int seq; + + for (;;) { + ret = false; + if (!nfs4_valid_open_stateid(state)) + break; + seq = read_seqbegin(&state->seqlock); + if (!nfs4_state_match_open_stateid_other(state, dst)) { + if (read_seqretry(&state->seqlock, seq)) + continue; + break; + } + seqid_open = state->open_stateid.seqid; + if (read_seqretry(&state->seqlock, seq)) + continue; + + dst_seqid = be32_to_cpu(dst->seqid); + if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) >= 0) + dst->seqid = cpu_to_be32(dst_seqid + 1); + else + dst->seqid = seqid_open; + ret = true; + break; + } + + return ret; +} + struct nfs4_closedata { struct inode *inode; struct nfs4_state *state; @@ -3358,32 +3432,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data) trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status); /* Handle Layoutreturn errors */ - if (calldata->arg.lr_args && task->tk_status != 0) { - switch (calldata->res.lr_ret) { - default: - calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - break; - case 0: - calldata->arg.lr_args = NULL; - calldata->res.lr_res = NULL; - break; - case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&calldata->arg.lr_args->stateid, - &calldata->arg.lr_args->range, - calldata->inode)) - goto lr_restart; - /* Fallthrough */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_UNKNOWN_LAYOUTTYPE: - case -NFS4ERR_WRONG_CRED: - calldata->arg.lr_args = NULL; - calldata->res.lr_res = NULL; - goto lr_restart; - } - } + if (pnfs_roc_done(task, calldata->inode, + &calldata->arg.lr_args, + &calldata->res.lr_res, + &calldata->res.lr_ret) == -EAGAIN) + goto out_restart; /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors @@ -3403,7 +3456,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) break; case -NFS4ERR_OLD_STATEID: /* Did we race with OPEN? */ - if (nfs4_refresh_open_stateid(&calldata->arg.stateid, + if (nfs4_refresh_open_old_stateid(&calldata->arg.stateid, state)) goto out_restart; goto out_release; @@ -3415,7 +3468,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data) task->tk_msg.rpc_cred); /* Fallthrough */ case -NFS4ERR_BAD_STATEID: - break; + if (calldata->arg.fmode == 0) + break; + /* Fallthrough */ default: task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); @@ -3430,8 +3485,6 @@ out_release: nfs_refresh_inode(calldata->inode, &calldata->fattr); dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); return; -lr_restart: - calldata->res.lr_ret = 0; out_restart: task->tk_status = 0; rpc_restart_call_prepare(task); @@ -3472,8 +3525,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) } else if (is_rdwr) calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; - if (!nfs4_valid_open_stateid(state) || - !nfs4_refresh_open_stateid(&calldata->arg.stateid, state)) + nfs4_sync_open_stateid(&calldata->arg.stateid, state); + if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); @@ -6018,7 +6071,6 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, .rpc_resp = res, .rpc_cred = cred, }; - struct rpc_task *task; struct rpc_task_setup task_setup_data = { .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, @@ -6051,17 +6103,13 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, dprintk("NFS call setclientid auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_owner_id); - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out; - } - status = task->tk_status; + + status = nfs4_call_sync_custom(&task_setup_data); if (setclientid.sc_cred) { + kfree(clp->cl_acceptor); clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred); put_rpccred(setclientid.sc_cred); } - rpc_put_task(task); out: trace_nfs4_setclientid(clp, status); dprintk("NFS reply setclientid: %d\n", status); @@ -6129,32 +6177,11 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); /* Handle Layoutreturn errors */ - if (data->args.lr_args && task->tk_status != 0) { - switch(data->res.lr_ret) { - default: - data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - break; - case 0: - data->args.lr_args = NULL; - data->res.lr_res = NULL; - break; - case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&data->args.lr_args->stateid, - &data->args.lr_args->range, - data->inode)) - goto lr_restart; - /* Fallthrough */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_UNKNOWN_LAYOUTTYPE: - case -NFS4ERR_WRONG_CRED: - data->args.lr_args = NULL; - data->res.lr_res = NULL; - goto lr_restart; - } - } + if (pnfs_roc_done(task, data->inode, + &data->args.lr_args, + &data->res.lr_res, + &data->res.lr_ret) == -EAGAIN) + goto out_restart; switch (task->tk_status) { case 0: @@ -6192,8 +6219,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) } data->rpc_status = task->tk_status; return; -lr_restart: - data->res.lr_ret = 0; out_restart: task->tk_status = 0; rpc_restart_call_prepare(task); @@ -6386,6 +6411,42 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } +/* + * Update the seqid of a lock stateid after receiving + * NFS4ERR_OLD_STATEID + */ +static bool nfs4_refresh_lock_old_stateid(nfs4_stateid *dst, + struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret = false; + + spin_lock(&state->state_lock); + if (!nfs4_stateid_match_other(dst, &lsp->ls_stateid)) + goto out; + if (!nfs4_stateid_is_newer(&lsp->ls_stateid, dst)) + nfs4_stateid_seqid_inc(dst); + else + dst->seqid = lsp->ls_stateid.seqid; + ret = true; +out: + spin_unlock(&state->state_lock); + return ret; +} + +static bool nfs4_sync_lock_stateid(nfs4_stateid *dst, + struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret; + + spin_lock(&state->state_lock); + ret = !nfs4_stateid_match_other(dst, &lsp->ls_stateid); + nfs4_stateid_copy(dst, &lsp->ls_stateid); + spin_unlock(&state->state_lock); + return ret; +} + struct nfs4_unlockdata { struct nfs_locku_args arg; struct nfs_locku_res res; @@ -6403,7 +6464,8 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs_seqid *seqid) { struct nfs4_unlockdata *p; - struct inode *inode = lsp->ls_state->inode; + struct nfs4_state *state = lsp->ls_state; + struct inode *inode = state->inode; p = kzalloc(sizeof(*p), GFP_NOFS); if (p == NULL) @@ -6419,6 +6481,9 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); + spin_lock(&state->state_lock); + nfs4_stateid_copy(&p->arg.stateid, &lsp->ls_stateid); + spin_unlock(&state->state_lock); return p; } @@ -6457,10 +6522,14 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) task->tk_msg.rpc_cred); /* Fall through */ case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - if (!nfs4_stateid_match(&calldata->arg.stateid, - &calldata->lsp->ls_stateid)) + if (nfs4_sync_lock_stateid(&calldata->arg.stateid, + calldata->lsp)) + rpc_restart_call_prepare(task); + break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_lock_old_stateid(&calldata->arg.stateid, + calldata->lsp)) rpc_restart_call_prepare(task); break; default: @@ -6483,7 +6552,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) goto out_wait; - nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid); if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ goto out_no_action; @@ -7645,6 +7713,8 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred) static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity) { int status; + struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct nfs_client *clp = NFS_SERVER(dir)->nfs_client; struct nfs4_secinfo_arg args = { .dir_fh = NFS_FH(dir), .name = name, @@ -7657,26 +7727,37 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct .rpc_argp = &args, .rpc_resp = &res, }; - struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct nfs4_call_sync_data data = { + .seq_server = NFS_SERVER(dir), + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = clp->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; const struct cred *cred = NULL; if (use_integrity) { - clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient; - cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); + clnt = clp->cl_rpcclient; + task_setup.rpc_client = clnt; + + cred = nfs4_get_clid_cred(clp); msg.rpc_cred = cred; } dprintk("NFS call secinfo %s\n", name->name); - nfs4_state_protect(NFS_SERVER(dir)->nfs_client, - NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + nfs4_state_protect(clp, NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_call_sync_custom(&task_setup); - status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, - &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); dprintk("NFS reply secinfo: %d\n", status); put_cred(cred); - return status; } @@ -8344,7 +8425,6 @@ static const struct rpc_call_ops nfs4_get_lease_time_ops = { int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) { - struct rpc_task *task; struct nfs4_get_lease_time_args args; struct nfs4_get_lease_time_res res = { .lr_fsinfo = fsinfo, @@ -8366,17 +8446,9 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) .callback_data = &data, .flags = RPC_TASK_TIMEOUT, }; - int status; nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1); - task = rpc_run_task(&task_setup); - - if (IS_ERR(task)) - return PTR_ERR(task); - - status = task->tk_status; - rpc_put_task(task); - return status; + return nfs4_call_sync_custom(&task_setup); } #ifdef CONFIG_NFS_V4_1 @@ -8845,7 +8917,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, const struct cred *cred) { struct nfs4_reclaim_complete_data *calldata; - struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], .rpc_cred = cred, @@ -8854,7 +8925,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs4_reclaim_complete_call_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN, + .flags = RPC_TASK_NO_ROUND_ROBIN, }; int status = -ENOMEM; @@ -8869,15 +8940,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out; - } - status = rpc_wait_for_completion_task(task); - if (status == 0) - status = task->tk_status; - rpc_put_task(task); + status = nfs4_call_sync_custom(&task_setup_data); out: dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -9103,10 +9166,19 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) if (!nfs41_sequence_process(task, &lrp->res.seq_res)) return; + /* + * Was there an RPC level error? Assume the call succeeded, + * and that we need to release the layout + */ + if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) { + lrp->res.lrs_present = 0; + return; + } + server = NFS_SERVER(lrp->args.inode); switch (task->tk_status) { case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&lrp->args.stateid, + if (nfs4_layout_refresh_old_stateid(&lrp->args.stateid, &lrp->args.range, lrp->args.inode)) goto out_restart; @@ -9362,18 +9434,32 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; struct rpc_clnt *clnt = server->client; + struct nfs4_call_sync_data data = { + .seq_server = server, + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = server->nfs_client->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; const struct cred *cred = NULL; int status; if (use_integrity) { clnt = server->nfs_client->cl_rpcclient; + task_setup.rpc_client = clnt; + cred = nfs4_get_clid_cred(server->nfs_client); msg.rpc_cred = cred; } dprintk("--> %s\n", __func__); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_call_sync_custom(&task_setup); dprintk("<-- %s status=%d\n", __func__, status); put_cred(cred); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cad4e064b328..0c6d53dc3672 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1015,22 +1015,6 @@ out: return ret; } -bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) -{ - bool ret; - int seq; - - do { - ret = false; - seq = read_seqbegin(&state->seqlock); - if (nfs4_state_match_open_stateid_other(state, dst)) { - dst->seqid = state->open_stateid.seqid; - ret = true; - } - } while (read_seqretry(&state->seqlock, seq)); - return ret; -} - bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { bool ret; @@ -2095,8 +2079,10 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred } status = nfs4_begin_drain_session(clp); - if (status != 0) - return status; + if (status != 0) { + result = status; + goto out; + } status = nfs4_replace_transport(server, locations); if (status != 0) { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 46a8d636d151..ab07db0f07cd 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1174,7 +1174,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } - if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { + if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) { *p++ = cpu_to_be32(label->lfs); *p++ = cpu_to_be32(label->pi); *p++ = cpu_to_be32(label->len); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4525d5acae38..bb80034a7661 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -359,9 +359,10 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg, } /* - * Update the seqid of a layout stateid + * Update the seqid of a layout stateid after receiving + * NFS4ERR_OLD_STATEID */ -bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode) { @@ -377,7 +378,15 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, spin_lock(&inode->i_lock); lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + if (lo && pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + /* Is our call using the most recent seqid? If so, bump it */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) { + nfs4_stateid_seqid_inc(dst); + ret = true; + goto out; + } + /* Try to update the seqid to the most recent */ err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); if (err != -EBUSY) { dst->seqid = lo->plh_stateid.seqid; @@ -385,6 +394,7 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, ret = true; } } +out: spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); return ret; @@ -1440,6 +1450,52 @@ out_noroc: return false; } +int pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret) +{ + struct nfs4_layoutreturn_args *arg = *argpp; + int retval = -EAGAIN; + + if (!arg) + return 0; + /* Handle Layoutreturn errors */ + switch (*ret) { + case 0: + retval = 0; + break; + case -NFS4ERR_NOMATCHING_LAYOUT: + /* Was there an RPC level error? If not, retry */ + if (task->tk_rpc_status == 0) + break; + /* If the call was not sent, let caller handle it */ + if (!RPC_WAS_SENT(task)) + return 0; + /* + * Otherwise, assume the call succeeded and + * that we need to release the layout + */ + *ret = 0; + (*respp)->lrs_present = 0; + retval = 0; + break; + case -NFS4ERR_DELAY: + /* Let the caller handle the retry */ + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return 0; + case -NFS4ERR_OLD_STATEID: + if (!nfs4_layout_refresh_old_stateid(&arg->stateid, + &arg->range, inode)) + break; + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return -EAGAIN; + } + *argpp = NULL; + *respp = NULL; + return retval; +} + void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, int ret) @@ -1449,10 +1505,15 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, const nfs4_stateid *res_stateid = NULL; struct nfs4_xdr_opaque_data *ld_private = args->ld_private; - if (ret == 0) { - arg_stateid = &args->stateid; + switch (ret) { + case -NFS4ERR_NOMATCHING_LAYOUT: + break; + case 0: if (res->lrs_present) res_stateid = &res->stateid; + /* Fallthrough */ + default: + arg_stateid = &args->stateid; } pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range, res_stateid); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f15609c003d8..f8a38065c7e4 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -261,7 +261,7 @@ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, bool is_recall); int pnfs_destroy_layouts_byclid(struct nfs_client *clp, bool is_recall); -bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode); void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); @@ -282,6 +282,10 @@ bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, const struct cred *cred); +int pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret); void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, int ret); @@ -701,6 +705,15 @@ pnfs_roc(struct inode *ino, return false; } +static inline int +pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret) +{ + return 0; +} + static inline void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, @@ -785,7 +798,7 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) { } -static inline bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode) { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 19a76cfa8b1f..a84df7d63403 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2645,6 +2645,13 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, } EXPORT_SYMBOL_GPL(nfs_clone_sb_security); +static void nfs_set_readahead(struct backing_dev_info *bdi, + unsigned long iomax_pages) +{ + bdi->ra_pages = VM_READAHEAD_PAGES; + bdi->io_pages = iomax_pages; +} + struct dentry *nfs_fs_mount_common(struct nfs_server *server, int flags, const char *dev_name, struct nfs_mount_info *mount_info, @@ -2687,7 +2694,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, mntroot = ERR_PTR(error); goto error_splat_super; } - s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD; + nfs_set_readahead(s->s_bdi, server->rpages); server->super = s; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 85ca49549b39..52cab65f91cf 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -786,7 +786,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *head; - atomic_long_dec(&nfsi->nrequests); if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { head = req->wb_head; @@ -799,8 +798,10 @@ static void nfs_inode_remove_request(struct nfs_page *req) spin_unlock(&mapping->private_lock); } - if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) + if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { nfs_release_request(req); + atomic_long_dec(&nfsi->nrequests); + } } static void diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index d25f6bbe7006..10cefb0c07c7 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -3,6 +3,7 @@ config NFSD tristate "NFS server support" depends on INET depends on FILE_LOCKING + depends on FSNOTIFY select LOCKD select SUNRPC select EXPORTFS @@ -147,7 +148,7 @@ config NFSD_V4_SECURITY_LABEL config NFSD_FAULT_INJECTION bool "NFS server manual fault injection" - depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS + depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS && BROKEN help This option enables support for manually injecting faults into the NFS server. This is intended to be used for diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 2bfb58eefad1..6a40b1afe703 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -11,7 +11,8 @@ obj-$(CONFIG_NFSD) += nfsd.o nfsd-y += trace.o nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ - export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o + export.o auth.o lockd.o nfscache.o nfsxdr.o \ + stats.o filecache.o nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index 4cd7c69a6cb9..ba14d2f4b64f 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -39,14 +39,6 @@ struct nfs4_acl; struct svc_fh; struct svc_rqst; -/* - * Maximum ACL we'll accept from a client; chosen (somewhat - * arbitrarily) so that kmalloc'ing the ACL shouldn't require a - * high-order allocation. This allows 204 ACEs on x86_64: - */ -#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \ - / sizeof(struct nfs4_ace)) - int nfs4_acl_bytes(int entries); int nfs4_acl_get_whotype(char *, u32); __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 66d4c55eb48e..9bbaa671c079 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -15,6 +15,7 @@ #include "blocklayoutxdr.h" #include "pnfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS @@ -404,7 +405,7 @@ static void nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls) { struct nfs4_client *clp = ls->ls_stid.sc_client; - struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev; + struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev; bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), 0, true); diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index baa01956a5b3..15422c951fd1 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -22,6 +22,7 @@ #include "nfsfh.h" #include "netns.h" #include "pnfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT @@ -232,6 +233,17 @@ static struct cache_head *expkey_alloc(void) return NULL; } +static void expkey_flush(void) +{ + /* + * Take the nfsd_mutex here to ensure that the file cache is not + * destroyed while we're in the middle of flushing. + */ + mutex_lock(&nfsd_mutex); + nfsd_file_cache_purge(current->nsproxy->net_ns); + mutex_unlock(&nfsd_mutex); +} + static const struct cache_detail svc_expkey_cache_template = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, @@ -244,6 +256,7 @@ static const struct cache_detail svc_expkey_cache_template = { .init = expkey_init, .update = expkey_update, .alloc = expkey_alloc, + .flush = expkey_flush, }; static int diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c new file mode 100644 index 000000000000..ef55e9b1cd4e --- /dev/null +++ b/fs/nfsd/filecache.c @@ -0,0 +1,934 @@ +/* + * Open file cache. + * + * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com> + */ + +#include <linux/hash.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/sched.h> +#include <linux/list_lru.h> +#include <linux/fsnotify_backend.h> +#include <linux/fsnotify.h> +#include <linux/seq_file.h> + +#include "vfs.h" +#include "nfsd.h" +#include "nfsfh.h" +#include "netns.h" +#include "filecache.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_FH + +/* FIXME: dynamically size this for the machine somehow? */ +#define NFSD_FILE_HASH_BITS 12 +#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS) +#define NFSD_LAUNDRETTE_DELAY (2 * HZ) + +#define NFSD_FILE_LRU_RESCAN (0) +#define NFSD_FILE_SHUTDOWN (1) +#define NFSD_FILE_LRU_THRESHOLD (4096UL) +#define NFSD_FILE_LRU_LIMIT (NFSD_FILE_LRU_THRESHOLD << 2) + +/* We only care about NFSD_MAY_READ/WRITE for this cache */ +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) + +struct nfsd_fcache_bucket { + struct hlist_head nfb_head; + spinlock_t nfb_lock; + unsigned int nfb_count; + unsigned int nfb_maxcount; +}; + +static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); + +static struct kmem_cache *nfsd_file_slab; +static struct kmem_cache *nfsd_file_mark_slab; +static struct nfsd_fcache_bucket *nfsd_file_hashtbl; +static struct list_lru nfsd_file_lru; +static long nfsd_file_lru_flags; +static struct fsnotify_group *nfsd_file_fsnotify_group; +static atomic_long_t nfsd_filecache_count; +static struct delayed_work nfsd_filecache_laundrette; + +enum nfsd_file_laundrette_ctl { + NFSD_FILE_LAUNDRETTE_NOFLUSH = 0, + NFSD_FILE_LAUNDRETTE_MAY_FLUSH +}; + +static void +nfsd_file_schedule_laundrette(enum nfsd_file_laundrette_ctl ctl) +{ + long count = atomic_long_read(&nfsd_filecache_count); + + if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags)) + return; + + /* Be more aggressive about scanning if over the threshold */ + if (count > NFSD_FILE_LRU_THRESHOLD) + mod_delayed_work(system_wq, &nfsd_filecache_laundrette, 0); + else + schedule_delayed_work(&nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY); + + if (ctl == NFSD_FILE_LAUNDRETTE_NOFLUSH) + return; + + /* ...and don't delay flushing if we're out of control */ + if (count >= NFSD_FILE_LRU_LIMIT) + flush_delayed_work(&nfsd_filecache_laundrette); +} + +static void +nfsd_file_slab_free(struct rcu_head *rcu) +{ + struct nfsd_file *nf = container_of(rcu, struct nfsd_file, nf_rcu); + + put_cred(nf->nf_cred); + kmem_cache_free(nfsd_file_slab, nf); +} + +static void +nfsd_file_mark_free(struct fsnotify_mark *mark) +{ + struct nfsd_file_mark *nfm = container_of(mark, struct nfsd_file_mark, + nfm_mark); + + kmem_cache_free(nfsd_file_mark_slab, nfm); +} + +static struct nfsd_file_mark * +nfsd_file_mark_get(struct nfsd_file_mark *nfm) +{ + if (!atomic_inc_not_zero(&nfm->nfm_ref)) + return NULL; + return nfm; +} + +static void +nfsd_file_mark_put(struct nfsd_file_mark *nfm) +{ + if (atomic_dec_and_test(&nfm->nfm_ref)) { + + fsnotify_destroy_mark(&nfm->nfm_mark, nfsd_file_fsnotify_group); + fsnotify_put_mark(&nfm->nfm_mark); + } +} + +static struct nfsd_file_mark * +nfsd_file_mark_find_or_create(struct nfsd_file *nf) +{ + int err; + struct fsnotify_mark *mark; + struct nfsd_file_mark *nfm = NULL, *new; + struct inode *inode = nf->nf_inode; + + do { + mutex_lock(&nfsd_file_fsnotify_group->mark_mutex); + mark = fsnotify_find_mark(&inode->i_fsnotify_marks, + nfsd_file_fsnotify_group); + if (mark) { + nfm = nfsd_file_mark_get(container_of(mark, + struct nfsd_file_mark, + nfm_mark)); + mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex); + fsnotify_put_mark(mark); + if (likely(nfm)) + break; + } else + mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex); + + /* allocate a new nfm */ + new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL); + if (!new) + return NULL; + fsnotify_init_mark(&new->nfm_mark, nfsd_file_fsnotify_group); + new->nfm_mark.mask = FS_ATTRIB|FS_DELETE_SELF; + atomic_set(&new->nfm_ref, 1); + + err = fsnotify_add_inode_mark(&new->nfm_mark, inode, 0); + + /* + * If the add was successful, then return the object. + * Otherwise, we need to put the reference we hold on the + * nfm_mark. The fsnotify code will take a reference and put + * it on failure, so we can't just free it directly. It's also + * not safe to call fsnotify_destroy_mark on it as the + * mark->group will be NULL. Thus, we can't let the nfm_ref + * counter drive the destruction at this point. + */ + if (likely(!err)) + nfm = new; + else + fsnotify_put_mark(&new->nfm_mark); + } while (unlikely(err == -EEXIST)); + + return nfm; +} + +static struct nfsd_file * +nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval, + struct net *net) +{ + struct nfsd_file *nf; + + nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL); + if (nf) { + INIT_HLIST_NODE(&nf->nf_node); + INIT_LIST_HEAD(&nf->nf_lru); + nf->nf_file = NULL; + nf->nf_cred = get_current_cred(); + nf->nf_net = net; + nf->nf_flags = 0; + nf->nf_inode = inode; + nf->nf_hashval = hashval; + atomic_set(&nf->nf_ref, 1); + nf->nf_may = may & NFSD_FILE_MAY_MASK; + if (may & NFSD_MAY_NOT_BREAK_LEASE) { + if (may & NFSD_MAY_WRITE) + __set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags); + if (may & NFSD_MAY_READ) + __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); + } + nf->nf_mark = NULL; + trace_nfsd_file_alloc(nf); + } + return nf; +} + +static bool +nfsd_file_free(struct nfsd_file *nf) +{ + bool flush = false; + + trace_nfsd_file_put_final(nf); + if (nf->nf_mark) + nfsd_file_mark_put(nf->nf_mark); + if (nf->nf_file) { + get_file(nf->nf_file); + filp_close(nf->nf_file, NULL); + fput(nf->nf_file); + flush = true; + } + call_rcu(&nf->nf_rcu, nfsd_file_slab_free); + return flush; +} + +static bool +nfsd_file_check_writeback(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + struct address_space *mapping; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return false; + mapping = file->f_mapping; + return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); +} + +static int +nfsd_file_check_write_error(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return 0; + return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)); +} + +static bool +nfsd_file_in_use(struct nfsd_file *nf) +{ + return nfsd_file_check_writeback(nf) || + nfsd_file_check_write_error(nf); +} + +static void +nfsd_file_do_unhash(struct nfsd_file *nf) +{ + lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + + trace_nfsd_file_unhash(nf); + + if (nfsd_file_check_write_error(nf)) + nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id)); + --nfsd_file_hashtbl[nf->nf_hashval].nfb_count; + hlist_del_rcu(&nf->nf_node); + if (!list_empty(&nf->nf_lru)) + list_lru_del(&nfsd_file_lru, &nf->nf_lru); + atomic_long_dec(&nfsd_filecache_count); +} + +static bool +nfsd_file_unhash(struct nfsd_file *nf) +{ + if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + nfsd_file_do_unhash(nf); + return true; + } + return false; +} + +/* + * Return true if the file was unhashed. + */ +static bool +nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose) +{ + lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + + trace_nfsd_file_unhash_and_release_locked(nf); + if (!nfsd_file_unhash(nf)) + return false; + /* keep final reference for nfsd_file_lru_dispose */ + if (atomic_add_unless(&nf->nf_ref, -1, 1)) + return true; + + list_add(&nf->nf_lru, dispose); + return true; +} + +static int +nfsd_file_put_noref(struct nfsd_file *nf) +{ + int count; + trace_nfsd_file_put(nf); + + count = atomic_dec_return(&nf->nf_ref); + if (!count) { + WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags)); + nfsd_file_free(nf); + } + return count; +} + +void +nfsd_file_put(struct nfsd_file *nf) +{ + bool is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0; + bool unused = !nfsd_file_in_use(nf); + + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + if (nfsd_file_put_noref(nf) == 1 && is_hashed && unused) + nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_MAY_FLUSH); +} + +struct nfsd_file * +nfsd_file_get(struct nfsd_file *nf) +{ + if (likely(atomic_inc_not_zero(&nf->nf_ref))) + return nf; + return NULL; +} + +static void +nfsd_file_dispose_list(struct list_head *dispose) +{ + struct nfsd_file *nf; + + while(!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); + list_del(&nf->nf_lru); + nfsd_file_put_noref(nf); + } +} + +static void +nfsd_file_dispose_list_sync(struct list_head *dispose) +{ + bool flush = false; + struct nfsd_file *nf; + + while(!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); + list_del(&nf->nf_lru); + if (!atomic_dec_and_test(&nf->nf_ref)) + continue; + if (nfsd_file_free(nf)) + flush = true; + } + if (flush) + flush_delayed_fput(); +} + +/* + * Note this can deadlock with nfsd_file_cache_purge. + */ +static enum lru_status +nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, + spinlock_t *lock, void *arg) + __releases(lock) + __acquires(lock) +{ + struct list_head *head = arg; + struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); + + /* + * Do a lockless refcount check. The hashtable holds one reference, so + * we look to see if anything else has a reference, or if any have + * been put since the shrinker last ran. Those don't get unhashed and + * released. + * + * Note that in the put path, we set the flag and then decrement the + * counter. Here we check the counter and then test and clear the flag. + * That order is deliberate to ensure that we can do this locklessly. + */ + if (atomic_read(&nf->nf_ref) > 1) + goto out_skip; + + /* + * Don't throw out files that are still undergoing I/O or + * that have uncleared errors pending. + */ + if (nfsd_file_check_writeback(nf)) + goto out_skip; + + if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) + goto out_rescan; + + if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) + goto out_skip; + + list_lru_isolate_move(lru, &nf->nf_lru, head); + return LRU_REMOVED; +out_rescan: + set_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags); +out_skip: + return LRU_SKIP; +} + +static void +nfsd_file_lru_dispose(struct list_head *head) +{ + while(!list_empty(head)) { + struct nfsd_file *nf = list_first_entry(head, + struct nfsd_file, nf_lru); + list_del_init(&nf->nf_lru); + spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + nfsd_file_do_unhash(nf); + spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + nfsd_file_put_noref(nf); + } +} + +static unsigned long +nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc) +{ + return list_lru_count(&nfsd_file_lru); +} + +static unsigned long +nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) +{ + LIST_HEAD(head); + unsigned long ret; + + ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &head); + nfsd_file_lru_dispose(&head); + return ret; +} + +static struct shrinker nfsd_file_shrinker = { + .scan_objects = nfsd_file_lru_scan, + .count_objects = nfsd_file_lru_count, + .seeks = 1, +}; + +static void +__nfsd_file_close_inode(struct inode *inode, unsigned int hashval, + struct list_head *dispose) +{ + struct nfsd_file *nf; + struct hlist_node *tmp; + + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) { + if (inode == nf->nf_inode) + nfsd_file_unhash_and_release_locked(nf, dispose); + } + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); +} + +/** + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * @inode: inode of the file to attempt to remove + * + * Walk the whole hash bucket, looking for any files that correspond to "inode". + * If any do, then unhash them and put the hashtable reference to them and + * destroy any that had their last reference put. Also ensure that any of the + * fputs also have their final __fput done as well. + */ +void +nfsd_file_close_inode_sync(struct inode *inode) +{ + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + LIST_HEAD(dispose); + + __nfsd_file_close_inode(inode, hashval, &dispose); + trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose)); + nfsd_file_dispose_list_sync(&dispose); +} + +/** + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * @inode: inode of the file to attempt to remove + * + * Walk the whole hash bucket, looking for any files that correspond to "inode". + * If any do, then unhash them and put the hashtable reference to them and + * destroy any that had their last reference put. + */ +static void +nfsd_file_close_inode(struct inode *inode) +{ + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + LIST_HEAD(dispose); + + __nfsd_file_close_inode(inode, hashval, &dispose); + trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose)); + nfsd_file_dispose_list(&dispose); +} + +/** + * nfsd_file_delayed_close - close unused nfsd_files + * @work: dummy + * + * Walk the LRU list and close any entries that have not been used since + * the last scan. + * + * Note this can deadlock with nfsd_file_cache_purge. + */ +static void +nfsd_file_delayed_close(struct work_struct *work) +{ + LIST_HEAD(head); + + list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, &head, LONG_MAX); + + if (test_and_clear_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags)) + nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_NOFLUSH); + + if (!list_empty(&head)) { + nfsd_file_lru_dispose(&head); + flush_delayed_fput(); + } +} + +static int +nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg, + void *data) +{ + struct file_lock *fl = data; + + /* Only close files for F_SETLEASE leases */ + if (fl->fl_flags & FL_LEASE) + nfsd_file_close_inode_sync(file_inode(fl->fl_file)); + return 0; +} + +static struct notifier_block nfsd_file_lease_notifier = { + .notifier_call = nfsd_file_lease_notifier_call, +}; + +static int +nfsd_file_fsnotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + u32 mask, const void *data, int data_type, + const struct qstr *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) +{ + trace_nfsd_file_fsnotify_handle_event(inode, mask); + + /* Should be no marks on non-regular files */ + if (!S_ISREG(inode->i_mode)) { + WARN_ON_ONCE(1); + return 0; + } + + /* don't close files if this was not the last link */ + if (mask & FS_ATTRIB) { + if (inode->i_nlink) + return 0; + } + + nfsd_file_close_inode(inode); + return 0; +} + + +static const struct fsnotify_ops nfsd_file_fsnotify_ops = { + .handle_event = nfsd_file_fsnotify_handle_event, + .free_mark = nfsd_file_mark_free, +}; + +int +nfsd_file_cache_init(void) +{ + int ret = -ENOMEM; + unsigned int i; + + clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); + + if (nfsd_file_hashtbl) + return 0; + + nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE, + sizeof(*nfsd_file_hashtbl), GFP_KERNEL); + if (!nfsd_file_hashtbl) { + pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n"); + goto out_err; + } + + nfsd_file_slab = kmem_cache_create("nfsd_file", + sizeof(struct nfsd_file), 0, 0, NULL); + if (!nfsd_file_slab) { + pr_err("nfsd: unable to create nfsd_file_slab\n"); + goto out_err; + } + + nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark", + sizeof(struct nfsd_file_mark), 0, 0, NULL); + if (!nfsd_file_mark_slab) { + pr_err("nfsd: unable to create nfsd_file_mark_slab\n"); + goto out_err; + } + + + ret = list_lru_init(&nfsd_file_lru); + if (ret) { + pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret); + goto out_err; + } + + ret = register_shrinker(&nfsd_file_shrinker); + if (ret) { + pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret); + goto out_lru; + } + + ret = lease_register_notifier(&nfsd_file_lease_notifier); + if (ret) { + pr_err("nfsd: unable to register lease notifier: %d\n", ret); + goto out_shrinker; + } + + nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops); + if (IS_ERR(nfsd_file_fsnotify_group)) { + pr_err("nfsd: unable to create fsnotify group: %ld\n", + PTR_ERR(nfsd_file_fsnotify_group)); + nfsd_file_fsnotify_group = NULL; + goto out_notifier; + } + + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head); + spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock); + } + + INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_delayed_close); +out: + return ret; +out_notifier: + lease_unregister_notifier(&nfsd_file_lease_notifier); +out_shrinker: + unregister_shrinker(&nfsd_file_shrinker); +out_lru: + list_lru_destroy(&nfsd_file_lru); +out_err: + kmem_cache_destroy(nfsd_file_slab); + nfsd_file_slab = NULL; + kmem_cache_destroy(nfsd_file_mark_slab); + nfsd_file_mark_slab = NULL; + kfree(nfsd_file_hashtbl); + nfsd_file_hashtbl = NULL; + goto out; +} + +/* + * Note this can deadlock with nfsd_file_lru_cb. + */ +void +nfsd_file_cache_purge(struct net *net) +{ + unsigned int i; + struct nfsd_file *nf; + struct hlist_node *next; + LIST_HEAD(dispose); + bool del; + + if (!nfsd_file_hashtbl) + return; + + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i]; + + spin_lock(&nfb->nfb_lock); + hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) { + if (net && nf->nf_net != net) + continue; + del = nfsd_file_unhash_and_release_locked(nf, &dispose); + + /* + * Deadlock detected! Something marked this entry as + * unhased, but hasn't removed it from the hash list. + */ + WARN_ON_ONCE(!del); + } + spin_unlock(&nfb->nfb_lock); + nfsd_file_dispose_list(&dispose); + } +} + +void +nfsd_file_cache_shutdown(void) +{ + LIST_HEAD(dispose); + + set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); + + lease_unregister_notifier(&nfsd_file_lease_notifier); + unregister_shrinker(&nfsd_file_shrinker); + /* + * make sure all callers of nfsd_file_lru_cb are done before + * calling nfsd_file_cache_purge + */ + cancel_delayed_work_sync(&nfsd_filecache_laundrette); + nfsd_file_cache_purge(NULL); + list_lru_destroy(&nfsd_file_lru); + rcu_barrier(); + fsnotify_put_group(nfsd_file_fsnotify_group); + nfsd_file_fsnotify_group = NULL; + kmem_cache_destroy(nfsd_file_slab); + nfsd_file_slab = NULL; + fsnotify_wait_marks_destroyed(); + kmem_cache_destroy(nfsd_file_mark_slab); + nfsd_file_mark_slab = NULL; + kfree(nfsd_file_hashtbl); + nfsd_file_hashtbl = NULL; +} + +static bool +nfsd_match_cred(const struct cred *c1, const struct cred *c2) +{ + int i; + + if (!uid_eq(c1->fsuid, c2->fsuid)) + return false; + if (!gid_eq(c1->fsgid, c2->fsgid)) + return false; + if (c1->group_info == NULL || c2->group_info == NULL) + return c1->group_info == c2->group_info; + if (c1->group_info->ngroups != c2->group_info->ngroups) + return false; + for (i = 0; i < c1->group_info->ngroups; i++) { + if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i])) + return false; + } + return true; +} + +static struct nfsd_file * +nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, + unsigned int hashval, struct net *net) +{ + struct nfsd_file *nf; + unsigned char need = may_flags & NFSD_FILE_MAY_MASK; + + hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, + nf_node) { + if ((need & nf->nf_may) != need) + continue; + if (nf->nf_inode != inode) + continue; + if (nf->nf_net != net) + continue; + if (!nfsd_match_cred(nf->nf_cred, current_cred())) + continue; + if (nfsd_file_get(nf) != NULL) + return nf; + } + return NULL; +} + +/** + * nfsd_file_is_cached - are there any cached open files for this fh? + * @inode: inode of the file to check + * + * Scan the hashtable for open files that match this fh. Returns true if there + * are any, and false if not. + */ +bool +nfsd_file_is_cached(struct inode *inode) +{ + bool ret = false; + struct nfsd_file *nf; + unsigned int hashval; + + hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); + + rcu_read_lock(); + hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, + nf_node) { + if (inode == nf->nf_inode) { + ret = true; + break; + } + } + rcu_read_unlock(); + trace_nfsd_file_is_cached(inode, hashval, (int)ret); + return ret; +} + +__be32 +nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + __be32 status; + struct net *net = SVC_NET(rqstp); + struct nfsd_file *nf, *new; + struct inode *inode; + unsigned int hashval; + + /* FIXME: skip this if fh_dentry is already set? */ + status = fh_verify(rqstp, fhp, S_IFREG, + may_flags|NFSD_MAY_OWNER_OVERRIDE); + if (status != nfs_ok) + return status; + + inode = d_inode(fhp->fh_dentry); + hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); +retry: + rcu_read_lock(); + nf = nfsd_file_find_locked(inode, may_flags, hashval, net); + rcu_read_unlock(); + if (nf) + goto wait_for_construction; + + new = nfsd_file_alloc(inode, may_flags, hashval, net); + if (!new) { + trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, + NULL, nfserr_jukebox); + return nfserr_jukebox; + } + + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + nf = nfsd_file_find_locked(inode, may_flags, hashval, net); + if (nf == NULL) + goto open_file; + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + nfsd_file_slab_free(&new->nf_rcu); + +wait_for_construction: + wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE); + + /* Did construction of this file fail? */ + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + nfsd_file_put_noref(nf); + goto retry; + } + + this_cpu_inc(nfsd_file_cache_hits); + + if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) { + bool write = (may_flags & NFSD_MAY_WRITE); + + if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) || + (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) { + status = nfserrno(nfsd_open_break_lease( + file_inode(nf->nf_file), may_flags)); + if (status == nfs_ok) { + clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); + if (write) + clear_bit(NFSD_FILE_BREAK_WRITE, + &nf->nf_flags); + } + } + } +out: + if (status == nfs_ok) { + *pnf = nf; + } else { + nfsd_file_put(nf); + nf = NULL; + } + + trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status); + return status; +open_file: + nf = new; + /* Take reference for the hashtable */ + atomic_inc(&nf->nf_ref); + __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); + __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); + list_lru_add(&nfsd_file_lru, &nf->nf_lru); + hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head); + ++nfsd_file_hashtbl[hashval].nfb_count; + nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount, + nfsd_file_hashtbl[hashval].nfb_count); + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + atomic_long_inc(&nfsd_filecache_count); + + nf->nf_mark = nfsd_file_mark_find_or_create(nf); + if (nf->nf_mark) + status = nfsd_open_verified(rqstp, fhp, S_IFREG, + may_flags, &nf->nf_file); + else + status = nfserr_jukebox; + /* + * If construction failed, or we raced with a call to unlink() + * then unhash. + */ + if (status != nfs_ok || inode->i_nlink == 0) { + bool do_free; + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + do_free = nfsd_file_unhash(nf); + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + if (do_free) + nfsd_file_put_noref(nf); + } + clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags); + smp_mb__after_atomic(); + wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); + goto out; +} + +/* + * Note that fields may be added, removed or reordered in the future. Programs + * scraping this file for info should test the labels to ensure they're + * getting the correct field. + */ +static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) +{ + unsigned int i, count = 0, longest = 0; + unsigned long hits = 0; + + /* + * No need for spinlocks here since we're not terribly interested in + * accuracy. We do take the nfsd_mutex simply to ensure that we + * don't end up racing with server shutdown + */ + mutex_lock(&nfsd_mutex); + if (nfsd_file_hashtbl) { + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + count += nfsd_file_hashtbl[i].nfb_count; + longest = max(longest, nfsd_file_hashtbl[i].nfb_count); + } + } + mutex_unlock(&nfsd_mutex); + + for_each_possible_cpu(i) + hits += per_cpu(nfsd_file_cache_hits, i); + + seq_printf(m, "total entries: %u\n", count); + seq_printf(m, "longest chain: %u\n", longest); + seq_printf(m, "cache hits: %lu\n", hits); + return 0; +} + +int nfsd_file_cache_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, nfsd_file_cache_stats_show, NULL); +} diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h new file mode 100644 index 000000000000..851d9abf54c2 --- /dev/null +++ b/fs/nfsd/filecache.h @@ -0,0 +1,61 @@ +#ifndef _FS_NFSD_FILECACHE_H +#define _FS_NFSD_FILECACHE_H + +#include <linux/fsnotify_backend.h> + +/* + * This is the fsnotify_mark container that nfsd attaches to the files that it + * is holding open. Note that we have a separate refcount here aside from the + * one in the fsnotify_mark. We only want a single fsnotify_mark attached to + * the inode, and for each nfsd_file to hold a reference to it. + * + * The fsnotify_mark is itself refcounted, but that's not sufficient to tell us + * how to put that reference. If there are still outstanding nfsd_files that + * reference the mark, then we would want to call fsnotify_put_mark on it. + * If there were not, then we'd need to call fsnotify_destroy_mark. Since we + * can't really tell the difference, we use the nfm_mark to keep track of how + * many nfsd_files hold references to the mark. When that counter goes to zero + * then we know to call fsnotify_destroy_mark on it. + */ +struct nfsd_file_mark { + struct fsnotify_mark nfm_mark; + atomic_t nfm_ref; +}; + +/* + * A representation of a file that has been opened by knfsd. These are hashed + * in the hashtable by inode pointer value. Note that this object doesn't + * hold a reference to the inode by itself, so the nf_inode pointer should + * never be dereferenced, only used for comparison. + */ +struct nfsd_file { + struct hlist_node nf_node; + struct list_head nf_lru; + struct rcu_head nf_rcu; + struct file *nf_file; + const struct cred *nf_cred; + struct net *nf_net; +#define NFSD_FILE_HASHED (0) +#define NFSD_FILE_PENDING (1) +#define NFSD_FILE_BREAK_READ (2) +#define NFSD_FILE_BREAK_WRITE (3) +#define NFSD_FILE_REFERENCED (4) + unsigned long nf_flags; + struct inode *nf_inode; + unsigned int nf_hashval; + atomic_t nf_ref; + unsigned char nf_may; + struct nfsd_file_mark *nf_mark; +}; + +int nfsd_file_cache_init(void); +void nfsd_file_cache_purge(struct net *); +void nfsd_file_cache_shutdown(void); +void nfsd_file_put(struct nfsd_file *nf); +struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); +void nfsd_file_close_inode_sync(struct inode *inode); +bool nfsd_file_is_cached(struct inode *inode); +__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); +int nfsd_file_cache_stats_open(struct inode *, struct file *); +#endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index bdfe5bcb3dcd..9a4ef815fb8c 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -104,6 +104,7 @@ struct nfsd_net { /* Time of server startup */ struct timespec64 nfssvc_boot; + seqlock_t boot_lock; /* * Max number of connections this nfsd container will allow. Defaults @@ -179,4 +180,7 @@ struct nfsd_net { extern void nfsd_netns_free_versions(struct nfsd_net *nn); extern unsigned int nfsd_net_id; + +void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn); +void nfsd_reset_boot_verifier(struct nfsd_net *nn); #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9bc32af4e2da..cea68d8411ac 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -172,13 +172,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp) nfserr = nfsd_read(rqstp, &resp->fh, argp->offset, rqstp->rq_vec, argp->vlen, - &resp->count); - if (nfserr == 0) { - struct inode *inode = d_inode(resp->fh.fh_dentry); - resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset, - inode->i_size); - } - + &resp->count, + &resp->eof); RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index fcf31822c74c..86e5658651f1 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -27,6 +27,7 @@ static u32 nfs3_ftypes[] = { NF3SOCK, NF3BAD, NF3LNK, NF3BAD, }; + /* * XDR functions for basic NFS types */ @@ -751,14 +752,16 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_writeres *resp = rqstp->rq_resp; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 verf[2]; p = encode_wcc_data(rqstp, p, &resp->fh); if (resp->status == 0) { *p++ = htonl(resp->count); *p++ = htonl(resp->committed); /* unique identifier, y2038 overflow can be ignored */ - *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_nsec); + nfsd_copy_boot_verifier(verf, nn); + *p++ = verf[0]; + *p++ = verf[1]; } return xdr_ressize_check(rqstp, p); } @@ -1125,13 +1128,15 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_commitres *resp = rqstp->rq_resp; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 verf[2]; p = encode_wcc_data(rqstp, p, &resp->fh); /* Write verifier */ if (resp->status == 0) { /* unique identifier, y2038 overflow can be ignored */ - *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_nsec); + nfsd_copy_boot_verifier(verf, nn); + *p++ = verf[0]; + *p++ = verf[1]; } return xdr_ressize_check(rqstp, p); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 397eb7820929..524111420b48 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -512,11 +512,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb != NULL) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } @@ -604,11 +602,10 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); } #endif /* CONFIG_NFSD_PNFS */ @@ -663,11 +660,10 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status); } @@ -759,11 +755,10 @@ static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status); } /* diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index a79e24b79095..2681c70283ce 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -169,8 +169,8 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) spin_unlock(&fp->fi_lock); if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) - vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); - fput(ls->ls_file); + vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls); + nfsd_file_put(ls->ls_file); if (ls->ls_recalled) atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); @@ -197,7 +197,7 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) fl->fl_end = OFFSET_MAX; fl->fl_owner = ls; fl->fl_pid = current->tgid; - fl->fl_file = ls->ls_file; + fl->fl_file = ls->ls_file->nf_file; status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); if (status) { @@ -236,13 +236,13 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, NFSPROC4_CLNT_CB_LAYOUT); if (parent->sc_type == NFS4_DELEG_STID) - ls->ls_file = get_file(fp->fi_deleg_file); + ls->ls_file = nfsd_file_get(fp->fi_deleg_file); else ls->ls_file = find_any_file(fp); BUG_ON(!ls->ls_file); if (nfsd4_layout_setlease(ls)) { - fput(ls->ls_file); + nfsd_file_put(ls->ls_file); put_nfs4_file(fp); kmem_cache_free(nfs4_layout_stateid_cache, ls); return NULL; @@ -626,7 +626,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) argv[0] = (char *)nfsd_recall_failed; argv[1] = addr_str; - argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id; + argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id; argv[3] = NULL; error = call_usermodehelper(nfsd_recall_failed, argv, envp, diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8beda999e134..4e3e77b76411 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -568,17 +568,11 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) { - __be32 verf[2]; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + __be32 *verf = (__be32 *)verifier->data; - /* - * This is opaque to client, so no need to byte-swap. Use - * __force to keep sparse happy. y2038 time_t overflow is - * irrelevant in this usage. - */ - verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; - verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; - memcpy(verifier->data, verf, sizeof(verifier->data)); + BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data)); + + nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id)); } static __be32 @@ -761,7 +755,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_read *read = &u->read; __be32 status; - read->rd_filp = NULL; + read->rd_nf = NULL; if (read->rd_offset >= OFFSET_MAX) return nfserr_inval; @@ -782,7 +776,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* check stateid */ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &read->rd_stateid, RD_STATE, - &read->rd_filp, &read->rd_tmp_file); + &read->rd_nf); if (status) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; @@ -798,8 +792,8 @@ out: static void nfsd4_read_release(union nfsd4_op_u *u) { - if (u->read.rd_filp) - fput(u->read.rd_filp); + if (u->read.rd_nf) + nfsd_file_put(u->read.rd_nf); trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, u->read.rd_offset, u->read.rd_length); } @@ -954,7 +948,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &setattr->sa_stateid, - WR_STATE, NULL, NULL); + WR_STATE, NULL); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; @@ -993,7 +987,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_write *write = &u->write; stateid_t *stateid = &write->wr_stateid; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; __be32 status = nfs_ok; unsigned long cnt; int nvecs; @@ -1005,7 +999,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, trace_nfsd_write_start(rqstp, &cstate->current_fh, write->wr_offset, cnt); status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - stateid, WR_STATE, &filp, NULL); + stateid, WR_STATE, &nf); if (status) { dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; @@ -1018,10 +1012,10 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &write->wr_head, write->wr_buflen); WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); - status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, + status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf->nf_file, write->wr_offset, rqstp->rq_vec, nvecs, &cnt, write->wr_how_written); - fput(filp); + nfsd_file_put(nf); write->wr_bytes_written = cnt; trace_nfsd_write_done(rqstp, &cstate->current_fh, @@ -1031,8 +1025,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static __be32 nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - stateid_t *src_stateid, struct file **src, - stateid_t *dst_stateid, struct file **dst) + stateid_t *src_stateid, struct nfsd_file **src, + stateid_t *dst_stateid, struct nfsd_file **dst) { __be32 status; @@ -1040,22 +1034,22 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_nofilehandle; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, - src_stateid, RD_STATE, src, NULL); + src_stateid, RD_STATE, src); if (status) { dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); goto out; } status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - dst_stateid, WR_STATE, dst, NULL); + dst_stateid, WR_STATE, dst); if (status) { dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); goto out_put_src; } /* fix up for NFS-specific error code */ - if (!S_ISREG(file_inode(*src)->i_mode) || - !S_ISREG(file_inode(*dst)->i_mode)) { + if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) || + !S_ISREG(file_inode((*dst)->nf_file)->i_mode)) { status = nfserr_wrong_type; goto out_put_dst; } @@ -1063,9 +1057,9 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: return status; out_put_dst: - fput(*dst); + nfsd_file_put(*dst); out_put_src: - fput(*src); + nfsd_file_put(*src); goto out; } @@ -1074,7 +1068,7 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_clone *clone = &u->clone; - struct file *src, *dst; + struct nfsd_file *src, *dst; __be32 status; status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src, @@ -1082,11 +1076,11 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - status = nfsd4_clone_file_range(src, clone->cl_src_pos, - dst, clone->cl_dst_pos, clone->cl_count); + status = nfsd4_clone_file_range(src->nf_file, clone->cl_src_pos, + dst->nf_file, clone->cl_dst_pos, clone->cl_count); - fput(dst); - fput(src); + nfsd_file_put(dst); + nfsd_file_put(src); out: return status; } @@ -1176,8 +1170,9 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) do { if (kthread_should_stop()) break; - bytes_copied = nfsd_copy_file_range(copy->file_src, src_pos, - copy->file_dst, dst_pos, bytes_total); + bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file, + src_pos, copy->nf_dst->nf_file, dst_pos, + bytes_total); if (bytes_copied <= 0) break; bytes_total -= bytes_copied; @@ -1204,8 +1199,8 @@ static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync) status = nfs_ok; } - fput(copy->file_src); - fput(copy->file_dst); + nfsd_file_put(copy->nf_src); + nfsd_file_put(copy->nf_dst); return status; } @@ -1218,16 +1213,16 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res)); memcpy(&dst->fh, &src->fh, sizeof(src->fh)); dst->cp_clp = src->cp_clp; - dst->file_dst = get_file(src->file_dst); - dst->file_src = get_file(src->file_src); + dst->nf_dst = nfsd_file_get(src->nf_dst); + dst->nf_src = nfsd_file_get(src->nf_src); memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid)); } static void cleanup_async_copy(struct nfsd4_copy *copy) { nfs4_free_cp_state(copy); - fput(copy->file_dst); - fput(copy->file_src); + nfsd_file_put(copy->nf_dst); + nfsd_file_put(copy->nf_src); spin_lock(©->cp_clp->async_lock); list_del(©->copies); spin_unlock(©->cp_clp->async_lock); @@ -1264,8 +1259,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_copy *async_copy = NULL; status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, - ©->file_src, ©->cp_dst_stateid, - ©->file_dst); + ©->nf_src, ©->cp_dst_stateid, + ©->nf_dst); if (status) goto out; @@ -1347,21 +1342,21 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) { __be32 status; - struct file *file; + struct nfsd_file *nf; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &fallocate->falloc_stateid, - WR_STATE, &file, NULL); + WR_STATE, &nf); if (status != nfs_ok) { dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); return status; } - status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, + status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file, fallocate->falloc_offset, fallocate->falloc_length, flags); - fput(file); + nfsd_file_put(nf); return status; } static __be32 @@ -1406,11 +1401,11 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_seek *seek = &u->seek; int whence; __be32 status; - struct file *file; + struct nfsd_file *nf; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &seek->seek_stateid, - RD_STATE, &file, NULL); + RD_STATE, &nf); if (status) { dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); return status; @@ -1432,14 +1427,14 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Note: This call does change file->f_pos, but nothing in NFSD * should ever file->f_pos. */ - seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence); + seek->seek_pos = vfs_llseek(nf->nf_file, seek->seek_offset, whence); if (seek->seek_pos < 0) status = nfserrno(seek->seek_pos); - else if (seek->seek_pos >= i_size_read(file_inode(file))) + else if (seek->seek_pos >= i_size_read(file_inode(nf->nf_file))) seek->seek_eof = true; out: - fput(file); + nfsd_file_put(nf); return status; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 87679557d0d6..cdc75ad4438b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -59,8 +59,13 @@ struct nfsd4_client_tracking_ops { void (*remove)(struct nfs4_client *); int (*check)(struct nfs4_client *); void (*grace_done)(struct nfsd_net *); + uint8_t version; + size_t msglen; }; +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops; +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2; + /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; @@ -173,6 +178,7 @@ __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, const char *dname, int len, struct nfsd_net *nn) { struct xdr_netobj name; + struct xdr_netobj princhash = { .len = 0, .data = NULL }; struct nfs4_client_reclaim *crp; name.data = kmemdup(dname, len, GFP_KERNEL); @@ -182,7 +188,7 @@ __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, return; } name.len = len; - crp = nfs4_client_to_reclaim(name, nn); + crp = nfs4_client_to_reclaim(name, princhash, nn); if (!crp) { kfree(name.data); return; @@ -482,6 +488,7 @@ static int load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) { struct xdr_netobj name; + struct xdr_netobj princhash = { .len = 0, .data = NULL }; if (child->d_name.len != HEXDIR_LEN - 1) { printk("%s: illegal name %pd in recovery directory\n", @@ -496,7 +503,7 @@ load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) goto out; } name.len = HEXDIR_LEN; - if (!nfs4_client_to_reclaim(name, nn)) + if (!nfs4_client_to_reclaim(name, princhash, nn)) kfree(name.data); out: return 0; @@ -718,6 +725,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { .remove = nfsd4_remove_clid_dir, .check = nfsd4_check_legacy_client, .grace_done = nfsd4_recdir_purge_old, + .version = 1, + .msglen = 0, }; /* Globals */ @@ -731,25 +740,32 @@ struct cld_net { struct list_head cn_list; unsigned int cn_xid; bool cn_has_legacy; + struct crypto_shash *cn_tfm; }; struct cld_upcall { struct list_head cu_list; struct cld_net *cu_net; struct completion cu_done; - struct cld_msg cu_msg; + union { + struct cld_msg_hdr cu_hdr; + struct cld_msg cu_msg; + struct cld_msg_v2 cu_msg_v2; + } cu_u; }; static int -__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) { int ret; struct rpc_pipe_msg msg; - struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_msg); + struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u); + struct nfsd_net *nn = net_generic(pipe->dentry->d_sb->s_fs_info, + nfsd_net_id); memset(&msg, 0, sizeof(msg)); msg.data = cmsg; - msg.len = sizeof(*cmsg); + msg.len = nn->client_tracking_ops->msglen; ret = rpc_queue_upcall(pipe, &msg); if (ret < 0) { @@ -765,7 +781,7 @@ out: } static int -cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) { int ret; @@ -781,11 +797,11 @@ cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) } static ssize_t -__cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg, +__cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, struct nfsd_net *nn) { - uint8_t cmd; - struct xdr_netobj name; + uint8_t cmd, princhashlen; + struct xdr_netobj name, princhash = { .len = 0, .data = NULL }; uint16_t namelen; struct cld_net *cn = nn->cld_net; @@ -794,22 +810,48 @@ __cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg, return -EFAULT; } if (cmd == Cld_GraceStart) { - if (get_user(namelen, &cmsg->cm_u.cm_name.cn_len)) - return -EFAULT; - name.data = memdup_user(&cmsg->cm_u.cm_name.cn_id, namelen); - if (IS_ERR_OR_NULL(name.data)) - return -EFAULT; - name.len = namelen; + if (nn->client_tracking_ops->version >= 2) { + const struct cld_clntinfo __user *ci; + + ci = &cmsg->cm_u.cm_clntinfo; + if (get_user(namelen, &ci->cc_name.cn_len)) + return -EFAULT; + name.data = memdup_user(&ci->cc_name.cn_id, namelen); + if (IS_ERR_OR_NULL(name.data)) + return -EFAULT; + name.len = namelen; + get_user(princhashlen, &ci->cc_princhash.cp_len); + if (princhashlen > 0) { + princhash.data = memdup_user( + &ci->cc_princhash.cp_data, + princhashlen); + if (IS_ERR_OR_NULL(princhash.data)) + return -EFAULT; + princhash.len = princhashlen; + } else + princhash.len = 0; + } else { + const struct cld_name __user *cnm; + + cnm = &cmsg->cm_u.cm_name; + if (get_user(namelen, &cnm->cn_len)) + return -EFAULT; + name.data = memdup_user(&cnm->cn_id, namelen); + if (IS_ERR_OR_NULL(name.data)) + return -EFAULT; + name.len = namelen; + } if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) { name.len = name.len - 5; memmove(name.data, name.data + 5, name.len); cn->cn_has_legacy = true; } - if (!nfs4_client_to_reclaim(name, nn)) { + if (!nfs4_client_to_reclaim(name, princhash, nn)) { kfree(name.data); + kfree(princhash.data); return -EFAULT; } - return sizeof(*cmsg); + return nn->client_tracking_ops->msglen; } return -EFAULT; } @@ -818,21 +860,22 @@ static ssize_t cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct cld_upcall *tmp, *cup; - struct cld_msg __user *cmsg = (struct cld_msg __user *)src; + struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src; + struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src; uint32_t xid; struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, nfsd_net_id); struct cld_net *cn = nn->cld_net; int16_t status; - if (mlen != sizeof(*cmsg)) { + if (mlen != nn->client_tracking_ops->msglen) { dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen, - sizeof(*cmsg)); + nn->client_tracking_ops->msglen); return -EINVAL; } /* copy just the xid so we can try to find that */ - if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) { + if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) { dprintk("%s: error when copying xid from userspace", __func__); return -EFAULT; } @@ -842,7 +885,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) * list (for -EINPROGRESS, we just want to make sure the xid is * valid, not remove the upcall from the list) */ - if (get_user(status, &cmsg->cm_status)) { + if (get_user(status, &hdr->cm_status)) { dprintk("%s: error when copying status from userspace", __func__); return -EFAULT; } @@ -851,7 +894,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) cup = NULL; spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { - if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) { + if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) { cup = tmp; if (status != -EINPROGRESS) list_del_init(&cup->cu_list); @@ -869,7 +912,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (status == -EINPROGRESS) return __cld_pipe_inprogress_downcall(cmsg, nn); - if (copy_from_user(&cup->cu_msg, src, mlen) != 0) + if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0) return -EFAULT; complete(&cup->cu_done); @@ -881,7 +924,7 @@ cld_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct cld_msg *cmsg = msg->data; struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, - cu_msg); + cu_u.cu_msg); /* errno >= 0 means we got a downcall */ if (msg->errno >= 0) @@ -1007,14 +1050,17 @@ nfsd4_remove_cld_pipe(struct net *net) nfsd4_cld_unregister_net(net, cn->cn_pipe); rpc_destroy_pipe_data(cn->cn_pipe); + if (cn->cn_tfm) + crypto_free_shash(cn->cn_tfm); kfree(nn->cld_net); nn->cld_net = NULL; } static struct cld_upcall * -alloc_cld_upcall(struct cld_net *cn) +alloc_cld_upcall(struct nfsd_net *nn) { struct cld_upcall *new, *tmp; + struct cld_net *cn = nn->cld_net; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) @@ -1024,20 +1070,20 @@ alloc_cld_upcall(struct cld_net *cn) restart_search: spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { - if (tmp->cu_msg.cm_xid == cn->cn_xid) { + if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) { cn->cn_xid++; spin_unlock(&cn->cn_lock); goto restart_search; } } init_completion(&new->cu_done); - new->cu_msg.cm_vers = CLD_UPCALL_VERSION; - put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid); + new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version; + put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid); new->cu_net = cn; list_add(&new->cu_list, &cn->cn_list); spin_unlock(&cn->cn_lock); - dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid); + dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid); return new; } @@ -1066,20 +1112,20 @@ nfsd4_cld_create(struct nfs4_client *clp) if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_Create; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Create; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1092,6 +1138,75 @@ out_err: /* Ask daemon to create a new record */ static void +nfsd4_cld_create_v2(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + struct cld_msg_v2 *cmsg; + struct crypto_shash *tfm = cn->cn_tfm; + struct xdr_netobj cksum; + char *principal = NULL; + SHASH_DESC_ON_STACK(desc, tfm); + + /* Don't upcall if it's already stored */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cmsg = &cup->cu_u.cu_msg_v2; + cmsg->cm_cmd = Cld_Create; + cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len; + memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + if (clp->cl_cred.cr_raw_principal) + principal = clp->cl_cred.cr_raw_principal; + else if (clp->cl_cred.cr_principal) + principal = clp->cl_cred.cr_principal; + if (principal) { + desc->tfm = tfm; + cksum.len = crypto_shash_digestsize(tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) { + ret = -ENOMEM; + goto out; + } + ret = crypto_shash_digest(desc, principal, strlen(principal), + cksum.data); + shash_desc_zero(desc); + if (ret) { + kfree(cksum.data); + goto out; + } + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len; + memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data, + cksum.data, cksum.len); + kfree(cksum.data); + } else + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; + + ret = cld_pipe_upcall(cn->cn_pipe, cmsg); + if (!ret) { + ret = cmsg->cm_status; + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + +out: + free_cld_upcall(cup); +out_err: + if (ret) + pr_err("NFSD: Unable to create client record on stable storage: %d\n", + ret); +} + +/* Ask daemon to create a new record */ +static void nfsd4_cld_remove(struct nfs4_client *clp) { int ret; @@ -1103,20 +1218,20 @@ nfsd4_cld_remove(struct nfs4_client *clp) if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_Remove; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Remove; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1145,21 +1260,21 @@ nfsd4_cld_check_v0(struct nfs4_client *clp) if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { printk(KERN_ERR "NFSD: Unable to check client record on " "stable storage: %d\n", -ENOMEM); return -ENOMEM; } - cup->cu_msg.cm_cmd = Cld_Check; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Check; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1217,22 +1332,95 @@ found: } static int +nfsd4_cld_check_v2(struct nfs4_client *clp) +{ + struct nfs4_client_reclaim *crp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + int status; + char dname[HEXDIR_LEN]; + struct xdr_netobj name; + struct crypto_shash *tfm = cn->cn_tfm; + struct xdr_netobj cksum; + char *principal = NULL; + SHASH_DESC_ON_STACK(desc, tfm); + + /* did we already find that this client is stable? */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + /* look for it in the reclaim hashtable otherwise */ + crp = nfsd4_find_reclaim_client(clp->cl_name, nn); + if (crp) + goto found; + + if (cn->cn_has_legacy) { + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) + return -ENOENT; + + name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data\n", + __func__); + return -ENOENT; + } + name.len = HEXDIR_LEN; + crp = nfsd4_find_reclaim_client(name, nn); + kfree(name.data); + if (crp) + goto found; + + } + return -ENOENT; +found: + if (crp->cr_princhash.len) { + if (clp->cl_cred.cr_raw_principal) + principal = clp->cl_cred.cr_raw_principal; + else if (clp->cl_cred.cr_principal) + principal = clp->cl_cred.cr_principal; + if (principal == NULL) + return -ENOENT; + desc->tfm = tfm; + cksum.len = crypto_shash_digestsize(tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) + return -ENOENT; + status = crypto_shash_digest(desc, principal, strlen(principal), + cksum.data); + shash_desc_zero(desc); + if (status) { + kfree(cksum.data); + return -ENOENT; + } + if (memcmp(crp->cr_princhash.data, cksum.data, + crp->cr_princhash.len)) { + kfree(cksum.data); + return -ENOENT; + } + kfree(cksum.data); + } + crp->cr_clp = clp; + return 0; +} + +static int nfsd4_cld_grace_start(struct nfsd_net *nn) { int ret; struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceStart; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: @@ -1250,17 +1438,17 @@ nfsd4_cld_grace_done_v0(struct nfsd_net *nn) struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceDone; - cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; + cup->cu_u.cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: @@ -1279,16 +1467,16 @@ nfsd4_cld_grace_done(struct nfsd_net *nn) struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceDone; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: @@ -1337,6 +1525,53 @@ cld_running(struct nfsd_net *nn) } static int +nfsd4_cld_get_version(struct nfsd_net *nn) +{ + int ret = 0; + struct cld_upcall *cup; + struct cld_net *cn = nn->cld_net; + uint8_t version; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + if (!ret) { + ret = cup->cu_u.cu_msg.cm_status; + if (ret) + goto out_free; + version = cup->cu_u.cu_msg.cm_u.cm_version; + dprintk("%s: userspace returned version %u\n", + __func__, version); + if (version < 1) + version = 1; + else if (version > CLD_UPCALL_VERSION) + version = CLD_UPCALL_VERSION; + + switch (version) { + case 1: + nn->client_tracking_ops = &nfsd4_cld_tracking_ops; + break; + case 2: + nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2; + break; + default: + break; + } + } +out_free: + free_cld_upcall(cup); +out_err: + if (ret) + dprintk("%s: Unable to get version from userspace: %d\n", + __func__, ret); + return ret; +} + +static int nfsd4_cld_tracking_init(struct net *net) { int status; @@ -1351,6 +1586,11 @@ nfsd4_cld_tracking_init(struct net *net) status = __nfsd4_init_cld_pipe(net); if (status) goto err_shutdown; + nn->cld_net->cn_tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(nn->cld_net->cn_tfm)) { + status = PTR_ERR(nn->cld_net->cn_tfm); + goto err_remove; + } /* * rpc pipe upcalls take 30 seconds to time out, so we don't want to @@ -1368,10 +1608,14 @@ nfsd4_cld_tracking_init(struct net *net) goto err_remove; } + status = nfsd4_cld_get_version(nn); + if (status == -EOPNOTSUPP) + pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n"); + status = nfsd4_cld_grace_start(nn); if (status) { if (status == -EOPNOTSUPP) - printk(KERN_WARNING "NFSD: Please upgrade nfsdcld.\n"); + pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n"); nfs4_release_reclaim(nn); goto err_remove; } else @@ -1403,6 +1647,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = { .remove = nfsd4_cld_remove, .check = nfsd4_cld_check_v0, .grace_done = nfsd4_cld_grace_done_v0, + .version = 1, + .msglen = sizeof(struct cld_msg), }; /* For newer nfsdcld's */ @@ -1413,6 +1659,20 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { .remove = nfsd4_cld_remove, .check = nfsd4_cld_check, .grace_done = nfsd4_cld_grace_done, + .version = 1, + .msglen = sizeof(struct cld_msg), +}; + +/* v2 create/check ops include the principal, if available */ +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = { + .init = nfsd4_cld_tracking_init, + .exit = nfsd4_cld_tracking_exit, + .create = nfsd4_cld_create_v2, + .remove = nfsd4_cld_remove, + .check = nfsd4_cld_check_v2, + .grace_done = nfsd4_cld_grace_done, + .version = 2, + .msglen = sizeof(struct cld_msg_v2), }; /* upcall via usermodehelper */ @@ -1760,6 +2020,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { .remove = nfsd4_umh_cltrack_remove, .check = nfsd4_umh_cltrack_check, .grace_done = nfsd4_umh_cltrack_grace_done, + .version = 1, + .msglen = 0, }; int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7857942c5ca6..c65aeaa812d4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -50,6 +50,7 @@ #include "netns.h" #include "pnfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -429,18 +430,18 @@ put_nfs4_file(struct nfs4_file *fi) } } -static struct file * +static struct nfsd_file * __nfs4_get_fd(struct nfs4_file *f, int oflag) { if (f->fi_fds[oflag]) - return get_file(f->fi_fds[oflag]); + return nfsd_file_get(f->fi_fds[oflag]); return NULL; } -static struct file * +static struct nfsd_file * find_writeable_file_locked(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); @@ -450,10 +451,10 @@ find_writeable_file_locked(struct nfs4_file *f) return ret; } -static struct file * +static struct nfsd_file * find_writeable_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_writeable_file_locked(f); @@ -462,9 +463,10 @@ find_writeable_file(struct nfs4_file *f) return ret; } -static struct file *find_readable_file_locked(struct nfs4_file *f) +static struct nfsd_file * +find_readable_file_locked(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); @@ -474,10 +476,10 @@ static struct file *find_readable_file_locked(struct nfs4_file *f) return ret; } -static struct file * +static struct nfsd_file * find_readable_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_readable_file_locked(f); @@ -486,10 +488,10 @@ find_readable_file(struct nfs4_file *f) return ret; } -struct file * +struct nfsd_file * find_any_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = __nfs4_get_fd(f, O_RDWR); @@ -590,17 +592,17 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) might_lock(&fp->fi_lock); if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) { - struct file *f1 = NULL; - struct file *f2 = NULL; + struct nfsd_file *f1 = NULL; + struct nfsd_file *f2 = NULL; swap(f1, fp->fi_fds[oflag]); if (atomic_read(&fp->fi_access[1 - oflag]) == 0) swap(f2, fp->fi_fds[O_RDWR]); spin_unlock(&fp->fi_lock); if (f1) - fput(f1); + nfsd_file_put(f1); if (f2) - fput(f2); + nfsd_file_put(f2); } } @@ -933,25 +935,25 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) static void put_deleg_file(struct nfs4_file *fp) { - struct file *filp = NULL; + struct nfsd_file *nf = NULL; spin_lock(&fp->fi_lock); if (--fp->fi_delegees == 0) - swap(filp, fp->fi_deleg_file); + swap(nf, fp->fi_deleg_file); spin_unlock(&fp->fi_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); } static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; - struct file *filp = fp->fi_deleg_file; + struct nfsd_file *nf = fp->fi_deleg_file; WARN_ON_ONCE(!fp->fi_delegees); - vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp); + vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); put_deleg_file(fp); } @@ -1289,11 +1291,14 @@ static void nfs4_free_lock_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); - struct file *file; + struct nfsd_file *nf; - file = find_any_file(stp->st_stid.sc_file); - if (file) - filp_close(file, (fl_owner_t)lo); + nf = find_any_file(stp->st_stid.sc_file); + if (nf) { + get_file(nf->nf_file); + filp_close(nf->nf_file, (fl_owner_t)lo); + nfsd_file_put(nf); + } nfs4_free_ol_stateid(stid); } @@ -1563,21 +1568,39 @@ static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) * re-negotiate active sessions and reduce their slot usage to make * room for new connections. For now we just fail the create session. */ -static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) +static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn) { u32 slotsize = slot_bytes(ca); u32 num = ca->maxreqs; unsigned long avail, total_avail; + unsigned int scale_factor; spin_lock(&nfsd_drc_lock); - total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + if (nfsd_drc_max_mem > nfsd_drc_mem_used) + total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + else + /* We have handed out more space than we chose in + * set_max_drc() to allow. That isn't really a + * problem as long as that doesn't make us think we + * have lots more due to integer overflow. + */ + total_avail = 0; avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* - * Never use more than a third of the remaining memory, - * unless it's the only way to give this client a slot: + * Never use more than a fraction of the remaining memory, + * unless it's the only way to give this client a slot. + * The chosen fraction is either 1/8 or 1/number of threads, + * whichever is smaller. This ensures there are adequate + * slots to support multiple clients per thread. + * Give the client one slot even if that would require + * over-allocation--it is better than failure. */ - avail = clamp_t(unsigned long, avail, slotsize, total_avail/3); + scale_factor = max_t(unsigned int, 8, nn->nfsd_serv->sv_nrthreads); + + avail = clamp_t(unsigned long, avail, slotsize, + total_avail/scale_factor); num = min_t(int, num, avail / slotsize); + num = max_t(int, num, 1); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); @@ -2323,9 +2346,9 @@ static void states_stop(struct seq_file *s, void *v) spin_unlock(&clp->cl_lock); } -static void nfs4_show_superblock(struct seq_file *s, struct file *f) +static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) { - struct inode *inode = file_inode(f); + struct inode *inode = f->nf_inode; seq_printf(s, "superblock: \"%02x:%02x:%ld\"", MAJOR(inode->i_sb->s_dev), @@ -2343,7 +2366,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_ol_stateid *ols; struct nfs4_file *nf; - struct file *file; + struct nfsd_file *file; struct nfs4_stateowner *oo; unsigned int access, deny; @@ -2370,7 +2393,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_owner(s, oo); seq_printf(s, " }\n"); - fput(file); + nfsd_file_put(file); return 0; } @@ -2379,7 +2402,7 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_ol_stateid *ols; struct nfs4_file *nf; - struct file *file; + struct nfsd_file *file; struct nfs4_stateowner *oo; ols = openlockstateid(st); @@ -2401,7 +2424,7 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_owner(s, oo); seq_printf(s, " }\n"); - fput(file); + nfsd_file_put(file); return 0; } @@ -2410,7 +2433,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_delegation *ds; struct nfs4_file *nf; - struct file *file; + struct nfsd_file *file; ds = delegstateid(st); nf = st->sc_file; @@ -2433,7 +2456,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_layout_stateid *ls; - struct file *file; + struct nfsd_file *file; ls = container_of(st, struct nfs4_layout_stateid, ls_stid); file = ls->ls_file; @@ -3169,10 +3192,10 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs * performance. When short on memory we therefore prefer to * decrease number of slots instead of their size. Clients that * request larger slots than they need will get poor results: + * Note that we always allow at least one slot, because our + * accounting is soft and provides no guarantees either way. */ - ca->maxreqs = nfsd4_get_drc_mem(ca); - if (!ca->maxreqs) - return nfserr_jukebox; + ca->maxreqs = nfsd4_get_drc_mem(ca, nn); return nfs_ok; } @@ -4651,7 +4674,7 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { - struct file *filp = NULL; + struct nfsd_file *nf = NULL; __be32 status; int oflag = nfs4_access_to_omode(open->op_share_access); int access = nfs4_access_to_access(open->op_share_access); @@ -4687,18 +4710,18 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, if (!fp->fi_fds[oflag]) { spin_unlock(&fp->fi_lock); - status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp); + status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); if (status) goto out_put_access; spin_lock(&fp->fi_lock); if (!fp->fi_fds[oflag]) { - fp->fi_fds[oflag] = filp; - filp = NULL; + fp->fi_fds[oflag] = nf; + nf = NULL; } } spin_unlock(&fp->fi_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); status = nfsd4_truncate(rqstp, cur_fh, open); if (status) @@ -4767,7 +4790,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, fl->fl_end = OFFSET_MAX; fl->fl_owner = (fl_owner_t)dp; fl->fl_pid = current->tgid; - fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file; + fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file; return fl; } @@ -4777,7 +4800,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, { int status = 0; struct nfs4_delegation *dp; - struct file *filp; + struct nfsd_file *nf; struct file_lock *fl; /* @@ -4788,8 +4811,8 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - filp = find_readable_file(fp); - if (!filp) { + nf = find_readable_file(fp); + if (!nf) { /* We should always have a readable file here */ WARN_ON_ONCE(1); return ERR_PTR(-EBADF); @@ -4799,17 +4822,17 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (nfs4_delegation_exists(clp, fp)) status = -EAGAIN; else if (!fp->fi_deleg_file) { - fp->fi_deleg_file = filp; + fp->fi_deleg_file = nf; /* increment early to prevent fi_deleg_file from being * cleared */ fp->fi_delegees = 1; - filp = NULL; + nf = NULL; } else fp->fi_delegees++; spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); if (status) return ERR_PTR(status); @@ -4822,7 +4845,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (!fl) goto out_clnt_odstate; - status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL); + status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL); if (fl) locks_free_lock(fl); if (status) @@ -4842,7 +4865,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, return dp; out_unlock: - vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp); + vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); @@ -5513,7 +5536,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, return nfs_ok; } -static struct file * +static struct nfsd_file * nfs4_find_file(struct nfs4_stid *s, int flags) { if (!s) @@ -5523,7 +5546,7 @@ nfs4_find_file(struct nfs4_stid *s, int flags) case NFS4_DELEG_STID: if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file)) return NULL; - return get_file(s->sc_file->fi_deleg_file); + return nfsd_file_get(s->sc_file->fi_deleg_file); case NFS4_OPEN_STID: case NFS4_LOCK_STID: if (flags & RD_STATE) @@ -5549,32 +5572,28 @@ nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags) static __be32 nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, - struct file **filpp, bool *tmp_file, int flags) + struct nfsd_file **nfp, int flags) { int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE; - struct file *file; + struct nfsd_file *nf; __be32 status; - file = nfs4_find_file(s, flags); - if (file) { + nf = nfs4_find_file(s, flags); + if (nf) { status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, acc | NFSD_MAY_OWNER_OVERRIDE); if (status) { - fput(file); - return status; + nfsd_file_put(nf); + goto out; } - - *filpp = file; } else { - status = nfsd_open(rqstp, fhp, S_IFREG, acc, filpp); + status = nfsd_file_acquire(rqstp, fhp, acc, &nf); if (status) return status; - - if (tmp_file) - *tmp_file = true; } - - return 0; + *nfp = nf; +out: + return status; } /* @@ -5583,7 +5602,7 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, - stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file) + stateid_t *stateid, int flags, struct nfsd_file **nfp) { struct inode *ino = d_inode(fhp->fh_dentry); struct net *net = SVC_NET(rqstp); @@ -5591,10 +5610,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfs4_stid *s = NULL; __be32 status; - if (filpp) - *filpp = NULL; - if (tmp_file) - *tmp_file = false; + if (nfp) + *nfp = NULL; if (grace_disallows_io(net, ino)) return nfserr_grace; @@ -5631,8 +5648,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, status = nfs4_check_fh(fhp, s); done: - if (!status && filpp) - status = nfs4_check_file(rqstp, fhp, s, filpp, tmp_file, flags); + if (status == nfs_ok && nfp) + status = nfs4_check_file(rqstp, fhp, s, nfp, flags); out: if (s) nfs4_put_stid(s); @@ -6392,7 +6409,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *lock_stp = NULL; struct nfs4_ol_stateid *open_stp = NULL; struct nfs4_file *fp; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; @@ -6474,8 +6491,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Fallthrough */ case NFS4_READ_LT: spin_lock(&fp->fi_lock); - filp = find_readable_file_locked(fp); - if (filp) + nf = find_readable_file_locked(fp); + if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); fl_type = F_RDLCK; @@ -6486,8 +6503,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Fallthrough */ case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); - filp = find_writeable_file_locked(fp); - if (filp) + nf = find_writeable_file_locked(fp); + if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); fl_type = F_WRLCK; @@ -6497,7 +6514,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - if (!filp) { + if (!nf) { status = nfserr_openmode; goto out; } @@ -6513,7 +6530,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, file_lock->fl_type = fl_type; file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); file_lock->fl_pid = current->tgid; - file_lock->fl_file = filp; + file_lock->fl_file = nf->nf_file; file_lock->fl_flags = fl_flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; @@ -6535,7 +6552,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, spin_unlock(&nn->blocked_locks_lock); } - err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); + err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, conflock); switch (err) { case 0: /* success! */ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); @@ -6570,8 +6587,8 @@ out: } free_blocked_lock(nbl); } - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); if (lock_stp) { /* Bump seqid manually if the 4.0 replay owner is openowner */ if (cstate->replay_owner && @@ -6606,11 +6623,11 @@ out: */ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { - struct file *file; - __be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + struct nfsd_file *nf; + __be32 err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); if (!err) { - err = nfserrno(vfs_test_lock(file, lock)); - fput(file); + err = nfserrno(vfs_test_lock(nf->nf_file, lock)); + nfsd_file_put(nf); } return err; } @@ -6698,7 +6715,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_locku *locku = &u->locku; struct nfs4_ol_stateid *stp; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; struct file_lock *file_lock = NULL; __be32 status; int err; @@ -6716,8 +6733,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &stp, nn); if (status) goto out; - filp = find_any_file(stp->st_stid.sc_file); - if (!filp) { + nf = find_any_file(stp->st_stid.sc_file); + if (!nf) { status = nfserr_lock_range; goto put_stateid; } @@ -6725,13 +6742,13 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; - goto fput; + goto put_file; } file_lock->fl_type = F_UNLCK; file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); file_lock->fl_pid = current->tgid; - file_lock->fl_file = filp; + file_lock->fl_file = nf->nf_file; file_lock->fl_flags = FL_POSIX; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = locku->lu_offset; @@ -6740,14 +6757,14 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, locku->lu_length); nfs4_transform_lock_offset(file_lock); - err = vfs_lock_file(filp, F_SETLK, file_lock, NULL); + err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, NULL); if (err) { dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); goto out_nfserr; } nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid); -fput: - fput(filp); +put_file: + nfsd_file_put(nf); put_stateid: mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); @@ -6759,7 +6776,7 @@ out: out_nfserr: status = nfserrno(err); - goto fput; + goto put_file; } /* @@ -6772,17 +6789,17 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock *fl; int status = false; - struct file *filp = find_any_file(fp); + struct nfsd_file *nf = find_any_file(fp); struct inode *inode; struct file_lock_context *flctx; - if (!filp) { + if (!nf) { /* Any valid lock stateid should have some sort of access */ WARN_ON_ONCE(1); return status; } - inode = locks_inode(filp); + inode = locks_inode(nf->nf_file); flctx = inode->i_flctx; if (flctx && !list_empty_careful(&flctx->flc_posix)) { @@ -6795,7 +6812,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) } spin_unlock(&flctx->flc_lock); } - fput(filp); + nfsd_file_put(nf); return status; } @@ -6888,7 +6905,8 @@ nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn) * will be freed in nfs4_remove_reclaim_record in the normal case). */ struct nfs4_client_reclaim * -nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn) +nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, + struct nfsd_net *nn) { unsigned int strhashval; struct nfs4_client_reclaim *crp; @@ -6901,6 +6919,8 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn) list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]); crp->cr_name.data = name.data; crp->cr_name.len = name.len; + crp->cr_princhash.data = princhash.data; + crp->cr_princhash.len = princhash.len; crp->cr_clp = NULL; nn->reclaim_str_hashtbl_size++; } @@ -6912,6 +6932,7 @@ nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn) { list_del(&crp->cr_strhash); kfree(crp->cr_name.data); + kfree(crp->cr_princhash.data); kfree(crp); nn->reclaim_str_hashtbl_size--; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 442811809f3d..533d0fc3c96b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -49,6 +49,7 @@ #include "cache.h" #include "netns.h" #include "pnfs.h" +#include "filecache.h" #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> @@ -203,6 +204,13 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) return p; } +static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp) +{ + unsigned int this = (char *)argp->end - (char *)argp->p; + + return this + argp->pagelen; +} + static int zero_clientid(clientid_t *clid) { return (clid->cl_boot == 0) && (clid->cl_id == 0); @@ -211,10 +219,10 @@ static int zero_clientid(clientid_t *clid) /** * svcxdr_tmpalloc - allocate memory to be freed after compound processing * @argp: NFSv4 compound argument structure - * @p: pointer to be freed (with kfree()) + * @len: length of buffer to allocate * - * Marks @p to be freed when processing the compound operation - * described in @argp finishes. + * Allocates a buffer of size @len to be freed when processing the compound + * operation described in @argp finishes. */ static void * svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len) @@ -347,7 +355,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, READ_BUF(4); len += 4; nace = be32_to_cpup(p++); - if (nace > NFS4_ACL_MAX) + if (nace > compoundargs_bytes_left(argp)/20) + /* + * Even with 4-byte names there wouldn't be + * space for that many aces; something fishy is + * going on: + */ return nfserr_fbig; *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace)); @@ -1418,7 +1431,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, struct nfsd4_create_session *sess) { DECODE_HEAD; - u32 dummy; READ_BUF(16); COPYMEM(&sess->clientid, 8); @@ -1427,7 +1439,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, /* Fore channel attrs */ READ_BUF(28); - dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + p++; /* headerpadsz is always 0 */ sess->fore_channel.maxreq_sz = be32_to_cpup(p++); sess->fore_channel.maxresp_sz = be32_to_cpup(p++); sess->fore_channel.maxresp_cached = be32_to_cpup(p++); @@ -1444,7 +1456,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, /* Back channel attrs */ READ_BUF(28); - dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + p++; /* headerpadsz is always 0 */ sess->back_channel.maxreq_sz = be32_to_cpup(p++); sess->back_channel.maxresp_sz = be32_to_cpup(p++); sess->back_channel.maxresp_cached = be32_to_cpup(p++); @@ -1736,7 +1748,6 @@ static __be32 nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) { DECODE_HEAD; - unsigned int tmp; status = nfsd4_decode_stateid(argp, ©->cp_src_stateid); if (status) @@ -1751,7 +1762,7 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) p = xdr_decode_hyper(p, ©->cp_count); p++; /* ca_consecutive: we always do consecutive copies */ copy->cp_synchronous = be32_to_cpup(p++); - tmp = be32_to_cpup(p); /* Source server list not supported */ + /* tmp = be32_to_cpup(p); Source server list not supported */ DECODE_TAIL; } @@ -3217,9 +3228,8 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ if (!p) return nfserr_resource; encode_cinfo(p, &create->cr_cinfo); - nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0], + return nfsd4_encode_bitmap(xdr, create->cr_bmval[0], create->cr_bmval[1], create->cr_bmval[2]); - return 0; } static __be32 @@ -3462,7 +3472,7 @@ static __be32 nfsd4_encode_splice_read( len = maxcount; nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, - file, read->rd_offset, &maxcount); + file, read->rd_offset, &maxcount, &eof); read->rd_length = maxcount; if (nfserr) { /* @@ -3474,9 +3484,6 @@ static __be32 nfsd4_encode_splice_read( return nfserr; } - eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, - d_inode(read->rd_fhp->fh_dentry)->i_size); - *(p++) = htonl(eof); *(p++) = htonl(maxcount); @@ -3547,15 +3554,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, len = maxcount; nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, &maxcount); + resp->rqstp->rq_vec, read->rd_vlen, &maxcount, + &eof); read->rd_length = maxcount; if (nfserr) return nfserr; xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); - eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, - d_inode(read->rd_fhp->fh_dentry)->i_size); - tmp = htonl(eof); write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); tmp = htonl(maxcount); @@ -3574,11 +3579,14 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, { unsigned long maxcount; struct xdr_stream *xdr = &resp->xdr; - struct file *file = read->rd_filp; + struct file *file; int starting_len = xdr->buf->len; - struct raparms *ra = NULL; __be32 *p; + if (nfserr) + return nfserr; + file = read->rd_nf->nf_file; + p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ if (!p) { WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); @@ -3596,18 +3604,12 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, (xdr->buf->buflen - xdr->buf->len)); maxcount = min_t(unsigned long, maxcount, read->rd_length); - if (read->rd_tmp_file) - ra = nfsd_init_raparms(file); - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); else nfserr = nfsd4_encode_readv(resp, read, file, maxcount); - if (ra) - nfsd_put_raparams(file, ra); - if (nfserr) xdr_truncate_encode(xdr, starting_len); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 2c215171c0eb..11b42c523f04 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1476,6 +1476,7 @@ static __net_init int nfsd_init_net(struct net *net) atomic_set(&nn->ntf_refcnt, 0); init_waitqueue_head(&nn->ntf_wq); + seqlock_init(&nn->boot_lock); mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL); if (IS_ERR(mnt)) { diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0d20fd161225..c83ddac22f38 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -172,6 +172,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; __be32 nfserr; + u32 eof; dprintk("nfsd: READ %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -195,7 +196,8 @@ nfsd_proc_read(struct svc_rqst *rqstp) nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, rqstp->rq_vec, argp->vlen, - &resp->count); + &resp->count, + &eof); if (nfserr) return nfserr; return fh_getattr(&resp->fh, &resp->stat); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 18d94ea984ba..fdf7ed4bd5dd 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -27,6 +27,7 @@ #include "cache.h" #include "vfs.h" #include "netns.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_SVC @@ -313,22 +314,17 @@ static int nfsd_startup_generic(int nrservs) if (nfsd_users++) return 0; - /* - * Readahead param cache - will no-op if it already exists. - * (Note therefore results will be suboptimal if number of - * threads is modified after nfsd start.) - */ - ret = nfsd_racache_init(2*nrservs); + ret = nfsd_file_cache_init(); if (ret) goto dec_users; ret = nfs4_state_start(); if (ret) - goto out_racache; + goto out_file_cache; return 0; -out_racache: - nfsd_racache_shutdown(); +out_file_cache: + nfsd_file_cache_shutdown(); dec_users: nfsd_users--; return ret; @@ -340,7 +336,7 @@ static void nfsd_shutdown_generic(void) return; nfs4_state_shutdown(); - nfsd_racache_shutdown(); + nfsd_file_cache_shutdown(); } static bool nfsd_needs_lockd(struct nfsd_net *nn) @@ -348,6 +344,35 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn) return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); } +void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) +{ + int seq = 0; + + do { + read_seqbegin_or_lock(&nn->boot_lock, &seq); + /* + * This is opaque to client, so no need to byte-swap. Use + * __force to keep sparse happy. y2038 time_t overflow is + * irrelevant in this usage + */ + verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; + verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; + } while (need_seqretry(&nn->boot_lock, seq)); + done_seqretry(&nn->boot_lock, seq); +} + +static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) +{ + ktime_get_real_ts64(&nn->nfssvc_boot); +} + +void nfsd_reset_boot_verifier(struct nfsd_net *nn) +{ + write_seqlock(&nn->boot_lock); + nfsd_reset_boot_verifier_locked(nn); + write_sequnlock(&nn->boot_lock); +} + static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -391,6 +416,7 @@ static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + nfsd_file_cache_purge(net); nfs4_state_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); @@ -599,7 +625,7 @@ int nfsd_create_serv(struct net *net) #endif } atomic_inc(&nn->ntf_refcnt); - ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */ + nfsd_reset_boot_verifier(nn); return 0; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 5dbd16946e8e..46f56afb6cb8 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -378,6 +378,7 @@ struct nfs4_client_reclaim { struct list_head cr_strhash; /* hash by cr_name */ struct nfs4_client *cr_clp; /* pointer to associated clp */ struct xdr_netobj cr_name; /* recovery dir name */ + struct xdr_netobj cr_princhash; }; /* A reasonable value for REPLAY_ISIZE was estimated as follows: @@ -506,7 +507,7 @@ struct nfs4_file { }; struct list_head fi_clnt_odstate; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ - struct file * fi_fds[3]; + struct nfsd_file *fi_fds[3]; /* * Each open or lock stateid contributes 0-4 to the counts * below depending on which bits are set in st_access_bitmap: @@ -516,7 +517,7 @@ struct nfs4_file { */ atomic_t fi_access[2]; u32 fi_share_deny; - struct file *fi_deleg_file; + struct nfsd_file *fi_deleg_file; int fi_delegees; struct knfsd_fh fi_fhandle; bool fi_had_conflict; @@ -565,7 +566,7 @@ struct nfs4_layout_stateid { spinlock_t ls_lock; struct list_head ls_layouts; u32 ls_layout_type; - struct file *ls_file; + struct nfsd_file *ls_file; struct nfsd4_callback ls_recall; stateid_t ls_recall_sid; bool ls_recalled; @@ -616,7 +617,7 @@ struct nfsd4_copy; extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, - stateid_t *stateid, int flags, struct file **filp, bool *tmp_file); + stateid_t *stateid, int flags, struct nfsd_file **filp); __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn); @@ -645,7 +646,7 @@ extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfsd4_shutdown_copy(struct nfs4_client *clp); extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, - struct nfsd_net *nn); + struct xdr_netobj princhash, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); struct nfs4_file *find_file(struct knfsd_fh *fh); @@ -657,7 +658,7 @@ static inline void get_nfs4_file(struct nfs4_file *fi) { refcount_inc(&fi->fi_ref); } -struct file *find_any_file(struct nfs4_file *f); +struct nfsd_file *find_any_file(struct nfs4_file *f); /* grace period management */ void nfsd4_end_grace(struct nfsd_net *nn); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 80933e4334d8..ffc78a0e28b2 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -126,6 +126,8 @@ DEFINE_NFSD_ERR_EVENT(read_err); DEFINE_NFSD_ERR_EVENT(write_err); #include "state.h" +#include "filecache.h" +#include "vfs.h" DECLARE_EVENT_CLASS(nfsd_stateid_class, TP_PROTO(stateid_t *stp), @@ -164,6 +166,144 @@ DEFINE_STATEID_EVENT(layout_recall_done); DEFINE_STATEID_EVENT(layout_recall_fail); DEFINE_STATEID_EVENT(layout_recall_release); +#define show_nf_flags(val) \ + __print_flags(val, "|", \ + { 1 << NFSD_FILE_HASHED, "HASHED" }, \ + { 1 << NFSD_FILE_PENDING, "PENDING" }, \ + { 1 << NFSD_FILE_BREAK_READ, "BREAK_READ" }, \ + { 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \ + { 1 << NFSD_FILE_REFERENCED, "REFERENCED"}) + +/* FIXME: This should probably be fleshed out in the future. */ +#define show_nf_may(val) \ + __print_flags(val, "|", \ + { NFSD_MAY_READ, "READ" }, \ + { NFSD_MAY_WRITE, "WRITE" }, \ + { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }) + +DECLARE_EVENT_CLASS(nfsd_file_class, + TP_PROTO(struct nfsd_file *nf), + TP_ARGS(nf), + TP_STRUCT__entry( + __field(unsigned int, nf_hashval) + __field(void *, nf_inode) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + ), + TP_fast_assign( + __entry->nf_hashval = nf->nf_hashval; + __entry->nf_inode = nf->nf_inode; + __entry->nf_ref = atomic_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p", + __entry->nf_hashval, + __entry->nf_inode, + __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nf_may(__entry->nf_may), + __entry->nf_file) +) + +#define DEFINE_NFSD_FILE_EVENT(name) \ +DEFINE_EVENT(nfsd_file_class, name, \ + TP_PROTO(struct nfsd_file *nf), \ + TP_ARGS(nf)) + +DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc); +DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash); +DEFINE_NFSD_FILE_EVENT(nfsd_file_put); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked); + +TRACE_EVENT(nfsd_file_acquire, + TP_PROTO(struct svc_rqst *rqstp, unsigned int hash, + struct inode *inode, unsigned int may_flags, + struct nfsd_file *nf, __be32 status), + + TP_ARGS(rqstp, hash, inode, may_flags, nf, status), + + TP_STRUCT__entry( + __field(__be32, xid) + __field(unsigned int, hash) + __field(void *, inode) + __field(unsigned int, may_flags) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + __field(__be32, status) + ), + + TP_fast_assign( + __entry->xid = rqstp->rq_xid; + __entry->hash = hash; + __entry->inode = inode; + __entry->may_flags = may_flags; + __entry->nf_ref = nf ? atomic_read(&nf->nf_ref) : 0; + __entry->nf_flags = nf ? nf->nf_flags : 0; + __entry->nf_may = nf ? nf->nf_may : 0; + __entry->nf_file = nf ? nf->nf_file : NULL; + __entry->status = status; + ), + + TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u", + be32_to_cpu(__entry->xid), __entry->hash, __entry->inode, + show_nf_may(__entry->may_flags), __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nf_may(__entry->nf_may), __entry->nf_file, + be32_to_cpu(__entry->status)) +); + +DECLARE_EVENT_CLASS(nfsd_file_search_class, + TP_PROTO(struct inode *inode, unsigned int hash, int found), + TP_ARGS(inode, hash, found), + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned int, hash) + __field(int, found) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->hash = hash; + __entry->found = found; + ), + TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash, + __entry->inode, __entry->found) +); + +#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \ +DEFINE_EVENT(nfsd_file_search_class, name, \ + TP_PROTO(struct inode *inode, unsigned int hash, int found), \ + TP_ARGS(inode, hash, found)) + +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync); +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode); +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached); + +TRACE_EVENT(nfsd_file_fsnotify_handle_event, + TP_PROTO(struct inode *inode, u32 mask), + TP_ARGS(inode, mask), + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned int, nlink) + __field(umode_t, mode) + __field(u32, mask) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->nlink = inode->i_nlink; + __entry->mode = inode->i_mode; + __entry->mask = mask; + ), + TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode, + __entry->nlink, __entry->mode, __entry->mask) +); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c85783e536d5..bd0a385df3fc 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -44,38 +44,11 @@ #include "nfsd.h" #include "vfs.h" +#include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_FILEOP - -/* - * This is a cache of readahead params that help us choose the proper - * readahead strategy. Initially, we set all readahead parameters to 0 - * and let the VFS handle things. - * If you increase the number of cached files very much, you'll need to - * add a hash table here. - */ -struct raparms { - struct raparms *p_next; - unsigned int p_count; - ino_t p_ino; - dev_t p_dev; - int p_set; - struct file_ra_state p_ra; - unsigned int p_hindex; -}; - -struct raparm_hbucket { - struct raparms *pb_head; - spinlock_t pb_lock; -} ____cacheline_aligned_in_smp; - -#define RAPARM_HASH_BITS 4 -#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) -#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) -static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; - /* * Called from nfsd_lookup and encode_dirent. Check if we have crossed * a mount point. @@ -699,7 +672,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor } #endif /* CONFIG_NFSD_V3 */ -static int nfsd_open_break_lease(struct inode *inode, int access) +int nfsd_open_break_lease(struct inode *inode, int access) { unsigned int mode; @@ -715,8 +688,8 @@ static int nfsd_open_break_lease(struct inode *inode, int access) * and additional flags. * N.B. After this call fhp needs an fh_put */ -__be32 -nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, +static __be32 +__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { struct path path; @@ -726,25 +699,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, __be32 err; int host_err = 0; - validate_process_creds(); - - /* - * If we get here, then the client has already done an "open", - * and (hopefully) checked permission - so allow OWNER_OVERRIDE - * in case a chmod has now revoked permission. - * - * Arguably we should also allow the owner override for - * directories, but we never have and it doesn't seem to have - * caused anyone a problem. If we were to change this, note - * also that our filldir callbacks would need a variant of - * lookup_one_len that doesn't check permissions. - */ - if (type == S_IFREG) - may_flags |= NFSD_MAY_OWNER_OVERRIDE; - err = fh_verify(rqstp, fhp, type, may_flags); - if (err) - goto out; - path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; inode = d_inode(path.dentry); @@ -798,67 +752,46 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, out_nfserr: err = nfserrno(host_err); out: - validate_process_creds(); return err; } -struct raparms * -nfsd_init_raparms(struct file *file) +__be32 +nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) { - struct inode *inode = file_inode(file); - dev_t dev = inode->i_sb->s_dev; - ino_t ino = inode->i_ino; - struct raparms *ra, **rap, **frap = NULL; - int depth = 0; - unsigned int hash; - struct raparm_hbucket *rab; - - hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; - rab = &raparm_hash[hash]; - - spin_lock(&rab->pb_lock); - for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { - if (ra->p_ino == ino && ra->p_dev == dev) - goto found; - depth++; - if (ra->p_count == 0) - frap = rap; - } - depth = nfsdstats.ra_size; - if (!frap) { - spin_unlock(&rab->pb_lock); - return NULL; - } - rap = frap; - ra = *frap; - ra->p_dev = dev; - ra->p_ino = ino; - ra->p_set = 0; - ra->p_hindex = hash; -found: - if (rap != &rab->pb_head) { - *rap = ra->p_next; - ra->p_next = rab->pb_head; - rab->pb_head = ra; - } - ra->p_count++; - nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; - spin_unlock(&rab->pb_lock); + __be32 err; - if (ra->p_set) - file->f_ra = ra->p_ra; - return ra; + validate_process_creds(); + /* + * If we get here, then the client has already done an "open", + * and (hopefully) checked permission - so allow OWNER_OVERRIDE + * in case a chmod has now revoked permission. + * + * Arguably we should also allow the owner override for + * directories, but we never have and it doesn't seem to have + * caused anyone a problem. If we were to change this, note + * also that our filldir callbacks would need a variant of + * lookup_one_len that doesn't check permissions. + */ + if (type == S_IFREG) + may_flags |= NFSD_MAY_OWNER_OVERRIDE; + err = fh_verify(rqstp, fhp, type, may_flags); + if (!err) + err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + validate_process_creds(); + return err; } -void nfsd_put_raparams(struct file *file, struct raparms *ra) +__be32 +nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) { - struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; + __be32 err; - spin_lock(&rab->pb_lock); - ra->p_ra = file->f_ra; - ra->p_set = 1; - ra->p_count--; - spin_unlock(&rab->pb_lock); + validate_process_creds(); + err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + validate_process_creds(); + return err; } /* @@ -901,12 +834,23 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } +static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len, + size_t expected) +{ + if (expected != 0 && len == 0) + return 1; + if (offset+len >= i_size_read(file_inode(file))) + return 1; + return 0; +} + static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - unsigned long *count, int host_err) + unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { nfsdstats.io_read += host_err; + *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); trace_nfsd_read_io_done(rqstp, fhp, offset, *count); @@ -918,7 +862,8 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, } __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, unsigned long *count) + struct file *file, loff_t offset, unsigned long *count, + u32 *eof) { struct splice_desc sd = { .len = 0, @@ -926,25 +871,27 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, .pos = offset, .u.data = rqstp, }; - int host_err; + ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); rqstp->rq_next_page = rqstp->rq_respages + 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); - return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - struct kvec *vec, int vlen, unsigned long *count) + struct kvec *vec, int vlen, unsigned long *count, + u32 *eof) { struct iov_iter iter; - int host_err; + loff_t ppos = offset; + ssize_t host_err; trace_nfsd_read_vector(rqstp, fhp, offset, *count); iov_iter_kvec(&iter, READ, vec, vlen, *count); - host_err = vfs_iter_read(file, &iter, &offset, 0); - return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); + host_err = vfs_iter_read(file, &iter, &ppos, 0); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } /* @@ -1025,8 +972,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, nfsdstats.io_write += *cnt; fsnotify_modify(file); - if (stable && use_wgather) + if (stable && use_wgather) { host_err = wait_for_concurrent_writes(file); + if (host_err < 0) + nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), + nfsd_net_id)); + } out_nfserr: if (host_err >= 0) { @@ -1047,27 +998,25 @@ out_nfserr: * N.B. After this call fhp needs an fh_put */ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) + loff_t offset, struct kvec *vec, int vlen, unsigned long *count, + u32 *eof) { + struct nfsd_file *nf; struct file *file; - struct raparms *ra; __be32 err; trace_nfsd_read_start(rqstp, fhp, offset, *count); - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); if (err) return err; - ra = nfsd_init_raparms(file); - + file = nf->nf_file; if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) - err = nfsd_splice_read(rqstp, fhp, file, offset, count); + err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); else - err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count); + err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof); - if (ra) - nfsd_put_raparams(file, ra); - fput(file); + nfsd_file_put(nf); trace_nfsd_read_done(rqstp, fhp, offset, *count); @@ -1083,17 +1032,18 @@ __be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, int stable) { - struct file *file = NULL; - __be32 err = 0; + struct nfsd_file *nf; + __be32 err; trace_nfsd_write_start(rqstp, fhp, offset, *cnt); - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf); if (err) goto out; - err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable); - fput(file); + err = nfsd_vfs_write(rqstp, fhp, nf->nf_file, offset, vec, + vlen, cnt, stable); + nfsd_file_put(nf); out: trace_nfsd_write_done(rqstp, fhp, offset, *cnt); return err; @@ -1113,9 +1063,9 @@ __be32 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long count) { - struct file *file; - loff_t end = LLONG_MAX; - __be32 err = nfserr_inval; + struct nfsd_file *nf; + loff_t end = LLONG_MAX; + __be32 err = nfserr_inval; if (offset < 0) goto out; @@ -1125,20 +1075,27 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; } - err = nfsd_open(rqstp, fhp, S_IFREG, - NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file); + err = nfsd_file_acquire(rqstp, fhp, + NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf); if (err) goto out; if (EX_ISSYNC(fhp->fh_export)) { - int err2 = vfs_fsync_range(file, offset, end, 0); + int err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); - if (err2 != -EINVAL) - err = nfserrno(err2); - else + switch (err2) { + case 0: + break; + case -EINVAL: err = nfserr_notsupp; + break; + default: + err = nfserrno(err2); + nfsd_reset_boot_verifier(net_generic(nf->nf_net, + nfsd_net_id)); + } } - fput(file); + nfsd_file_put(nf); out: return err; } @@ -1659,6 +1616,26 @@ out_nfserr: goto out_unlock; } +static void +nfsd_close_cached_files(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (inode && S_ISREG(inode->i_mode)) + nfsd_file_close_inode_sync(inode); +} + +static bool +nfsd_has_cached_files(struct dentry *dentry) +{ + bool ret = false; + struct inode *inode = d_inode(dentry); + + if (inode && S_ISREG(inode->i_mode)) + ret = nfsd_file_is_cached(inode); + return ret; +} + /* * Rename a file * N.B. After this call _both_ ffhp and tfhp need an fh_put @@ -1671,6 +1648,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct inode *fdir, *tdir; __be32 err; int host_err; + bool has_cached = false; err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) @@ -1689,6 +1667,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) goto out; +retry: host_err = fh_want_write(ffhp); if (host_err) { err = nfserrno(host_err); @@ -1728,11 +1707,16 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) goto out_dput_new; - host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); - if (!host_err) { - host_err = commit_metadata(tfhp); - if (!host_err) - host_err = commit_metadata(ffhp); + if (nfsd_has_cached_files(ndentry)) { + has_cached = true; + goto out_dput_old; + } else { + host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) + host_err = commit_metadata(ffhp); + } } out_dput_new: dput(ndentry); @@ -1745,12 +1729,26 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, * as that would do the wrong thing if the two directories * were the same, so again we do it by hand. */ - fill_post_wcc(ffhp); - fill_post_wcc(tfhp); + if (!has_cached) { + fill_post_wcc(ffhp); + fill_post_wcc(tfhp); + } unlock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = false; fh_drop_write(ffhp); + /* + * If the target dentry has cached open files, then we need to try to + * close them prior to doing the rename. Flushing delayed fput + * shouldn't be done with locks held however, so we delay it until this + * point and then reattempt the whole shebang. + */ + if (has_cached) { + has_cached = false; + nfsd_close_cached_files(ndentry); + dput(ndentry); + goto retry; + } out: return err; } @@ -1797,10 +1795,13 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; - if (type != S_IFDIR) + if (type != S_IFDIR) { + nfsd_close_cached_files(rdentry); host_err = vfs_unlink(dirp, rdentry, NULL); - else + } else { host_err = vfs_rmdir(dirp, rdentry); + } + if (!host_err) host_err = commit_metadata(fhp); dput(rdentry); @@ -2074,63 +2075,3 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, return err? nfserrno(err) : 0; } - -void -nfsd_racache_shutdown(void) -{ - struct raparms *raparm, *last_raparm; - unsigned int i; - - dprintk("nfsd: freeing readahead buffers.\n"); - - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - raparm = raparm_hash[i].pb_head; - while(raparm) { - last_raparm = raparm; - raparm = raparm->p_next; - kfree(last_raparm); - } - raparm_hash[i].pb_head = NULL; - } -} -/* - * Initialize readahead param cache - */ -int -nfsd_racache_init(int cache_size) -{ - int i; - int j = 0; - int nperbucket; - struct raparms **raparm = NULL; - - - if (raparm_hash[0].pb_head) - return 0; - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - nperbucket = max(2, nperbucket); - cache_size = nperbucket * RAPARM_HASH_SIZE; - - dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); - - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - spin_lock_init(&raparm_hash[i].pb_lock); - - raparm = &raparm_hash[i].pb_head; - for (j = 0; j < nperbucket; j++) { - *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); - if (!*raparm) - goto out_nomem; - raparm = &(*raparm)->p_next; - } - *raparm = NULL; - } - - nfsdstats.ra_size = cache_size; - return 0; - -out_nomem: - dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); - nfsd_racache_shutdown(); - return -ENOMEM; -} diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index db351247892d..a13fd9d7e1f5 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -40,8 +40,6 @@ typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); /* nfsd/vfs.c */ -int nfsd_racache_init(int); -void nfsd_racache_shutdown(void); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, @@ -75,18 +73,23 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, loff_t, unsigned long); #endif /* CONFIG_NFSD_V3 */ +int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -struct raparms; +__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, + int, struct file **); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - unsigned long *count); + unsigned long *count, + u32 *eof); __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long *count); + unsigned long *count, + u32 *eof); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, - loff_t, struct kvec *, int, unsigned long *); + loff_t, struct kvec *, int, unsigned long *, + u32 *eof); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *, int); __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -115,9 +118,6 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); -struct raparms *nfsd_init_raparms(struct file *file); -void nfsd_put_raparams(struct file *file, struct raparms *ra); - static inline int fh_want_write(struct svc_fh *fh) { int ret; @@ -152,23 +152,4 @@ static inline int nfsd_create_is_exclusive(int createmode) || createmode == NFS4_CREATE_EXCLUSIVE4_1; } -static inline bool nfsd_eof_on_read(long requested, long read, - loff_t offset, loff_t size) -{ - /* We assume a short read means eof: */ - if (requested > read) - return true; - /* - * A non-short read might also reach end of file. The spec - * still requires us to set eof in that case. - * - * Further operations may have modified the file size since - * the read, so the following check is not atomic with the read. - * We've only seen that cause a problem for a client in the case - * where the read returned a count of 0 without setting eof. - * That case was fixed by the addition of the above check. - */ - return (offset + read >= size); -} - #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 2cb29e961a76..99ff9f403ff1 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -151,7 +151,7 @@ struct nfsd3_readres { __be32 status; struct svc_fh fh; unsigned long count; - int eof; + __u32 eof; }; struct nfsd3_writeres { diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d64c870f998a..f4737d66ee98 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -273,15 +273,14 @@ struct nfsd4_open_downgrade { struct nfsd4_read { - stateid_t rd_stateid; /* request */ - u64 rd_offset; /* request */ - u32 rd_length; /* request */ - int rd_vlen; - struct file *rd_filp; - bool rd_tmp_file; + stateid_t rd_stateid; /* request */ + u64 rd_offset; /* request */ + u32 rd_length; /* request */ + int rd_vlen; + struct nfsd_file *rd_nf; - struct svc_rqst *rd_rqstp; /* response */ - struct svc_fh * rd_fhp; /* response */ + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh *rd_fhp; /* response */ }; struct nfsd4_readdir { @@ -538,8 +537,8 @@ struct nfsd4_copy { struct nfs4_client *cp_clp; - struct file *file_src; - struct file *file_dst; + struct nfsd_file *nf_src; + struct nfsd_file *nf_dst; stateid_t cp_stateid; diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 5a00121fb219..f3462828a0e2 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -54,8 +54,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { fsnotify_destroy_marks(&sb->s_fsnotify_marks); } -/* Wait until all marks queued for destruction are destroyed */ -extern void fsnotify_wait_marks_destroyed(void); /* * update the dentry->d_flags of all of inode's children to indicate if inode cares diff --git a/fs/notify/group.c b/fs/notify/group.c index 0391190305cc..133f723aca07 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -108,6 +108,7 @@ void fsnotify_put_group(struct fsnotify_group *group) if (refcount_dec_and_test(&group->refcnt)) fsnotify_final_destroy_group(group); } +EXPORT_SYMBOL_GPL(fsnotify_put_group); /* * Create a new fsnotify_group and hold a reference for the group returned. @@ -137,6 +138,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) return group; } +EXPORT_SYMBOL_GPL(fsnotify_alloc_group); int fsnotify_fasync(int fd, struct file *file, int on) { diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 99ddd126f6f0..1d96216dffd1 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -276,6 +276,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) queue_delayed_work(system_unbound_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } +EXPORT_SYMBOL_GPL(fsnotify_put_mark); /* * Get mark reference when we found the mark via lockless traversal of object @@ -430,6 +431,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, mutex_unlock(&group->mark_mutex); fsnotify_free_mark(mark); } +EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); /* * Sorting function for lists of fsnotify marks. @@ -685,6 +687,7 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, mutex_unlock(&group->mark_mutex); return ret; } +EXPORT_SYMBOL_GPL(fsnotify_add_mark); /* * Given a list of marks, find the mark associated with given group. If found @@ -711,6 +714,7 @@ struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, spin_unlock(&conn->lock); return NULL; } +EXPORT_SYMBOL_GPL(fsnotify_find_mark); /* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, @@ -809,6 +813,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, mark->group = group; WRITE_ONCE(mark->connector, NULL); } +EXPORT_SYMBOL_GPL(fsnotify_init_mark); /* * Destroy all marks in destroy_list, waits for SRCU period to finish before @@ -837,3 +842,4 @@ void fsnotify_wait_marks_destroyed(void) { flush_delayed_work(&reaper_work); } +EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed); diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 20c841a906f2..3aac5c917afe 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -71,7 +71,7 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) } /* Read, map, and pin the page. */ page = ntfs_map_page(mft_vi->i_mapping, index); - if (likely(!IS_ERR(page))) { + if (!IS_ERR(page)) { /* Catch multi sector transfer fixup errors. */ if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + ofs)))) { @@ -154,7 +154,7 @@ MFT_RECORD *map_mft_record(ntfs_inode *ni) mutex_lock(&ni->mrec_lock); m = map_mft_record_page(ni); - if (likely(!IS_ERR(m))) + if (!IS_ERR(m)) return m; mutex_unlock(&ni->mrec_lock); @@ -271,7 +271,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, m = map_mft_record(ni); /* map_mft_record() has incremented this on success. */ atomic_dec(&ni->count); - if (likely(!IS_ERR(m))) { + if (!IS_ERR(m)) { /* Verify the sequence number. */ if (likely(le16_to_cpu(m->sequence_number) == seq_no)) { ntfs_debug("Done 1."); @@ -1303,7 +1303,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, (ll - 1) >> vol->cluster_size_bits, NULL); - if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { up_write(&mftbmp_ni->runlist.lock); ntfs_error(vol->sb, "Failed to determine last allocated " "cluster of mft bitmap attribute."); @@ -1734,7 +1734,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) read_unlock_irqrestore(&mft_ni->size_lock, flags); rl = ntfs_attr_find_vcn_nolock(mft_ni, (ll - 1) >> vol->cluster_size_bits, NULL); - if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { up_write(&mft_ni->runlist.lock); ntfs_error(vol->sb, "Failed to determine last allocated " "cluster of mft data attribute."); @@ -1776,7 +1776,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) do { rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, true); - if (likely(!IS_ERR(rl2))) + if (!IS_ERR(rl2)) break; if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { ntfs_error(vol->sb, "Failed to allocate the minimal " diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 2d3cc9e3395d..4e6a44bc654c 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -115,7 +115,7 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, dent_ino = MREF(mref); ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino); dent_inode = ntfs_iget(vol->sb, dent_ino); - if (likely(!IS_ERR(dent_inode))) { + if (!IS_ERR(dent_inode)) { /* Consistency check. */ if (is_bad_inode(dent_inode) || MSEQNO(mref) == NTFS_I(dent_inode)->seq_no || diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c index 508744a93180..97932fb5179c 100644 --- a/fs/ntfs/runlist.c +++ b/fs/ntfs/runlist.c @@ -951,7 +951,7 @@ mpa_err: } /* Now combine the new and old runlists checking for overlaps. */ old_rl = ntfs_runlists_merge(old_rl, rl); - if (likely(!IS_ERR(old_rl))) + if (!IS_ERR(old_rl)) return old_rl; ntfs_free(rl); ntfs_error(vol->sb, "Failed to merge runlists."); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 29621d40f448..7dc3bc604f78 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1475,7 +1475,7 @@ not_enabled: kfree(name); /* Get the inode. */ tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) { + if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) { if (!IS_ERR(tmp_ino)) iput(tmp_ino); ntfs_error(vol->sb, "Failed to load $UsnJrnl."); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 8de1c9d644f6..9cd0a6815933 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2049,7 +2049,8 @@ out_write_size: inode->i_mtime = inode->i_ctime = current_time(inode); di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ocfs2_update_inode_fsync_trans(handle, inode, 1); + if (handle) + ocfs2_update_inode_fsync_trans(handle, inode, 1); } if (handle) ocfs2_journal_dirty(handle, wc->w_di_bh); @@ -2146,13 +2147,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, struct ocfs2_dio_write_ctxt *dwc = NULL; struct buffer_head *di_bh = NULL; u64 p_blkno; - loff_t pos = iblock << inode->i_sb->s_blocksize_bits; + unsigned int i_blkbits = inode->i_sb->s_blocksize_bits; + loff_t pos = iblock << i_blkbits; + sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits; unsigned len, total_len = bh_result->b_size; int ret = 0, first_get_block = 0; len = osb->s_clustersize - (pos & (osb->s_clustersize - 1)); len = min(total_len, len); + /* + * bh_result->b_size is count in get_more_blocks according to write + * "pos" and "end", we need map twice to return different buffer state: + * 1. area in file size, not set NEW; + * 2. area out file size, set NEW. + * + * iblock endblk + * |--------|---------|---------|--------- + * |<-------area in file------->| + */ + + if ((iblock <= endblk) && + ((iblock + ((len - 1) >> i_blkbits)) > endblk)) + len = (endblk - iblock + 1) << i_blkbits; + mlog(0, "get block of %lu at %llu:%u req %u\n", inode->i_ino, pos, len, total_len); @@ -2236,6 +2254,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, if (desc->c_needs_zero) set_buffer_new(bh_result); + if (iblock > endblk) + set_buffer_new(bh_result); + /* May sleep in end_io. It should not happen in a irq context. So defer * it to dio work queue. */ set_buffer_defer_completion(bh_result); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index d6f7b299eb23..efeea208fdeb 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -283,7 +283,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, if (inode_alloc) inode_lock(inode_alloc); - if (o2info_coherent(&fi->ifi_req)) { + if (inode_alloc && o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 90c830e3758e..d8507972ee13 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1490,18 +1490,6 @@ static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc, return loc->xl_ops->xlo_check_space(loc, xi); } -static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) -{ - loc->xl_ops->xlo_add_entry(loc, name_hash); - loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash); - /* - * We can't leave the new entry's xe_name_offset at zero or - * add_namevalue() will go nuts. We set it to the size of our - * storage so that it can never be less than any other entry. - */ - loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size); -} - static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc, struct ocfs2_xattr_info *xi) { @@ -2133,29 +2121,31 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, if (rc) goto out; - if (loc->xl_entry) { - if (ocfs2_xa_can_reuse_entry(loc, xi)) { - orig_value_size = loc->xl_entry->xe_value_size; - rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); - if (rc) - goto out; - goto alloc_value; - } + if (!loc->xl_entry) { + rc = -EINVAL; + goto out; + } - if (!ocfs2_xattr_is_local(loc->xl_entry)) { - orig_clusters = ocfs2_xa_value_clusters(loc); - rc = ocfs2_xa_value_truncate(loc, 0, ctxt); - if (rc) { - mlog_errno(rc); - ocfs2_xa_cleanup_value_truncate(loc, - "overwriting", - orig_clusters); - goto out; - } + if (ocfs2_xa_can_reuse_entry(loc, xi)) { + orig_value_size = loc->xl_entry->xe_value_size; + rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); + if (rc) + goto out; + goto alloc_value; + } + + if (!ocfs2_xattr_is_local(loc->xl_entry)) { + orig_clusters = ocfs2_xa_value_clusters(loc); + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc) { + mlog_errno(rc); + ocfs2_xa_cleanup_value_truncate(loc, + "overwriting", + orig_clusters); + goto out; } - ocfs2_xa_wipe_namevalue(loc); - } else - ocfs2_xa_add_entry(loc, name_hash); + } + ocfs2_xa_wipe_namevalue(loc); /* * If we get here, we have a blank entry. Fill it. We grow our diff --git a/fs/open.c b/fs/open.c index c60cd22cc052..b62f5c0923a8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -776,7 +776,7 @@ static int do_dentry_open(struct file *f, f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); - if (unlikely(WARN_ON(!f->f_op))) { + if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index f5834488b67d..e2ed8e08cc7a 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -31,6 +31,7 @@ #include <linux/ioport.h> #include <linux/memory.h> #include <linux/sched/task.h> +#include <linux/security.h> #include <asm/sections.h> #include "internal.h" @@ -545,9 +546,14 @@ out: static int open_kcore(struct inode *inode, struct file *filp) { + int ret = security_locked_down(LOCKDOWN_KCORE); + if (!capable(CAP_SYS_RAWIO)) return -EPERM; + if (ret) + return ret; + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!filp->private_data) return -ENOMEM; diff --git a/fs/readdir.c b/fs/readdir.c index 2f6a4534e0df..6e2623e57b2e 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -20,9 +20,23 @@ #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/compat.h> - #include <linux/uaccess.h> +#include <asm/unaligned.h> + +/* + * Note the "unsafe_put_user() semantics: we goto a + * label for errors. + */ +#define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ + char __user *dst = (_dst); \ + const char *src = (_src); \ + size_t len = (_len); \ + unsafe_put_user(0, dst+len, label); \ + unsafe_copy_to_user(dst, src, len, label); \ +} while (0) + + int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); @@ -65,6 +79,40 @@ out: EXPORT_SYMBOL(iterate_dir); /* + * POSIX says that a dirent name cannot contain NULL or a '/'. + * + * It's not 100% clear what we should really do in this case. + * The filesystem is clearly corrupted, but returning a hard + * error means that you now don't see any of the other names + * either, so that isn't a perfect alternative. + * + * And if you return an error, what error do you use? Several + * filesystems seem to have decided on EUCLEAN being the error + * code for EFSCORRUPTED, and that may be the error to use. Or + * just EIO, which is perhaps more obvious to users. + * + * In order to see the other file names in the directory, the + * caller might want to make this a "soft" error: skip the + * entry, and return the error at the end instead. + * + * Note that this should likely do a "memchr(name, 0, len)" + * check too, since that would be filesystem corruption as + * well. However, that case can't actually confuse user space, + * which has to do a strlen() on the name anyway to find the + * filename length, and the above "soft error" worry means + * that it's probably better left alone until we have that + * issue clarified. + */ +static int verify_dirent_name(const char *name, int len) +{ + if (WARN_ON_ONCE(!len)) + return -EIO; + if (WARN_ON_ONCE(memchr(name, '/', len))) + return -EIO; + return 0; +} + +/* * Traditional linux readdir() handling.. * * "count=1" is a special case, meaning that the buffer is one @@ -173,6 +221,9 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; @@ -182,28 +233,31 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, return -EOVERFLOW; } dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user(d_ino, &dirent->d_ino)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + if (dirent && signal_pending(current)) + return -EINTR; + + /* + * Note! This range-checks 'previous' (which may be NULL). + * The real range was checked in getdents + */ + if (!user_access_begin(dirent, sizeof(*dirent))) goto efault; + if (dirent) + unsafe_put_user(offset, &dirent->d_off, efault_end); + dirent = buf->current_dir; + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_access_end(); + buf->previous = dirent; dirent = (void __user *)dirent + reclen; buf->current_dir = dirent; buf->count -= reclen; return 0; +efault_end: + user_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -259,34 +313,38 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user(ino, &dirent->d_ino)) - goto efault; - if (__put_user(0, &dirent->d_off)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (__put_user(d_type, &dirent->d_type)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) + if (dirent && signal_pending(current)) + return -EINTR; + + /* + * Note! This range-checks 'previous' (which may be NULL). + * The real range was checked in getdents + */ + if (!user_access_begin(dirent, sizeof(*dirent))) goto efault; + if (dirent) + unsafe_put_user(offset, &dirent->d_off, efault_end); + dirent = buf->current_dir; + unsafe_put_user(ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, &dirent->d_type, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_access_end(); + buf->previous = dirent; dirent = (void __user *)dirent + reclen; buf->current_dir = dirent; buf->count -= reclen; return 0; +efault_end: + user_access_end(); efault: buf->error = -EFAULT; return -EFAULT; diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 9c02d96d3a42..4075e41408b4 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -239,10 +239,8 @@ static int balance_leaf_when_delete_left(struct tree_balance *tb) static int balance_leaf_when_delete(struct tree_balance *tb, int flag) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int item_pos = PATH_LAST_POSITION(tb->tb_path); struct buffer_info bi; int n; - struct item_head *ih; RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, "vs- 12000: level: wrong FR %z", tb->FR[0]); @@ -251,7 +249,6 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag) RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0), "PAP-12010: tree can not be empty"); - ih = item_head(tbS0, item_pos); buffer_info_init_tbS0(tb, &bi); /* Delete or truncate the item */ @@ -298,7 +295,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb, if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { /* part of new item falls into L[0] */ int new_item_len, shift; - int version; ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1); @@ -317,8 +313,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb, leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, min_t(int, tb->zeroes_num, ih_item_len(ih))); - version = ih_version(ih); - /* * Calculate key component, item length and body to * insert into S[0] @@ -632,7 +626,6 @@ static void balance_leaf_insert_right(struct tree_balance *tb, struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tbS0); struct buffer_info bi; - int ret; /* new item or part of it doesn't fall into R[0] */ if (n - tb->rnum[0] >= tb->item_pos) { @@ -646,13 +639,11 @@ static void balance_leaf_insert_right(struct tree_balance *tb, if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { loff_t old_key_comp, old_len, r_zeroes_number; const char *r_body; - int version, shift; + int shift; loff_t offset; leaf_shift_right(tb, tb->rnum[0] - 1, -1); - version = ih_version(ih); - /* Remember key component and item length */ old_key_comp = le_ih_k_offset(ih); old_len = ih_item_len(ih); @@ -698,7 +689,7 @@ static void balance_leaf_insert_right(struct tree_balance *tb, /* whole new item falls into R[0] */ /* Shift rnum[0]-1 items to R[0] */ - ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); + leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); /* Insert new item into R[0] */ buffer_info_init_right(tb, &bi); @@ -950,14 +941,12 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb, if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) { int old_key_comp, old_len, r_zeroes_number; const char *r_body; - int version; /* Move snum[i]-1 items from S[0] to S_new[i] */ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1, tb->S_new[i]); /* Remember key component and item length */ - version = ih_version(ih); old_key_comp = le_ih_k_offset(ih); old_len = ih_item_len(ih); diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 6b0ddb2a9091..117092224111 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -376,7 +376,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, int to, int to_bytes, short *snum012, int flow) { int i; - int cur_free; int units; struct virtual_node *vn = tb->tb_vn; int total_node_size, max_node_size, current_item_size; @@ -438,7 +437,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, /* leaf level */ needed_nodes = 1; total_node_size = 0; - cur_free = max_node_size; /* start from 'from'-th item */ start_item = from; @@ -1734,14 +1732,12 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) * and Fh is its father. */ struct buffer_head *Sh, *Fh; - int maxsize, ret; + int ret; int lfree, rfree /* free space in L and R */ ; Sh = PATH_H_PBUFFER(tb->tb_path, h); Fh = PATH_H_PPARENT(tb->tb_path, h); - maxsize = MAX_CHILD_SIZE(Sh); - /* * using tb->insert_size[h], which is negative in this case, * create_virtual_node calculates: diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4517a1394c6f..4b3e3e73b512 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -891,7 +891,6 @@ static int flush_older_commits(struct super_block *s, struct list_head *entry; unsigned int trans_id = jl->j_trans_id; unsigned int other_trans_id; - unsigned int first_trans_id; find_first: /* @@ -914,8 +913,6 @@ find_first: return 0; } - first_trans_id = first_jl->j_trans_id; - entry = &first_jl->j_list; while (1) { other_jl = JOURNAL_LIST_ENTRY(entry); @@ -1351,7 +1348,7 @@ static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { struct reiserfs_journal_list *pjl; - struct reiserfs_journal_cnode *cn, *last; + struct reiserfs_journal_cnode *cn; int count; int was_jwait = 0; int was_dirty = 0; @@ -1509,7 +1506,6 @@ static int flush_journal_list(struct super_block *s, b_blocknr, __func__); } free_cnode: - last = cn; cn = cn->next; if (saved_bh) { /* @@ -1792,7 +1788,6 @@ static int flush_used_journal_lists(struct super_block *s, { unsigned long len = 0; unsigned long cur_len; - int ret; int i; int limit = 256; struct reiserfs_journal_list *tjl; @@ -1829,9 +1824,9 @@ static int flush_used_journal_lists(struct super_block *s, * transactions, but only bother if we've actually spanned * across multiple lists */ - if (flush_jl != jl) { - ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); - } + if (flush_jl != jl) + kupdate_transactions(s, jl, &tjl, &trans_id, len, i); + flush_journal_list(s, flush_jl, 1); put_journal_list(s, flush_jl); put_journal_list(s, jl); @@ -1911,7 +1906,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *sb, int error) { struct reiserfs_transaction_handle myth; - int flushed = 0; struct reiserfs_journal *journal = SB_JOURNAL(sb); /* @@ -1933,7 +1927,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, 1); journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); do_journal_end(&myth, FLUSH_ALL); - flushed = 1; } } @@ -3444,9 +3437,8 @@ static int remove_from_transaction(struct super_block *sb, if (cn == journal->j_last) { journal->j_last = cn->prev; } - if (bh) - remove_journal_hash(sb, journal->j_hash_table, NULL, - bh->b_blocknr, 0); + remove_journal_hash(sb, journal->j_hash_table, NULL, + bh->b_blocknr, 0); clear_buffer_journaled(bh); /* don't log this one */ if (!already_cleaned) { @@ -3988,7 +3980,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) struct buffer_head *c_bh; /* commit bh */ struct buffer_head *d_bh; /* desc bh */ int cur_write_start = 0; /* start index of current log write */ - int old_start; int i; int flush; int wait_on_commit; @@ -4245,7 +4236,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) journal->j_num_work_lists++; /* reset journal values for the next transaction */ - old_start = journal->j_start; journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(sb); diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index f5cebd70d903..7f868569d4d0 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -1322,7 +1322,7 @@ void leaf_paste_entries(struct buffer_info *bi, char *item; struct reiserfs_de_head *deh; char *insert_point; - int i, old_entry_num; + int i; struct buffer_head *bh = bi->bi_bh; if (new_entry_count == 0) @@ -1362,7 +1362,6 @@ void leaf_paste_entries(struct buffer_info *bi, put_deh_location(&deh[i], deh_location(&deh[i]) + paste_size); - old_entry_num = ih_entry_count(ih); put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); /* prepare space for pasted records */ diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index 415d66ca87d1..34baf5c0f265 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -183,13 +183,12 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s) int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2; int old_max = sb_oid_maxsize(disk_sb); struct reiserfs_super_block_v1 *disk_sb_v1; - __le32 *objectid_map, *new_objectid_map; + __le32 *objectid_map; int i; disk_sb_v1 = (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); objectid_map = (__le32 *) (disk_sb_v1 + 1); - new_objectid_map = (__le32 *) (disk_sb + 1); if (cur_size > new_size) { /* diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 9fed1c05f1f4..500f2000eb41 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -746,9 +746,6 @@ static void check_leaf_block_head(struct buffer_head *bh) static void check_internal_block_head(struct buffer_head *bh) { - struct block_head *blkh; - - blkh = B_BLK_HEAD(bh); if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT)) reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh); diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 0037aea97d39..da9ebe33882b 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -593,7 +593,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, struct buffer_head *bh; struct path_element *last_element; int node_level, retval; - int right_neighbor_of_leaf_node; int fs_gen; struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA]; @@ -614,8 +613,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, pathrelse(search_path); - right_neighbor_of_leaf_node = 0; - /* * With each iteration of this loop we search through the items in the * current node, and calculate the next current node(next path element) @@ -701,7 +698,6 @@ io_error: */ block_number = SB_ROOT_BLOCK(sb); expected_level = -1; - right_neighbor_of_leaf_node = 0; /* repeat search from the root */ continue; diff --git a/fs/statfs.c b/fs/statfs.c index eea7af6f2f22..2616424012ea 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -318,19 +318,10 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) { struct compat_statfs64 buf; - if (sizeof(ubuf->f_bsize) == 4) { - if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | - kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) - return -EOVERFLOW; - /* f_files and f_ffree may be -1; it's okay - * to stuff that into 32 bits */ - if (kbuf->f_files != 0xffffffffffffffffULL - && (kbuf->f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (kbuf->f_ffree != 0xffffffffffffffffULL - && (kbuf->f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } + + if ((kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) + return -EOVERFLOW; + memset(&buf, 0, sizeof(struct compat_statfs64)); buf.f_type = kbuf->f_type; buf.f_bsize = kbuf->f_bsize; diff --git a/fs/super.c b/fs/super.c index f627b7c53d2b..cfadab2cbf35 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1300,6 +1300,7 @@ int get_tree_bdev(struct fs_context *fc, mutex_lock(&bdev->bd_fsfreeze_mutex); if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); + blkdev_put(bdev, mode); warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); return -EBUSY; } @@ -1308,8 +1309,10 @@ int get_tree_bdev(struct fs_context *fc, fc->sget_key = bdev; s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); mutex_unlock(&bdev->bd_fsfreeze_mutex); - if (IS_ERR(s)) + if (IS_ERR(s)) { + blkdev_put(bdev, mode); return PTR_ERR(s); + } if (s->s_root) { /* Don't summarily change the RO/RW state. */ diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index eeeae0475da9..9fc14e38927f 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -20,6 +20,7 @@ #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/security.h> #define TRACEFS_DEFAULT_MODE 0700 @@ -27,6 +28,25 @@ static struct vfsmount *tracefs_mount; static int tracefs_mount_count; static bool tracefs_registered; +static int default_open_file(struct inode *inode, struct file *filp) +{ + struct dentry *dentry = filp->f_path.dentry; + struct file_operations *real_fops; + int ret; + + if (!dentry) + return -EINVAL; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + + real_fops = dentry->d_fsdata; + if (!real_fops->open) + return 0; + return real_fops->open(inode, filp); +} + static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -221,6 +241,12 @@ static int tracefs_apply_options(struct super_block *sb) return 0; } +static void tracefs_destroy_inode(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + kfree(inode->i_fop); +} + static int tracefs_remount(struct super_block *sb, int *flags, char *data) { int err; @@ -257,6 +283,7 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root) static const struct super_operations tracefs_super_operations = { .statfs = simple_statfs, .remount_fs = tracefs_remount, + .destroy_inode = tracefs_destroy_inode, .show_options = tracefs_show_options, }; @@ -387,6 +414,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { + struct file_operations *proxy_fops; struct dentry *dentry; struct inode *inode; @@ -402,8 +430,20 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, if (unlikely(!inode)) return failed_creating(dentry); + proxy_fops = kzalloc(sizeof(struct file_operations), GFP_KERNEL); + if (unlikely(!proxy_fops)) { + iput(inode); + return failed_creating(dentry); + } + + if (!fops) + fops = &tracefs_file_operations; + + dentry->d_fsdata = (void *)fops; + memcpy(proxy_fops, fops, sizeof(*proxy_fops)); + proxy_fops->open = default_open_file; inode->i_mode = mode; - inode->i_fop = fops ? fops : &tracefs_file_operations; + inode->i_fop = proxy_fops; inode->i_private = data; d_instantiate(dentry, inode); fsnotify_create(dentry->d_parent->d_inode, dentry); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index fe6d804a38dc..f9fd18670e22 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1272,21 +1272,23 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, } static __always_inline int validate_range(struct mm_struct *mm, - __u64 start, __u64 len) + __u64 *start, __u64 len) { __u64 task_size = mm->task_size; - if (start & ~PAGE_MASK) + *start = untagged_addr(*start); + + if (*start & ~PAGE_MASK) return -EINVAL; if (len & ~PAGE_MASK) return -EINVAL; if (!len) return -EINVAL; - if (start < mmap_min_addr) + if (*start < mmap_min_addr) return -EINVAL; - if (start >= task_size) + if (*start >= task_size) return -EINVAL; - if (len > task_size - start) + if (len > task_size - *start) return -EINVAL; return 0; } @@ -1336,7 +1338,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; } - ret = validate_range(mm, uffdio_register.range.start, + ret = validate_range(mm, &uffdio_register.range.start, uffdio_register.range.len); if (ret) goto out; @@ -1525,7 +1527,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) goto out; - ret = validate_range(mm, uffdio_unregister.start, + ret = validate_range(mm, &uffdio_unregister.start, uffdio_unregister.len); if (ret) goto out; @@ -1676,7 +1678,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) goto out; - ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); + ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len); if (ret) goto out; @@ -1716,7 +1718,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, sizeof(uffdio_copy)-sizeof(__s64))) goto out; - ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); + ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len); if (ret) goto out; /* @@ -1772,7 +1774,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, sizeof(uffdio_zeropage)-sizeof(__s64))) goto out; - ret = validate_range(ctx->mm, uffdio_zeropage.range.start, + ret = validate_range(ctx->mm, &uffdio_zeropage.range.start, uffdio_zeropage.range.len); if (ret) goto out; diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 5de296b34ab1..14fbdf22b7e7 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -28,12 +28,11 @@ xfs_get_aghdr_buf( struct xfs_mount *mp, xfs_daddr_t blkno, size_t numblks, - int flags, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; - bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0); if (!bp) return NULL; @@ -345,7 +344,7 @@ xfs_ag_init_hdr( { struct xfs_buf *bp; - bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops); + bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops); if (!bp) return -ENOMEM; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 58fa85cec325..d6ed5d2c07c2 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -81,9 +81,10 @@ typedef struct xfs_alloc_arg { /* * Defines for datatype */ -#define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */ -#define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */ -#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ +#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ +#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ +#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ +#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ static inline bool xfs_alloc_is_userdata(int datatype) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index b9f019603d0b..f0089e862216 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -826,32 +826,17 @@ xfs_attr_shortform_to_leaf( sf = (xfs_attr_shortform_t *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); - xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK); + xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); bp = NULL; error = xfs_da_grow_inode(args, &blkno); - if (error) { - /* - * If we hit an IO error middle of the transaction inside - * grow_inode(), we may have inconsistent data. Bail out. - */ - if (error == -EIO) - goto out; - xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ - memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + if (error) goto out; - } ASSERT(blkno == 0); error = xfs_attr3_leaf_create(args, blkno, &bp); - if (error) { - /* xfs_attr3_leaf_create may not have instantiated a block */ - if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0)) - goto out; - xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ - memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + if (error) goto out; - } memset((char *)&nargs, 0, sizeof(nargs)); nargs.dp = dp; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 054b4ce30033..02469d59c787 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -792,6 +792,7 @@ out_root_realloc: */ void xfs_bmap_local_to_extents_empty( + struct xfs_trans *tp, struct xfs_inode *ip, int whichfork) { @@ -808,6 +809,7 @@ xfs_bmap_local_to_extents_empty( ifp->if_u1.if_root = NULL; ifp->if_height = 0; XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -840,7 +842,7 @@ xfs_bmap_local_to_extents( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); if (!ifp->if_bytes) { - xfs_bmap_local_to_extents_empty(ip, whichfork); + xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags = XFS_ILOG_CORE; goto done; } @@ -887,7 +889,7 @@ xfs_bmap_local_to_extents( /* account for the change in fork size */ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); - xfs_bmap_local_to_extents_empty(ip, whichfork); + xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags |= XFS_ILOG_CORE; ifp->if_u1.if_root = NULL; @@ -4042,8 +4044,12 @@ xfs_bmapi_allocate( */ if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK && bma->offset == 0) - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + if (whichfork == XFS_DATA_FORK) { + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + else + bma->datatype |= XFS_ALLOC_USERDATA; + } if (bma->flags & XFS_BMAPI_ZERO) bma->datatype |= XFS_ALLOC_USERDATA_ZERO; } @@ -5621,6 +5627,11 @@ xfs_bmse_merge( if (error) return error; + /* change to extent format if required after extent removal */ + error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork); + if (error) + return error; + done: xfs_iext_remove(ip, icur, 0); xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 5bb446d80542..e2798c6f3a5f 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -182,7 +182,8 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); -void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); +void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, + struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, bool skip_discard); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 9595ced393dc..49e4bc39e7bb 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1096,7 +1096,7 @@ xfs_dir2_sf_to_block( memcpy(sfp, oldsfp, ifp->if_bytes); xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); - xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK); + xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK); dp->i_d.di_size = 0; /* diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a08dd8f40346..ac6cdca63e15 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -928,7 +928,7 @@ xfs_log_sb( xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); - xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); + xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } /* diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index a43d1813c4ff..5533e48e605d 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -97,7 +97,6 @@ xchk_allocbt_rec( xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t bno; xfs_extlen_t len; - int error = 0; bno = be32_to_cpu(rec->alloc.ar_startblock); len = be32_to_cpu(rec->alloc.ar_blockcount); @@ -109,7 +108,7 @@ xchk_allocbt_rec( xchk_allocbt_xref(bs->sc, bno, len); - return error; + return 0; } /* Scrub the freespace btrees for some AG. */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 93b3793bc5b3..0cab11a5d390 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -341,7 +341,6 @@ xchk_refcountbt_rec( xfs_extlen_t len; xfs_nlink_t refcount; bool has_cowflag; - int error = 0; bno = be32_to_cpu(rec->refc.rc_startblock); len = be32_to_cpu(rec->refc.rc_blockcount); @@ -366,7 +365,7 @@ xchk_refcountbt_rec( xchk_refcountbt_xref(bs->sc, bno, len, refcount); - return error; + return 0; } /* Make sure we have as many refc blocks as the rmap says. */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0910cb75b65d..4f443703065e 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -864,6 +864,7 @@ xfs_alloc_file_space( xfs_filblks_t allocatesize_fsb; xfs_extlen_t extsz, temp; xfs_fileoff_t startoffset_fsb; + xfs_fileoff_t endoffset_fsb; int nimaps; int quota_flag; int rt; @@ -891,7 +892,8 @@ xfs_alloc_file_space( imapp = &imaps[0]; nimaps = 1; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); - allocatesize_fsb = XFS_B_TO_FSB(mp, count); + endoffset_fsb = XFS_B_TO_FSB(mp, offset + count); + allocatesize_fsb = endoffset_fsb - startoffset_fsb; /* * Allocate file space until done or until there is an error diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 120ef99d09e8..0abba171aa89 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -345,6 +345,15 @@ xfs_buf_allocate_memory( unsigned short page_count, i; xfs_off_t start, end; int error; + xfs_km_flags_t kmflag_mask = 0; + + /* + * assure zeroed buffer for non-read cases. + */ + if (!(flags & XBF_READ)) { + kmflag_mask |= KM_ZERO; + gfp_mask |= __GFP_ZERO; + } /* * for buffers that are contained within a single page, just allocate @@ -354,7 +363,8 @@ xfs_buf_allocate_memory( size = BBTOB(bp->b_length); if (size < PAGE_SIZE) { int align_mask = xfs_buftarg_dma_alignment(bp->b_target); - bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS); + bp->b_addr = kmem_alloc_io(size, align_mask, + KM_NOFS | kmflag_mask); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; @@ -2097,7 +2107,7 @@ xfs_verify_magic( int idx; idx = xfs_sb_version_hascrc(&mp->m_sb); - if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) return false; return dmagic == bp->b_ops->magic[idx]; } @@ -2115,7 +2125,7 @@ xfs_verify_magic16( int idx; idx = xfs_sb_version_hascrc(&mp->m_sb); - if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) return false; return dmagic == bp->b_ops->magic16[idx]; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a2beee9f74da..641d07f30a27 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1443,7 +1443,7 @@ xlog_alloc_log( prev_iclog = iclog; iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, - KM_MAYFAIL); + KM_MAYFAIL | KM_ZERO); if (!iclog->ic_data) goto out_free_iclog; #ifdef DEBUG diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 508319039dce..c1a514ffff55 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -127,7 +127,7 @@ xlog_alloc_buffer( if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL); + return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO); } /* diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index ddd0bf7a4740..f1bc88f4367c 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -63,19 +63,6 @@ static const struct sysfs_ops xfs_sysfs_ops = { .store = xfs_sysfs_object_store, }; -/* - * xfs_mount kobject. The mp kobject also serves as the per-mount parent object - * that is identified by the fsname under sysfs. - */ - -static inline struct xfs_mount * -to_mp(struct kobject *kobject) -{ - struct xfs_kobj *kobj = to_kobj(kobject); - - return container_of(kobj, struct xfs_mount, m_kobj); -} - static struct attribute *xfs_mp_attrs[] = { NULL, }; |