diff options
Diffstat (limited to 'fs')
95 files changed, 2235 insertions, 1773 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 082d227fa56b..6261719f6f2a 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -276,7 +276,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, switch (handler->flags) { case ACL_TYPE_ACCESS: if (acl) { - struct iattr iattr; + struct iattr iattr = { 0 }; struct posix_acl *old_acl = acl; retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 89bac3d2f05b..619128b55837 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -61,6 +61,8 @@ enum { Opt_cache_loose, Opt_fscache, Opt_mmap, /* Access options */ Opt_access, Opt_posixacl, + /* Lock timeout option */ + Opt_locktimeout, /* Error token */ Opt_err }; @@ -80,6 +82,7 @@ static const match_table_t tokens = { {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_posixacl, "posixacl"}, + {Opt_locktimeout, "locktimeout=%u"}, {Opt_err, NULL} }; @@ -187,6 +190,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #ifdef CONFIG_9P_FSCACHE v9ses->cachetag = NULL; #endif + v9ses->session_lock_timeout = P9_LOCK_TIMEOUT; if (!opts) return 0; @@ -359,6 +363,23 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #endif break; + case Opt_locktimeout: + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + if (option < 1) { + p9_debug(P9_DEBUG_ERROR, + "locktimeout must be a greater than zero integer.\n"); + ret = -EINVAL; + continue; + } + v9ses->session_lock_timeout = (long)option * HZ; + break; + default: continue; } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 982e017acadb..129e5243a6bf 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -116,6 +116,7 @@ struct v9fs_session_info { struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct rw_semaphore rename_sem; + long session_lock_timeout; /* retry interval for blocking locks */ }; /* cache_validity flags */ diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index b0405d6aac85..cb6c4031af55 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -76,15 +76,6 @@ static inline int dt_type(struct p9_wstat *mistat) return rettype; } -static void p9stat_init(struct p9_wstat *stbuf) -{ - stbuf->name = NULL; - stbuf->uid = NULL; - stbuf->gid = NULL; - stbuf->muid = NULL; - stbuf->extension = NULL; -} - /** * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir * @filp: opened file structure @@ -114,7 +105,6 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) int err = 0; struct p9_fid *fid; int buflen; - int reclen = 0; struct p9_rdir *rdir; struct kvec kvec; @@ -145,15 +135,12 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) rdir->tail = n; } while (rdir->head < rdir->tail) { - p9stat_init(&st); err = p9stat_read(fid->clnt, rdir->buf + rdir->head, rdir->tail - rdir->head, &st); - if (err) { + if (err <= 0) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); - p9stat_free(&st); return -EIO; } - reclen = st.size+2; over = !dir_emit(ctx, st.name, strlen(st.name), v9fs_qid2ino(&st.qid), dt_type(&st)); @@ -161,8 +148,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) if (over) return 0; - rdir->head += reclen; - ctx->pos += reclen; + rdir->head += err; + ctx->pos += err; } } } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 5f2e48d41d72..a25efa782fcc 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -154,6 +154,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) uint8_t status = P9_LOCK_ERROR; int res = 0; unsigned char fl_type; + struct v9fs_session_info *v9ses; fid = filp->private_data; BUG_ON(fid == NULL); @@ -189,6 +190,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) if (IS_SETLKW(cmd)) flock.flags = P9_LOCK_FLAGS_BLOCK; + v9ses = v9fs_inode2v9ses(file_inode(filp)); + /* * if its a blocked request and we get P9_LOCK_BLOCKED as the status * for lock request, keep on trying @@ -202,8 +205,17 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) break; - if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0) + if (schedule_timeout_interruptible(v9ses->session_lock_timeout) + != 0) break; + /* + * p9_client_lock_dotl overwrites flock.client_id with the + * server message, free and reuse the client name + */ + if (flock.client_id != fid->clnt->name) { + kfree(flock.client_id); + flock.client_id = fid->clnt->name; + } } /* map 9p status to VFS status */ @@ -216,7 +228,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; default: WARN_ONCE(1, "unknown lock status code: %d\n", status); - /* fallthough */ + /* fall through */ case P9_LOCK_ERROR: case P9_LOCK_GRACE: res = -ENOLCK; @@ -235,6 +247,8 @@ out_unlock: locks_lock_file_wait(filp, fl); fl->fl_type = fl_type; } + if (flock.client_id != fid->clnt->name) + kfree(flock.client_id); out: return res; } @@ -269,7 +283,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) res = p9_client_getlock_dotl(fid, &glock); if (res < 0) - return res; + goto out; /* map 9p lock type to os lock type */ switch (glock.type) { case P9_LOCK_TYPE_RDLCK: @@ -290,7 +304,9 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) fl->fl_end = glock.start + glock.length - 1; fl->fl_pid = -glock.proc_id; } - kfree(glock.client_id); +out: + if (glock.client_id != fid->clnt->name) + kfree(glock.client_id); return res; } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8703ce68fe9d..2955a4ea2fa8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -437,10 +437,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - rcu_read_lock(); - page = radix_tree_lookup(&mapping->i_pages, pg_index); - rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) { + page = xa_load(&mapping->i_pages, pg_index); + if (page && !xa_is_value(page)) { misses++; if (misses > 4) break; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6877a74c7469..d228f706ff3e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3784,7 +3784,7 @@ int btree_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { @@ -3909,7 +3909,7 @@ static int extent_write_cache_pages(struct address_space *mapping, pgoff_t done_index; int range_whole = 0; int scanned = 0; - int tag; + xa_mark_t tag; /* * We have to hold onto the inode so that ordered extents can do their @@ -5159,11 +5159,9 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) clear_page_dirty_for_io(page); xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->i_pages, - page_index(page), - PAGECACHE_TAG_DIRTY); - } + if (!PageDirty(page)) + __xa_clear_mark(&page->mapping->i_pages, + page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irq(&page->mapping->i_pages); ClearPageError(page); unlock_page(page); diff --git a/fs/buffer.c b/fs/buffer.c index 109f55196866..d60d61e8ed7d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -562,7 +562,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) EXPORT_SYMBOL(mark_buffer_dirty_inode); /* - * Mark the page dirty, and set it dirty in the radix tree, and mark the inode + * Mark the page dirty, and set it dirty in the page cache, and mark the inode * dirty. * * If warn is true, then emit a warning if the page is not uptodate and has @@ -579,8 +579,8 @@ void __set_page_dirty(struct page *page, struct address_space *mapping, if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); + __xa_set_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } @@ -1050,7 +1050,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and - * the page is tagged dirty in its radix tree. + * the page is tagged dirty in the page cache. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is @@ -1073,9 +1073,9 @@ __getblk_slow(struct block_device *bdev, sector_t block, * mark_buffer_dirty - mark a buffer_head as needing writeout * @bh: the buffer_head to mark dirty * - * mark_buffer_dirty() will set the dirty bit against the buffer, then set its - * backing page dirty, then tag the page as dirty in its address_space's radix - * tree and then attach the address_space's inode to its superblock's dirty + * mark_buffer_dirty() will set the dirty bit against the buffer, then set + * its backing page dirty, then tag the page as dirty in the page cache + * and then attach the address_space's inode to its superblock's dirty * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index f1fbea947fef..3e812428ac8d 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -132,7 +132,7 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr; - seq_printf(m, "\t\tSpeed: %zu bps\n", iface->speed); + seq_printf(m, "\tSpeed: %zu bps\n", iface->speed); seq_puts(m, "\t\tCapabilities: "); if (iface->rdma_capable) seq_puts(m, "rdma "); @@ -285,7 +285,7 @@ skip_rdma: if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) || (ses->serverNOS == NULL)) { - seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d\t", + seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d ", i, ses->serverName, ses->ses_count, ses->capabilities, ses->status); if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) @@ -296,16 +296,18 @@ skip_rdma: seq_printf(m, "\n%d) Name: %s Domain: %s Uses: %d OS:" " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB" - " session status: %d\t", + " session status: %d ", i, ses->serverName, ses->serverDomain, ses->ses_count, ses->serverOS, ses->serverNOS, ses->capabilities, ses->status); } if (server->rdma) seq_printf(m, "RDMA\n\t"); - seq_printf(m, "TCP status: %d\n\tLocal Users To " + seq_printf(m, "TCP status: %d Instance: %d\n\tLocal Users To " "Server: %d SecMode: 0x%x Req On Wire: %d", - server->tcpStatus, server->srv_count, + server->tcpStatus, + server->reconnect_instance, + server->srv_count, server->sec_mode, in_flight(server)); #ifdef CONFIG_CIFS_STATS2 @@ -352,7 +354,7 @@ skip_rdma: seq_printf(m, "\n\tServer interfaces: %zu\n", ses->iface_count); for (j = 0; j < ses->iface_count; j++) { - seq_printf(m, "\t%d)\n", j); + seq_printf(m, "\t%d)", j); cifs_dump_iface(m, &ses->iface_list[j]); } spin_unlock(&ses->iface_lock); @@ -383,6 +385,9 @@ static ssize_t cifs_stats_proc_write(struct file *file, atomic_set(&totBufAllocCount, 0); atomic_set(&totSmBufAllocCount, 0); #endif /* CONFIG_CIFS_STATS2 */ + atomic_set(&tcpSesReconnectCount, 0); + atomic_set(&tconInfoReconnectCount, 0); + spin_lock(&GlobalMid_Lock); GlobalMaxActiveXid = 0; GlobalCurrentXid = 0; diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index f4f3f0853c6e..631dc1bb21c1 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -47,6 +47,29 @@ extern int cifsFYI; */ #ifdef CONFIG_CIFS_DEBUG + +/* + * When adding tracepoints and debug messages we have various choices. + * Some considerations: + * + * Use cifs_dbg(VFS, ...) for things we always want logged, and the user to see + * cifs_info(...) slightly less important, admin can filter via loglevel > 6 + * cifs_dbg(FYI, ...) minor debugging messages, off by default + * trace_smb3_* ftrace functions are preferred for complex debug messages + * intended for developers or experienced admins, off by default + */ + +/* Information level messages, minor events */ +#define cifs_info_func(ratefunc, fmt, ...) \ +do { \ + pr_info_ ## ratefunc("CIFS: " fmt, ##__VA_ARGS__); \ +} while (0) + +#define cifs_info(fmt, ...) \ +do { \ + cifs_info_func(ratelimited, fmt, ##__VA_ARGS__); \ +} while (0) + /* information message: e.g., configuration, major event */ #define cifs_dbg_func(ratefunc, type, fmt, ...) \ do { \ @@ -81,6 +104,11 @@ do { \ if (0) \ pr_debug(fmt, ##__VA_ARGS__); \ } while (0) + +#define cifs_info(fmt, ...) \ +do { \ + pr_info("CIFS: "fmt, ##__VA_ARGS__); \ +} while (0) #endif #endif /* _H_CIFS_DEBUG */ diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 6b61df117fd4..b97c74efd04a 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -304,12 +304,17 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) */ mnt = ERR_PTR(-ENOMEM); + cifs_sb = CIFS_SB(mntpt->d_sb); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) { + mnt = ERR_PTR(-EREMOTE); + goto cdda_exit; + } + /* always use tree name prefix */ full_path = build_path_from_dentry_optional_prefix(mntpt, true); if (full_path == NULL) goto cdda_exit; - cifs_sb = CIFS_SB(mntpt->d_sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { mnt = ERR_CAST(tlink); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 9731d0d891e7..63d7530f2e1d 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -51,6 +51,7 @@ */ #define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */ #define CIFS_MOUNT_NO_HANDLE_CACHE 0x4000000 /* disable caching dir handles */ +#define CIFS_MOUNT_NO_DFS 0x8000000 /* disable DFS resolving */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 57ff0756e30c..d8bce2f862de 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -43,8 +43,19 @@ struct smb_snapshot_array { /* snapshots[]; */ } __packed; +struct smb_query_info { + __u32 info_type; + __u32 file_info_class; + __u32 additional_information; + __u32 flags; + __u32 input_buffer_length; + __u32 output_buffer_length; + /* char buffer[]; */ +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) #define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array) +#define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 7065426b3280..7de9603c54f1 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -81,6 +81,14 @@ module_param(cifs_max_pending, uint, 0444); MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for " "CIFS/SMB1 dialect (N/A for SMB3) " "Default: 32767 Range: 2 to 32767."); +#ifdef CONFIG_CIFS_STATS2 +unsigned int slow_rsp_threshold = 1; +module_param(slow_rsp_threshold, uint, 0644); +MODULE_PARM_DESC(slow_rsp_threshold, "Amount of time (in seconds) to wait " + "before logging that a response is delayed. " + "Default: 1 (if set to 0 disables msg)."); +#endif /* STATS2 */ + module_param(enable_oplocks, bool, 0644); MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1"); @@ -492,6 +500,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",unix"); else seq_puts(s, ",nounix"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) + seq_puts(s, ",nodfs"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) seq_puts(s, ",posixpaths"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) @@ -707,7 +717,14 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, struct cifs_mnt_data mnt_data; struct dentry *root; - cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags); + /* + * Prints in Kernel / CIFS log the attempted mount operation + * If CIFS_DEBUG && cifs_FYI + */ + if (cifsFYI) + cifs_dbg(FYI, "Devname: %s flags: %d\n", dev_name, flags); + else + cifs_info("Attempting to mount %s\n", dev_name); volume_info = cifs_get_volume_info((char *)data, dev_name, is_smb3); if (IS_ERR(volume_info)) @@ -1418,6 +1435,11 @@ init_cifs(void) #ifdef CONFIG_CIFS_STATS2 atomic_set(&totBufAllocCount, 0); atomic_set(&totSmBufAllocCount, 0); + if (slow_rsp_threshold < 1) + cifs_dbg(FYI, "slow_response_threshold msgs disabled\n"); + else if (slow_rsp_threshold > 32767) + cifs_dbg(VFS, + "slow response threshold set higher than recommended (0 to 32767)\n"); #endif /* CONFIG_CIFS_STATS2 */ atomic_set(&midCount, 0); @@ -1538,11 +1560,11 @@ exit_cifs(void) cifs_proc_clean(); } -MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>"); +MODULE_AUTHOR("Steve French"); MODULE_LICENSE("GPL"); /* combination of LGPL + GPL source behaves as GPL */ MODULE_DESCRIPTION - ("VFS to access servers complying with the SNIA CIFS Specification " - "e.g. Samba and Windows"); + ("VFS to access SMB3 servers e.g. Samba, Macs, Azure and Windows (and " + "also older servers complying with the SNIA CIFS Specification)"); MODULE_VERSION(CIFS_VERSION); MODULE_SOFTDEP("pre: arc4"); MODULE_SOFTDEP("pre: des"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index f047e87871a1..24e265a51874 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -148,5 +148,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.13" +#define CIFS_VERSION "2.14" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 9dcaed031843..ed1e0fcb69e3 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -33,6 +33,7 @@ #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ +#define SMB_PATH_MAX 260 #define CIFS_PORT 445 #define RFC1001_PORT 139 @@ -465,6 +466,11 @@ struct smb_version_operations { enum securityEnum (*select_sectype)(struct TCP_Server_Info *, enum securityEnum); int (*next_header)(char *); + /* ioctl passthrough for query_info */ + int (*ioctl_query_info)(const unsigned int xid, + struct cifs_tcon *tcon, + __le16 *path, int is_dir, + unsigned long p); }; struct smb_version_values { @@ -654,6 +660,7 @@ struct TCP_Server_Info { /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* for signing, protected by srv_mutex */ + __u32 reconnect_instance; /* incremented on each reconnect */ struct session_key session_key; unsigned long lstrp; /* when we got last response from this server */ struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ @@ -798,6 +805,7 @@ compare_mid(__u16 mid, const struct smb_hdr *smb) * a single wsize request with a single call. */ #define CIFS_DEFAULT_IOSIZE (1024 * 1024) +#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) /* * Windows only supports a max of 60kb reads and 65535 byte writes. Default to @@ -924,6 +932,8 @@ struct cifs_tcon { struct list_head tcon_list; int tc_count; struct list_head rlist; /* reconnect list */ + atomic_t num_local_opens; /* num of all opens including disconnected */ + atomic_t num_remote_opens; /* num of all network opens on server */ struct list_head openFileList; spinlock_t open_file_lock; /* protects list above */ struct cifs_ses *ses; /* pointer to session associated with */ @@ -1072,7 +1082,8 @@ struct cifsLockInfo { __u64 offset; __u64 length; __u32 pid; - __u32 type; + __u16 type; + __u16 flags; }; /* @@ -1715,6 +1726,7 @@ GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */ #ifdef CONFIG_CIFS_STATS2 GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */ GLOBAL_EXTERN atomic_t totSmBufAllocCount; +extern unsigned int slow_rsp_threshold; /* number of secs before logging */ #endif GLOBAL_EXTERN atomic_t smBufAllocCount; GLOBAL_EXTERN atomic_t midCount; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 20adda4de83b..fa361bc00602 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -219,7 +219,7 @@ extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); extern void cifs_reopen_persistent_handles(struct cifs_tcon *tcon); extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, - __u64 length, __u8 type, + __u64 length, __u8 type, __u16 flags, struct cifsLockInfo **conf_lock, int rw_check); extern void cifs_add_pending_open(struct cifs_fid *fid, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 5657b79dbc99..f82fd342bca5 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1607,6 +1607,7 @@ cifs_readv_callback(struct mid_q_entry *mid) struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_pages = rdata->pages, + .rq_offset = rdata->page_offset, .rq_npages = rdata->nr_pages, .rq_pagesz = rdata->pagesz, .rq_tailsz = rdata->tailsz }; @@ -2210,6 +2211,7 @@ cifs_async_writev(struct cifs_writedata *wdata, rqst.rq_iov = iov; rqst.rq_nvec = 2; rqst.rq_pages = wdata->pages; + rqst.rq_offset = wdata->page_offset; rqst.rq_npages = wdata->nr_pages; rqst.rq_pagesz = wdata->pagesz; rqst.rq_tailsz = wdata->tailsz; @@ -5027,6 +5029,13 @@ oldQFSInfoRetry: le16_to_cpu(response_data->BytesPerSector) * le32_to_cpu(response_data-> SectorsPerAllocationUnit); + /* + * much prefer larger but if server doesn't report + * a valid size than 4K is a reasonable minimum + */ + if (FSData->f_bsize < 512) + FSData->f_bsize = 4096; + FSData->f_blocks = le32_to_cpu(response_data->TotalAllocationUnits); FSData->f_bfree = FSData->f_bavail = @@ -5107,6 +5116,13 @@ QFSInfoRetry: le32_to_cpu(response_data->BytesPerSector) * le32_to_cpu(response_data-> SectorsPerAllocationUnit); + /* + * much prefer larger but if server doesn't report + * a valid size than 4K is a reasonable minimum + */ + if (FSData->f_bsize < 512) + FSData->f_bsize = 4096; + FSData->f_blocks = le64_to_cpu(response_data->TotalAllocationUnits); FSData->f_bfree = FSData->f_bavail = @@ -5470,6 +5486,13 @@ QFSPosixRetry: data_offset); FSData->f_bsize = le32_to_cpu(response_data->BlockSize); + /* + * much prefer larger but if server doesn't report + * a valid size than 4K is a reasonable minimum + */ + if (FSData->f_bsize < 512) + FSData->f_bsize = 4096; + FSData->f_blocks = le64_to_cpu(response_data->TotalBlocks); FSData->f_bfree = diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 52d71b64c0c6..d82f0cc71755 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -250,6 +250,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_ignore, "dev" }, { Opt_ignore, "mand" }, { Opt_ignore, "nomand" }, + { Opt_ignore, "relatime" }, { Opt_ignore, "_netdev" }, { Opt_err, NULL } @@ -347,7 +348,7 @@ cifs_reconnect(struct TCP_Server_Info *server) server->maxBuf = 0; server->max_read = 0; - cifs_dbg(FYI, "Reconnecting tcp session\n"); + cifs_dbg(FYI, "Mark tcp session as need reconnect\n"); trace_smb3_reconnect(server->CurrentMid, server->hostname); /* before reconnecting the tcp session, mark the smb session (uid) @@ -2396,6 +2397,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) volume_info->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); tcp_ses->session_estab = false; tcp_ses->sequence_number = 0; + tcp_ses->reconnect_instance = 0; tcp_ses->lstrp = jiffies; spin_lock_init(&tcp_ses->req_lock); INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); @@ -3085,10 +3087,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) if (rc) goto out_fail; - if (volume_info->nodfs) { - tcon->Flags &= ~SMB_SHARE_IS_IN_DFS; - cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags); - } tcon->use_persistent = false; /* check if SMB2 or later, CIFS does not support persistent handles */ if (volume_info->persistent) { @@ -3663,6 +3661,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->actimeo = pvolume_info->actimeo; cifs_sb->local_nls = pvolume_info->local_nls; + if (pvolume_info->nodfs) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_DFS; if (pvolume_info->noperm) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; if (pvolume_info->setuids) @@ -3819,6 +3819,9 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, struct dfs_info3_param *referrals = NULL; char *full_path = NULL, *ref_path = NULL, *mdata = NULL; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) + return -EREMOTE; + full_path = build_unc_path_to_root(volume_info, cifs_sb); if (IS_ERR(full_path)) return PTR_ERR(full_path); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8d41ca7bfcf1..c620d4b5d5d4 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -334,6 +334,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, server->ops->set_fid(cfile, fid, oplock); list_add(&cfile->tlist, &tcon->openFileList); + atomic_inc(&tcon->num_local_opens); /* if readable file instance put first in list*/ if (file->f_mode & FMODE_READ) @@ -395,6 +396,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) /* remove it from the lists */ list_del(&cifs_file->flist); list_del(&cifs_file->tlist); + atomic_dec(&tcon->num_local_opens); if (list_empty(&cifsi->openFileList)) { cifs_dbg(FYI, "closing last open instance for inode %p\n", @@ -864,7 +866,7 @@ int cifs_closedir(struct inode *inode, struct file *file) } static struct cifsLockInfo * -cifs_lock_init(__u64 offset, __u64 length, __u8 type) +cifs_lock_init(__u64 offset, __u64 length, __u8 type, __u16 flags) { struct cifsLockInfo *lock = kmalloc(sizeof(struct cifsLockInfo), GFP_KERNEL); @@ -874,6 +876,7 @@ cifs_lock_init(__u64 offset, __u64 length, __u8 type) lock->length = length; lock->type = type; lock->pid = current->tgid; + lock->flags = flags; INIT_LIST_HEAD(&lock->blist); init_waitqueue_head(&lock->block_q); return lock; @@ -896,7 +899,8 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock) /* @rw_check : 0 - no op, 1 - read, 2 - write */ static bool cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, - __u64 length, __u8 type, struct cifsFileInfo *cfile, + __u64 length, __u8 type, __u16 flags, + struct cifsFileInfo *cfile, struct cifsLockInfo **conf_lock, int rw_check) { struct cifsLockInfo *li; @@ -918,6 +922,10 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, ((server->ops->compare_fids(cfile, cur_cfile) && current->tgid == li->pid) || type == li->type)) continue; + if (rw_check == CIFS_LOCK_OP && + (flags & FL_OFDLCK) && (li->flags & FL_OFDLCK) && + server->ops->compare_fids(cfile, cur_cfile)) + continue; if (conf_lock) *conf_lock = li; return true; @@ -927,8 +935,8 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset, bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, - __u8 type, struct cifsLockInfo **conf_lock, - int rw_check) + __u8 type, __u16 flags, + struct cifsLockInfo **conf_lock, int rw_check) { bool rc = false; struct cifs_fid_locks *cur; @@ -936,7 +944,8 @@ cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, list_for_each_entry(cur, &cinode->llist, llist) { rc = cifs_find_fid_lock_conflict(cur, offset, length, type, - cfile, conf_lock, rw_check); + flags, cfile, conf_lock, + rw_check); if (rc) break; } @@ -964,7 +973,8 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, down_read(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, offset, length, type, - &conf_lock, CIFS_LOCK_OP); + flock->fl_flags, &conf_lock, + CIFS_LOCK_OP); if (exist) { flock->fl_start = conf_lock->offset; flock->fl_end = conf_lock->offset + conf_lock->length - 1; @@ -1011,7 +1021,8 @@ try_again: down_write(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, - lock->type, &conf_lock, CIFS_LOCK_OP); + lock->type, lock->flags, &conf_lock, + CIFS_LOCK_OP); if (!exist && cinode->can_cache_brlcks) { list_add_tail(&lock->llist, &cfile->llist->locks); up_write(&cinode->lock_sem); @@ -1321,7 +1332,7 @@ cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock, cifs_dbg(FYI, "Lease on file - not implemented yet\n"); if (flock->fl_flags & (~(FL_POSIX | FL_FLOCK | FL_SLEEP | - FL_ACCESS | FL_LEASE | FL_CLOSE))) + FL_ACCESS | FL_LEASE | FL_CLOSE | FL_OFDLCK))) cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags); *type = server->vals->large_lock_type; @@ -1584,7 +1595,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, if (lock) { struct cifsLockInfo *lock; - lock = cifs_lock_init(flock->fl_start, length, type); + lock = cifs_lock_init(flock->fl_start, length, type, + flock->fl_flags); if (!lock) return -ENOMEM; @@ -1653,7 +1665,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag, tcon->ses->server); - cifs_sb = CIFS_FILE_SB(file); netfid = cfile->fid.netfid; cinode = CIFS_I(file_inode(file)); @@ -2098,6 +2109,7 @@ static int cifs_writepages(struct address_space *mapping, pgoff_t end, index; struct cifs_writedata *wdata; int rc = 0; + unsigned int xid; /* * If wsize is smaller than the page cache size, default to writing @@ -2106,6 +2118,7 @@ static int cifs_writepages(struct address_space *mapping, if (cifs_sb->wsize < PAGE_SIZE) return generic_writepages(mapping, wbc); + xid = get_xid(); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -2199,6 +2212,7 @@ retry: if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = index; + free_xid(xid); return rc; } @@ -2817,8 +2831,8 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) goto out; if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), - server->vals->exclusive_lock_type, NULL, - CIFS_WRITE_OP)) + server->vals->exclusive_lock_type, 0, + NULL, CIFS_WRITE_OP)) rc = __generic_file_write_iter(iocb, from); else rc = -EACCES; @@ -3388,7 +3402,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to) down_read(&cinode->lock_sem); if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to), tcon->ses->server->vals->shared_lock_type, - NULL, CIFS_READ_OP)) + 0, NULL, CIFS_READ_OP)) rc = generic_file_read_iter(iocb, to); up_read(&cinode->lock_sem); return rc; @@ -3743,7 +3757,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct TCP_Server_Info *server; pid_t pid; + unsigned int xid; + xid = get_xid(); /* * Reads as many pages as possible from fscache. Returns -ENOBUFS * immediately if the cookie is negative @@ -3753,8 +3769,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, */ rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, &num_pages); - if (rc == 0) + if (rc == 0) { + free_xid(xid); return rc; + } if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; @@ -3798,6 +3816,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, */ if (unlikely(rsize < PAGE_SIZE)) { add_credits_and_wake_if(server, credits, 0); + free_xid(xid); return 0; } @@ -3862,6 +3881,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, * allocator. */ cifs_fscache_readpages_cancel(mapping->host, page_list); + free_xid(xid); return rc; } @@ -3889,8 +3909,12 @@ static int cifs_readpage_worker(struct file *file, struct page *page, else cifs_dbg(FYI, "Bytes read %d\n", rc); - file_inode(file)->i_atime = - current_time(file_inode(file)); + /* we do not want atime to be less than mtime, it broke some apps */ + file_inode(file)->i_atime = current_time(file_inode(file)); + if (timespec64_compare(&(file_inode(file)->i_atime), &(file_inode(file)->i_mtime))) + file_inode(file)->i_atime = file_inode(file)->i_mtime; + else + file_inode(file)->i_atime = current_time(file_inode(file)); if (PAGE_SIZE > rc) memset(read_data + rc, 0, PAGE_SIZE - rc); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 6e8765f44508..1023d78673fb 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -162,7 +162,11 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) cifs_revalidate_cache(inode, fattr); spin_lock(&inode->i_lock); - inode->i_atime = fattr->cf_atime; + /* we do not want atime to be less than mtime, it broke some apps */ + if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime)) + inode->i_atime = fattr->cf_mtime; + else + inode->i_atime = fattr->cf_atime; inode->i_mtime = fattr->cf_mtime; inode->i_ctime = fattr->cf_ctime; inode->i_rdev = fattr->cf_rdev; @@ -777,38 +781,53 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } else if (rc == -EREMOTE) { cifs_create_dfs_fattr(&fattr, sb); rc = 0; - } else if (rc == -EACCES && backup_cred(cifs_sb)) { - srchinf = kzalloc(sizeof(struct cifs_search_info), - GFP_KERNEL); - if (srchinf == NULL) { - rc = -ENOMEM; - goto cgii_exit; - } + } else if ((rc == -EACCES) && backup_cred(cifs_sb) && + (strcmp(server->vals->version_string, SMB1_VERSION_STRING) + == 0)) { + /* + * For SMB2 and later the backup intent flag is already + * sent if needed on open and there is no path based + * FindFirst operation to use to retry with + */ - srchinf->endOfSearch = false; + srchinf = kzalloc(sizeof(struct cifs_search_info), + GFP_KERNEL); + if (srchinf == NULL) { + rc = -ENOMEM; + goto cgii_exit; + } + + srchinf->endOfSearch = false; + if (tcon->unix_ext) + srchinf->info_level = SMB_FIND_FILE_UNIX; + else if ((tcon->ses->capabilities & + tcon->ses->server->vals->cap_nt_find) == 0) + srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD; + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; + else /* no srvino useful for fallback to some netapp */ + srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO; - srchflgs = CIFS_SEARCH_CLOSE_ALWAYS | - CIFS_SEARCH_CLOSE_AT_END | - CIFS_SEARCH_BACKUP_SEARCH; + srchflgs = CIFS_SEARCH_CLOSE_ALWAYS | + CIFS_SEARCH_CLOSE_AT_END | + CIFS_SEARCH_BACKUP_SEARCH; - rc = CIFSFindFirst(xid, tcon, full_path, - cifs_sb, NULL, srchflgs, srchinf, false); - if (!rc) { - data = - (FILE_ALL_INFO *)srchinf->srch_entries_start; + rc = CIFSFindFirst(xid, tcon, full_path, + cifs_sb, NULL, srchflgs, srchinf, false); + if (!rc) { + data = (FILE_ALL_INFO *)srchinf->srch_entries_start; - cifs_dir_info_to_fattr(&fattr, - (FILE_DIRECTORY_INFO *)data, cifs_sb); - fattr.cf_uniqueid = le64_to_cpu( - ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); - validinum = true; + cifs_dir_info_to_fattr(&fattr, + (FILE_DIRECTORY_INFO *)data, cifs_sb); + fattr.cf_uniqueid = le64_to_cpu( + ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); + validinum = true; - cifs_buf_release(srchinf->ntwrk_buf_start); - } - kfree(srchinf); - if (rc) - goto cgii_exit; + cifs_buf_release(srchinf->ntwrk_buf_start); + } + kfree(srchinf); + if (rc) + goto cgii_exit; } else goto cgii_exit; diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 54f32f9143a9..76ddd98b6298 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -32,8 +32,51 @@ #include "cifs_debug.h" #include "cifsfs.h" #include "cifs_ioctl.h" +#include "smb2proto.h" #include <linux/btrfs.h> +static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, + unsigned long p) +{ + struct inode *inode = file_inode(filep); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct dentry *dentry = filep->f_path.dentry; + unsigned char *path; + __le16 *utf16_path = NULL, root_path; + int rc = 0; + + path = build_path_from_dentry(dentry); + if (path == NULL) + return -ENOMEM; + + cifs_dbg(FYI, "%s %s\n", __func__, path); + + if (!path[0]) { + root_path = 0; + utf16_path = &root_path; + } else { + utf16_path = cifs_convert_path_to_utf16(path + 1, cifs_sb); + if (!utf16_path) { + rc = -ENOMEM; + goto ici_exit; + } + } + + if (tcon->ses->server->ops->ioctl_query_info) + rc = tcon->ses->server->ops->ioctl_query_info( + xid, tcon, utf16_path, + filep->private_data ? 0 : 1, p); + else + rc = -EOPNOTSUPP; + + ici_exit: + if (utf16_path != &root_path) + kfree(utf16_path); + kfree(path); + return rc; +} + static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file, unsigned long srcfd) { @@ -123,7 +166,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) struct inode *inode = file_inode(filep); int rc = -ENOTTY; /* strange error - but the precedent */ unsigned int xid; - struct cifs_sb_info *cifs_sb; struct cifsFileInfo *pSMBFile = filep->private_data; struct cifs_tcon *tcon; __u64 ExtAttrBits = 0; @@ -131,7 +173,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) xid = get_xid(); - cifs_sb = CIFS_SB(inode->i_sb); cifs_dbg(FYI, "cifs ioctl 0x%x\n", command); switch (command) { case FS_IOC_GETFLAGS: @@ -196,6 +237,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) case CIFS_IOC_COPYCHUNK_FILE: rc = cifs_ioctl_copychunk(xid, filep, arg); break; + case CIFS_QUERY_INFO: + rc = cifs_ioctl_query_info(xid, filep, arg); + break; case CIFS_IOC_SET_INTEGRITY: if (pSMBFile == NULL) break; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 6926685e513c..fc43d5d25d1d 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -123,6 +123,8 @@ tconInfoAlloc(void) ret_buf->crfid.fid = kzalloc(sizeof(struct cifs_fid), GFP_KERNEL); spin_lock_init(&ret_buf->stat_lock); + atomic_set(&ret_buf->num_local_opens, 0); + atomic_set(&ret_buf->num_remote_opens, 0); } return ret_buf; } diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 0ffa18094335..dd10f0ce4cd5 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -33,7 +33,7 @@ /* * Identifiers for functions that use the open, operation, close pattern - * in smb2inode.c:smb2_open_op_close() + * in smb2inode.c:smb2_compound_op() */ #define SMB2_OP_SET_DELETE 1 #define SMB2_OP_SET_INFO 2 diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 1eef1791d0c4..9e7ef7ec2d70 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -38,54 +38,83 @@ #include "smb2proto.h" static int -smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, const char *full_path, - __u32 desired_access, __u32 create_disposition, - __u32 create_options, void *data, int command) +smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path, + __u32 desired_access, __u32 create_disposition, + __u32 create_options, void *ptr, int command) { - int rc, tmprc = 0; + int rc; __le16 *utf16_path = NULL; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; - bool use_cached_root_handle = false; - - if ((strcmp(full_path, "") == 0) && (create_options == 0) && - (desired_access == FILE_READ_ATTRIBUTES) && - (create_disposition == FILE_OPEN) && - (tcon->nohandlecache == false)) { - rc = open_shroot(xid, tcon, &fid); - if (rc == 0) - use_cached_root_handle = true; - } + struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = ses->server; + int num_rqst = 0; + struct smb_rqst rqst[3]; + int resp_buftype[3]; + struct kvec rsp_iov[3]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec qi_iov[1]; + struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE]; + struct kvec close_iov[1]; + struct smb2_query_info_rsp *qi_rsp = NULL; + int flags = 0; + __u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0}; + unsigned int size[2]; + void *data[2]; + struct smb2_file_rename_info rename_info; + struct smb2_file_link_info link_info; + int len; - if (use_cached_root_handle == false) { - utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); - if (!utf16_path) - return -ENOMEM; - - oparms.tcon = tcon; - oparms.desired_access = desired_access; - oparms.disposition = create_disposition; - oparms.create_options = create_options; - oparms.fid = &fid; - oparms.reconnect = false; - - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, - NULL); - if (rc) { - kfree(utf16_path); - return rc; - } - } + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + /* Open */ + utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.desired_access = desired_access; + oparms.disposition = create_disposition; + oparms.create_options = create_options; + if (backup_cred(cifs_sb)) + oparms.create_options |= CREATE_OPEN_BACKUP_INTENT; + oparms.fid = &fid; + oparms.reconnect = false; + memset(&open_iov, 0, sizeof(open_iov)); + rqst[num_rqst].rq_iov = open_iov; + rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE; + rc = SMB2_open_init(tcon, &rqst[num_rqst], &oplock, &oparms, + utf16_path); + kfree(utf16_path); + if (rc) + goto finished; + + smb2_set_next_command(server, &rqst[num_rqst++]); + + /* Operation */ switch (command) { - case SMB2_OP_DELETE: - break; case SMB2_OP_QUERY_INFO: - tmprc = SMB2_query_info(xid, tcon, fid.persistent_fid, - fid.volatile_fid, - (struct smb2_file_all_info *)data); + memset(&qi_iov, 0, sizeof(qi_iov)); + rqst[num_rqst].rq_iov = qi_iov; + rqst[num_rqst].rq_nvec = 1; + + rc = SMB2_query_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, FILE_ALL_INFORMATION, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + + PATH_MAX * 2, 0, NULL); + smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + break; + case SMB2_OP_DELETE: break; case SMB2_OP_MKDIR: /* @@ -94,39 +123,156 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon, */ break; case SMB2_OP_RMDIR: - tmprc = SMB2_rmdir(xid, tcon, fid.persistent_fid, - fid.volatile_fid); - break; - case SMB2_OP_RENAME: - tmprc = SMB2_rename(xid, tcon, fid.persistent_fid, - fid.volatile_fid, (__le16 *)data); - break; - case SMB2_OP_HARDLINK: - tmprc = SMB2_set_hardlink(xid, tcon, fid.persistent_fid, - fid.volatile_fid, (__le16 *)data); + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_nvec = 1; + + size[0] = 8; + data[0] = &delete_pending[0]; + + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_DISPOSITION_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_SET_EOF: - tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid, - fid.volatile_fid, current->tgid, - (__le64 *)data, false); + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_nvec = 1; + + size[0] = 8; /* sizeof __le64 */ + data[0] = ptr; + + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); break; case SMB2_OP_SET_INFO: - tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid, - fid.volatile_fid, - (FILE_BASIC_INFO *)data); + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_nvec = 1; + + + size[0] = sizeof(FILE_BASIC_INFO); + data[0] = ptr; + + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_BASIC_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + break; + case SMB2_OP_RENAME: + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_nvec = 2; + + len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX)); + + rename_info.ReplaceIfExists = 1; + rename_info.RootDirectory = 0; + rename_info.FileNameLength = cpu_to_le32(len); + + size[0] = sizeof(struct smb2_file_rename_info); + data[0] = &rename_info; + + size[1] = len + 2 /* null */; + data[1] = (__le16 *)ptr; + + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_RENAME_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); + break; + case SMB2_OP_HARDLINK: + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num_rqst].rq_iov = si_iov; + rqst[num_rqst].rq_nvec = 2; + + len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX)); + + link_info.ReplaceIfExists = 0; + link_info.RootDirectory = 0; + link_info.FileNameLength = cpu_to_le32(len); + + size[0] = sizeof(struct smb2_file_link_info); + data[0] = &link_info; + + size[1] = len + 2 /* null */; + data[1] = (__le16 *)ptr; + + rc = SMB2_set_info_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID, current->tgid, + FILE_LINK_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_next_command(server, &rqst[num_rqst]); + smb2_set_related(&rqst[num_rqst++]); break; default: cifs_dbg(VFS, "Invalid command\n"); - break; + rc = -EINVAL; } + if (rc) + goto finished; - if (use_cached_root_handle) - close_shroot(&tcon->crfid); - else - rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); - if (tmprc) - rc = tmprc; - kfree(utf16_path); + /* Close */ + memset(&close_iov, 0, sizeof(close_iov)); + rqst[num_rqst].rq_iov = close_iov; + rqst[num_rqst].rq_nvec = 1; + rc = SMB2_close_init(tcon, &rqst[num_rqst], COMPOUND_FID, + COMPOUND_FID); + smb2_set_related(&rqst[num_rqst++]); + if (rc) + goto finished; + + rc = compound_send_recv(xid, ses, flags, num_rqst, rqst, + resp_buftype, rsp_iov); + + finished: + SMB2_open_free(&rqst[0]); + switch (command) { + case SMB2_OP_QUERY_INFO: + if (rc == 0) { + qi_rsp = (struct smb2_query_info_rsp *) + rsp_iov[1].iov_base; + rc = smb2_validate_and_copy_iov( + le16_to_cpu(qi_rsp->OutputBufferOffset), + le32_to_cpu(qi_rsp->OutputBufferLength), + &rsp_iov[1], sizeof(struct smb2_file_all_info), + ptr); + } + if (rqst[1].rq_iov) + SMB2_query_info_free(&rqst[1]); + if (rqst[2].rq_iov) + SMB2_close_free(&rqst[2]); + break; + case SMB2_OP_DELETE: + case SMB2_OP_MKDIR: + if (rqst[1].rq_iov) + SMB2_close_free(&rqst[1]); + break; + case SMB2_OP_HARDLINK: + case SMB2_OP_RENAME: + case SMB2_OP_RMDIR: + case SMB2_OP_SET_EOF: + case SMB2_OP_SET_INFO: + if (rqst[1].rq_iov) + SMB2_set_info_free(&rqst[1]); + if (rqst[2].rq_iov) + SMB2_close_free(&rqst[2]); + break; + } + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); return rc; } @@ -147,6 +293,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, { int rc; struct smb2_file_all_info *smb2_data; + __u32 create_options = 0; *adjust_tz = false; *symlink = false; @@ -155,17 +302,21 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, GFP_KERNEL); if (smb2_data == NULL) return -ENOMEM; + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; - rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, 0, - smb2_data, SMB2_OP_QUERY_INFO); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, + smb2_data, SMB2_OP_QUERY_INFO); if (rc == -EOPNOTSUPP) { *symlink = true; + create_options |= OPEN_REPARSE_POINT; + /* Failed on a symbolic link - query a reparse point info */ - rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path, - FILE_READ_ATTRIBUTES, FILE_OPEN, - OPEN_REPARSE_POINT, smb2_data, - SMB2_OP_QUERY_INFO); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_READ_ATTRIBUTES, FILE_OPEN, + create_options, smb2_data, + SMB2_OP_QUERY_INFO); } if (rc) goto out; @@ -180,9 +331,9 @@ int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { - return smb2_open_op_close(xid, tcon, cifs_sb, name, - FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR); + return smb2_compound_op(xid, tcon, cifs_sb, name, + FILE_WRITE_ATTRIBUTES, FILE_CREATE, + CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR); } void @@ -199,9 +350,9 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, cifs_i = CIFS_I(inode); dosattrs = cifs_i->cifsAttrs | ATTR_READONLY; data.Attributes = cpu_to_le32(dosattrs); - tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name, - FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO); + tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, + FILE_WRITE_ATTRIBUTES, FILE_CREATE, + CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -210,18 +361,18 @@ int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { - return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_NOT_FILE, - NULL, SMB2_OP_RMDIR); + return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, + CREATE_NOT_FILE, + NULL, SMB2_OP_RMDIR); } int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { - return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - NULL, SMB2_OP_DELETE); + return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, + CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, + NULL, SMB2_OP_DELETE); } static int @@ -238,8 +389,8 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, goto smb2_rename_path; } - rc = smb2_open_op_close(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, 0, smb2_to_name, command); + rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, + FILE_OPEN, 0, smb2_to_name, command); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -269,9 +420,10 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, bool set_alloc) { __le64 eof = cpu_to_le64(size); - return smb2_open_op_close(xid, tcon, cifs_sb, full_path, - FILE_WRITE_DATA, FILE_OPEN, 0, &eof, - SMB2_OP_SET_EOF); + + return smb2_compound_op(xid, tcon, cifs_sb, full_path, + FILE_WRITE_DATA, FILE_OPEN, 0, &eof, + SMB2_OP_SET_EOF); } int @@ -291,9 +443,9 @@ smb2_set_file_info(struct inode *inode, const char *full_path, if (IS_ERR(tlink)) return PTR_ERR(tlink); - rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path, - FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, - SMB2_OP_SET_INFO); + rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path, + FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, + SMB2_OP_SET_INFO); cifs_put_tlink(tlink); return rc; } diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 20a2d304c603..d47b7f5dfa6c 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -288,7 +288,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_FLT_BUFFER_TOO_SMALL, -ENOBUFS, "STATUS_FLT_BUFFER_TOO_SMALL"}, {STATUS_FVE_PARTIAL_METADATA, -EIO, "STATUS_FVE_PARTIAL_METADATA"}, {STATUS_UNSUCCESSFUL, -EIO, "STATUS_UNSUCCESSFUL"}, - {STATUS_NOT_IMPLEMENTED, -ENOSYS, "STATUS_NOT_IMPLEMENTED"}, + {STATUS_NOT_IMPLEMENTED, -EOPNOTSUPP, "STATUS_NOT_IMPLEMENTED"}, {STATUS_INVALID_INFO_CLASS, -EIO, "STATUS_INVALID_INFO_CLASS"}, {STATUS_INFO_LENGTH_MISMATCH, -EIO, "STATUS_INFO_LENGTH_MISMATCH"}, {STATUS_ACCESS_VIOLATION, -EACCES, "STATUS_ACCESS_VIOLATION"}, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 89985a0a6819..f85fc5aa2710 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -74,6 +74,12 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, int *val, rc = 0; spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); + + /* eg found case where write overlapping reconnect messed up credits */ + if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0)) + trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, + server->hostname, *val); + *val += add; if (*val > 65000) { *val = 65000; /* Don't get near 64K credits, avoid srv bugs */ @@ -104,7 +110,12 @@ smb2_set_credits(struct TCP_Server_Info *server, const int val) { spin_lock(&server->req_lock); server->credits = val; + if (val == 1) + server->reconnect_instance++; spin_unlock(&server->req_lock); + /* don't log while holding the lock */ + if (val == 1) + cifs_dbg(FYI, "set credits to 1 due to smb2 reconnect\n"); } static int * @@ -270,6 +281,31 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) } static unsigned int +smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) +{ + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int wsize; + + /* start with specified wsize, or default */ + wsize = volume_info->wsize ? volume_info->wsize : SMB3_DEFAULT_IOSIZE; + wsize = min_t(unsigned int, wsize, server->max_write); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->rdma) { + if (server->sign) + wsize = min_t(unsigned int, + wsize, server->smbd_conn->max_fragmented_send_size); + else + wsize = min_t(unsigned int, + wsize, server->smbd_conn->max_readwrite_size); + } +#endif + if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); + + return wsize; +} + +static unsigned int smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) { struct TCP_Server_Info *server = tcon->ses->server; @@ -295,6 +331,31 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) return rsize; } +static unsigned int +smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) +{ + struct TCP_Server_Info *server = tcon->ses->server; + unsigned int rsize; + + /* start with specified rsize, or default */ + rsize = volume_info->rsize ? volume_info->rsize : SMB3_DEFAULT_IOSIZE; + rsize = min_t(unsigned int, rsize, server->max_read); +#ifdef CONFIG_CIFS_SMB_DIRECT + if (server->rdma) { + if (server->sign) + rsize = min_t(unsigned int, + rsize, server->smbd_conn->max_fragmented_recv_size); + else + rsize = min_t(unsigned int, + rsize, server->smbd_conn->max_readwrite_size); + } +#endif + + if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) + rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); + + return rsize; +} static int parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, @@ -962,6 +1023,9 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) seq_printf(m, "\nBytes read: %llu Bytes written: %llu", (long long)(tcon->bytes_read), (long long)(tcon->bytes_written)); + seq_printf(m, "\nOpen files: %d total (local), %d open on server", + atomic_read(&tcon->num_local_opens), + atomic_read(&tcon->num_remote_opens)); seq_printf(m, "\nTreeConnects: %d total %d failed", atomic_read(&sent[SMB2_TREE_CONNECT_HE]), atomic_read(&failed[SMB2_TREE_CONNECT_HE])); @@ -1057,6 +1121,131 @@ req_res_key_exit: return rc; } +static int +smb2_ioctl_query_info(const unsigned int xid, + struct cifs_tcon *tcon, + __le16 *path, int is_dir, + unsigned long p) +{ + struct cifs_ses *ses = tcon->ses; + char __user *arg = (char __user *)p; + struct smb_query_info qi; + struct smb_query_info __user *pqi; + int rc = 0; + int flags = 0; + struct smb2_query_info_rsp *rsp = NULL; + void *buffer = NULL; + struct smb_rqst rqst[3]; + int resp_buftype[3]; + struct kvec rsp_iov[3]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct cifs_open_parms oparms; + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_fid fid; + struct kvec qi_iov[1]; + struct kvec close_iov[1]; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + if (copy_from_user(&qi, arg, sizeof(struct smb_query_info))) + return -EFAULT; + + if (qi.output_buffer_length > 1024) + return -EINVAL; + + if (!ses || !(ses->server)) + return -EIO; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + buffer = kmalloc(qi.output_buffer_length, GFP_KERNEL); + if (buffer == NULL) + return -ENOMEM; + + if (copy_from_user(buffer, arg + sizeof(struct smb_query_info), + qi.output_buffer_length)) { + rc = -EFAULT; + goto iqinf_exit; + } + + /* Open */ + memset(&open_iov, 0, sizeof(open_iov)); + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; + + memset(&oparms, 0, sizeof(oparms)); + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_ATTRIBUTES | READ_CONTROL; + oparms.disposition = FILE_OPEN; + if (is_dir) + oparms.create_options = CREATE_NOT_FILE; + else + oparms.create_options = CREATE_NOT_DIR; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, path); + if (rc) + goto iqinf_exit; + smb2_set_next_command(ses->server, &rqst[0]); + + /* Query */ + memset(&qi_iov, 0, sizeof(qi_iov)); + rqst[1].rq_iov = qi_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID, + qi.file_info_class, qi.info_type, + qi.additional_information, + qi.input_buffer_length, + qi.output_buffer_length, buffer); + if (rc) + goto iqinf_exit; + smb2_set_next_command(ses->server, &rqst[1]); + smb2_set_related(&rqst[1]); + + /* Close */ + memset(&close_iov, 0, sizeof(close_iov)); + rqst[2].rq_iov = close_iov; + rqst[2].rq_nvec = 1; + + rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID); + if (rc) + goto iqinf_exit; + smb2_set_related(&rqst[2]); + + rc = compound_send_recv(xid, ses, flags, 3, rqst, + resp_buftype, rsp_iov); + if (rc) + goto iqinf_exit; + pqi = (struct smb_query_info __user *)arg; + rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(rsp->OutputBufferLength) < qi.input_buffer_length) + qi.input_buffer_length = le32_to_cpu(rsp->OutputBufferLength); + if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto iqinf_exit; + } + if (copy_to_user(pqi + 1, rsp->Buffer, qi.input_buffer_length)) { + rc = -EFAULT; + goto iqinf_exit; + } + + iqinf_exit: + kfree(buffer); + SMB2_open_free(&rqst[0]); + SMB2_query_info_free(&rqst[1]); + SMB2_close_free(&rqst[2]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); + free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + return rc; +} + static ssize_t smb2_copychunk_range(const unsigned int xid, struct cifsFileInfo *srcfile, @@ -1301,7 +1490,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, } return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, &eof, false); + cfile->fid.volatile_fid, cfile->pid, &eof); } static int @@ -1556,7 +1745,7 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, CIFS_CACHE_READ(cinode) ? 1 : 0); } -static void +void smb2_set_related(struct smb_rqst *rqst) { struct smb2_sync_hdr *shdr; @@ -1567,7 +1756,7 @@ smb2_set_related(struct smb_rqst *rqst) char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0}; -static void +void smb2_set_next_command(struct TCP_Server_Info *server, struct smb_rqst *rqst) { struct smb2_sync_hdr *shdr; @@ -1610,7 +1799,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, flags |= CIFS_TRANSFORM_REQ; memset(rqst, 0, sizeof(rqst)); - memset(resp_buftype, 0, sizeof(resp_buftype)); + resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER; memset(rsp_iov, 0, sizeof(rsp_iov)); memset(&open_iov, 0, sizeof(open_iov)); @@ -1636,7 +1825,8 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID, FS_FULL_SIZE_INFORMATION, SMB2_O_INFO_FILESYSTEM, 0, - sizeof(struct smb2_fs_full_size_info)); + sizeof(struct smb2_fs_full_size_info), 0, + NULL); if (rc) goto qfs_exit; smb2_set_next_command(server, &rqst[1]); @@ -3303,6 +3493,7 @@ struct smb_version_operations smb20_operations = { .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ .next_header = smb2_next_header, + .ioctl_query_info = smb2_ioctl_query_info, }; struct smb_version_operations smb21_operations = { @@ -3398,6 +3589,7 @@ struct smb_version_operations smb21_operations = { .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ .next_header = smb2_next_header, + .ioctl_query_info = smb2_ioctl_query_info, }; struct smb_version_operations smb30_operations = { @@ -3425,8 +3617,8 @@ struct smb_version_operations smb30_operations = { .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, - .negotiate_wsize = smb2_negotiate_wsize, - .negotiate_rsize = smb2_negotiate_rsize, + .negotiate_wsize = smb3_negotiate_wsize, + .negotiate_rsize = smb3_negotiate_rsize, .sess_setup = SMB2_sess_setup, .logoff = SMB2_logoff, .tree_connect = SMB2_tcon, @@ -3502,6 +3694,7 @@ struct smb_version_operations smb30_operations = { .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ .next_header = smb2_next_header, + .ioctl_query_info = smb2_ioctl_query_info, }; struct smb_version_operations smb311_operations = { @@ -3529,8 +3722,8 @@ struct smb_version_operations smb311_operations = { .downgrade_oplock = smb2_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, - .negotiate_wsize = smb2_negotiate_wsize, - .negotiate_rsize = smb2_negotiate_rsize, + .negotiate_wsize = smb3_negotiate_wsize, + .negotiate_rsize = smb3_negotiate_rsize, .sess_setup = SMB2_sess_setup, .logoff = SMB2_logoff, .tree_connect = SMB2_tcon, @@ -3607,6 +3800,7 @@ struct smb_version_operations smb311_operations = { .set_acl = set_smb2_acl, #endif /* CIFS_ACL */ .next_header = smb2_next_header, + .ioctl_query_info = smb2_ioctl_query_info, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index f54d07bda067..7d7b016fe8bb 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1478,7 +1478,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, /* SMB2 TREE_CONNECT request must be called with TreeId == 0 */ tcon->tid = 0; - + atomic_set(&tcon->num_remote_opens, 0); rc = smb2_plain_req_init(SMB2_TREE_CONNECT, tcon, (void **) &req, &total_len); if (rc) { @@ -2243,10 +2243,12 @@ SMB2_open_free(struct smb_rqst *rqst) { int i; - cifs_small_buf_release(rqst->rq_iov[0].iov_base); - for (i = 1; i < rqst->rq_nvec; i++) - if (rqst->rq_iov[i].iov_base != smb2_padding) - kfree(rqst->rq_iov[i].iov_base); + if (rqst && rqst->rq_iov) { + cifs_small_buf_release(rqst->rq_iov[0].iov_base); + for (i = 1; i < rqst->rq_nvec; i++) + if (rqst->rq_iov[i].iov_base != smb2_padding) + kfree(rqst->rq_iov[i].iov_base); + } } int @@ -2261,7 +2263,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, struct cifs_ses *ses = tcon->ses; struct kvec iov[SMB2_CREATE_IOV_SIZE]; struct kvec rsp_iov = {NULL, 0}; - int resp_buftype; + int resp_buftype = CIFS_NO_BUFFER; int rc = 0; int flags = 0; @@ -2303,6 +2305,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, ses->Suid, oparms->create_options, oparms->desired_access); + atomic_inc(&tcon->num_remote_opens); oparms->fid->persistent_fid = rsp->PersistentFileId; oparms->fid->volatile_fid = rsp->VolatileFileId; @@ -2474,13 +2477,13 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto ioctl_exit; } - *out_data = kmalloc(*plen, GFP_KERNEL); + *out_data = kmemdup((char *)rsp + le32_to_cpu(rsp->OutputOffset), + *plen, GFP_KERNEL); if (*out_data == NULL) { rc = -ENOMEM; goto ioctl_exit; } - memcpy(*out_data, (char *)rsp + le32_to_cpu(rsp->OutputOffset), *plen); ioctl_exit: free_rsp_buf(resp_buftype, rsp); return rc; @@ -2535,7 +2538,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, void SMB2_close_free(struct smb_rqst *rqst) { - cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ } int @@ -2547,7 +2551,7 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_ses *ses = tcon->ses; struct kvec iov[1]; struct kvec rsp_iov; - int resp_buftype; + int resp_buftype = CIFS_NO_BUFFER; int rc = 0; cifs_dbg(FYI, "Close\n"); @@ -2577,6 +2581,8 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, goto close_exit; } + atomic_dec(&tcon->num_remote_opens); + /* BB FIXME - decode close response, update inode for caching */ close_exit: @@ -2627,10 +2633,10 @@ smb2_validate_iov(unsigned int offset, unsigned int buffer_length, * If SMB buffer fields are valid, copy into temporary buffer to hold result. * Caller must free buffer. */ -static int -validate_and_copy_iov(unsigned int offset, unsigned int buffer_length, - struct kvec *iov, unsigned int minbufsize, - char *data) +int +smb2_validate_and_copy_iov(unsigned int offset, unsigned int buffer_length, + struct kvec *iov, unsigned int minbufsize, + char *data) { char *begin_of_buf = offset + (char *)iov->iov_base; int rc; @@ -2651,7 +2657,7 @@ int SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u8 info_class, u8 info_type, u32 additional_info, - size_t output_len) + size_t output_len, size_t input_len, void *input) { struct smb2_query_info_req *req; struct kvec *iov = rqst->rq_iov; @@ -2669,23 +2675,25 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, req->VolatileFileId = volatile_fid; req->AdditionalInformation = cpu_to_le32(additional_info); - /* - * We do not use the input buffer (do not send extra byte) - */ - req->InputBufferOffset = 0; - req->OutputBufferLength = cpu_to_le32(output_len); + if (input_len) { + req->InputBufferLength = cpu_to_le32(input_len); + /* total_len for smb query request never close to le16 max */ + req->InputBufferOffset = cpu_to_le16(total_len - 1); + memcpy(req->Buffer, input, input_len); + } iov[0].iov_base = (char *)req; /* 1 for Buffer */ - iov[0].iov_len = total_len - 1; + iov[0].iov_len = total_len - 1 + input_len; return 0; } void SMB2_query_info_free(struct smb_rqst *rqst) { - cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ } static int @@ -2699,7 +2707,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, struct kvec iov[1]; struct kvec rsp_iov; int rc = 0; - int resp_buftype; + int resp_buftype = CIFS_NO_BUFFER; struct cifs_ses *ses = tcon->ses; int flags = 0; @@ -2718,7 +2726,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_query_info_init(tcon, &rqst, persistent_fid, volatile_fid, info_class, info_type, additional_info, - output_len); + output_len, 0, NULL); if (rc) goto qinf_exit; @@ -2746,9 +2754,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, } } - rc = validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), - le32_to_cpu(rsp->OutputBufferLength), - &rsp_iov, min_len, *data); + rc = smb2_validate_and_copy_iov(le16_to_cpu(rsp->OutputBufferOffset), + le32_to_cpu(rsp->OutputBufferLength), + &rsp_iov, min_len, *data); qinf_exit: SMB2_query_info_free(&rqst); @@ -3754,45 +3762,22 @@ qdir_exit: return rc; } -static int -send_set_info(const unsigned int xid, struct cifs_tcon *tcon, +int +SMB2_set_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class, - u8 info_type, u32 additional_info, unsigned int num, + u8 info_type, u32 additional_info, void **data, unsigned int *size) { - struct smb_rqst rqst; struct smb2_set_info_req *req; - struct smb2_set_info_rsp *rsp = NULL; - struct kvec *iov; - struct kvec rsp_iov; - int rc = 0; - int resp_buftype; - unsigned int i; - struct cifs_ses *ses = tcon->ses; - int flags = 0; - unsigned int total_len; - - if (!ses || !(ses->server)) - return -EIO; - - if (!num) - return -EINVAL; - - iov = kmalloc_array(num, sizeof(struct kvec), GFP_KERNEL); - if (!iov) - return -ENOMEM; + struct kvec *iov = rqst->rq_iov; + unsigned int i, total_len; + int rc; rc = smb2_plain_req_init(SMB2_SET_INFO, tcon, (void **) &req, &total_len); - if (rc) { - kfree(iov); + if (rc) return rc; - } - - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; req->sync_hdr.ProcessId = cpu_to_le32(pid); - req->InfoType = info_type; req->FileInfoClass = info_class; req->PersistentFileId = persistent_fid; @@ -3810,19 +3795,66 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, /* 1 for Buffer */ iov[0].iov_len = total_len - 1; - for (i = 1; i < num; i++) { + for (i = 1; i < rqst->rq_nvec; i++) { le32_add_cpu(&req->BufferLength, size[i]); iov[i].iov_base = (char *)data[i]; iov[i].iov_len = size[i]; } + return 0; +} + +void +SMB2_set_info_free(struct smb_rqst *rqst) +{ + if (rqst && rqst->rq_iov) + cifs_buf_release(rqst->rq_iov[0].iov_base); /* request */ +} + +static int +send_set_info(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, u32 pid, u8 info_class, + u8 info_type, u32 additional_info, unsigned int num, + void **data, unsigned int *size) +{ + struct smb_rqst rqst; + struct smb2_set_info_rsp *rsp = NULL; + struct kvec *iov; + struct kvec rsp_iov; + int rc = 0; + int resp_buftype; + struct cifs_ses *ses = tcon->ses; + int flags = 0; + + if (!ses || !(ses->server)) + return -EIO; + + if (!num) + return -EINVAL; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + iov = kmalloc_array(num, sizeof(struct kvec), GFP_KERNEL); + if (!iov) + return -ENOMEM; + memset(&rqst, 0, sizeof(struct smb_rqst)); rqst.rq_iov = iov; rqst.rq_nvec = num; + rc = SMB2_set_info_init(tcon, &rqst, persistent_fid, volatile_fid, pid, + info_class, info_type, additional_info, + data, size); + if (rc) { + kfree(iov); + return rc; + } + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_buf_release(req); + SMB2_set_info_free(&rqst); rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base; if (rc != 0) { @@ -3837,88 +3869,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, } int -SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, __le16 *target_file) -{ - struct smb2_file_rename_info info; - void **data; - unsigned int size[2]; - int rc; - int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX)); - - data = kmalloc_array(2, sizeof(void *), GFP_KERNEL); - if (!data) - return -ENOMEM; - - info.ReplaceIfExists = 1; /* 1 = replace existing target with new */ - /* 0 = fail if target already exists */ - info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */ - info.FileNameLength = cpu_to_le32(len); - - data[0] = &info; - size[0] = sizeof(struct smb2_file_rename_info); - - data[1] = target_file; - size[1] = len + 2 /* null */; - - rc = send_set_info(xid, tcon, persistent_fid, volatile_fid, - current->tgid, FILE_RENAME_INFORMATION, SMB2_O_INFO_FILE, - 0, 2, data, size); - kfree(data); - return rc; -} - -int -SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid) -{ - __u8 delete_pending = 1; - void *data; - unsigned int size; - - data = &delete_pending; - size = 1; /* sizeof __u8 */ - - return send_set_info(xid, tcon, persistent_fid, volatile_fid, - current->tgid, FILE_DISPOSITION_INFORMATION, SMB2_O_INFO_FILE, - 0, 1, &data, &size); -} - -int -SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, __le16 *target_file) -{ - struct smb2_file_link_info info; - void **data; - unsigned int size[2]; - int rc; - int len = (2 * UniStrnlen((wchar_t *)target_file, PATH_MAX)); - - data = kmalloc_array(2, sizeof(void *), GFP_KERNEL); - if (!data) - return -ENOMEM; - - info.ReplaceIfExists = 0; /* 1 = replace existing link with new */ - /* 0 = fail if link already exists */ - info.RootDirectory = 0; /* MBZ for network ops (why does spec say?) */ - info.FileNameLength = cpu_to_le32(len); - - data[0] = &info; - size[0] = sizeof(struct smb2_file_link_info); - - data[1] = target_file; - size[1] = len + 2 /* null */; - - rc = send_set_info(xid, tcon, persistent_fid, volatile_fid, - current->tgid, FILE_LINK_INFORMATION, SMB2_O_INFO_FILE, - 0, 2, data, size); - kfree(data); - return rc; -} - -int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc) + u64 volatile_fid, u32 pid, __le64 *eof) { struct smb2_file_eof_info info; void *data; @@ -3929,28 +3881,12 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, data = &info; size = sizeof(struct smb2_file_eof_info); - if (is_falloc) - return send_set_info(xid, tcon, persistent_fid, volatile_fid, - pid, FILE_ALLOCATION_INFORMATION, SMB2_O_INFO_FILE, - 0, 1, &data, &size); - else - return send_set_info(xid, tcon, persistent_fid, volatile_fid, + return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE, 0, 1, &data, &size); } int -SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, FILE_BASIC_INFO *buf) -{ - unsigned int size; - size = sizeof(FILE_BASIC_INFO); - return send_set_info(xid, tcon, persistent_fid, volatile_fid, - current->tgid, FILE_BASIC_INFORMATION, SMB2_O_INFO_FILE, - 0, 1, (void **)&buf, &size); -} - -int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct cifs_ntsd *pnntsd, int pacllen, int aclflag) @@ -4350,6 +4286,8 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, struct kvec iov[1]; struct kvec rsp_iov; int resp_buf_type; + __u64 *please_key_high; + __u64 *please_key_low; cifs_dbg(FYI, "SMB2_lease_break\n"); rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req, @@ -4379,10 +4317,16 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); + please_key_low = (__u64 *)req->LeaseKey; + please_key_high = (__u64 *)(req->LeaseKey+8); if (rc) { cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE); + trace_smb3_lease_err(le32_to_cpu(lease_state), tcon->tid, + ses->Suid, *please_key_low, *please_key_high, rc); cifs_dbg(FYI, "Send error in Lease Break = %d\n", rc); - } + } else + trace_smb3_lease_done(le32_to_cpu(lease_state), tcon->tid, + ses->Suid, *please_key_low, *please_key_high); return rc; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 8fb7887f2b3d..f753f424d7f1 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -613,6 +613,8 @@ struct smb2_tree_disconnect_rsp { #define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83 #define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C +/* Flag (SMB3 open response) values */ +#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01 /* * Maximum number of iovs we need for an open/create request. @@ -650,7 +652,7 @@ struct smb2_create_rsp { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 89 */ __u8 OplockLevel; - __u8 Reserved; + __u8 Flag; /* 0x01 if reparse point */ __le32 CreateAction; __le64 CreationTime; __le64 LastAccessTime; @@ -1174,6 +1176,15 @@ struct smb2_query_info_rsp { __u8 Buffer[1]; } __packed; +/* + * Maximum number of iovs we need for a set-info request. + * The largest one is rename/hardlink + * [0] : struct smb2_set_info_req + smb2_file_[rename|link]_info + * [1] : path + * [2] : compound padding + */ +#define SMB2_SET_INFO_IOV_SIZE 3 + struct smb2_set_info_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 33 */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index b4076577eeb7..9f4e9ed9ce53 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -116,6 +116,9 @@ extern void smb2_reconnect_server(struct work_struct *work); extern int smb3_crypto_aead_allocate(struct TCP_Server_Info *server); extern unsigned long smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst); +extern void smb2_set_next_command(struct TCP_Server_Info *server, + struct smb_rqst *rqst); +extern void smb2_set_related(struct smb_rqst *rqst); /* * SMB2 Worker functions - most of protocol specific implementation details @@ -160,7 +163,8 @@ extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, u64 persistent_fid, u64 volatile_fid, u8 info_class, u8 info_type, - u32 additional_info, size_t output_len); + u32 additional_info, size_t output_len, + size_t input_len, void *input); extern void SMB2_query_info_free(struct smb_rqst *rqst); extern int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, @@ -179,20 +183,14 @@ extern int SMB2_echo(struct TCP_Server_Info *server); extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int index, struct cifs_search_info *srch_inf); -extern int SMB2_rename(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, - __le16 *target_file); -extern int SMB2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid); -extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, - __le16 *target_file); extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 pid, - __le64 *eof, bool is_fallocate); -extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, - FILE_BASIC_INFO *buf); + __le64 *eof); +extern int SMB2_set_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 pid, + u8 info_class, u8 info_type, u32 additional_info, + void **data, unsigned int *size); +extern void SMB2_set_info_free(struct smb_rqst *rqst); extern int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct cifs_ntsd *pnntsd, int pacllen, int aclflag); @@ -232,6 +230,10 @@ extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, extern int smb3_encryption_required(const struct cifs_tcon *tcon); extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int min_buf_size); +extern int smb2_validate_and_copy_iov(unsigned int offset, + unsigned int buffer_length, + struct kvec *iov, + unsigned int minbufsize, char *data); extern void smb2_copy_fs_info_to_kstatfs( struct smb2_fs_full_size_info *pfs_inf, struct kstatfs *kst); diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 5fdb9a509a97..5e282368cc4a 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -2295,8 +2295,12 @@ static void smbd_mr_recovery_work(struct work_struct *work) int rc; list_for_each_entry(smbdirect_mr, &info->mr_list, list) { - if (smbdirect_mr->state == MR_INVALIDATED || - smbdirect_mr->state == MR_ERROR) { + if (smbdirect_mr->state == MR_INVALIDATED) + ib_dma_unmap_sg( + info->id->device, smbdirect_mr->sgl, + smbdirect_mr->sgl_count, + smbdirect_mr->dir); + else if (smbdirect_mr->state == MR_ERROR) { /* recover this MR entry */ rc = ib_dereg_mr(smbdirect_mr->mr); @@ -2320,25 +2324,21 @@ static void smbd_mr_recovery_work(struct work_struct *work) smbd_disconnect_rdma_connection(info); continue; } + } else + /* This MR is being used, don't recover it */ + continue; - if (smbdirect_mr->state == MR_INVALIDATED) - ib_dma_unmap_sg( - info->id->device, smbdirect_mr->sgl, - smbdirect_mr->sgl_count, - smbdirect_mr->dir); - - smbdirect_mr->state = MR_READY; + smbdirect_mr->state = MR_READY; - /* smbdirect_mr->state is updated by this function - * and is read and updated by I/O issuing CPUs trying - * to get a MR, the call to atomic_inc_return - * implicates a memory barrier and guarantees this - * value is updated before waking up any calls to - * get_mr() from the I/O issuing CPUs - */ - if (atomic_inc_return(&info->mr_ready_count) == 1) - wake_up_interruptible(&info->wait_mr); - } + /* smbdirect_mr->state is updated by this function + * and is read and updated by I/O issuing CPUs trying + * to get a MR, the call to atomic_inc_return + * implicates a memory barrier and guarantees this + * value is updated before waking up any calls to + * get_mr() from the I/O issuing CPUs + */ + if (atomic_inc_return(&info->mr_ready_count) == 1) + wake_up_interruptible(&info->wait_mr); } } diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index d4aed5217a56..cce8414fe7ec 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -460,6 +460,85 @@ DEFINE_EVENT(smb3_open_done_class, smb3_##name, \ DEFINE_SMB3_OPEN_DONE_EVENT(open_done); DEFINE_SMB3_OPEN_DONE_EVENT(posix_mkdir_done); + +DECLARE_EVENT_CLASS(smb3_lease_done_class, + TP_PROTO(__u32 lease_state, + __u32 tid, + __u64 sesid, + __u64 lease_key_low, + __u64 lease_key_high), + TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high), + TP_STRUCT__entry( + __field(__u32, lease_state) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, lease_key_low) + __field(__u64, lease_key_high) + ), + TP_fast_assign( + __entry->lease_state = lease_state; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->lease_key_low = lease_key_low; + __entry->lease_key_high = lease_key_high; + ), + TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x", + __entry->sesid, __entry->tid, __entry->lease_key_high, + __entry->lease_key_low, __entry->lease_state) +) + +#define DEFINE_SMB3_LEASE_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_lease_done_class, smb3_##name, \ + TP_PROTO(__u32 lease_state, \ + __u32 tid, \ + __u64 sesid, \ + __u64 lease_key_low, \ + __u64 lease_key_high), \ + TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high)) + +DEFINE_SMB3_LEASE_DONE_EVENT(lease_done); + +DECLARE_EVENT_CLASS(smb3_lease_err_class, + TP_PROTO(__u32 lease_state, + __u32 tid, + __u64 sesid, + __u64 lease_key_low, + __u64 lease_key_high, + int rc), + TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc), + TP_STRUCT__entry( + __field(__u32, lease_state) + __field(__u32, tid) + __field(__u64, sesid) + __field(__u64, lease_key_low) + __field(__u64, lease_key_high) + __field(int, rc) + ), + TP_fast_assign( + __entry->lease_state = lease_state; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->lease_key_low = lease_key_low; + __entry->lease_key_high = lease_key_high; + __entry->rc = rc; + ), + TP_printk("sid=0x%llx tid=0x%x lease_key=0x%llx%llx lease_state=0x%x rc=%d", + __entry->sesid, __entry->tid, __entry->lease_key_high, + __entry->lease_key_low, __entry->lease_state, __entry->rc) +) + +#define DEFINE_SMB3_LEASE_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \ + TP_PROTO(__u32 lease_state, \ + __u32 tid, \ + __u64 sesid, \ + __u64 lease_key_low, \ + __u64 lease_key_high, \ + int rc), \ + TP_ARGS(lease_state, tid, sesid, lease_key_low, lease_key_high, rc)) + +DEFINE_SMB3_LEASE_ERR_EVENT(lease_err); + DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_PROTO(__u64 currmid, char *hostname), @@ -486,6 +565,36 @@ DEFINE_EVENT(smb3_reconnect_class, smb3_##name, \ DEFINE_SMB3_RECONNECT_EVENT(reconnect); DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect); +DECLARE_EVENT_CLASS(smb3_credit_class, + TP_PROTO(__u64 currmid, + char *hostname, + int credits), + TP_ARGS(currmid, hostname, credits), + TP_STRUCT__entry( + __field(__u64, currmid) + __field(char *, hostname) + __field(int, credits) + ), + TP_fast_assign( + __entry->currmid = currmid; + __entry->hostname = hostname; + __entry->credits = credits; + ), + TP_printk("server=%s current_mid=0x%llx credits=%d", + __entry->hostname, + __entry->currmid, + __entry->credits) +) + +#define DEFINE_SMB3_CREDIT_EVENT(name) \ +DEFINE_EVENT(smb3_credit_class, smb3_##name, \ + TP_PROTO(__u64 currmid, \ + char *hostname, \ + int credits), \ + TP_ARGS(currmid, hostname, credits)) + +DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits); + #endif /* _CIFS_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index b48f43963da6..f8112433f0c8 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -113,9 +113,18 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) cifs_small_buf_release(midEntry->resp_buf); #ifdef CONFIG_CIFS_STATS2 now = jiffies; - /* commands taking longer than one second are indications that - something is wrong, unless it is quite a slow link or server */ - if (time_after(now, midEntry->when_alloc + HZ) && + /* + * commands taking longer than one second (default) can be indications + * that something is wrong, unless it is quite a slow link or a very + * busy server. Note that this calc is unlikely or impossible to wrap + * as long as slow_rsp_threshold is not set way above recommended max + * value (32767 ie 9 hours) and is generally harmless even if wrong + * since only affects debug counters - so leaving the calc as simple + * comparison rather than doing multiple conversions and overflow + * checks + */ + if ((slow_rsp_threshold != 0) && + time_after(now, midEntry->when_alloc + (slow_rsp_threshold * HZ)) && (midEntry->command != command)) { /* smb2slowcmd[NUMBER_OF_SMB2_COMMANDS] counts by command */ if ((le16_to_cpu(midEntry->command) < NUMBER_OF_SMB2_COMMANDS) && @@ -128,7 +137,7 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) if (cifsFYI & CIFS_TIMER) { pr_debug(" CIFS slow rsp: cmd %d mid %llu", midEntry->command, midEntry->mid); - pr_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n", + cifs_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n", now - midEntry->when_alloc, now - midEntry->when_sent, now - midEntry->when_received); @@ -786,7 +795,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, int i, j, rc = 0; int timeout, optype; struct mid_q_entry *midQ[MAX_COMPOUND]; - unsigned int credits = 1; + unsigned int credits = 0; char *buf; timeout = flags & CIFS_TIMEOUT_MASK; @@ -851,21 +860,24 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, mutex_unlock(&ses->server->srv_mutex); - for (i = 0; i < num_rqst; i++) { - if (rc < 0) - goto out; + if (rc < 0) + goto out; - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) - smb311_update_preauth_hash(ses, rqst[i].rq_iov, - rqst[i].rq_nvec); + /* + * Compounding is never used during session establish. + */ + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) + smb311_update_preauth_hash(ses, rqst[0].rq_iov, + rqst[0].rq_nvec); - if (timeout == CIFS_ASYNC_OP) - goto out; + if (timeout == CIFS_ASYNC_OP) + goto out; + for (i = 0; i < num_rqst; i++) { rc = wait_for_response(ses->server, midQ[i]); if (rc != 0) { - cifs_dbg(FYI, "Cancelling wait for mid %llu\n", - midQ[i]->mid); + cifs_dbg(VFS, "Cancelling wait for mid %llu cmd: %d\n", + midQ[i]->mid, le16_to_cpu(midQ[i]->command)); send_cancel(ses->server, &rqst[i], midQ[i]); spin_lock(&GlobalMid_Lock); if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { @@ -877,10 +889,21 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } spin_unlock(&GlobalMid_Lock); } + } + + for (i = 0; i < num_rqst; i++) + if (midQ[i]->resp_buf) + credits += ses->server->ops->get_credits(midQ[i]); + if (!credits) + credits = 1; + + for (i = 0; i < num_rqst; i++) { + if (rc < 0) + goto out; rc = cifs_sync_mid_result(midQ[i], ses->server); if (rc != 0) { - add_credits(ses->server, 1, optype); + add_credits(ses->server, credits, optype); return rc; } @@ -901,23 +924,26 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, else resp_buf_type[i] = CIFS_SMALL_BUFFER; - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { - struct kvec iov = { - .iov_base = resp_iov[i].iov_base, - .iov_len = resp_iov[i].iov_len - }; - smb311_update_preauth_hash(ses, &iov, 1); - } - - credits = ses->server->ops->get_credits(midQ[i]); - rc = ses->server->ops->check_receive(midQ[i], ses->server, flags & CIFS_LOG_ERROR); /* mark it so buf will not be freed by cifs_delete_mid */ if ((flags & CIFS_NO_RESP) == 0) midQ[i]->resp_buf = NULL; + + } + + /* + * Compounding is never used during session establish. + */ + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { + struct kvec iov = { + .iov_base = resp_iov[0].iov_base, + .iov_len = resp_iov[0].iov_len + }; + smb311_update_preauth_hash(ses, &iov, 1); } + out: /* * This will dequeue all mids. After this it is important that the diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index ce2cc2169040..6e30949d9f77 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -64,11 +64,6 @@ #include <linux/hiddev.h> -#define __DVB_CORE__ -#include <linux/dvb/audio.h> -#include <linux/dvb/dmx.h> -#include <linux/dvb/frontend.h> -#include <linux/dvb/video.h> #include <linux/sort.h> @@ -95,71 +90,6 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return vfs_ioctl(file, cmd, arg); } -struct compat_video_event { - int32_t type; - compat_time_t timestamp; - union { - video_size_t size; - unsigned int frame_rate; - } u; -}; - -static int do_video_get_event(struct file *file, - unsigned int cmd, struct compat_video_event __user *up) -{ - struct video_event __user *kevent = - compat_alloc_user_space(sizeof(*kevent)); - int err; - - if (kevent == NULL) - return -EFAULT; - - err = do_ioctl(file, cmd, (unsigned long)kevent); - if (!err) { - err = convert_in_user(&kevent->type, &up->type); - err |= convert_in_user(&kevent->timestamp, &up->timestamp); - err |= convert_in_user(&kevent->u.size.w, &up->u.size.w); - err |= convert_in_user(&kevent->u.size.h, &up->u.size.h); - err |= convert_in_user(&kevent->u.size.aspect_ratio, - &up->u.size.aspect_ratio); - if (err) - err = -EFAULT; - } - - return err; -} - -struct compat_video_still_picture { - compat_uptr_t iFrame; - int32_t size; -}; - -static int do_video_stillpicture(struct file *file, - unsigned int cmd, struct compat_video_still_picture __user *up) -{ - struct video_still_picture __user *up_native; - compat_uptr_t fp; - int32_t size; - int err; - - err = get_user(fp, &up->iFrame); - err |= get_user(size, &up->size); - if (err) - return -EFAULT; - - up_native = - compat_alloc_user_space(sizeof(struct video_still_picture)); - - err = put_user(compat_ptr(fp), &up_native->iFrame); - err |= put_user(size, &up_native->size); - if (err) - return -EFAULT; - - err = do_ioctl(file, cmd, (unsigned long) up_native); - - return err; -} - #ifdef CONFIG_BLOCK typedef struct sg_io_hdr32 { compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ @@ -958,61 +888,6 @@ COMPATIBLE_IOCTL(HIDIOCGFLAG) COMPATIBLE_IOCTL(HIDIOCSFLAG) COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINDEX) COMPATIBLE_IOCTL(HIDIOCGCOLLECTIONINFO) -/* dvb */ -COMPATIBLE_IOCTL(AUDIO_STOP) -COMPATIBLE_IOCTL(AUDIO_PLAY) -COMPATIBLE_IOCTL(AUDIO_PAUSE) -COMPATIBLE_IOCTL(AUDIO_CONTINUE) -COMPATIBLE_IOCTL(AUDIO_SELECT_SOURCE) -COMPATIBLE_IOCTL(AUDIO_SET_MUTE) -COMPATIBLE_IOCTL(AUDIO_SET_AV_SYNC) -COMPATIBLE_IOCTL(AUDIO_SET_BYPASS_MODE) -COMPATIBLE_IOCTL(AUDIO_CHANNEL_SELECT) -COMPATIBLE_IOCTL(AUDIO_GET_STATUS) -COMPATIBLE_IOCTL(AUDIO_GET_CAPABILITIES) -COMPATIBLE_IOCTL(AUDIO_CLEAR_BUFFER) -COMPATIBLE_IOCTL(AUDIO_SET_ID) -COMPATIBLE_IOCTL(AUDIO_SET_MIXER) -COMPATIBLE_IOCTL(AUDIO_SET_STREAMTYPE) -COMPATIBLE_IOCTL(DMX_START) -COMPATIBLE_IOCTL(DMX_STOP) -COMPATIBLE_IOCTL(DMX_SET_FILTER) -COMPATIBLE_IOCTL(DMX_SET_PES_FILTER) -COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE) -COMPATIBLE_IOCTL(DMX_GET_PES_PIDS) -COMPATIBLE_IOCTL(DMX_GET_STC) -COMPATIBLE_IOCTL(DMX_REQBUFS) -COMPATIBLE_IOCTL(DMX_QUERYBUF) -COMPATIBLE_IOCTL(DMX_EXPBUF) -COMPATIBLE_IOCTL(DMX_QBUF) -COMPATIBLE_IOCTL(DMX_DQBUF) -COMPATIBLE_IOCTL(VIDEO_STOP) -COMPATIBLE_IOCTL(VIDEO_PLAY) -COMPATIBLE_IOCTL(VIDEO_FREEZE) -COMPATIBLE_IOCTL(VIDEO_CONTINUE) -COMPATIBLE_IOCTL(VIDEO_SELECT_SOURCE) -COMPATIBLE_IOCTL(VIDEO_SET_BLANK) -COMPATIBLE_IOCTL(VIDEO_GET_STATUS) -COMPATIBLE_IOCTL(VIDEO_SET_DISPLAY_FORMAT) -COMPATIBLE_IOCTL(VIDEO_FAST_FORWARD) -COMPATIBLE_IOCTL(VIDEO_SLOWMOTION) -COMPATIBLE_IOCTL(VIDEO_GET_CAPABILITIES) -COMPATIBLE_IOCTL(VIDEO_CLEAR_BUFFER) -COMPATIBLE_IOCTL(VIDEO_SET_STREAMTYPE) -COMPATIBLE_IOCTL(VIDEO_SET_FORMAT) -COMPATIBLE_IOCTL(VIDEO_GET_SIZE) -/* cec */ -COMPATIBLE_IOCTL(CEC_ADAP_G_CAPS) -COMPATIBLE_IOCTL(CEC_ADAP_G_LOG_ADDRS) -COMPATIBLE_IOCTL(CEC_ADAP_S_LOG_ADDRS) -COMPATIBLE_IOCTL(CEC_ADAP_G_PHYS_ADDR) -COMPATIBLE_IOCTL(CEC_ADAP_S_PHYS_ADDR) -COMPATIBLE_IOCTL(CEC_G_MODE) -COMPATIBLE_IOCTL(CEC_S_MODE) -COMPATIBLE_IOCTL(CEC_TRANSMIT) -COMPATIBLE_IOCTL(CEC_RECEIVE) -COMPATIBLE_IOCTL(CEC_DQEVENT) - /* joystick */ COMPATIBLE_IOCTL(JSIOCGVERSION) COMPATIBLE_IOCTL(JSIOCGAXES) @@ -1080,12 +955,6 @@ static long do_ioctl_trans(unsigned int cmd, case RTC_EPOCH_READ32: case RTC_EPOCH_SET32: return rtc_ioctl(file, cmd, argp); - - /* dvb */ - case VIDEO_GET_EVENT: - return do_video_get_event(file, cmd, argp); - case VIDEO_STILLPICTURE: - return do_video_stillpicture(file, cmd, argp); } /* diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index f408994fc632..0c35e62f108d 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -418,9 +418,12 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) int i; vma->vm_flags |= VM_MIXEDMAP; for (i = 0; i < pages && !ret; i++) { + vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); - ret = vm_insert_mixed(vma, vma->vm_start + off, pfn); + vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn); + if (vmf & VM_FAULT_ERROR) + ret = vm_fault_to_errno(vmf, 0); } } @@ -38,6 +38,17 @@ #define CREATE_TRACE_POINTS #include <trace/events/fs_dax.h> +static inline unsigned int pe_order(enum page_entry_size pe_size) +{ + if (pe_size == PE_SIZE_PTE) + return PAGE_SHIFT - PAGE_SHIFT; + if (pe_size == PE_SIZE_PMD) + return PMD_SHIFT - PAGE_SHIFT; + if (pe_size == PE_SIZE_PUD) + return PUD_SHIFT - PAGE_SHIFT; + return ~0; +} + /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) @@ -46,6 +57,9 @@ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) +/* The order of a PMD entry */ +#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) + static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) @@ -59,63 +73,74 @@ static int __init init_dax_wait_table(void) fs_initcall(init_dax_wait_table); /* - * We use lowest available bit in exceptional entry for locking, one bit for - * the entry size (PMD) and two more to tell us if the entry is a zero page or - * an empty entry that is just used for locking. In total four special bits. + * DAX pagecache entries use XArray value entries so they can't be mistaken + * for pages. We use one bit for locking, one bit for the entry size (PMD) + * and two more to tell us if the entry is a zero page or an empty entry that + * is just used for locking. In total four special bits. * * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem * block allocation. */ -#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) -#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) -#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) -#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) +#define DAX_SHIFT (4) +#define DAX_LOCKED (1UL << 0) +#define DAX_PMD (1UL << 1) +#define DAX_ZERO_PAGE (1UL << 2) +#define DAX_EMPTY (1UL << 3) -static unsigned long dax_radix_pfn(void *entry) +static unsigned long dax_to_pfn(void *entry) { - return (unsigned long)entry >> RADIX_DAX_SHIFT; + return xa_to_value(entry) >> DAX_SHIFT; } -static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) +static void *dax_make_entry(pfn_t pfn, unsigned long flags) { - return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | - (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK); + return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); } -static unsigned int dax_radix_order(void *entry) +static void *dax_make_page_entry(struct page *page) { - if ((unsigned long)entry & RADIX_DAX_PMD) - return PMD_SHIFT - PAGE_SHIFT; + pfn_t pfn = page_to_pfn_t(page); + return dax_make_entry(pfn, PageHead(page) ? DAX_PMD : 0); +} + +static bool dax_is_locked(void *entry) +{ + return xa_to_value(entry) & DAX_LOCKED; +} + +static unsigned int dax_entry_order(void *entry) +{ + if (xa_to_value(entry) & DAX_PMD) + return PMD_ORDER; return 0; } static int dax_is_pmd_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_PMD; + return xa_to_value(entry) & DAX_PMD; } static int dax_is_pte_entry(void *entry) { - return !((unsigned long)entry & RADIX_DAX_PMD); + return !(xa_to_value(entry) & DAX_PMD); } static int dax_is_zero_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; + return xa_to_value(entry) & DAX_ZERO_PAGE; } static int dax_is_empty_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_EMPTY; + return xa_to_value(entry) & DAX_EMPTY; } /* - * DAX radix tree locking + * DAX page cache entry locking */ struct exceptional_entry_key { - struct address_space *mapping; + struct xarray *xa; pgoff_t entry_start; }; @@ -124,10 +149,11 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; -static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, - pgoff_t index, void *entry, struct exceptional_entry_key *key) +static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, + void *entry, struct exceptional_entry_key *key) { unsigned long hash; + unsigned long index = xas->xa_index; /* * If 'entry' is a PMD, align the 'index' that we use for the wait @@ -136,22 +162,21 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, */ if (dax_is_pmd_entry(entry)) index &= ~PG_PMD_COLOUR; - - key->mapping = mapping; + key->xa = xas->xa; key->entry_start = index; - hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); + hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); return wait_table + hash; } -static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, - int sync, void *keyp) +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *keyp) { struct exceptional_entry_key *key = keyp; struct wait_exceptional_entry_queue *ewait = container_of(wait, struct wait_exceptional_entry_queue, wait); - if (key->mapping != ewait->key.mapping || + if (key->xa != ewait->key.xa || key->entry_start != ewait->key.entry_start) return 0; return autoremove_wake_function(wait, mode, sync, NULL); @@ -162,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ -static void dax_wake_mapping_entry_waiter(struct address_space *mapping, - pgoff_t index, void *entry, bool wake_all) +static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) { struct exceptional_entry_key key; wait_queue_head_t *wq; - wq = dax_entry_waitqueue(mapping, index, entry, &key); + wq = dax_entry_waitqueue(xas, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens @@ -181,55 +205,16 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, } /* - * Check whether the given slot is locked. Must be called with the i_pages - * lock held. - */ -static inline int slot_locked(struct address_space *mapping, void **slot) -{ - unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - return entry & RADIX_DAX_ENTRY_LOCK; -} - -/* - * Mark the given slot as locked. Must be called with the i_pages lock held. - */ -static inline void *lock_slot(struct address_space *mapping, void **slot) -{ - unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - - entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); - return (void *)entry; -} - -/* - * Mark the given slot as unlocked. Must be called with the i_pages lock held. - */ -static inline void *unlock_slot(struct address_space *mapping, void **slot) -{ - unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - - entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); - return (void *)entry; -} - -/* - * Lookup entry in radix tree, wait for it to become unlocked if it is - * exceptional entry and return it. The caller must call - * put_unlocked_mapping_entry() when he decided not to lock the entry or - * put_locked_mapping_entry() when he locked the entry and now wants to - * unlock it. + * Look up entry in page cache, wait for it to become unlocked if it + * is a DAX entry and return it. The caller must subsequently call + * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() + * if it did. * * Must be called with the i_pages lock held. */ -static void *__get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp, bool (*wait_fn)(void)) +static void *get_unlocked_entry(struct xa_state *xas) { - void *entry, **slot; + void *entry; struct wait_exceptional_entry_queue ewait; wait_queue_head_t *wq; @@ -237,80 +222,54 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping, ewait.wait.func = wake_exceptional_entry_func; for (;;) { - bool revalidate; - - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, - &slot); - if (!entry || - WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || - !slot_locked(mapping, slot)) { - if (slotp) - *slotp = slot; + entry = xas_load(xas); + if (!entry || xa_is_internal(entry) || + WARN_ON_ONCE(!xa_is_value(entry)) || + !dax_is_locked(entry)) return entry; - } - wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); + wq = dax_entry_waitqueue(xas, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); - xa_unlock_irq(&mapping->i_pages); - revalidate = wait_fn(); + xas_unlock_irq(xas); + xas_reset(xas); + schedule(); finish_wait(wq, &ewait.wait); - xa_lock_irq(&mapping->i_pages); - if (revalidate) - return ERR_PTR(-EAGAIN); + xas_lock_irq(xas); } } -static bool entry_wait(void) -{ - schedule(); - /* - * Never return an ERR_PTR() from - * __get_unlocked_mapping_entry(), just keep looping. - */ - return false; -} - -static void *get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp) +static void put_unlocked_entry(struct xa_state *xas, void *entry) { - return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait); -} - -static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - void *entry, **slot; - - xa_lock_irq(&mapping->i_pages); - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || - !slot_locked(mapping, slot))) { - xa_unlock_irq(&mapping->i_pages); - return; - } - unlock_slot(mapping, slot); - xa_unlock_irq(&mapping->i_pages); - dax_wake_mapping_entry_waiter(mapping, index, entry, false); + /* If we were the only waiter woken, wake the next one */ + if (entry) + dax_wake_entry(xas, entry, false); } -static void put_locked_mapping_entry(struct address_space *mapping, - pgoff_t index) +/* + * We used the xa_state to get the entry, but then we locked the entry and + * dropped the xa_lock, so we know the xa_state is stale and must be reset + * before use. + */ +static void dax_unlock_entry(struct xa_state *xas, void *entry) { - unlock_mapping_entry(mapping, index); + void *old; + + xas_reset(xas); + xas_lock_irq(xas); + old = xas_store(xas, entry); + xas_unlock_irq(xas); + BUG_ON(!dax_is_locked(old)); + dax_wake_entry(xas, entry, false); } /* - * Called when we are done with radix tree entry we looked up via - * get_unlocked_mapping_entry() and which we didn't lock in the end. + * Return: The entry stored at this location before it was locked. */ -static void put_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) +static void *dax_lock_entry(struct xa_state *xas, void *entry) { - if (!entry) - return; - - /* We have to wake up next waiter for the radix tree entry lock */ - dax_wake_mapping_entry_waiter(mapping, index, entry, false); + unsigned long v = xa_to_value(entry); + return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); } static unsigned long dax_entry_size(void *entry) @@ -325,9 +284,9 @@ static unsigned long dax_entry_size(void *entry) return PAGE_SIZE; } -static unsigned long dax_radix_end_pfn(void *entry) +static unsigned long dax_end_pfn(void *entry) { - return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; + return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; } /* @@ -335,8 +294,8 @@ static unsigned long dax_radix_end_pfn(void *entry) * 'empty' and 'zero' entries. */ #define for_each_mapped_pfn(entry, pfn) \ - for (pfn = dax_radix_pfn(entry); \ - pfn < dax_radix_end_pfn(entry); pfn++) + for (pfn = dax_to_pfn(entry); \ + pfn < dax_end_pfn(entry); pfn++) /* * TODO: for reflink+dax we need a way to associate a single page with @@ -393,33 +352,16 @@ static struct page *dax_busy_page(void *entry) return NULL; } -static bool entry_wait_revalidate(void) -{ - rcu_read_unlock(); - schedule(); - rcu_read_lock(); - - /* - * Tell __get_unlocked_mapping_entry() to take a break, we need - * to revalidate page->mapping after dropping locks - */ - return true; -} - bool dax_lock_mapping_entry(struct page *page) { - pgoff_t index; - struct inode *inode; - bool did_lock = false; - void *entry = NULL, **slot; - struct address_space *mapping; + XA_STATE(xas, NULL, 0); + void *entry; - rcu_read_lock(); for (;;) { - mapping = READ_ONCE(page->mapping); + struct address_space *mapping = READ_ONCE(page->mapping); if (!dax_mapping(mapping)) - break; + return false; /* * In the device-dax case there's no need to lock, a @@ -428,98 +370,94 @@ bool dax_lock_mapping_entry(struct page *page) * otherwise we would not have a valid pfn_to_page() * translation. */ - inode = mapping->host; - if (S_ISCHR(inode->i_mode)) { - did_lock = true; - break; - } + if (S_ISCHR(mapping->host->i_mode)) + return true; - xa_lock_irq(&mapping->i_pages); + xas.xa = &mapping->i_pages; + xas_lock_irq(&xas); if (mapping != page->mapping) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); continue; } - index = page->index; - - entry = __get_unlocked_mapping_entry(mapping, index, &slot, - entry_wait_revalidate); - if (!entry) { - xa_unlock_irq(&mapping->i_pages); - break; - } else if (IS_ERR(entry)) { - xa_unlock_irq(&mapping->i_pages); - WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN); - continue; + xas_set(&xas, page->index); + entry = xas_load(&xas); + if (dax_is_locked(entry)) { + entry = get_unlocked_entry(&xas); + /* Did the page move while we slept? */ + if (dax_to_pfn(entry) != page_to_pfn(page)) { + xas_unlock_irq(&xas); + continue; + } } - lock_slot(mapping, slot); - did_lock = true; - xa_unlock_irq(&mapping->i_pages); - break; + dax_lock_entry(&xas, entry); + xas_unlock_irq(&xas); + return true; } - rcu_read_unlock(); - - return did_lock; } void dax_unlock_mapping_entry(struct page *page) { struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; + XA_STATE(xas, &mapping->i_pages, page->index); - if (S_ISCHR(inode->i_mode)) + if (S_ISCHR(mapping->host->i_mode)) return; - unlock_mapping_entry(mapping, page->index); + dax_unlock_entry(&xas, dax_make_page_entry(page)); } /* - * Find radix tree entry at given index. If it points to an exceptional entry, - * return it with the radix tree entry locked. If the radix tree doesn't - * contain given index, create an empty exceptional entry for the index and - * return with it locked. + * Find page cache entry at given index. If it is a DAX entry, return it + * with the entry locked. If the page cache doesn't contain an entry at + * that index, add a locked empty entry. * - * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will - * either return that locked entry or will return an error. This error will - * happen if there are any 4k entries within the 2MiB range that we are - * requesting. + * When requesting an entry with size DAX_PMD, grab_mapping_entry() will + * either return that locked entry or will return VM_FAULT_FALLBACK. + * This will happen if there are any PTE entries within the PMD range + * that we are requesting. * - * We always favor 4k entries over 2MiB entries. There isn't a flow where we - * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB - * insertion will fail if it finds any 4k entries already in the tree, and a - * 4k insertion will cause an existing 2MiB entry to be unmapped and - * downgraded to 4k entries. This happens for both 2MiB huge zero pages as - * well as 2MiB empty entries. + * We always favor PTE entries over PMD entries. There isn't a flow where we + * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD + * insertion will fail if it finds any PTE entries already in the tree, and a + * PTE insertion will cause an existing PMD entry to be unmapped and + * downgraded to PTE entries. This happens for both PMD zero pages as + * well as PMD empty entries. * - * The exception to this downgrade path is for 2MiB DAX PMD entries that have - * real storage backing them. We will leave these real 2MiB DAX entries in - * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. + * The exception to this downgrade path is for PMD entries that have + * real storage backing them. We will leave these real PMD entries in + * the tree, and PTE writes will simply dirty the entire PMD entry. * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. + * + * On error, this function does not return an ERR_PTR. Instead it returns + * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values + * overlap with xarray value entries. */ -static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, - unsigned long size_flag) +static void *grab_mapping_entry(struct xa_state *xas, + struct address_space *mapping, unsigned long size_flag) { - bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ - void *entry, **slot; - -restart: - xa_lock_irq(&mapping->i_pages); - entry = get_unlocked_mapping_entry(mapping, index, &slot); + unsigned long index = xas->xa_index; + bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */ + void *entry; - if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { - entry = ERR_PTR(-EIO); - goto out_unlock; - } +retry: + xas_lock_irq(xas); + entry = get_unlocked_entry(xas); + if (xa_is_internal(entry)) + goto fallback; if (entry) { - if (size_flag & RADIX_DAX_PMD) { + if (WARN_ON_ONCE(!xa_is_value(entry))) { + xas_set_err(xas, EIO); + goto out_unlock; + } + + if (size_flag & DAX_PMD) { if (dax_is_pte_entry(entry)) { - put_unlocked_mapping_entry(mapping, index, - entry); - entry = ERR_PTR(-EEXIST); - goto out_unlock; + put_unlocked_entry(xas, entry); + goto fallback; } } else { /* trying to grab a PTE entry */ if (dax_is_pmd_entry(entry) && @@ -530,87 +468,57 @@ restart: } } - /* No entry for given index? Make sure radix tree is big enough. */ - if (!entry || pmd_downgrade) { - int err; - - if (pmd_downgrade) { - /* - * Make sure 'entry' remains valid while we drop - * the i_pages lock. - */ - entry = lock_slot(mapping, slot); - } + if (pmd_downgrade) { + /* + * Make sure 'entry' remains valid while we drop + * the i_pages lock. + */ + dax_lock_entry(xas, entry); - xa_unlock_irq(&mapping->i_pages); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be * unmapped. */ - if (pmd_downgrade && dax_is_zero_entry(entry)) - unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, - PG_PMD_NR, false); - - err = radix_tree_preload( - mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); - if (err) { - if (pmd_downgrade) - put_locked_mapping_entry(mapping, index); - return ERR_PTR(err); - } - xa_lock_irq(&mapping->i_pages); - - if (!entry) { - /* - * We needed to drop the i_pages lock while calling - * radix_tree_preload() and we didn't have an entry to - * lock. See if another thread inserted an entry at - * our index during this time. - */ - entry = __radix_tree_lookup(&mapping->i_pages, index, - NULL, &slot); - if (entry) { - radix_tree_preload_end(); - xa_unlock_irq(&mapping->i_pages); - goto restart; - } + if (dax_is_zero_entry(entry)) { + xas_unlock_irq(xas); + unmap_mapping_pages(mapping, + xas->xa_index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); + xas_reset(xas); + xas_lock_irq(xas); } - if (pmd_downgrade) { - dax_disassociate_entry(entry, mapping, false); - radix_tree_delete(&mapping->i_pages, index); - mapping->nrexceptional--; - dax_wake_mapping_entry_waiter(mapping, index, entry, - true); - } + dax_disassociate_entry(entry, mapping, false); + xas_store(xas, NULL); /* undo the PMD join */ + dax_wake_entry(xas, entry, true); + mapping->nrexceptional--; + entry = NULL; + xas_set(xas, index); + } - entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); - - err = __radix_tree_insert(&mapping->i_pages, index, - dax_radix_order(entry), entry); - radix_tree_preload_end(); - if (err) { - xa_unlock_irq(&mapping->i_pages); - /* - * Our insertion of a DAX entry failed, most likely - * because we were inserting a PMD entry and it - * collided with a PTE sized entry at a different - * index in the PMD range. We haven't inserted - * anything into the radix tree and have no waiters to - * wake. - */ - return ERR_PTR(err); - } - /* Good, we have inserted empty locked entry into the tree. */ + if (entry) { + dax_lock_entry(xas, entry); + } else { + entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY); + dax_lock_entry(xas, entry); + if (xas_error(xas)) + goto out_unlock; mapping->nrexceptional++; - xa_unlock_irq(&mapping->i_pages); - return entry; } - entry = lock_slot(mapping, slot); - out_unlock: - xa_unlock_irq(&mapping->i_pages); + +out_unlock: + xas_unlock_irq(xas); + if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) + goto retry; + if (xas->xa_node == XA_ERROR(-ENOMEM)) + return xa_mk_internal(VM_FAULT_OOM); + if (xas_error(xas)) + return xa_mk_internal(VM_FAULT_SIGBUS); return entry; +fallback: + xas_unlock_irq(xas); + return xa_mk_internal(VM_FAULT_FALLBACK); } /** @@ -630,11 +538,10 @@ restart: */ struct page *dax_layout_busy_page(struct address_space *mapping) { - pgoff_t indices[PAGEVEC_SIZE]; + XA_STATE(xas, &mapping->i_pages, 0); + void *entry; + unsigned int scanned = 0; struct page *page = NULL; - struct pagevec pvec; - pgoff_t index, end; - unsigned i; /* * In the 'limited' case get_user_pages() for dax is disabled. @@ -645,13 +552,9 @@ struct page *dax_layout_busy_page(struct address_space *mapping) if (!dax_mapping(mapping) || !mapping_mapped(mapping)) return NULL; - pagevec_init(&pvec); - index = 0; - end = -1; - /* * If we race get_user_pages_fast() here either we'll see the - * elevated page count in the pagevec_lookup and wait, or + * elevated page count in the iteration and wait, or * get_user_pages_fast() will see that the page it took a reference * against is no longer mapped in the page tables and bail to the * get_user_pages() slow path. The slow path is protected by @@ -663,94 +566,68 @@ struct page *dax_layout_busy_page(struct address_space *mapping) */ unmap_mapping_range(mapping, 0, 0, 1); - while (index < end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - indices)) { - pgoff_t nr_pages = 1; - - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *pvec_ent = pvec.pages[i]; - void *entry; - - index = indices[i]; - if (index >= end) - break; - - if (WARN_ON_ONCE( - !radix_tree_exceptional_entry(pvec_ent))) - continue; - - xa_lock_irq(&mapping->i_pages); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (entry) { - page = dax_busy_page(entry); - /* - * Account for multi-order entries at - * the end of the pagevec. - */ - if (i + 1 >= pagevec_count(&pvec)) - nr_pages = 1UL << dax_radix_order(entry); - } - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(&mapping->i_pages); - if (page) - break; - } - - /* - * We don't expect normal struct page entries to exist in our - * tree, but we keep these pagevec calls so that this code is - * consistent with the common pattern for handling pagevecs - * throughout the kernel. - */ - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - index += nr_pages; - + xas_lock_irq(&xas); + xas_for_each(&xas, entry, ULONG_MAX) { + if (WARN_ON_ONCE(!xa_is_value(entry))) + continue; + if (unlikely(dax_is_locked(entry))) + entry = get_unlocked_entry(&xas); + if (entry) + page = dax_busy_page(entry); + put_unlocked_entry(&xas, entry); if (page) break; + if (++scanned % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); } + xas_unlock_irq(&xas); return page; } EXPORT_SYMBOL_GPL(dax_layout_busy_page); -static int __dax_invalidate_mapping_entry(struct address_space *mapping, +static int __dax_invalidate_entry(struct address_space *mapping, pgoff_t index, bool trunc) { + XA_STATE(xas, &mapping->i_pages, index); int ret = 0; void *entry; - struct radix_tree_root *pages = &mapping->i_pages; - xa_lock_irq(pages); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) + xas_lock_irq(&xas); + entry = get_unlocked_entry(&xas); + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto out; if (!trunc && - (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) + (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || + xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) goto out; dax_disassociate_entry(entry, mapping, trunc); - radix_tree_delete(pages, index); + xas_store(&xas, NULL); mapping->nrexceptional--; ret = 1; out: - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(pages); + put_unlocked_entry(&xas, entry); + xas_unlock_irq(&xas); return ret; } + /* - * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree - * entry to get unlocked before deleting it. + * Delete DAX entry at @index from @mapping. Wait for it + * to be unlocked before deleting it. */ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) { - int ret = __dax_invalidate_mapping_entry(mapping, index, true); + int ret = __dax_invalidate_entry(mapping, index, true); /* * This gets called from truncate / punch_hole path. As such, the caller * must hold locks protecting against concurrent modifications of the - * radix tree (usually fs-private i_mmap_sem for writing). Since the - * caller has seen exceptional entry for this index, we better find it + * page cache (usually fs-private i_mmap_sem for writing). Since the + * caller has seen a DAX entry for this index, we better find it * at that index as well... */ WARN_ON_ONCE(!ret); @@ -758,12 +635,12 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) } /* - * Invalidate exceptional DAX entry if it is clean. + * Invalidate DAX entry if it is clean. */ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index) { - return __dax_invalidate_mapping_entry(mapping, index, false); + return __dax_invalidate_entry(mapping, index, false); } static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, @@ -799,30 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ -static void *dax_insert_mapping_entry(struct address_space *mapping, - struct vm_fault *vmf, - void *entry, pfn_t pfn_t, - unsigned long flags, bool dirty) +static void *dax_insert_entry(struct xa_state *xas, + struct address_space *mapping, struct vm_fault *vmf, + void *entry, pfn_t pfn, unsigned long flags, bool dirty) { - struct radix_tree_root *pages = &mapping->i_pages; - unsigned long pfn = pfn_t_to_pfn(pfn_t); - pgoff_t index = vmf->pgoff; - void *new_entry; + void *new_entry = dax_make_entry(pfn, flags); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { + if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { + unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, - PG_PMD_NR, false); + PG_PMD_NR, false); else /* pte entry */ - unmap_mapping_pages(mapping, vmf->pgoff, 1, false); + unmap_mapping_pages(mapping, index, 1, false); } - xa_lock_irq(pages); - new_entry = dax_radix_locked_entry(pfn, flags); + xas_reset(xas); + xas_lock_irq(xas); if (dax_entry_size(entry) != dax_entry_size(new_entry)) { dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); @@ -830,33 +704,30 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* - * Only swap our new entry into the radix tree if the current + * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or - * PMD entry is already in the tree, we leave it alone. This + * PMD entry is already in the cache, we leave it alone. This * means that if we are trying to insert a PTE and the * existing entry is a PMD, we will just leave the PMD in the * tree and dirty it if necessary. */ - struct radix_tree_node *node; - void **slot; - void *ret; - - ret = __radix_tree_lookup(pages, index, &node, &slot); - WARN_ON_ONCE(ret != entry); - __radix_tree_replace(pages, node, slot, - new_entry, NULL); + void *old = dax_lock_entry(xas, new_entry); + WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | + DAX_LOCKED)); entry = new_entry; + } else { + xas_load(xas); /* Walk the xa_state */ } if (dirty) - radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); + xas_set_mark(xas, PAGECACHE_TAG_DIRTY); - xa_unlock_irq(pages); + xas_unlock_irq(xas); return entry; } -static inline unsigned long -pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) +static inline +unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) { unsigned long address; @@ -866,8 +737,8 @@ pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) } /* Walk all mappings of a given index of a file and writeprotect them */ -static void dax_mapping_entry_mkclean(struct address_space *mapping, - pgoff_t index, unsigned long pfn) +static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, + unsigned long pfn) { struct vm_area_struct *vma; pte_t pte, *ptep = NULL; @@ -937,11 +808,9 @@ unlock_pte: i_mmap_unlock_read(mapping); } -static int dax_writeback_one(struct dax_device *dax_dev, - struct address_space *mapping, pgoff_t index, void *entry) +static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, + struct address_space *mapping, void *entry) { - struct radix_tree_root *pages = &mapping->i_pages; - void *entry2, **slot; unsigned long pfn; long ret = 0; size_t size; @@ -950,32 +819,38 @@ static int dax_writeback_one(struct dax_device *dax_dev, * A page got tagged dirty in DAX mapping? Something is seriously * wrong. */ - if (WARN_ON(!radix_tree_exceptional_entry(entry))) + if (WARN_ON(!xa_is_value(entry))) return -EIO; - xa_lock_irq(pages); - entry2 = get_unlocked_mapping_entry(mapping, index, &slot); - /* Entry got punched out / reallocated? */ - if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) - goto put_unlocked; - /* - * Entry got reallocated elsewhere? No need to writeback. We have to - * compare pfns as we must not bail out due to difference in lockbit - * or entry type. - */ - if (dax_radix_pfn(entry2) != dax_radix_pfn(entry)) - goto put_unlocked; - if (WARN_ON_ONCE(dax_is_empty_entry(entry) || - dax_is_zero_entry(entry))) { - ret = -EIO; - goto put_unlocked; + if (unlikely(dax_is_locked(entry))) { + void *old_entry = entry; + + entry = get_unlocked_entry(xas); + + /* Entry got punched out / reallocated? */ + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) + goto put_unlocked; + /* + * Entry got reallocated elsewhere? No need to writeback. + * We have to compare pfns as we must not bail out due to + * difference in lockbit or entry type. + */ + if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) + goto put_unlocked; + if (WARN_ON_ONCE(dax_is_empty_entry(entry) || + dax_is_zero_entry(entry))) { + ret = -EIO; + goto put_unlocked; + } + + /* Another fsync thread may have already done this entry */ + if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) + goto put_unlocked; } - /* Another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)) - goto put_unlocked; /* Lock the entry to serialize with page faults */ - entry = lock_slot(mapping, slot); + dax_lock_entry(xas, entry); + /* * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we @@ -983,8 +858,8 @@ static int dax_writeback_one(struct dax_device *dax_dev, * at the entry only under the i_pages lock and once they do that * they will see the entry locked and wait for it to unlock. */ - radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); - xa_unlock_irq(pages); + xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irq(xas); /* * Even if dax_writeback_mapping_range() was given a wbc->range_start @@ -993,10 +868,10 @@ static int dax_writeback_one(struct dax_device *dax_dev, * This allows us to flush for PMD_SIZE and not have to worry about * partial PMD writebacks. */ - pfn = dax_radix_pfn(entry); - size = PAGE_SIZE << dax_radix_order(entry); + pfn = dax_to_pfn(entry); + size = PAGE_SIZE << dax_entry_order(entry); - dax_mapping_entry_mkclean(mapping, index, pfn); + dax_entry_mkclean(mapping, xas->xa_index, pfn); dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); /* * After we have flushed the cache, we can clear the dirty tag. There @@ -1004,16 +879,18 @@ static int dax_writeback_one(struct dax_device *dax_dev, * the pfn mappings are writeprotected and fault waits for mapping * entry lock. */ - xa_lock_irq(pages); - radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); - xa_unlock_irq(pages); - trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); - put_locked_mapping_entry(mapping, index); + xas_reset(xas); + xas_lock_irq(xas); + xas_store(xas, entry); + xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); + dax_wake_entry(xas, entry, false); + + trace_dax_writeback_one(mapping->host, xas->xa_index, + size >> PAGE_SHIFT); return ret; put_unlocked: - put_unlocked_mapping_entry(mapping, index, entry2); - xa_unlock_irq(pages); + put_unlocked_entry(xas, entry); return ret; } @@ -1025,13 +902,13 @@ static int dax_writeback_one(struct dax_device *dax_dev, int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { + XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); struct inode *inode = mapping->host; - pgoff_t start_index, end_index; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; struct dax_device *dax_dev; - struct pagevec pvec; - bool done = false; - int i, ret = 0; + void *entry; + int ret = 0; + unsigned int scanned = 0; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; @@ -1043,41 +920,29 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (!dax_dev) return -EIO; - start_index = wbc->range_start >> PAGE_SHIFT; - end_index = wbc->range_end >> PAGE_SHIFT; - - trace_dax_writeback_range(inode, start_index, end_index); + trace_dax_writeback_range(inode, xas.xa_index, end_index); - tag_pages_for_writeback(mapping, start_index, end_index); + tag_pages_for_writeback(mapping, xas.xa_index, end_index); - pagevec_init(&pvec); - while (!done) { - pvec.nr = find_get_entries_tag(mapping, start_index, - PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, - pvec.pages, indices); - - if (pvec.nr == 0) + xas_lock_irq(&xas); + xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { + ret = dax_writeback_one(&xas, dax_dev, mapping, entry); + if (ret < 0) { + mapping_set_error(mapping, ret); break; - - for (i = 0; i < pvec.nr; i++) { - if (indices[i] > end_index) { - done = true; - break; - } - - ret = dax_writeback_one(dax_dev, mapping, indices[i], - pvec.pages[i]); - if (ret < 0) { - mapping_set_error(mapping, ret); - goto out; - } } - start_index = indices[pvec.nr - 1] + 1; + if (++scanned % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); } -out: + xas_unlock_irq(&xas); put_dax(dax_dev); - trace_dax_writeback_range_done(inode, start_index, end_index); - return (ret < 0 ? ret : 0); + trace_dax_writeback_range_done(inode, xas.xa_index, end_index); + return ret; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); @@ -1125,16 +990,18 @@ out: * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ -static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, - struct vm_fault *vmf) +static vm_fault_t dax_load_hole(struct xa_state *xas, + struct address_space *mapping, void **entry, + struct vm_fault *vmf) { struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); vm_fault_t ret; - dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, - false); + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, + DAX_ZERO_PAGE, false); + ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); trace_dax_load_hole(inode, vmf, ret); return ret; @@ -1342,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; + XA_STATE(xas, &mapping->i_pages, vmf->pgoff); struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; @@ -1368,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (write && !vmf->cow_page) flags |= IOMAP_WRITE; - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); - if (IS_ERR(entry)) { - ret = dax_fault_return(PTR_ERR(entry)); + entry = grab_mapping_entry(&xas, mapping, 0); + if (xa_is_internal(entry)) { + ret = xa_to_internal(entry); goto out; } @@ -1443,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto error_finish_iomap; - entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, + entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, 0, write && !sync); /* @@ -1471,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!write) { - ret = dax_load_hole(mapping, entry, vmf); + ret = dax_load_hole(&xas, mapping, &entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1498,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff); + dax_unlock_entry(&xas, entry); out: trace_dax_pte_fault_done(inode, vmf, ret); return ret | major; } #ifdef CONFIG_FS_DAX_PMD -static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, - void *entry) +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, + struct iomap *iomap, void **entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; struct inode *inode = mapping->host; struct page *zero_page; - void *ret = NULL; spinlock_t *ptl; pmd_t pmd_entry; pfn_t pfn; @@ -1523,8 +1390,8 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, goto fallback; pfn = page_to_pfn_t(zero_page); - ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, - RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, + DAX_PMD | DAX_ZERO_PAGE, false); ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { @@ -1536,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, pmd_entry = pmd_mkhuge(pmd_entry); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); - trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); + trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); return VM_FAULT_NOPAGE; fallback: - trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); return VM_FAULT_FALLBACK; } @@ -1549,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; bool sync; @@ -1556,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, struct inode *inode = mapping->host; vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; - pgoff_t max_pgoff, pgoff; + pgoff_t max_pgoff; void *entry; loff_t pos; int error; @@ -1567,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ - pgoff = linear_page_index(vma, pmd_addr); max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); @@ -1576,7 +1443,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * Make sure that the faulting address's PMD offset (color) matches * the PMD offset from the start of the file. This is necessary so * that a PMD range in the page table overlaps exactly with a PMD - * range in the radix tree. + * range in the page cache. */ if ((vmf->pgoff & PG_PMD_COLOUR) != ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) @@ -1592,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; - if (pgoff >= max_pgoff) { + if (xas.xa_index >= max_pgoff) { result = VM_FAULT_SIGBUS; goto out; } /* If the PMD would extend beyond the file size */ - if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) + if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) goto fallback; /* - * grab_mapping_entry() will make sure we get a 2MiB empty entry, a - * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page - * is already in the tree, for instance), it will return -EEXIST and - * we just fall back to 4k entries. + * grab_mapping_entry() will make sure we get an empty PMD entry, + * a zero PMD entry or a DAX PMD. If it can't (because a PTE + * entry is already in the array, for instance), it will return + * VM_FAULT_FALLBACK. */ - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); - if (IS_ERR(entry)) + entry = grab_mapping_entry(&xas, mapping, DAX_PMD); + if (xa_is_internal(entry)) { + result = xa_to_internal(entry); goto fallback; + } /* * It is possible, particularly with mixed reads & writes to private @@ -1628,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * setting up a mapping, so really we're using iomap_begin() as a way * to look up our filesystem block. */ - pos = (loff_t)pgoff << PAGE_SHIFT; + pos = (loff_t)xas.xa_index << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) goto unlock_entry; @@ -1644,8 +1513,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto finish_iomap; - entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, - RADIX_DAX_PMD, write && !sync); + entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, + DAX_PMD, write && !sync); /* * If we are doing synchronous page fault and inode needs fsync, @@ -1669,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, case IOMAP_HOLE: if (WARN_ON_ONCE(write)) break; - result = dax_pmd_load_hole(vmf, &iomap, entry); + result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); break; default: WARN_ON_ONCE(1); @@ -1692,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, pgoff); + dax_unlock_entry(&xas, entry); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); @@ -1737,54 +1606,49 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, } EXPORT_SYMBOL_GPL(dax_iomap_fault); -/** +/* * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables * @vmf: The description of the fault - * @pe_size: Size of entry to be inserted * @pfn: PFN to insert + * @order: Order of entry to insert. * - * This function inserts writeable PTE or PMD entry into page tables for mmaped - * DAX file. It takes care of marking corresponding radix tree entry as dirty - * as well. + * This function inserts a writeable PTE or PMD entry into the page tables + * for an mmaped DAX file. It also marks the page cache entry as dirty. */ -static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, - enum page_entry_size pe_size, - pfn_t pfn) +static vm_fault_t +dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; - void *entry, **slot; - pgoff_t index = vmf->pgoff; + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); + void *entry; vm_fault_t ret; - xa_lock_irq(&mapping->i_pages); - entry = get_unlocked_mapping_entry(mapping, index, &slot); + xas_lock_irq(&xas); + entry = get_unlocked_entry(&xas); /* Did we race with someone splitting entry or so? */ if (!entry || - (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || - (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(&mapping->i_pages); + (order == 0 && !dax_is_pte_entry(entry)) || + (order == PMD_ORDER && (xa_is_internal(entry) || + !dax_is_pmd_entry(entry)))) { + put_unlocked_entry(&xas, entry); + xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } - radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); - entry = lock_slot(mapping, slot); - xa_unlock_irq(&mapping->i_pages); - switch (pe_size) { - case PE_SIZE_PTE: + xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); + dax_lock_entry(&xas, entry); + xas_unlock_irq(&xas); + if (order == 0) ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); - break; #ifdef CONFIG_FS_DAX_PMD - case PE_SIZE_PMD: + else if (order == PMD_ORDER) ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, true); - break; #endif - default: + else ret = VM_FAULT_FALLBACK; - } - put_locked_mapping_entry(mapping, index); + dax_unlock_entry(&xas, entry); trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); return ret; } @@ -1804,17 +1668,12 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; - size_t len = 0; + unsigned int order = pe_order(pe_size); + size_t len = PAGE_SIZE << order; - if (pe_size == PE_SIZE_PTE) - len = PAGE_SIZE; - else if (pe_size == PE_SIZE_PMD) - len = PMD_SIZE; - else - WARN_ON_ONCE(1); err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); if (err) return VM_FAULT_SIGBUS; - return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); + return dax_insert_pfn_mkwrite(vmf, pfn, order); } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); diff --git a/fs/dcache.c b/fs/dcache.c index 2e7e8d85e9b4..c2e443fb76ae 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -257,24 +257,10 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } -static void __d_free_external_name(struct rcu_head *head) -{ - struct external_name *name = container_of(head, struct external_name, - u.head); - - mod_node_page_state(page_pgdat(virt_to_page(name)), - NR_INDIRECTLY_RECLAIMABLE_BYTES, - -ksize(name)); - - kfree(name); -} - static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - - __d_free_external_name(&external_name(dentry)->u.head); - + kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } @@ -306,7 +292,7 @@ void release_dentry_name_snapshot(struct name_snapshot *name) struct external_name *p; p = container_of(name->name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) - call_rcu(&p->u.head, __d_free_external_name); + kfree_rcu(p, u.head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); @@ -1606,7 +1592,6 @@ EXPORT_SYMBOL(d_invalidate); struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { - struct external_name *ext = NULL; struct dentry *dentry; char *dname; int err; @@ -1627,14 +1612,15 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - - ext = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT); - if (!ext) { + struct external_name *p = kmalloc(size + name->len, + GFP_KERNEL_ACCOUNT | + __GFP_RECLAIMABLE); + if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } - atomic_set(&ext->u.count, 1); - dname = ext->name; + atomic_set(&p->u.count, 1); + dname = p->name; } else { dname = dentry->d_iname; } @@ -1673,12 +1659,6 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } } - if (unlikely(ext)) { - pg_data_t *pgdat = page_pgdat(virt_to_page(ext)); - mod_node_page_state(pgdat, NR_INDIRECTLY_RECLAIMABLE_BYTES, - ksize(ext)); - } - this_cpu_inc(nr_dentry); return dentry; @@ -2707,7 +2687,7 @@ static void copy_name(struct dentry *dentry, struct dentry *target) dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) - call_rcu(&old_name->u.head, __d_free_external_name); + kfree_rcu(old_name, u.head); } /* diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 224c04abb2e5..cf4c77f8dd08 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -256,11 +256,15 @@ ext2_init_acl(struct inode *inode, struct inode *dir) if (default_acl) { error = __ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; } if (acl) { if (!error) error = __ext2_set_acl(inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); + } else { + inode->i_acl = NULL; } return error; } diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 00e759f05161..e770cd100a6a 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -390,11 +390,7 @@ struct ext2_inode { #define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ #define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ #define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ -#ifdef CONFIG_FS_DAX #define EXT2_MOUNT_DAX 0x100000 /* Direct Access */ -#else -#define EXT2_MOUNT_DAX 0 -#endif #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 73bd58fa13de..cb91baa4275d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -309,20 +309,17 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); -#if defined(CONFIG_QUOTA) if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA) seq_puts(seq, ",usrquota"); if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA) seq_puts(seq, ",grpquota"); -#endif -#ifdef CONFIG_FS_DAX if (sbi->s_mount_opt & EXT2_MOUNT_XIP) seq_puts(seq, ",xip"); + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) seq_puts(seq, ",dax"); -#endif if (!test_opt(sb, RESERVATION)) seq_puts(seq, ",noreservation"); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c3d9a42c561e..05f01fbd9c7f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2643,7 +2643,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; - int tag; + xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 106f116466bf..b293cb3e27a2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2071,7 +2071,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t done_index; int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; int nwritten = 0; pagevec_init(&pvec); @@ -2787,13 +2787,13 @@ const struct address_space_operations f2fs_dblock_aops = { #endif }; -void f2fs_clear_radix_tree_dirty_tag(struct page *page) +void f2fs_clear_page_cache_dirty_tag(struct page *page) { struct address_space *mapping = page_mapping(page); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - radix_tree_tag_clear(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 2ef84b4590ea..bacc667950b6 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -726,7 +726,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !f2fs_truncate_hole(dir, page->index, page->index + 1)) { - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 56204a8f8a12..1e031971a466 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3108,7 +3108,7 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); -void f2fs_clear_radix_tree_dirty_tag(struct page *page); +void f2fs_clear_page_cache_dirty_tag(struct page *page); /* * gc.c diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index cb31a719b048..7b0cff7e6051 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -243,7 +243,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 2b34206486d8..d338740d0fda 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -101,7 +101,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) static void clear_node_page_dirty(struct page *page) { if (PageDirty(page)) { - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); } @@ -1306,9 +1306,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (f2fs_check_nid_range(sbi, nid)) return; - rcu_read_lock(); - apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid); - rcu_read_unlock(); + apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid); if (apage) return; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 471d863958bc..b40168fcc94a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -339,9 +339,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; - struct radix_tree_iter iter; + XA_STATE(xas, &mapping->i_pages, 0); + struct page *page; bool switched = false; - void **slot; /* * By the time control reaches here, RCU grace period has passed @@ -375,25 +375,18 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to * pages actually under writeback. */ - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, - PAGECACHE_TAG_DIRTY) { - struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (likely(page) && PageDirty(page)) { + xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) { + if (PageDirty(page)) { dec_wb_stat(old_wb, WB_RECLAIMABLE); inc_wb_stat(new_wb, WB_RECLAIMABLE); } } - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, - PAGECACHE_TAG_WRITEBACK) { - struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (likely(page)) { - WARN_ON_ONCE(!PageWriteback(page)); - dec_wb_stat(old_wb, WB_WRITEBACK); - inc_wb_stat(new_wb, WB_WRITEBACK); - } + xas_set(&xas, 0); + xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { + WARN_ON_ONCE(!PageWriteback(page)); + dec_wb_stat(old_wb, WB_WRITEBACK); + inc_wb_stat(new_wb, WB_WRITEBACK); } wb_get(new_wb); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 31e8270d0b26..8afbb35559b9 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -366,7 +366,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, pgoff_t done_index; int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { diff --git a/fs/inode.c b/fs/inode.c index 42f6d25f32a5..9b808986d440 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -349,7 +349,7 @@ EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { - INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT); + xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); diff --git a/fs/iomap.c b/fs/iomap.c index ec15cf2ec696..90c2febc93ac 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1057,7 +1057,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, return length; } -int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 947ce22f5b3c..f0fe641893a5 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -46,7 +46,7 @@ int isofs_name_translate(struct iso_directory_record *de, char *new, struct inod return i; } -/* Acorn extensions written by Matthew Wilcox <willy@bofh.ai> 1998 */ +/* Acorn extensions written by Matthew Wilcox <willy@infradead.org> 1998 */ int get_acorn_filename(struct iso_directory_record *de, char *retname, struct inode *inode) { diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index ff2716f9322e..fdf527b6d79c 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -236,6 +236,9 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; + /* sysfs dentries and inodes don't require IO to create */ + sb->s_shrink.seeks = 0; + /* get root inode, initialize and unlock it */ mutex_lock(&kernfs_mutex); inode = kernfs_get_inode(sb, info->root->kn); diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 305b220af45d..162f43b80c84 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -72,6 +72,9 @@ static int kernfs_get_target_path(struct kernfs_node *parent, if (base == kn) break; + if ((s - path) + 3 >= PATH_MAX) + return -ENAMETOOLONG; + strcpy(s, "../"); s += 3; base = base->parent; @@ -88,7 +91,7 @@ static int kernfs_get_target_path(struct kernfs_node *parent, if (len < 2) return -EINVAL; len--; - if ((s - path) + len > PATH_MAX) + if ((s - path) + len >= PATH_MAX) return -ENAMETOOLONG; /* reverse fillup of target string from target to base */ diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 06cb0c1d9aee..d3781cd983f6 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -896,7 +896,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (end != inode->i_mapping->nrpages) { rcu_read_lock(); - end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); + end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX); rcu_read_unlock(); } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index f033f3a69a3b..07b839560576 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -93,7 +93,7 @@ int nfs4_check_delegation(struct inode *inode, fmode_t flags) return nfs4_do_check_delegation(inode, flags, false); } -static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) +static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_stateid *stateid) { struct inode *inode = state->inode; struct file_lock *fl; @@ -108,7 +108,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ spin_lock(&flctx->flc_lock); restart: list_for_each_entry(fl, list, fl_list) { - if (nfs_file_open_context(fl->fl_file) != ctx) + if (nfs_file_open_context(fl->fl_file)->state != state) continue; spin_unlock(&flctx->flc_lock); status = nfs4_lock_delegation_recall(fl, state, stateid); @@ -136,8 +136,8 @@ static int nfs_delegation_claim_opens(struct inode *inode, int err; again: - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { state = ctx->state; if (state == NULL) continue; @@ -147,15 +147,16 @@ again: continue; if (!nfs4_stateid_match(&state->stateid, stateid)) continue; - get_nfs_open_context(ctx); - spin_unlock(&inode->i_lock); + if (!get_nfs_open_context(ctx)) + continue; + rcu_read_unlock(); sp = state->owner; /* Block nfs4_proc_unlck */ mutex_lock(&sp->so_delegreturn_mutex); seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); err = nfs4_open_delegation_recall(ctx, state, stateid, type); if (!err) - err = nfs_delegation_claim_locks(ctx, state, stateid); + err = nfs_delegation_claim_locks(state, stateid); if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) err = -EAGAIN; mutex_unlock(&sp->so_delegreturn_mutex); @@ -164,7 +165,7 @@ again: return err; goto again; } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); return 0; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8bfaa658b2c1..71b2e390becf 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1072,6 +1072,100 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU); } +static int +nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry, + struct inode *inode, int error) +{ + switch (error) { + case 1: + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", + __func__, dentry); + return 1; + case 0: + nfs_mark_for_revalidate(dir); + if (inode && S_ISDIR(inode->i_mode)) { + /* Purge readdir caches. */ + nfs_zap_caches(inode); + /* + * We can't d_drop the root of a disconnected tree: + * its d_hash is on the s_anon list and d_drop() would hide + * it from shrink_dcache_for_unmount(), leading to busy + * inodes on unmount and further oopses. + */ + if (IS_ROOT(dentry)) + return 1; + } + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", + __func__, dentry); + return 0; + } + dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", + __func__, dentry, error); + return error; +} + +static int +nfs_lookup_revalidate_negative(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + int ret = 1; + if (nfs_neg_need_reval(dir, dentry, flags)) { + if (flags & LOOKUP_RCU) + return -ECHILD; + ret = 0; + } + return nfs_lookup_revalidate_done(dir, dentry, NULL, ret); +} + +static int +nfs_lookup_revalidate_delegated(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + return nfs_lookup_revalidate_done(dir, dentry, inode, 1); +} + +static int +nfs_lookup_revalidate_dentry(struct inode *dir, struct dentry *dentry, + struct inode *inode) +{ + struct nfs_fh *fhandle; + struct nfs_fattr *fattr; + struct nfs4_label *label; + int ret; + + ret = -ENOMEM; + fhandle = nfs_alloc_fhandle(); + fattr = nfs_alloc_fattr(); + label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); + if (fhandle == NULL || fattr == NULL || IS_ERR(label)) + goto out; + + ret = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); + if (ret < 0) { + if (ret == -ESTALE || ret == -ENOENT) + ret = 0; + goto out; + } + ret = 0; + if (nfs_compare_fh(NFS_FH(inode), fhandle)) + goto out; + if (nfs_refresh_inode(inode, fattr) < 0) + goto out; + + nfs_setsecurity(inode, fattr, label); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + + /* set a readdirplus hint that we had a cache miss */ + nfs_force_use_readdirplus(dir); + ret = 1; +out: + nfs_free_fattr(fattr); + nfs_free_fhandle(fhandle); + nfs4_label_free(label); + return nfs_lookup_revalidate_done(dir, dentry, inode, ret); +} + /* * This is called every time the dcache has a lookup hit, * and we should check whether we can really trust that @@ -1083,58 +1177,36 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, * If the parent directory is seen to have changed, we throw out the * cached dentry and do a new lookup. */ -static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int +nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, + unsigned int flags) { - struct inode *dir; struct inode *inode; - struct dentry *parent; - struct nfs_fh *fhandle = NULL; - struct nfs_fattr *fattr = NULL; - struct nfs4_label *label = NULL; int error; - if (flags & LOOKUP_RCU) { - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - } else { - parent = dget_parent(dentry); - dir = d_inode(parent); - } nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); inode = d_inode(dentry); - if (!inode) { - if (nfs_neg_need_reval(dir, dentry, flags)) { - if (flags & LOOKUP_RCU) - return -ECHILD; - goto out_bad; - } - goto out_valid; - } + if (!inode) + return nfs_lookup_revalidate_negative(dir, dentry, flags); if (is_bad_inode(inode)) { - if (flags & LOOKUP_RCU) - return -ECHILD; dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", __func__, dentry); goto out_bad; } if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) - goto out_set_verifier; + return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* Force a full look up iff the parent directory has changed */ if (!(flags & (LOOKUP_EXCL | LOOKUP_REVAL)) && nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) { error = nfs_lookup_verify_inode(inode, flags); if (error) { - if (flags & LOOKUP_RCU) - return -ECHILD; if (error == -ESTALE) - goto out_zap_parent; - goto out_error; + nfs_zap_caches(dir); + goto out_bad; } nfs_advise_use_readdirplus(dir); goto out_valid; @@ -1146,81 +1218,45 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (NFS_STALE(inode)) goto out_bad; - error = -ENOMEM; - fhandle = nfs_alloc_fhandle(); - fattr = nfs_alloc_fattr(); - if (fhandle == NULL || fattr == NULL) - goto out_error; - - label = nfs4_label_alloc(NFS_SERVER(inode), GFP_NOWAIT); - if (IS_ERR(label)) - goto out_error; - trace_nfs_lookup_revalidate_enter(dir, dentry, flags); - error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); + error = nfs_lookup_revalidate_dentry(dir, dentry, inode); trace_nfs_lookup_revalidate_exit(dir, dentry, flags, error); - if (error == -ESTALE || error == -ENOENT) - goto out_bad; - if (error) - goto out_error; - if (nfs_compare_fh(NFS_FH(inode), fhandle)) - goto out_bad; - if ((error = nfs_refresh_inode(inode, fattr)) != 0) - goto out_bad; - - nfs_setsecurity(inode, fattr, label); - - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); - nfs4_label_free(label); + return error; +out_valid: + return nfs_lookup_revalidate_done(dir, dentry, inode, 1); +out_bad: + if (flags & LOOKUP_RCU) + return -ECHILD; + return nfs_lookup_revalidate_done(dir, dentry, inode, 0); +} - /* set a readdirplus hint that we had a cache miss */ - nfs_force_use_readdirplus(dir); +static int +__nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, + int (*reval)(struct inode *, struct dentry *, unsigned int)) +{ + struct dentry *parent; + struct inode *dir; + int ret; -out_set_verifier: - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - out_valid: if (flags & LOOKUP_RCU) { + parent = READ_ONCE(dentry->d_parent); + dir = d_inode_rcu(parent); + if (!dir) + return -ECHILD; + ret = reval(dir, dentry, flags); if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; - } else + } else { + parent = dget_parent(dentry); + ret = reval(d_inode(parent), dentry, flags); dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", - __func__, dentry); - return 1; -out_zap_parent: - nfs_zap_caches(dir); - out_bad: - WARN_ON(flags & LOOKUP_RCU); - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); - nfs4_label_free(label); - nfs_mark_for_revalidate(dir); - if (inode && S_ISDIR(inode->i_mode)) { - /* Purge readdir caches. */ - nfs_zap_caches(inode); - /* - * We can't d_drop the root of a disconnected tree: - * its d_hash is on the s_anon list and d_drop() would hide - * it from shrink_dcache_for_unmount(), leading to busy - * inodes on unmount and further oopses. - */ - if (IS_ROOT(dentry)) - goto out_valid; } - dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", - __func__, dentry); - return 0; -out_error: - WARN_ON(flags & LOOKUP_RCU); - nfs_free_fattr(fattr); - nfs_free_fhandle(fhandle); - nfs4_label_free(label); - dput(parent); - dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) lookup returned error %d\n", - __func__, dentry, error); - return error; + return ret; +} + +static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) +{ + return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate); } /* @@ -1579,62 +1615,55 @@ no_open: } EXPORT_SYMBOL_GPL(nfs_atomic_open); -static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) +static int +nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, + unsigned int flags) { struct inode *inode; - int ret = 0; if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) - goto no_open; + goto full_reval; if (d_mountpoint(dentry)) - goto no_open; - if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) - goto no_open; + goto full_reval; inode = d_inode(dentry); /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. */ - if (inode == NULL) { - struct dentry *parent; - struct inode *dir; - - if (flags & LOOKUP_RCU) { - parent = READ_ONCE(dentry->d_parent); - dir = d_inode_rcu(parent); - if (!dir) - return -ECHILD; - } else { - parent = dget_parent(dentry); - dir = d_inode(parent); - } - if (!nfs_neg_need_reval(dir, dentry, flags)) - ret = 1; - else if (flags & LOOKUP_RCU) - ret = -ECHILD; - if (!(flags & LOOKUP_RCU)) - dput(parent); - else if (parent != READ_ONCE(dentry->d_parent)) - return -ECHILD; - goto out; - } + if (inode == NULL) + goto full_reval; + + if (NFS_PROTO(dir)->have_delegation(inode, FMODE_READ)) + return nfs_lookup_revalidate_delegated(dir, dentry, inode); /* NFS only supports OPEN on regular files */ if (!S_ISREG(inode->i_mode)) - goto no_open; + goto full_reval; + /* We cannot do exclusive creation on a positive dentry */ - if (flags & LOOKUP_EXCL) - goto no_open; + if (flags & (LOOKUP_EXCL | LOOKUP_REVAL)) + goto reval_dentry; + + /* Check if the directory changed */ + if (!nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) + goto reval_dentry; /* Let f_op->open() actually open (and revalidate) the file */ - ret = 1; + return 1; +reval_dentry: + if (flags & LOOKUP_RCU) + return -ECHILD; + return nfs_lookup_revalidate_dentry(dir, dentry, inode);; -out: - return ret; +full_reval: + return nfs_do_lookup_revalidate(dir, dentry, flags); +} -no_open: - return nfs_lookup_revalidate(dentry, flags); +static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) +{ + return __nfs_lookup_revalidate(dentry, flags, + nfs4_do_lookup_revalidate); } #endif /* CONFIG_NFSV4 */ diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index d175724ff566..61f46facb39c 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, + .max_layoutget_response = 4096, /* 1 page or so... */ .alloc_layout_hdr = filelayout_alloc_layout_hdr, .free_layout_hdr = filelayout_free_layout_hdr, .alloc_lseg = filelayout_alloc_lseg, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index cae43333ef16..86bcba40ca61 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -2356,6 +2356,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .name = "LAYOUT_FLEX_FILES", .owner = THIS_MODULE, .flags = PNFS_LAYOUTGET_ON_OPEN, + .max_layoutget_response = 4096, /* 1 page or so... */ .set_layoutdriver = ff_layout_set_layoutdriver, .alloc_layout_hdr = ff_layout_alloc_layout_hdr, .free_layout_hdr = ff_layout_free_layout_hdr, diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 59aa04976331..74d8d5352438 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -453,7 +453,7 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); struct rpc_cred *cred; - if (mirror) { + if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) { cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode); if (!cred) cred = get_rpccred(mdscred); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b65aee481d13..5b1eee4952b7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -857,15 +857,14 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) { - struct nfs_lock_context *head = &ctx->lock_context; - struct nfs_lock_context *pos = head; + struct nfs_lock_context *pos; - do { + list_for_each_entry_rcu(pos, &ctx->lock_context.list, list) { if (pos->lockowner != current->files) continue; - refcount_inc(&pos->count); - return pos; - } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head); + if (refcount_inc_not_zero(&pos->count)) + return pos; + } return NULL; } @@ -874,10 +873,10 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) struct nfs_lock_context *res, *new = NULL; struct inode *inode = d_inode(ctx->dentry); - spin_lock(&inode->i_lock); + rcu_read_lock(); res = __nfs_find_lock_context(ctx); + rcu_read_unlock(); if (res == NULL) { - spin_unlock(&inode->i_lock); new = kmalloc(sizeof(*new), GFP_KERNEL); if (new == NULL) return ERR_PTR(-ENOMEM); @@ -885,14 +884,14 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); if (res == NULL) { - list_add_tail(&new->list, &ctx->lock_context.list); + list_add_tail_rcu(&new->list, &ctx->lock_context.list); new->open_context = ctx; res = new; new = NULL; } + spin_unlock(&inode->i_lock); + kfree(new); } - spin_unlock(&inode->i_lock); - kfree(new); return res; } EXPORT_SYMBOL_GPL(nfs_get_lock_context); @@ -904,9 +903,9 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx) if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; - list_del(&l_ctx->list); + list_del_rcu(&l_ctx->list); spin_unlock(&inode->i_lock); - kfree(l_ctx); + kfree_rcu(l_ctx, rcu_head); } EXPORT_SYMBOL_GPL(nfs_put_lock_context); @@ -978,9 +977,9 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context); struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { - if (ctx != NULL) - refcount_inc(&ctx->lock_context.count); - return ctx; + if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count)) + return ctx; + return NULL; } EXPORT_SYMBOL_GPL(get_nfs_open_context); @@ -989,13 +988,13 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) struct inode *inode = d_inode(ctx->dentry); struct super_block *sb = ctx->dentry->d_sb; + if (!refcount_dec_and_test(&ctx->lock_context.count)) + return; if (!list_empty(&ctx->list)) { - if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) - return; - list_del(&ctx->list); + spin_lock(&inode->i_lock); + list_del_rcu(&ctx->list); spin_unlock(&inode->i_lock); - } else if (!refcount_dec_and_test(&ctx->lock_context.count)) - return; + } if (inode != NULL) NFS_PROTO(inode)->close_context(ctx, is_sync); if (ctx->cred != NULL) @@ -1003,7 +1002,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) dput(ctx->dentry); nfs_sb_deactive(sb); kfree(ctx->mdsthreshold); - kfree(ctx); + kfree_rcu(ctx, rcu_head); } void put_nfs_open_context(struct nfs_open_context *ctx) @@ -1027,10 +1026,7 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); - if (ctx->mode & FMODE_WRITE) - list_add(&ctx->list, &nfsi->open_files); - else - list_add_tail(&ctx->list, &nfsi->open_files); + list_add_tail_rcu(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); @@ -1051,16 +1047,17 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *pos, *ctx = NULL; - spin_lock(&inode->i_lock); - list_for_each_entry(pos, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, &nfsi->open_files, list) { if (cred != NULL && pos->cred != cred) continue; if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) continue; ctx = get_nfs_open_context(pos); - break; + if (ctx) + break; } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); return ctx; } @@ -1078,9 +1075,6 @@ void nfs_file_clear_open_context(struct file *filp) if (ctx->error < 0) invalidate_inode_pages2(inode->i_mapping); filp->private_data = NULL; - spin_lock(&inode->i_lock); - list_move_tail(&ctx->list, &NFS_I(inode)->open_files); - spin_unlock(&inode->i_lock); put_nfs_open_context_sync(ctx); } } @@ -1329,19 +1323,11 @@ static bool nfs_file_has_writers(struct nfs_inode *nfsi) { struct inode *inode = &nfsi->vfs_inode; - assert_spin_locked(&inode->i_lock); - if (!S_ISREG(inode->i_mode)) return false; if (list_empty(&nfsi->open_files)) return false; - /* Note: This relies on nfsi->open_files being ordered with writers - * being placed at the head of the list. - * See nfs_inode_attach_open_context() - */ - return (list_first_entry(&nfsi->open_files, - struct nfs_open_context, - list)->mode & FMODE_WRITE) == FMODE_WRITE; + return inode_is_open_for_write(inode); } static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index ec8a9efa268f..71bc16225b98 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -786,6 +786,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct inode *inode = hdr->inode; + struct nfs_server *server = NFS_SERVER(inode); if (hdr->pgio_done_cb != NULL) return hdr->pgio_done_cb(task, hdr); @@ -793,6 +794,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; + if (task->tk_status >= 0 && !server->read_hdrsize) + cmpxchg(&server->read_hdrsize, 0, hdr->res.replen); + nfs_invalidate_atime(inode); nfs_refresh_inode(inode, &hdr->fattr); return 0; @@ -802,6 +806,7 @@ static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr, struct rpc_message *msg) { msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; + hdr->args.replen = NFS_SERVER(hdr->inode)->read_hdrsize; } static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 64e4fa33d89f..78df4eb60f85 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -983,10 +983,11 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, const void *data) { const struct nfs_pgio_args *args = data; + unsigned int replen = args->replen ? args->replen : NFS3_readres_sz; encode_read3args(xdr, args); prepare_reply_buffer(req, args->pages, args->pgbase, - args->count, NFS3_readres_sz); + args->count, replen); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -1364,10 +1365,12 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, encode_nfs_fh3(xdr, args->fh); encode_uint32(xdr, args->mask); - if (args->mask & (NFS_ACL | NFS_DFACL)) + if (args->mask & (NFS_ACL | NFS_DFACL)) { prepare_reply_buffer(req, args->pages, 0, NFSACL_MAXPAGES << PAGE_SHIFT, ACL3_getaclres_sz); + req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; + } } static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, @@ -1673,9 +1676,11 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { struct nfs_pgio_res *result = data; + unsigned int pos; enum nfs_stat status; int error; + pos = xdr_stream_pos(xdr); error = decode_nfsstat3(xdr, &status); if (unlikely(error)) goto out; @@ -1685,6 +1690,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, result->op_status = status; if (status != NFS3_OK) goto out_status; + result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2); error = decode_read3resok(xdr, result); out: return error; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 3a6904173214..8d59c9655ec4 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -188,9 +188,10 @@ struct nfs4_state { unsigned int n_wronly; /* Number of write-only references */ unsigned int n_rdwr; /* Number of read/write references */ fmode_t state; /* State on the server (R,W, or RW) */ - atomic_t count; + refcount_t count; wait_queue_head_t waitq; + struct rcu_head rcu_head; }; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 146e30862234..8f53455c4765 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -950,10 +950,10 @@ EXPORT_SYMBOL_GPL(nfs4_set_ds_client); /* * Session has been established, and the client marked ready. - * Set the mount rsize and wsize with negotiated fore channel - * attributes which will be bound checked in nfs_server_set_fsinfo. + * Limit the mount rsize, wsize and dtsize using negotiated fore + * channel attributes. */ -static void nfs4_session_set_rwsize(struct nfs_server *server) +static void nfs4_session_limit_rwsize(struct nfs_server *server) { #ifdef CONFIG_NFS_V4_1 struct nfs4_session *sess; @@ -966,9 +966,11 @@ static void nfs4_session_set_rwsize(struct nfs_server *server) server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; - if (!server->rsize || server->rsize > server_resp_sz) + if (server->dtsize > server_resp_sz) + server->dtsize = server_resp_sz; + if (server->rsize > server_resp_sz) server->rsize = server_resp_sz; - if (!server->wsize || server->wsize > server_rqst_sz) + if (server->wsize > server_rqst_sz) server->wsize = server_rqst_sz; #endif /* CONFIG_NFS_V4_1 */ } @@ -1015,12 +1017,12 @@ static int nfs4_server_common_setup(struct nfs_server *server, (unsigned long long) server->fsid.minor); nfs_display_fhandle(mntfh, "Pseudo-fs root FH"); - nfs4_session_set_rwsize(server); - error = nfs_probe_fsinfo(server, mntfh, fattr); if (error < 0) goto out; + nfs4_session_limit_rwsize(server); + if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8220a168282e..db84b4adbc49 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1349,12 +1349,20 @@ static bool nfs4_mode_match_open_stateid(struct nfs4_state *state, return false; } -static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode) +static int can_open_cached(struct nfs4_state *state, fmode_t mode, + int open_mode, enum open_claim_type4 claim) { int ret = 0; if (open_mode & (O_EXCL|O_TRUNC)) goto out; + switch (claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: + goto out; + default: + break; + } switch (mode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 @@ -1747,7 +1755,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) for (;;) { spin_lock(&state->owner->so_lock); - if (can_open_cached(state, fmode, open_mode)) { + if (can_open_cached(state, fmode, open_mode, claim)) { update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); goto out_return_state; @@ -1777,7 +1785,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) out: return ERR_PTR(ret); out_return_state: - atomic_inc(&state->count); + refcount_inc(&state->count); return state; } @@ -1849,7 +1857,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data) update: update_open_stateid(state, &data->o_res.stateid, NULL, data->o_arg.fmode); - atomic_inc(&state->count); + refcount_inc(&state->count); return state; } @@ -1887,7 +1895,7 @@ nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data) return ERR_CAST(inode); if (data->state != NULL && data->state->inode == inode) { state = data->state; - atomic_inc(&state->count); + refcount_inc(&state->count); } else state = nfs4_get_open_state(inode, data->owner); iput(inode); @@ -1933,23 +1941,41 @@ nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) return ret; } -static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) +static struct nfs_open_context * +nfs4_state_find_open_context_mode(struct nfs4_state *state, fmode_t mode) { struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_open_context *ctx; - spin_lock(&state->inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { if (ctx->state != state) continue; - get_nfs_open_context(ctx); - spin_unlock(&state->inode->i_lock); + if ((ctx->mode & mode) != mode) + continue; + if (!get_nfs_open_context(ctx)) + continue; + rcu_read_unlock(); return ctx; } - spin_unlock(&state->inode->i_lock); + rcu_read_unlock(); return ERR_PTR(-ENOENT); } +static struct nfs_open_context * +nfs4_state_find_open_context(struct nfs4_state *state) +{ + struct nfs_open_context *ctx; + + ctx = nfs4_state_find_open_context_mode(state, FMODE_READ|FMODE_WRITE); + if (!IS_ERR(ctx)) + return ctx; + ctx = nfs4_state_find_open_context_mode(state, FMODE_WRITE); + if (!IS_ERR(ctx)) + return ctx; + return nfs4_state_find_open_context_mode(state, FMODE_READ); +} + static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context *ctx, struct nfs4_state *state, enum open_claim_type4 claim) { @@ -1960,7 +1986,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; - atomic_inc(&state->count); + refcount_inc(&state->count); return opendata; } @@ -2276,7 +2302,8 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) if (data->state != NULL) { struct nfs_delegation *delegation; - if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags)) + if (can_open_cached(data->state, data->o_arg.fmode, + data->o_arg.open_flags, claim)) goto out_no_action; rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 40a08cd483f0..62ae0fd345ad 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -655,7 +655,7 @@ nfs4_alloc_open_state(void) state = kzalloc(sizeof(*state), GFP_NOFS); if (!state) return NULL; - atomic_set(&state->count, 1); + refcount_set(&state->count, 1); INIT_LIST_HEAD(&state->lock_states); spin_lock_init(&state->state_lock); seqlock_init(&state->seqlock); @@ -684,12 +684,12 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) struct nfs_inode *nfsi = NFS_I(inode); struct nfs4_state *state; - list_for_each_entry(state, &nfsi->open_states, inode_states) { + list_for_each_entry_rcu(state, &nfsi->open_states, inode_states) { if (state->owner != owner) continue; if (!nfs4_valid_open_stateid(state)) continue; - if (atomic_inc_not_zero(&state->count)) + if (refcount_inc_not_zero(&state->count)) return state; } return NULL; @@ -698,7 +698,7 @@ __nfs4_find_state_byowner(struct inode *inode, struct nfs4_state_owner *owner) static void nfs4_free_open_state(struct nfs4_state *state) { - kfree(state); + kfree_rcu(state, rcu_head); } struct nfs4_state * @@ -707,9 +707,9 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) struct nfs4_state *state, *new; struct nfs_inode *nfsi = NFS_I(inode); - spin_lock(&inode->i_lock); + rcu_read_lock(); state = __nfs4_find_state_byowner(inode, owner); - spin_unlock(&inode->i_lock); + rcu_read_unlock(); if (state) goto out; new = nfs4_alloc_open_state(); @@ -720,7 +720,7 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) state = new; state->owner = owner; atomic_inc(&owner->so_count); - list_add(&state->inode_states, &nfsi->open_states); + list_add_rcu(&state->inode_states, &nfsi->open_states); ihold(inode); state->inode = inode; spin_unlock(&inode->i_lock); @@ -743,10 +743,10 @@ void nfs4_put_open_state(struct nfs4_state *state) struct inode *inode = state->inode; struct nfs4_state_owner *owner = state->owner; - if (!atomic_dec_and_lock(&state->count, &owner->so_lock)) + if (!refcount_dec_and_lock(&state->count, &owner->so_lock)) return; spin_lock(&inode->i_lock); - list_del(&state->inode_states); + list_del_rcu(&state->inode_states); list_del(&state->open_states); spin_unlock(&inode->i_lock); spin_unlock(&owner->so_lock); @@ -1437,8 +1437,8 @@ void nfs_inode_find_state_and_recover(struct inode *inode, struct nfs4_state *state; bool found = false; - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { state = ctx->state; if (state == NULL) continue; @@ -1456,7 +1456,7 @@ void nfs_inode_find_state_and_recover(struct inode *inode, nfs4_state_mark_reclaim_nograce(clp, state)) found = true; } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); nfs_inode_find_delegation_state_and_recover(inode, stateid); if (found) @@ -1469,13 +1469,13 @@ static void nfs4_state_mark_open_context_bad(struct nfs4_state *state) struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *ctx; - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { if (ctx->state != state) continue; set_bit(NFS_CONTEXT_BAD, &ctx->flags); } - spin_unlock(&inode->i_lock); + rcu_read_unlock(); } static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error) @@ -1549,10 +1549,62 @@ out: return status; } +#ifdef CONFIG_NFS_V4_2 +static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + struct nfs4_copy_state *copy; + + if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags)) + return; + + spin_lock(&sp->so_server->nfs_client->cl_lock); + list_for_each_entry(copy, &sp->so_server->ss_copies, copies) { + if (nfs4_stateid_match_other(&state->stateid, ©->parent_state->stateid)) + continue; + copy->flags = 1; + complete(©->completion); + break; + } + spin_unlock(&sp->so_server->nfs_client->cl_lock); +} +#else /* !CONFIG_NFS_V4_2 */ +static inline void nfs42_complete_copies(struct nfs4_state_owner *sp, + struct nfs4_state *state) +{ +} +#endif /* CONFIG_NFS_V4_2 */ + +static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_state *state, + const struct nfs4_state_recovery_ops *ops) +{ + struct nfs4_lock_state *lock; + int status; + + status = ops->recover_open(sp, state); + if (status < 0) + return status; + + status = nfs4_reclaim_locks(state, ops); + if (status < 0) + return status; + + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { + spin_lock(&state->state_lock); + list_for_each_entry(lock, &state->lock_states, ls_locks) { + if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) + pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__); + } + spin_unlock(&state->state_lock); + } + + nfs42_complete_copies(sp, state); + clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); + return status; +} + static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops) { struct nfs4_state *state; - struct nfs4_lock_state *lock; int status = 0; /* Note: we rely on the sp->so_states list being ordered @@ -1573,79 +1625,45 @@ restart: continue; if (state->state == 0) continue; - atomic_inc(&state->count); + refcount_inc(&state->count); spin_unlock(&sp->so_lock); - status = ops->recover_open(sp, state); - if (status >= 0) { - status = nfs4_reclaim_locks(state, ops); - if (status >= 0) { - if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { - spin_lock(&state->state_lock); - list_for_each_entry(lock, &state->lock_states, ls_locks) { - if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) - pr_warn_ratelimited("NFS: " - "%s: Lock reclaim " - "failed!\n", __func__); - } - spin_unlock(&state->state_lock); - } - clear_bit(NFS_STATE_RECLAIM_NOGRACE, - &state->flags); -#ifdef CONFIG_NFS_V4_2 - if (test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags)) { - struct nfs4_copy_state *copy; - - spin_lock(&sp->so_server->nfs_client->cl_lock); - list_for_each_entry(copy, &sp->so_server->ss_copies, copies) { - if (memcmp(&state->stateid.other, ©->parent_state->stateid.other, NFS4_STATEID_SIZE)) - continue; - copy->flags = 1; - complete(©->completion); - printk("AGLO: server rebooted waking up the copy\n"); - break; - } - spin_unlock(&sp->so_server->nfs_client->cl_lock); - } -#endif /* CONFIG_NFS_V4_2 */ - nfs4_put_open_state(state); - spin_lock(&sp->so_lock); - goto restart; - } - } + status = __nfs4_reclaim_open_state(sp, state, ops); + switch (status) { - default: - printk(KERN_ERR "NFS: %s: unhandled error %d\n", - __func__, status); - /* Fall through */ - case -ENOENT: - case -ENOMEM: - case -EACCES: - case -EROFS: - case -EIO: - case -ESTALE: - /* Open state on this file cannot be recovered */ - nfs4_state_mark_recovery_failed(state, status); - break; - case -EAGAIN: - ssleep(1); - /* Fall through */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_OLD_STATEID: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_RECLAIM_BAD: - case -NFS4ERR_RECLAIM_CONFLICT: - nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + default: + if (status >= 0) break; - case -NFS4ERR_EXPIRED: - case -NFS4ERR_NO_GRACE: - nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - goto out_err; + printk(KERN_ERR "NFS: %s: unhandled error %d\n", __func__, status); + /* Fall through */ + case -ENOENT: + case -ENOMEM: + case -EACCES: + case -EROFS: + case -EIO: + case -ESTALE: + /* Open state on this file cannot be recovered */ + nfs4_state_mark_recovery_failed(state, status); + break; + case -EAGAIN: + ssleep(1); + /* Fall through */ + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + goto out_err; } nfs4_put_open_state(state); spin_lock(&sp->so_lock); @@ -1795,38 +1813,38 @@ static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { - case 0: - break; - case -NFS4ERR_CB_PATH_DOWN: - nfs40_handle_cb_pathdown(clp); - break; - case -NFS4ERR_NO_GRACE: - nfs4_state_end_reclaim_reboot(clp); - break; - case -NFS4ERR_STALE_CLIENTID: - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_start_reclaim_reboot(clp); - break; - case -NFS4ERR_EXPIRED: - set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_start_reclaim_nograce(clp); - break; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_DEADSESSION: - case -NFS4ERR_SEQ_FALSE_RETRY: - case -NFS4ERR_SEQ_MISORDERED: - set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); - /* Zero session reset errors */ - break; - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); - break; - default: - dprintk("%s: failed to handle error %d for server %s\n", - __func__, error, clp->cl_hostname); - return error; + case 0: + break; + case -NFS4ERR_CB_PATH_DOWN: + nfs40_handle_cb_pathdown(clp); + break; + case -NFS4ERR_NO_GRACE: + nfs4_state_end_reclaim_reboot(clp); + break; + case -NFS4ERR_STALE_CLIENTID: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_reboot(clp); + break; + case -NFS4ERR_EXPIRED: + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + nfs4_state_start_reclaim_nograce(clp); + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + /* Zero session reset errors */ + break; + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); + break; + default: + dprintk("%s: failed to handle error %d for server %s\n", + __func__, error, clp->cl_hostname); + return error; } dprintk("%s: handled error %d for server %s\n", __func__, error, clp->cl_hostname); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index b7bde12d8cd5..2fc8f6fa25e4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3516,7 +3516,7 @@ static int decode_attr_exclcreat_supported(struct xdr_stream *xdr, static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh) { __be32 *p; - int len; + u32 len; if (fh != NULL) memset(fh, 0, sizeof(*fh)); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index bb5476a6d264..5c4568a0804b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -63,14 +63,14 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) { - spin_lock(&hdr->lock); - if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags) - || pos < hdr->io_start + hdr->good_bytes) { + unsigned int new = pos - hdr->io_start; + + if (hdr->good_bytes > new) { + hdr->good_bytes = new; clear_bit(NFS_IOHDR_EOF, &hdr->flags); - hdr->good_bytes = pos - hdr->io_start; - hdr->error = error; + if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)) + hdr->error = error; } - spin_unlock(&hdr->lock); } static inline struct nfs_page * @@ -494,7 +494,6 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops) if (hdr) { INIT_LIST_HEAD(&hdr->pages); - spin_lock_init(&hdr->lock); hdr->rw_ops = ops; } return hdr; @@ -1111,6 +1110,20 @@ static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc, return ret; } +static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc) +{ + u32 midx; + struct nfs_pgio_mirror *mirror; + + if (!desc->pg_error) + return; + + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + mirror = &desc->pg_mirrors[midx]; + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + } +} + int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { @@ -1161,25 +1174,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, return 1; out_failed: - /* - * We might have failed before sending any reqs over wire. - * Clean up rest of the reqs in mirror pg_list. - */ - if (desc->pg_error) { - struct nfs_pgio_mirror *mirror; - void (*func)(struct list_head *); - - /* remember fatal errors */ - if (nfs_error_is_fatal(desc->pg_error)) - nfs_context_set_write_error(req->wb_context, - desc->pg_error); - - func = desc->pg_completion_ops->error_cleanup; - for (midx = 0; midx < desc->pg_mirror_count; midx++) { - mirror = &desc->pg_mirrors[midx]; - func(&mirror->pg_list); - } - } + nfs_pageio_error_cleanup(desc); return 0; } @@ -1251,6 +1246,8 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) for (midx = 0; midx < desc->pg_mirror_count; midx++) nfs_pageio_complete_mirror(desc, midx); + if (desc->pg_error < 0) + nfs_pageio_error_cleanup(desc); if (desc->pg_ops->pg_cleanup) desc->pg_ops->pg_cleanup(desc); nfs_pageio_cleanup_mirroring(desc); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7d9a51e6b847..06cb90e9bc6e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -965,7 +965,7 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) struct page **pages; int i; - pages = kcalloc(size, sizeof(struct page *), gfp_flags); + pages = kmalloc_array(size, sizeof(struct page *), gfp_flags); if (!pages) { dprintk("%s: can't alloc array of %zu pages\n", __func__, size); return NULL; @@ -975,7 +975,7 @@ static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) pages[i] = alloc_page(gfp_flags); if (!pages[i]) { dprintk("%s: failed to allocate page\n", __func__); - nfs4_free_pages(pages, size); + nfs4_free_pages(pages, i); return NULL; } } @@ -991,6 +991,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino, gfp_t gfp_flags) { struct nfs_server *server = pnfs_find_server(ino, ctx); + size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response; size_t max_pages = max_response_pages(server); struct nfs4_layoutget *lgp; @@ -1000,6 +1001,12 @@ pnfs_alloc_init_layoutget_args(struct inode *ino, if (lgp == NULL) return NULL; + if (max_reply_sz) { + size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (npages < max_pages) + max_pages = npages; + } + lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); if (!lgp->args.layout.pages) { kfree(lgp); @@ -1332,6 +1339,7 @@ bool pnfs_roc(struct inode *ino, if (!nfs_have_layout(ino)) return false; retry: + rcu_read_lock(); spin_lock(&ino->i_lock); lo = nfsi->layout; if (!lo || !pnfs_layout_is_valid(lo) || @@ -1342,6 +1350,7 @@ retry: pnfs_get_layout_hdr(lo); if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { spin_unlock(&ino->i_lock); + rcu_read_unlock(); wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); pnfs_put_layout_hdr(lo); @@ -1355,7 +1364,7 @@ retry: skip_read = true; } - list_for_each_entry(ctx, &nfsi->open_files, list) { + list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { state = ctx->state; if (state == NULL) continue; @@ -1403,6 +1412,7 @@ retry: out_noroc: spin_unlock(&ino->i_lock); + rcu_read_unlock(); pnfs_layoutcommit_inode(ino, true); if (roc) { struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index ece367ebde69..e2e9fcd5341d 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -125,6 +125,7 @@ struct pnfs_layoutdriver_type { struct module *owner; unsigned flags; unsigned max_deviceinfo_size; + unsigned max_layoutget_response; int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); int (*clear_layoutdriver) (struct nfs_server *); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 48d7277c60a9..f9f19784db82 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -276,16 +276,14 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_header *hdr) { if (hdr->res.eof) { - loff_t bound; + loff_t pos = hdr->args.offset + hdr->res.count; + unsigned int new = pos - hdr->io_start; - bound = hdr->args.offset + hdr->res.count; - spin_lock(&hdr->lock); - if (bound < hdr->io_start + hdr->good_bytes) { + if (hdr->good_bytes > new) { + hdr->good_bytes = new; set_bit(NFS_IOHDR_EOF, &hdr->flags); clear_bit(NFS_IOHDR_ERROR, &hdr->flags); - hdr->good_bytes = bound - hdr->io_start; } - spin_unlock(&hdr->lock); } else if (hdr->res.count < hdr->args.count) nfs_readpage_retry(task, hdr); } diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index ebb24a314f43..de99db518571 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -168,24 +168,18 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, ctxt->newbh = NULL; if (inode->i_blkbits == PAGE_SHIFT) { - lock_page(obh->b_page); - /* - * We cannot call radix_tree_preload for the kernels older - * than 2.6.23, because it is not exported for modules. - */ + struct page *opage = obh->b_page; + lock_page(opage); retry: - err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (err) - goto failed_unlock; /* BUG_ON(oldkey != obh->b_page->index); */ - if (unlikely(oldkey != obh->b_page->index)) - NILFS_PAGE_BUG(obh->b_page, + if (unlikely(oldkey != opage->index)) + NILFS_PAGE_BUG(opage, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); xa_lock_irq(&btnc->i_pages); - err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page); + err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS); xa_unlock_irq(&btnc->i_pages); /* * Note: page->index will not change to newkey until @@ -193,7 +187,6 @@ retry: * To protect the page in intermediate state, the page lock * is held. */ - radix_tree_preload_end(); if (!err) return 0; else if (err != -EEXIST) @@ -203,7 +196,7 @@ retry: if (!err) goto retry; /* fallback to copy mode */ - unlock_page(obh->b_page); + unlock_page(opage); } nbh = nilfs_btnode_create_block(btnc, newkey); @@ -243,9 +236,8 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, mark_buffer_dirty(obh); xa_lock_irq(&btnc->i_pages); - radix_tree_delete(&btnc->i_pages, oldkey); - radix_tree_tag_set(&btnc->i_pages, newkey, - PAGECACHE_TAG_DIRTY); + __xa_erase(&btnc->i_pages, oldkey); + __xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY); xa_unlock_irq(&btnc->i_pages); opage->index = obh->b_blocknr = newkey; @@ -275,7 +267,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, if (nbh == NULL) { /* blocksize == pagesize */ xa_lock_irq(&btnc->i_pages); - radix_tree_delete(&btnc->i_pages, newkey); + __xa_erase(&btnc->i_pages, newkey); xa_unlock_irq(&btnc->i_pages); unlock_page(ctxt->bh->b_page); } else diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 329a056b73b1..d7fc8d369d89 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -289,7 +289,7 @@ repeat: * @dmap: destination page cache * @smap: source page cache * - * No pages must no be added to the cache during this process. + * No pages must be added to the cache during this process. * This must be ensured by the caller. */ void nilfs_copy_back_pages(struct address_space *dmap, @@ -298,7 +298,6 @@ void nilfs_copy_back_pages(struct address_space *dmap, struct pagevec pvec; unsigned int i, n; pgoff_t index = 0; - int err; pagevec_init(&pvec); repeat: @@ -313,35 +312,34 @@ repeat: lock_page(page); dpage = find_lock_page(dmap, offset); if (dpage) { - /* override existing page on the destination cache */ + /* overwrite existing page in the destination cache */ WARN_ON(PageDirty(dpage)); nilfs_copy_page(dpage, page, 0); unlock_page(dpage); put_page(dpage); + /* Do we not need to remove page from smap here? */ } else { - struct page *page2; + struct page *p; /* move the page to the destination cache */ xa_lock_irq(&smap->i_pages); - page2 = radix_tree_delete(&smap->i_pages, offset); - WARN_ON(page2 != page); - + p = __xa_erase(&smap->i_pages, offset); + WARN_ON(page != p); smap->nrpages--; xa_unlock_irq(&smap->i_pages); xa_lock_irq(&dmap->i_pages); - err = radix_tree_insert(&dmap->i_pages, offset, page); - if (unlikely(err < 0)) { - WARN_ON(err == -EEXIST); + p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS); + if (unlikely(p)) { + /* Probably -ENOMEM */ page->mapping = NULL; - put_page(page); /* for cache */ + put_page(page); } else { page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) - radix_tree_tag_set(&dmap->i_pages, - offset, - PAGECACHE_TAG_DIRTY); + __xa_set_mark(&dmap->i_pages, offset, + PAGECACHE_TAG_DIRTY); } xa_unlock_irq(&dmap->i_pages); } @@ -467,8 +465,7 @@ int __nilfs_clear_page_dirty(struct page *page) if (mapping) { xa_lock_irq(&mapping->i_pages); if (test_bit(PG_dirty, &page->flags)) { - radix_tree_tag_clear(&mapping->i_pages, - page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irq(&mapping->i_pages); return clear_page_dirty_for_io(page); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 94b52157bf8d..5769cf3ff035 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -25,7 +25,7 @@ static bool should_merge(struct fsnotify_event *old_fsn, old = FANOTIFY_E(old_fsn); new = FANOTIFY_E(new_fsn); - if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid && + if (old_fsn->inode == new_fsn->inode && old->pid == new->pid && old->path.mnt == new->path.mnt && old->path.dentry == new->path.dentry) return true; @@ -131,8 +131,8 @@ static bool fanotify_should_send_event(struct fsnotify_iter_info *iter_info, !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) return false; - if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask & - ~marks_ignored_mask) + if (event_mask & FANOTIFY_OUTGOING_EVENTS & + marks_mask & ~marks_ignored_mask) return true; return false; @@ -171,7 +171,10 @@ struct fanotify_event_info *fanotify_alloc_event(struct fsnotify_group *group, goto out; init: __maybe_unused fsnotify_init_event(&event->fse, inode, mask); - event->tgid = get_pid(task_tgid(current)); + if (FAN_GROUP_FLAG(group, FAN_REPORT_TID)) + event->pid = get_pid(task_pid(current)); + else + event->pid = get_pid(task_tgid(current)); if (path) { event->path = *path; path_get(&event->path); @@ -205,6 +208,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 10); + if (!fanotify_should_send_event(iter_info, mask, data, data_type)) return 0; @@ -236,7 +241,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, ret = fsnotify_add_event(group, fsn_event, fanotify_merge); if (ret) { /* Permission events shouldn't be merged */ - BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS); + BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS); /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); @@ -268,7 +273,7 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) event = FANOTIFY_E(fsn_event); path_put(&event->path); - put_pid(event->tgid); + put_pid(event->pid); if (fanotify_is_perm_event(fsn_event->mask)) { kmem_cache_free(fanotify_perm_event_cachep, FANOTIFY_PE(fsn_event)); diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 8609ba06f474..ea05b8a401e7 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -19,7 +19,7 @@ struct fanotify_event_info { * during this object's lifetime */ struct path path; - struct pid *tgid; + struct pid *pid; }; /* @@ -44,7 +44,7 @@ FANOTIFY_PE(struct fsnotify_event *fse) static inline bool fanotify_is_perm_event(u32 mask) { return IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS) && - mask & FAN_ALL_PERM_EVENTS; + mask & FANOTIFY_PERM_EVENTS; } static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 69054886915b..e03be5071362 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -131,8 +131,8 @@ static int fill_event_metadata(struct fsnotify_group *group, metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; metadata->reserved = 0; - metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS; - metadata->pid = pid_vnr(event->tgid); + metadata->mask = fsn_event->mask & FANOTIFY_OUTGOING_EVENTS; + metadata->pid = pid_vnr(event->pid); if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW)) metadata->fd = FAN_NOFD; else { @@ -191,7 +191,7 @@ static int process_access_response(struct fsnotify_group *group, if (fd < 0) return -EINVAL; - if ((response & FAN_AUDIT) && !group->fanotify_data.audit) + if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) return -EINVAL; event = dequeue_event(group, fd); @@ -395,7 +395,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) */ while (!fsnotify_notify_queue_is_empty(group)) { fsn_event = fsnotify_remove_first_event(group); - if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS)) { + if (!(fsn_event->mask & FANOTIFY_PERM_EVENTS)) { spin_unlock(&group->notification_lock); fsnotify_destroy_event(group, fsn_event); spin_lock(&group->notification_lock); @@ -506,18 +506,10 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { - __u32 tmask = fsn_mark->mask & ~mask; - - if (flags & FAN_MARK_ONDIR) - tmask &= ~FAN_ONDIR; - oldmask = fsn_mark->mask; - fsn_mark->mask = tmask; + fsn_mark->mask &= ~mask; } else { - __u32 tmask = fsn_mark->ignored_mask & ~mask; - if (flags & FAN_MARK_ONDIR) - tmask &= ~FAN_ONDIR; - fsn_mark->ignored_mask = tmask; + fsn_mark->ignored_mask &= ~mask; } *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask); spin_unlock(&fsn_mark->lock); @@ -563,6 +555,13 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, mask, flags); } +static int fanotify_remove_sb_mark(struct fsnotify_group *group, + struct super_block *sb, __u32 mask, + unsigned int flags) +{ + return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask, flags); +} + static int fanotify_remove_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, unsigned int flags) @@ -579,19 +578,10 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { - __u32 tmask = fsn_mark->mask | mask; - - if (flags & FAN_MARK_ONDIR) - tmask |= FAN_ONDIR; - oldmask = fsn_mark->mask; - fsn_mark->mask = tmask; + fsn_mark->mask |= mask; } else { - __u32 tmask = fsn_mark->ignored_mask | mask; - if (flags & FAN_MARK_ONDIR) - tmask |= FAN_ONDIR; - - fsn_mark->ignored_mask = tmask; + fsn_mark->ignored_mask |= mask; if (flags & FAN_MARK_IGNORED_SURV_MODIFY) fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; } @@ -658,6 +648,14 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags); } +static int fanotify_add_sb_mark(struct fsnotify_group *group, + struct super_block *sb, __u32 mask, + unsigned int flags) +{ + return fanotify_add_mark(group, &sb->s_fsnotify_marks, + FSNOTIFY_OBJ_TYPE_SB, mask, flags); +} + static int fanotify_add_inode_mark(struct fsnotify_group *group, struct inode *inode, __u32 mask, unsigned int flags) @@ -686,16 +684,16 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) struct user_struct *user; struct fanotify_event_info *oevent; - pr_debug("%s: flags=%d event_f_flags=%d\n", - __func__, flags, event_f_flags); + pr_debug("%s: flags=%x event_f_flags=%x\n", + __func__, flags, event_f_flags); if (!capable(CAP_SYS_ADMIN)) return -EPERM; #ifdef CONFIG_AUDITSYSCALL - if (flags & ~(FAN_ALL_INIT_FLAGS | FAN_ENABLE_AUDIT)) + if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) #else - if (flags & ~FAN_ALL_INIT_FLAGS) + if (flags & ~FANOTIFY_INIT_FLAGS) #endif return -EINVAL; @@ -731,6 +729,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) } group->fanotify_data.user = user; + group->fanotify_data.flags = flags; atomic_inc(&user->fanotify_listeners); group->memcg = get_mem_cgroup_from_mm(current->mm); @@ -746,7 +745,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.f_flags = event_f_flags; init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); - switch (flags & FAN_ALL_CLASS_BITS) { + switch (flags & FANOTIFY_CLASS_BITS) { case FAN_CLASS_NOTIF: group->priority = FS_PRIO_0; break; @@ -783,7 +782,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) fd = -EPERM; if (!capable(CAP_AUDIT_WRITE)) goto out_destroy_group; - group->fanotify_data.audit = true; } fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); @@ -805,7 +803,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, struct fsnotify_group *group; struct fd f; struct path path; - u32 valid_mask = FAN_ALL_EVENTS | FAN_EVENT_ON_CHILD; + u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; int ret; pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", @@ -815,8 +814,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (mask & ((__u64)0xffffffff << 32)) return -EINVAL; - if (flags & ~FAN_ALL_MARK_FLAGS) + if (flags & ~FANOTIFY_MARK_FLAGS) + return -EINVAL; + + switch (mark_type) { + case FAN_MARK_INODE: + case FAN_MARK_MOUNT: + case FAN_MARK_FILESYSTEM: + break; + default: return -EINVAL; + } + switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { case FAN_MARK_ADD: /* fallthrough */ case FAN_MARK_REMOVE: @@ -824,20 +833,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; break; case FAN_MARK_FLUSH: - if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH)) + if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH)) return -EINVAL; break; default: return -EINVAL; } - if (mask & FAN_ONDIR) { - flags |= FAN_MARK_ONDIR; - mask &= ~FAN_ONDIR; - } - if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) - valid_mask |= FAN_ALL_PERM_EVENTS; + valid_mask |= FANOTIFY_PERM_EVENTS; if (mask & ~valid_mask) return -EINVAL; @@ -857,14 +861,16 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * allowed to set permissions events. */ ret = -EINVAL; - if (mask & FAN_ALL_PERM_EVENTS && + if (mask & FANOTIFY_PERM_EVENTS && group->priority == FS_PRIO_0) goto fput_and_out; if (flags & FAN_MARK_FLUSH) { ret = 0; - if (flags & FAN_MARK_MOUNT) + if (mark_type == FAN_MARK_MOUNT) fsnotify_clear_vfsmount_marks_by_group(group); + else if (mark_type == FAN_MARK_FILESYSTEM) + fsnotify_clear_sb_marks_by_group(group); else fsnotify_clear_inode_marks_by_group(group); goto fput_and_out; @@ -875,7 +881,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, goto fput_and_out; /* inode held in place by reference to path; group by fget on fd */ - if (!(flags & FAN_MARK_MOUNT)) + if (mark_type == FAN_MARK_INODE) inode = path.dentry->d_inode; else mnt = path.mnt; @@ -883,14 +889,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, /* create/update an inode mark */ switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { case FAN_MARK_ADD: - if (flags & FAN_MARK_MOUNT) + if (mark_type == FAN_MARK_MOUNT) ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); + else if (mark_type == FAN_MARK_FILESYSTEM) + ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask, flags); else ret = fanotify_add_inode_mark(group, inode, mask, flags); break; case FAN_MARK_REMOVE: - if (flags & FAN_MARK_MOUNT) + if (mark_type == FAN_MARK_MOUNT) ret = fanotify_remove_vfsmount_mark(group, mnt, mask, flags); + else if (mark_type == FAN_MARK_FILESYSTEM) + ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask, flags); else ret = fanotify_remove_inode_mark(group, inode, mask, flags); break; @@ -934,6 +944,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, */ static int __init fanotify_user_setup(void) { + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 7); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); + fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC); diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 86fcf5814279..348a184bcdda 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -131,37 +131,20 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); + } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) { + struct super_block *sb = fsnotify_conn_sb(mark->connector); + + seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", + sb->s_dev, mflags, mark->mask, mark->ignored_mask); } } void fanotify_show_fdinfo(struct seq_file *m, struct file *f) { struct fsnotify_group *group = f->private_data; - unsigned int flags = 0; - - switch (group->priority) { - case FS_PRIO_0: - flags |= FAN_CLASS_NOTIF; - break; - case FS_PRIO_1: - flags |= FAN_CLASS_CONTENT; - break; - case FS_PRIO_2: - flags |= FAN_CLASS_PRE_CONTENT; - break; - } - - if (group->max_events == UINT_MAX) - flags |= FAN_UNLIMITED_QUEUE; - - if (group->fanotify_data.max_marks == UINT_MAX) - flags |= FAN_UNLIMITED_MARKS; - - if (group->fanotify_data.audit) - flags |= FAN_ENABLE_AUDIT; seq_printf(m, "fanotify flags:%x event-flags:%x\n", - flags, group->fanotify_data.f_flags); + group->fanotify_data.flags, group->fanotify_data.f_flags); show_fdinfo(m, f, fanotify_fdinfo); } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ababdbfab537..2172ba516c61 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -48,7 +48,7 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) * Called during unmount with no locks held, so needs to be safe against * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ -void fsnotify_unmount_inodes(struct super_block *sb) +static void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *iput_inode = NULL; @@ -96,6 +96,15 @@ void fsnotify_unmount_inodes(struct super_block *sb) if (iput_inode) iput(iput_inode); + /* Wait for outstanding inode references from connectors */ + wait_var_event(&sb->s_fsnotify_inode_refs, + !atomic_long_read(&sb->s_fsnotify_inode_refs)); +} + +void fsnotify_sb_delete(struct super_block *sb) +{ + fsnotify_unmount_inodes(sb); + fsnotify_clear_marks_by_sb(sb); } /* @@ -190,7 +199,7 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; - __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); + __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; __u32 marks_ignored_mask = 0; struct fsnotify_mark *mark; @@ -319,15 +328,17 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const unsigned char *file_name, u32 cookie) { struct fsnotify_iter_info iter_info = {}; - struct mount *mnt; + struct super_block *sb = NULL; + struct mount *mnt = NULL; + __u32 mnt_or_sb_mask = 0; int ret = 0; - /* global tests shouldn't care about events on child only the specific event */ - __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); + __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); - if (data_is == FSNOTIFY_EVENT_PATH) + if (data_is == FSNOTIFY_EVENT_PATH) { mnt = real_mount(((const struct path *)data)->mnt); - else - mnt = NULL; + sb = mnt->mnt.mnt_sb; + mnt_or_sb_mask = mnt->mnt_fsnotify_mask | sb->s_fsnotify_mask; + } /* * Optimization: srcu_read_lock() has a memory barrier which can @@ -337,16 +348,15 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, * need SRCU to keep them "alive". */ if (!to_tell->i_fsnotify_marks && - (!mnt || !mnt->mnt_fsnotify_marks)) + (!mnt || (!mnt->mnt_fsnotify_marks && !sb->s_fsnotify_marks))) return 0; /* * if this is a modify event we may need to clear the ignored masks - * otherwise return if neither the inode nor the vfsmount care about + * otherwise return if neither the inode nor the vfsmount/sb care about * this type of event. */ if (!(mask & FS_MODIFY) && - !(test_mask & to_tell->i_fsnotify_mask) && - !(mnt && test_mask & mnt->mnt_fsnotify_mask)) + !(test_mask & (to_tell->i_fsnotify_mask | mnt_or_sb_mask))) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); @@ -356,11 +366,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, if (mnt) { iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); + iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] = + fsnotify_first_mark(&sb->s_fsnotify_marks); } /* - * We need to merge inode & vfsmount mark lists so that inode mark - * ignore masks are properly reflected for mount mark notifications. + * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark + * ignore masks are properly reflected for mount/sb mark notifications. * That's why this traversal is so complicated... */ while (fsnotify_iter_select_report_types(&iter_info)) { @@ -386,7 +398,7 @@ static __init int fsnotify_init(void) { int ret; - BUG_ON(hweight32(ALL_FSNOTIFY_EVENTS) != 23); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 7902653dd577..5a00121fb219 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -21,6 +21,12 @@ static inline struct mount *fsnotify_conn_mount( return container_of(conn->obj, struct mount, mnt_fsnotify_marks); } +static inline struct super_block *fsnotify_conn_sb( + struct fsnotify_mark_connector *conn) +{ + return container_of(conn->obj, struct super_block, s_fsnotify_marks); +} + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); @@ -43,6 +49,11 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } +/* run the list of all marks associated with sb and destroy them */ +static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) +{ + fsnotify_destroy_marks(&sb->s_fsnotify_marks); +} /* Wait until all marks queued for destruction are destroyed */ extern void fsnotify_wait_marks_destroyed(void); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ac6978d3208c..105576daca4a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -815,7 +815,7 @@ static int __init inotify_user_setup(void) BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); - BUG_ON(hweight32(ALL_INOTIFY_BITS) != 22); + BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC|SLAB_ACCOUNT); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 59cdb27826de..d2dd16cb5989 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -115,6 +115,8 @@ static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) return &fsnotify_conn_inode(conn)->i_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; + else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) + return &fsnotify_conn_sb(conn)->s_fsnotify_mask; return NULL; } @@ -179,19 +181,24 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) } } -static struct inode *fsnotify_detach_connector_from_object( - struct fsnotify_mark_connector *conn) +static void *fsnotify_detach_connector_from_object( + struct fsnotify_mark_connector *conn, + unsigned int *type) { struct inode *inode = NULL; + *type = conn->type; if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) return NULL; if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; + atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs); } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; + } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { + fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } rcu_assign_pointer(*(conn->obj), NULL); @@ -211,10 +218,29 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) fsnotify_put_group(group); } +/* Drop object reference originally held by a connector */ +static void fsnotify_drop_object(unsigned int type, void *objp) +{ + struct inode *inode; + struct super_block *sb; + + if (!objp) + return; + /* Currently only inode references are passed to be dropped */ + if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) + return; + inode = objp; + sb = inode->i_sb; + iput(inode); + if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs)) + wake_up_var(&sb->s_fsnotify_inode_refs); +} + void fsnotify_put_mark(struct fsnotify_mark *mark) { struct fsnotify_mark_connector *conn; - struct inode *inode = NULL; + void *objp = NULL; + unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED; bool free_conn = false; /* Catch marks that were actually never attached to object */ @@ -234,7 +260,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) conn = mark->connector; hlist_del_init_rcu(&mark->obj_list); if (hlist_empty(&conn->list)) { - inode = fsnotify_detach_connector_from_object(conn); + objp = fsnotify_detach_connector_from_object(conn, &type); free_conn = true; } else { __fsnotify_recalc_mask(conn); @@ -242,7 +268,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) mark->connector = NULL; spin_unlock(&conn->lock); - iput(inode); + fsnotify_drop_object(type, objp); if (free_conn) { spin_lock(&destroy_lock); @@ -709,7 +735,8 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp) { struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark, *old_mark = NULL; - struct inode *inode; + void *objp; + unsigned int type; conn = fsnotify_grab_connector(connp); if (!conn) @@ -735,11 +762,11 @@ void fsnotify_destroy_marks(fsnotify_connp_t *connp) * mark references get dropped. It would lead to strange results such * as delaying inode deletion or blocking unmount. */ - inode = fsnotify_detach_connector_from_object(conn); + objp = fsnotify_detach_connector_from_object(conn, &type); spin_unlock(&conn->lock); if (old_mark) fsnotify_put_mark(old_mark); - iput(inode); + fsnotify_drop_object(type, objp); } /* diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a342f008e42f..d1cbb27808e2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5106,8 +5106,6 @@ int ocfs2_split_extent(handle_t *handle, * rightmost extent list. */ if (path->p_tree_depth) { - struct ocfs2_extent_block *eb; - ret = ocfs2_read_extent_block(et->et_ci, ocfs2_et_get_last_eb_blk(et), &last_eb_bh); @@ -5115,8 +5113,6 @@ int ocfs2_split_extent(handle_t *handle, mlog_errno(ret); goto out; } - - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; } if (rec->e_cpos == split_rec->e_cpos && diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 302cd7caa4a7..da578ad4c08f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1392,8 +1392,7 @@ retry: unlock: spin_unlock(&oi->ip_lock); out: - if (new) - kfree(new); + kfree(new); return ret; } diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 9b984cae4c4e..1d6dc8422899 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -329,7 +329,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) { char *buf; - buf = (char *) get_zeroed_page(GFP_NOFS); + buf = (char *) get_zeroed_page(GFP_ATOMIC); if (buf) { dump_mle(mle, buf, PAGE_SIZE - 1); free_page((unsigned long)buf); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 838a06d4066a..074d5de17bb2 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -531,7 +531,7 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) assert_spin_locked(&res->spinlock); /* don't shuffle secondary queues */ - if ((res->owner == dlm->node_num)) { + if (res->owner == dlm->node_num) { if (res->state & (DLM_LOCK_RES_MIGRATING | DLM_LOCK_RES_BLOCK_DIRTY)) return; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7a5ee145c733..1114ef02e780 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4135,7 +4135,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, struct buffer_head *ref_root_bh = NULL; struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); - struct ocfs2_refcount_block *rb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; struct ocfs2_refcount_tree *ref_tree; @@ -4162,7 +4161,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode, mlog_errno(ret); goto out; } - rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, &ref_tree->rf_ci, ref_root_bh, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index fc5306a31a1d..5792f9e39466 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -516,6 +516,9 @@ int proc_fill_super(struct super_block *s, void *data, int silent) */ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + /* procfs dentries and inodes don't require IO to create */ + s->s_shrink.seeks = 0; + pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); if (!root_inode) { diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index d06694757201..8468baee951d 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -10,9 +10,6 @@ #include <linux/seqlock.h> #include <linux/time.h> -#define LOAD_INT(x) ((x) >> FSHIFT) -#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) - static int loadavg_proc_show(struct seq_file *m, void *v) { unsigned long avnrun[3]; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index edda898714eb..568d90e17c17 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -38,6 +38,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) long cached; long available; unsigned long pages[NR_LRU_LISTS]; + unsigned long sreclaimable, sunreclaim; int lru; si_meminfo(&i); @@ -53,6 +54,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); + sreclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE); + sunreclaim = global_node_page_state(NR_SLAB_UNRECLAIMABLE); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); @@ -94,14 +97,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Mapped: ", global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); - show_val_kb(m, "Slab: ", - global_node_page_state(NR_SLAB_RECLAIMABLE) + - global_node_page_state(NR_SLAB_UNRECLAIMABLE)); - - show_val_kb(m, "SReclaimable: ", - global_node_page_state(NR_SLAB_RECLAIMABLE)); - show_val_kb(m, "SUnreclaim: ", - global_node_page_state(NR_SLAB_UNRECLAIMABLE)); + show_val_kb(m, "KReclaimable: ", sreclaimable + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); + show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); + show_val_kb(m, "SReclaimable: ", sreclaimable); + show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5ea1d64cb0b4..47c3764c469b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -521,7 +521,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) mss->swap += PAGE_SIZE; else put_page(page); @@ -713,6 +713,8 @@ static void smap_gather_stats(struct vm_area_struct *vma, smaps_walk.private = mss; #ifdef CONFIG_SHMEM + /* In case of smaps_rollup, reset the value from previous vma */ + mss->check_shmem_swap = false; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -728,7 +730,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE)) { - mss->swap = shmem_swapped; + mss->swap += shmem_swapped; } else { mss->check_shmem_swap = true; smaps_walk.pte_hole = smaps_pte_hole; diff --git a/fs/super.c b/fs/super.c index f3a8c008e164..ca53a08497ed 100644 --- a/fs/super.c +++ b/fs/super.c @@ -442,7 +442,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; - fsnotify_unmount_inodes(sb); + fsnotify_sb_delete(sb); cgroup_writeback_umount(); evict_inodes(sb); diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index fcda0fc97b90..ec85aeaed54a 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -175,8 +175,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, { struct udf_sb_info *sbi = UDF_SB(sb); int alloc_count = 0; - int bit, block, block_group, group_start; - int nr_groups, bitmap_nr; + int bit, block, block_group; + int bitmap_nr; struct buffer_head *bh; __u32 part_len; @@ -189,10 +189,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb, block_count = part_len - first_block; do { - nr_groups = udf_compute_nr_groups(sb, partition); block = first_block + (sizeof(struct spaceBitmapDesc) << 3); block_group = block >> (sb->s_blocksize_bits + 3); - group_start = block_group ? 0 : sizeof(struct spaceBitmapDesc); bitmap_nr = load_block_bitmap(sb, bitmap, block_group); if (bitmap_nr < 0) @@ -652,12 +650,6 @@ void udf_free_blocks(struct super_block *sb, struct inode *inode, } else if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) { udf_table_free_blocks(sb, map->s_uspace.s_table, bloc, offset, count); - } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { - udf_bitmap_free_blocks(sb, map->s_fspace.s_bitmap, - bloc, offset, count); - } else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { - udf_table_free_blocks(sb, map->s_fspace.s_table, - bloc, offset, count); } if (inode) { @@ -684,16 +676,6 @@ inline int udf_prealloc_blocks(struct super_block *sb, map->s_uspace.s_table, partition, first_block, block_count); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - allocated = udf_bitmap_prealloc_blocks(sb, - map->s_fspace.s_bitmap, - partition, first_block, - block_count); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - allocated = udf_table_prealloc_blocks(sb, - map->s_fspace.s_table, - partition, first_block, - block_count); else return 0; @@ -717,14 +699,6 @@ inline udf_pblk_t udf_new_block(struct super_block *sb, block = udf_table_new_block(sb, map->s_uspace.s_table, partition, goal, err); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - block = udf_bitmap_new_block(sb, - map->s_fspace.s_bitmap, - partition, goal, err); - else if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - block = udf_table_new_block(sb, - map->s_fspace.s_table, - partition, goal, err); else { *err = -EIO; return 0; diff --git a/fs/udf/super.c b/fs/udf/super.c index 6f515651a2c2..8f2f56d9a1bb 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -290,12 +290,8 @@ static void udf_free_partition(struct udf_part_map *map) if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) iput(map->s_uspace.s_table); - if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) - iput(map->s_fspace.s_table); if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) udf_sb_free_bitmap(map->s_uspace.s_bitmap); - if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) - udf_sb_free_bitmap(map->s_fspace.s_bitmap); if (map->s_partition_type == UDF_SPARABLE_MAP15) for (i = 0; i < 4; i++) brelse(map->s_type_specific.s_sparing.s_spar_map[i]); @@ -613,14 +609,11 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) struct udf_options uopt; struct udf_sb_info *sbi = UDF_SB(sb); int error = 0; - struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb); + + if (!(*flags & SB_RDONLY) && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT)) + return -EACCES; sync_filesystem(sb); - if (lvidiu) { - int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); - if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & SB_RDONLY)) - return -EACCES; - } uopt.flags = sbi->s_flags; uopt.uid = sbi->s_uid; @@ -988,12 +981,62 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) return bitmap; } +static int check_partition_desc(struct super_block *sb, + struct partitionDesc *p, + struct udf_part_map *map) +{ + bool umap, utable, fmap, ftable; + struct partitionHeaderDesc *phd; + + switch (le32_to_cpu(p->accessType)) { + case PD_ACCESS_TYPE_READ_ONLY: + case PD_ACCESS_TYPE_WRITE_ONCE: + case PD_ACCESS_TYPE_REWRITABLE: + case PD_ACCESS_TYPE_NONE: + goto force_ro; + } + + /* No Partition Header Descriptor? */ + if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && + strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) + goto force_ro; + + phd = (struct partitionHeaderDesc *)p->partitionContentsUse; + utable = phd->unallocSpaceTable.extLength; + umap = phd->unallocSpaceBitmap.extLength; + ftable = phd->freedSpaceTable.extLength; + fmap = phd->freedSpaceBitmap.extLength; + + /* No allocation info? */ + if (!utable && !umap && !ftable && !fmap) + goto force_ro; + + /* We don't support blocks that require erasing before overwrite */ + if (ftable || fmap) + goto force_ro; + /* UDF 2.60: 2.3.3 - no mixing of tables & bitmaps, no VAT. */ + if (utable && umap) + goto force_ro; + + if (map->s_partition_type == UDF_VIRTUAL_MAP15 || + map->s_partition_type == UDF_VIRTUAL_MAP20) + goto force_ro; + + return 0; +force_ro: + if (!sb_rdonly(sb)) + return -EACCES; + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); + return 0; +} + static int udf_fill_partdesc_info(struct super_block *sb, struct partitionDesc *p, int p_index) { struct udf_part_map *map; struct udf_sb_info *sbi = UDF_SB(sb); struct partitionHeaderDesc *phd; + int err; map = &sbi->s_partmaps[p_index]; @@ -1013,8 +1056,16 @@ static int udf_fill_partdesc_info(struct super_block *sb, p_index, map->s_partition_type, map->s_partition_root, map->s_partition_len); - if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) && - strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03)) + err = check_partition_desc(sb, p, map); + if (err) + return err; + + /* + * Skip loading allocation info it we cannot ever write to the fs. + * This is a correctness thing as we may have decided to force ro mount + * to avoid allocation info we don't support. + */ + if (UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT)) return 0; phd = (struct partitionHeaderDesc *)p->partitionContentsUse; @@ -1050,40 +1101,6 @@ static int udf_fill_partdesc_info(struct super_block *sb, p_index, bitmap->s_extPosition); } - if (phd->partitionIntegrityTable.extLength) - udf_debug("partitionIntegrityTable (part %d)\n", p_index); - - if (phd->freedSpaceTable.extLength) { - struct kernel_lb_addr loc = { - .logicalBlockNum = le32_to_cpu( - phd->freedSpaceTable.extPosition), - .partitionReferenceNum = p_index, - }; - struct inode *inode; - - inode = udf_iget_special(sb, &loc); - if (IS_ERR(inode)) { - udf_debug("cannot load freedSpaceTable (part %d)\n", - p_index); - return PTR_ERR(inode); - } - map->s_fspace.s_table = inode; - map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; - udf_debug("freedSpaceTable (part %d) @ %lu\n", - p_index, map->s_fspace.s_table->i_ino); - } - - if (phd->freedSpaceBitmap.extLength) { - struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index); - if (!bitmap) - return -ENOMEM; - map->s_fspace.s_bitmap = bitmap; - bitmap->s_extPosition = le32_to_cpu( - phd->freedSpaceBitmap.extPosition); - map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP; - udf_debug("freedSpaceBitmap (part %d) @ %u\n", - p_index, bitmap->s_extPosition); - } return 0; } @@ -1257,6 +1274,7 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block) ret = -EACCES; goto out_bh; } + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); ret = udf_load_vat(sb, i, type1_idx); if (ret < 0) goto out_bh; @@ -2155,10 +2173,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) UDF_MAX_READ_VERSION); ret = -EINVAL; goto error_out; - } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION && - !sb_rdonly(sb)) { - ret = -EACCES; - goto error_out; + } else if (minUDFWriteRev > UDF_MAX_WRITE_VERSION) { + if (!sb_rdonly(sb)) { + ret = -EACCES; + goto error_out; + } + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); } sbi->s_udfrev = minUDFWriteRev; @@ -2176,10 +2196,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) } if (sbi->s_partmaps[sbi->s_partition].s_partition_flags & - UDF_PART_FLAG_READ_ONLY && - !sb_rdonly(sb)) { - ret = -EACCES; - goto error_out; + UDF_PART_FLAG_READ_ONLY) { + if (!sb_rdonly(sb)) { + ret = -EACCES; + goto error_out; + } + UDF_SET_FLAG(sb, UDF_FLAG_RW_INCOMPAT); } if (udf_find_fileset(sb, &fileset, &rootdir)) { @@ -2433,10 +2455,6 @@ static unsigned int udf_count_free(struct super_block *sb) accum += udf_count_free_bitmap(sb, map->s_uspace.s_bitmap); } - if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP) { - accum += udf_count_free_bitmap(sb, - map->s_fspace.s_bitmap); - } if (accum) return accum; @@ -2444,11 +2462,6 @@ static unsigned int udf_count_free(struct super_block *sb) accum += udf_count_free_table(sb, map->s_uspace.s_table); } - if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE) { - accum += udf_count_free_table(sb, - map->s_fspace.s_table); - } - return accum; } diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 9424d7cab790..3d83be54c474 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -30,11 +30,11 @@ #define UDF_FLAG_LASTBLOCK_SET 16 #define UDF_FLAG_BLOCKSIZE_SET 17 #define UDF_FLAG_INCONSISTENT 18 +#define UDF_FLAG_RW_INCOMPAT 19 /* Set when we find RW incompatible + * feature */ #define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 #define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 -#define UDF_PART_FLAG_FREED_BITMAP 0x0004 -#define UDF_PART_FLAG_FREED_TABLE 0x0008 #define UDF_PART_FLAG_READ_ONLY 0x0010 #define UDF_PART_FLAG_WRITE_ONCE 0x0020 #define UDF_PART_FLAG_REWRITABLE 0x0040 @@ -50,8 +50,6 @@ #define UDF_INVALID_MODE ((umode_t)-1) -#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */ - #define MF_DUPLICATE_MD 0x01 #define MF_MIRROR_FE_LOADED 0x02 @@ -93,10 +91,6 @@ struct udf_part_map { struct udf_bitmap *s_bitmap; struct inode *s_table; } s_uspace; - union { - struct udf_bitmap *s_bitmap; - struct inode *s_table; - } s_fspace; __u32 s_partition_root; __u32 s_partition_len; __u16 s_partition_type; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index bfa0ec69f924..356d2b8568c1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1026,7 +1026,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, struct userfaultfd_ctx *fork_nctx = NULL; /* always take the fd_wqh lock before the fault_pending_wqh lock */ - spin_lock(&ctx->fd_wqh.lock); + spin_lock_irq(&ctx->fd_wqh.lock); __add_wait_queue(&ctx->fd_wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -1112,13 +1112,13 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, ret = -EAGAIN; break; } - spin_unlock(&ctx->fd_wqh.lock); + spin_unlock_irq(&ctx->fd_wqh.lock); schedule(); - spin_lock(&ctx->fd_wqh.lock); + spin_lock_irq(&ctx->fd_wqh.lock); } __remove_wait_queue(&ctx->fd_wqh, &wait); __set_current_state(TASK_RUNNING); - spin_unlock(&ctx->fd_wqh.lock); + spin_unlock_irq(&ctx->fd_wqh.lock); if (!ret && msg->event == UFFD_EVENT_FORK) { ret = resolve_userfault_fork(ctx, fork_nctx, msg); |