aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c4
-rw-r--r--fs/9p/fid.c22
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/v9fs.h10
-rw-r--r--fs/9p/vfs_addr.c28
-rw-r--r--fs/9p/vfs_inode.c13
-rw-r--r--fs/9p/vfs_inode_dotl.c3
-rw-r--r--fs/afs/callback.c2
-rw-r--r--fs/afs/dir.c32
-rw-r--r--fs/afs/dir_edit.c10
-rw-r--r--fs/afs/dir_silly.c4
-rw-r--r--fs/afs/dynroot.c2
-rw-r--r--fs/afs/file.c12
-rw-r--r--fs/afs/fs_operation.c6
-rw-r--r--fs/afs/inode.c41
-rw-r--r--fs/afs/internal.h23
-rw-r--r--fs/afs/super.c6
-rw-r--r--fs/afs/volume.c3
-rw-r--r--fs/afs/write.c23
-rw-r--r--fs/attr.c26
-rw-r--r--fs/btrfs/block-group.h1
-rw-r--r--fs/btrfs/ctree.h20
-rw-r--r--fs/btrfs/delayed-inode.c84
-rw-r--r--fs/btrfs/disk-io.c192
-rw-r--r--fs/btrfs/extent-tree.c22
-rw-r--r--fs/btrfs/extent_io.c125
-rw-r--r--fs/btrfs/file.c92
-rw-r--r--fs/btrfs/inode.c32
-rw-r--r--fs/btrfs/locking.c3
-rw-r--r--fs/btrfs/reflink.c16
-rw-r--r--fs/btrfs/send.c40
-rw-r--r--fs/btrfs/super.c47
-rw-r--r--fs/btrfs/tests/btrfs-tests.c24
-rw-r--r--fs/btrfs/transaction.c112
-rw-r--r--fs/btrfs/zoned.c61
-rw-r--r--fs/btrfs/zoned.h5
-rw-r--r--fs/cachefiles/ondemand.c3
-rw-r--r--fs/ceph/addr.c27
-rw-r--r--fs/ceph/cache.c4
-rw-r--r--fs/ceph/cache.h2
-rw-r--r--fs/ceph/caps.c105
-rw-r--r--fs/ceph/file.c2
-rw-r--r--fs/ceph/inode.c10
-rw-r--r--fs/ceph/mds_client.c4
-rw-r--r--fs/ceph/snap.c8
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/ceph/super.h10
-rw-r--r--fs/ceph/xattr.c14
-rw-r--r--fs/cifs/cifs_debug.c12
-rw-r--r--fs/cifs/cifsfs.c10
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h70
-rw-r--r--fs/cifs/cifsproto.h7
-rw-r--r--fs/cifs/connect.c65
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/cifs/fscache.c8
-rw-r--r--fs/cifs/fscache.h10
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/misc.c40
-rw-r--r--fs/cifs/sess.c185
-rw-r--r--fs/cifs/smb2ops.c181
-rw-r--r--fs/cifs/smb2pdu.c27
-rw-r--r--fs/cifs/trace.h38
-rw-r--r--fs/exec.c2
-rw-r--r--fs/exfat/namei.c4
-rw-r--r--fs/ext2/dir.c9
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/ext4/mballoc.c26
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/namei.c3
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/resize.c10
-rw-r--r--fs/ext4/super.c172
-rw-r--r--fs/ext4/xattr.c3
-rw-r--r--fs/f2fs/iostat.c31
-rw-r--r--fs/f2fs/namei.c17
-rw-r--r--fs/f2fs/node.c4
-rw-r--r--fs/fs-writeback.c37
-rw-r--r--fs/fscache/cookie.c26
-rw-r--r--fs/fscache/volume.c4
-rw-r--r--fs/hugetlbfs/inode.c68
-rw-r--r--fs/inode.c2
-rw-r--r--fs/io_uring.c411
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--fs/ksmbd/smb2pdu.c48
-rw-r--r--fs/ksmbd/transport_rdma.c10
-rw-r--r--fs/ksmbd/transport_tcp.c2
-rw-r--r--fs/ksmbd/vfs.c12
-rw-r--r--fs/lockd/svcsubs.c14
-rw-r--r--fs/netfs/buffered_read.c26
-rw-r--r--fs/netfs/internal.h2
-rw-r--r--fs/netfs/objects.c8
-rw-r--r--fs/nfs/callback_proc.c1
-rw-r--r--fs/nfs/dir.c1
-rw-r--r--fs/nfs/nfs4file.c1
-rw-r--r--fs/nfs/nfs4proc.c19
-rw-r--r--fs/nfs/nfs4state.c1
-rw-r--r--fs/nfs/pnfs.c21
-rw-r--r--fs/nfs/pnfs.h1
-rw-r--r--fs/nfsd/filecache.c9
-rw-r--r--fs/nfsd/nfs4xdr.c9
-rw-r--r--fs/nfsd/nfsd.h3
-rw-r--r--fs/nfsd/vfs.c11
-rw-r--r--fs/nilfs2/nilfs.h3
-rw-r--r--fs/notify/fanotify/fanotify_user.c34
-rw-r--r--fs/ntfs/attrib.c8
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/slot_map.c46
-rw-r--r--fs/ocfs2/super.c21
-rw-r--r--fs/overlayfs/super.c25
-rw-r--r--fs/quota/dquot.c10
-rw-r--r--fs/read_write.c80
-rw-r--r--fs/remap_range.c3
-rw-r--r--fs/tracefs/inode.c2
-rw-r--r--fs/userfaultfd.c12
-rw-r--r--fs/xfs/libxfs/xfs_attr.c47
-rw-r--r--fs/xfs/libxfs/xfs_attr.h17
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c37
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h3
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h4
-rw-r--r--fs/xfs/xfs_attr_item.c42
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--fs/xfs/xfs_icache.c56
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_inode.c64
-rw-r--r--fs/xfs/xfs_ioctl.c3
-rw-r--r--fs/xfs/xfs_log.c9
-rw-r--r--fs/xfs/xfs_mount.h2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c9
-rw-r--r--fs/xfs/xfs_super.c9
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--fs/xfs/xfs_xattr.c17
-rw-r--r--fs/zonefs/super.c111
134 files changed, 2252 insertions, 1474 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 1c8dc696d516..cebba4eaa0b5 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -62,12 +62,12 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
version = cpu_to_le32(v9inode->qid.version);
path = cpu_to_le64(v9inode->qid.path);
v9ses = v9fs_inode2v9ses(inode);
- v9inode->netfs_ctx.cache =
+ v9inode->netfs.cache =
fscache_acquire_cookie(v9fs_session_cache(v9ses),
0,
&path, sizeof(path),
&version, sizeof(version),
- i_size_read(&v9inode->vfs_inode));
+ i_size_read(&v9inode->netfs.inode));
p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
inode, v9fs_inode_cookie(v9inode));
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 79df61fe0e59..baf2b152229e 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -152,7 +152,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
const unsigned char **wnames, *uname;
int i, n, l, clone, access;
struct v9fs_session_info *v9ses;
- struct p9_fid *fid, *old_fid = NULL;
+ struct p9_fid *fid, *old_fid;
v9ses = v9fs_dentry2v9ses(dentry);
access = v9ses->flags & V9FS_ACCESS_MASK;
@@ -194,13 +194,12 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
if (IS_ERR(fid))
return fid;
+ refcount_inc(&fid->count);
v9fs_fid_add(dentry->d_sb->s_root, fid);
}
/* If we are root ourself just return that */
- if (dentry->d_sb->s_root == dentry) {
- refcount_inc(&fid->count);
+ if (dentry->d_sb->s_root == dentry)
return fid;
- }
/*
* Do a multipath walk with attached root.
* When walking parent we need to make sure we
@@ -212,6 +211,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
fid = ERR_PTR(n);
goto err_out;
}
+ old_fid = fid;
clone = 1;
i = 0;
while (i < n) {
@@ -221,19 +221,15 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
* walk to ensure none of the patch component change
*/
fid = p9_client_walk(fid, l, &wnames[i], clone);
+ /* non-cloning walk will return the same fid */
+ if (fid != old_fid) {
+ p9_client_clunk(old_fid);
+ old_fid = fid;
+ }
if (IS_ERR(fid)) {
- if (old_fid) {
- /*
- * If we fail, clunk fid which are mapping
- * to path component and not the last component
- * of the path.
- */
- p9_client_clunk(old_fid);
- }
kfree(wnames);
goto err_out;
}
- old_fid = fid;
i += l;
clone = 0;
}
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index e28ddf763b3b..0129de2ea31a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -625,7 +625,7 @@ static void v9fs_inode_init_once(void *foo)
struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
memset(&v9inode->qid, 0, sizeof(v9inode->qid));
- inode_init_once(&v9inode->vfs_inode);
+ inode_init_once(&v9inode->netfs.inode);
}
/**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index ec0e8df3b2eb..6acabc2e7dc9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -109,11 +109,7 @@ struct v9fs_session_info {
#define V9FS_INO_INVALID_ATTR 0x01
struct v9fs_inode {
- struct {
- /* These must be contiguous */
- struct inode vfs_inode; /* the VFS's inode record */
- struct netfs_i_context netfs_ctx; /* Netfslib context */
- };
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct p9_qid qid;
unsigned int cache_validity;
struct p9_fid *writeback_fid;
@@ -122,13 +118,13 @@ struct v9fs_inode {
static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
{
- return container_of(inode, struct v9fs_inode, vfs_inode);
+ return container_of(inode, struct v9fs_inode, netfs.inode);
}
static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode)
{
#ifdef CONFIG_9P_FSCACHE
- return netfs_i_cookie(&v9inode->vfs_inode);
+ return netfs_i_cookie(&v9inode->netfs);
#else
return NULL;
#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 8ce82ff1e40a..d0833fa69faf 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -58,21 +58,33 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
*/
static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
+ struct inode *inode = file_inode(file);
+ struct v9fs_inode *v9inode = V9FS_I(inode);
struct p9_fid *fid = file->private_data;
+ BUG_ON(!fid);
+
+ /* we might need to read from a fid that was opened write-only
+ * for read-modify-write of page cache, use the writeback fid
+ * for that */
+ if (rreq->origin == NETFS_READ_FOR_WRITE &&
+ (fid->mode & O_ACCMODE) == O_WRONLY) {
+ fid = v9inode->writeback_fid;
+ BUG_ON(!fid);
+ }
+
refcount_inc(&fid->count);
rreq->netfs_priv = fid;
return 0;
}
/**
- * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_request
- * @mapping: unused mapping of request to cleanup
- * @priv: private data to cleanup, a fid, guaranted non-null.
+ * v9fs_free_request - Cleanup request initialized by v9fs_init_rreq
+ * @rreq: The I/O request to clean up
*/
-static void v9fs_req_cleanup(struct address_space *mapping, void *priv)
+static void v9fs_free_request(struct netfs_io_request *rreq)
{
- struct p9_fid *fid = priv;
+ struct p9_fid *fid = rreq->netfs_priv;
p9_client_clunk(fid);
}
@@ -94,9 +106,9 @@ static int v9fs_begin_cache_operation(struct netfs_io_request *rreq)
const struct netfs_request_ops v9fs_req_ops = {
.init_request = v9fs_init_request,
+ .free_request = v9fs_free_request,
.begin_cache_operation = v9fs_begin_cache_operation,
.issue_read = v9fs_issue_read,
- .cleanup = v9fs_req_cleanup,
};
/**
@@ -140,7 +152,7 @@ static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
transferred_or_error != -ENOBUFS) {
version = cpu_to_le32(v9inode->qid.version);
fscache_invalidate(v9fs_inode_cookie(v9inode), &version,
- i_size_read(&v9inode->vfs_inode), 0);
+ i_size_read(&v9inode->netfs.inode), 0);
}
}
@@ -274,7 +286,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- retval = netfs_write_begin(filp, mapping, pos, len, &folio, fsdata);
+ retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata);
if (retval < 0)
return retval;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 55367ecb9442..3d8297714772 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -234,7 +234,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
v9inode->writeback_fid = NULL;
v9inode->cache_validity = 0;
mutex_init(&v9inode->v_mutex);
- return &v9inode->vfs_inode;
+ return &v9inode->netfs.inode;
}
/**
@@ -252,7 +252,8 @@ void v9fs_free_inode(struct inode *inode)
*/
static void v9fs_set_netfs_context(struct inode *inode)
{
- netfs_i_context_init(inode, &v9fs_req_ops);
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ netfs_inode_init(&v9inode->netfs, &v9fs_req_ops);
}
int v9fs_init_inode(struct v9fs_session_info *v9ses,
@@ -1250,15 +1251,15 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry,
return ERR_PTR(-ECHILD);
v9ses = v9fs_dentry2v9ses(dentry);
- fid = v9fs_fid_lookup(dentry);
+ if (!v9fs_proto_dotu(v9ses))
+ return ERR_PTR(-EBADF);
+
p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
+ fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return ERR_CAST(fid);
- if (!v9fs_proto_dotu(v9ses))
- return ERR_PTR(-EBADF);
-
st = p9_client_stat(fid);
p9_client_clunk(fid);
if (IS_ERR(st))
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index d17502a738a9..b6eb1160296c 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -274,6 +274,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
+ p9_client_clunk(dfid);
goto out;
}
@@ -285,6 +286,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (err) {
p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
err);
+ p9_client_clunk(dfid);
goto error;
}
err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
@@ -292,6 +294,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (err < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
err);
+ p9_client_clunk(dfid);
goto error;
}
v9fs_invalidate_inode_attr(dir);
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 1b4d5809808d..a484fa642808 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -30,7 +30,7 @@ void afs_invalidate_mmap_work(struct work_struct *work)
{
struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work);
- unmap_mapping_pages(vnode->vfs_inode.i_mapping, 0, 0, false);
+ unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
}
void afs_server_init_callback_work(struct work_struct *work)
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 79f6b74336d2..56ae5cd5184f 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -109,7 +109,7 @@ struct afs_lookup_cookie {
*/
static void afs_dir_read_cleanup(struct afs_read *req)
{
- struct address_space *mapping = req->vnode->vfs_inode.i_mapping;
+ struct address_space *mapping = req->vnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t last = req->nr_pages - 1;
@@ -153,7 +153,7 @@ static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio,
block = kmap_local_folio(folio, offset);
if (block->hdr.magic != AFS_DIR_MAGIC) {
printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n",
- __func__, dvnode->vfs_inode.i_ino,
+ __func__, dvnode->netfs.inode.i_ino,
pos, offset, size, ntohs(block->hdr.magic));
trace_afs_dir_check_failed(dvnode, pos + offset, i_size);
kunmap_local(block);
@@ -183,7 +183,7 @@ error:
static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
{
union afs_xdr_dir_block *block;
- struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct address_space *mapping = dvnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t last = req->nr_pages - 1;
size_t offset, size;
@@ -217,7 +217,7 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
*/
static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
{
- struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct address_space *mapping = dvnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t last = req->nr_pages - 1;
int ret = 0;
@@ -269,7 +269,7 @@ static int afs_dir_open(struct inode *inode, struct file *file)
static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
__acquires(&dvnode->validate_lock)
{
- struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct address_space *mapping = dvnode->netfs.inode.i_mapping;
struct afs_read *req;
loff_t i_size;
int nr_pages, i;
@@ -287,7 +287,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
req->cleanup = afs_dir_read_cleanup;
expand:
- i_size = i_size_read(&dvnode->vfs_inode);
+ i_size = i_size_read(&dvnode->netfs.inode);
if (i_size < 2048) {
ret = afs_bad(dvnode, afs_file_error_dir_small);
goto error;
@@ -305,7 +305,7 @@ expand:
req->actual_len = i_size; /* May change */
req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
req->data_version = dvnode->status.data_version; /* May change */
- iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages,
+ iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages,
0, i_size);
req->iter = &req->def_iter;
@@ -897,7 +897,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
out_op:
if (op->error == 0) {
- inode = &op->file[1].vnode->vfs_inode;
+ inode = &op->file[1].vnode->netfs.inode;
op->file[1].vnode = NULL;
}
@@ -1139,7 +1139,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
afs_stat_v(dir, n_reval);
/* search the directory for this vnode */
- ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key, &dir_version);
+ ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version);
switch (ret) {
case 0:
/* the filename maps to something */
@@ -1170,7 +1170,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
_debug("%pd: file deleted (uq %u -> %u I:%u)",
dentry, fid.unique,
vnode->fid.unique,
- vnode->vfs_inode.i_generation);
+ vnode->netfs.inode.i_generation);
goto not_found;
}
goto out_valid;
@@ -1368,7 +1368,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
if (d_really_is_positive(dentry)) {
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
- clear_nlink(&vnode->vfs_inode);
+ clear_nlink(&vnode->netfs.inode);
set_bit(AFS_VNODE_DELETED, &vnode->flags);
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
@@ -1487,8 +1487,8 @@ static void afs_dir_remove_link(struct afs_operation *op)
/* Already done */
} else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
write_seqlock(&vnode->cb_lock);
- drop_nlink(&vnode->vfs_inode);
- if (vnode->vfs_inode.i_nlink == 0) {
+ drop_nlink(&vnode->netfs.inode);
+ if (vnode->netfs.inode.i_nlink == 0) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
__afs_break_callback(vnode, afs_cb_break_for_unlink);
}
@@ -1504,7 +1504,7 @@ static void afs_dir_remove_link(struct afs_operation *op)
op->error = ret;
}
- _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, op->error);
+ _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error);
}
static void afs_unlink_success(struct afs_operation *op)
@@ -1680,8 +1680,8 @@ static void afs_link_success(struct afs_operation *op)
afs_update_dentry_version(op, dvp, op->dentry);
if (op->dentry_2->d_parent == op->dentry->d_parent)
afs_update_dentry_version(op, dvp, op->dentry_2);
- ihold(&vp->vnode->vfs_inode);
- d_instantiate(op->dentry, &vp->vnode->vfs_inode);
+ ihold(&vp->vnode->netfs.inode);
+ d_instantiate(op->dentry, &vp->vnode->netfs.inode);
}
static void afs_link_put(struct afs_operation *op)
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index d98e109ecee9..0ab7752d1b75 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -109,7 +109,7 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
*/
static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
{
- struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct address_space *mapping = vnode->netfs.inode.i_mapping;
struct folio *folio;
folio = __filemap_get_folio(mapping, index,
@@ -216,7 +216,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
_enter(",,{%d,%s},", name->len, name->name);
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
(i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
@@ -336,7 +336,7 @@ found_space:
if (b < AFS_DIR_BLOCKS_WITH_CTR)
meta->meta.alloc_ctrs[b] -= need_slots;
- inode_inc_iversion_raw(&vnode->vfs_inode);
+ inode_inc_iversion_raw(&vnode->netfs.inode);
afs_stat_v(vnode, n_dir_cr);
_debug("Insert %s in %u[%u]", name->name, b, slot);
@@ -383,7 +383,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
_enter(",,{%d,%s},", name->len, name->name);
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (i_size < AFS_DIR_BLOCK_SIZE ||
i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
(i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
@@ -463,7 +463,7 @@ found_dirent:
if (b < AFS_DIR_BLOCKS_WITH_CTR)
meta->meta.alloc_ctrs[b] += need_slots;
- inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version);
+ inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
afs_stat_v(vnode, n_dir_rm);
_debug("Remove %s from %u[%u]", name->name, b, slot);
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 45cfd50a9521..bb5807e87fa4 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -131,7 +131,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
goto out;
} while (!d_is_negative(sdentry));
- ihold(&vnode->vfs_inode);
+ ihold(&vnode->netfs.inode);
ret = afs_do_silly_rename(dvnode, vnode, dentry, sdentry, key);
switch (ret) {
@@ -148,7 +148,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
d_drop(sdentry);
}
- iput(&vnode->vfs_inode);
+ iput(&vnode->netfs.inode);
dput(sdentry);
out:
_leave(" = %d", ret);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index f120bcb8bf73..d7d9402ff718 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
/* there shouldn't be an existing inode */
BUG_ON(!(inode->i_state & I_NEW));
- netfs_i_context_init(inode, NULL);
+ netfs_inode_init(&vnode->netfs, NULL);
inode->i_size = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
if (root) {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index a8e8832179e4..d1cfb235c4b9 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -194,7 +194,7 @@ int afs_release(struct inode *inode, struct file *file)
afs_put_wb_key(af->wb);
if ((file->f_mode & FMODE_WRITE)) {
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
afs_set_cache_aux(vnode, &aux);
fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size);
} else {
@@ -325,7 +325,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
fsreq->iter = &fsreq->def_iter;
iov_iter_xarray(&fsreq->def_iter, READ,
- &fsreq->vnode->vfs_inode.i_mapping->i_pages,
+ &fsreq->vnode->netfs.inode.i_mapping->i_pages,
fsreq->pos, fsreq->len);
afs_fetch_data(fsreq->vnode, fsreq);
@@ -375,24 +375,24 @@ static int afs_begin_cache_operation(struct netfs_io_request *rreq)
}
static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
- struct folio *folio, void **_fsdata)
+ struct folio **foliop, void **_fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
}
-static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv)
+static void afs_free_request(struct netfs_io_request *rreq)
{
- key_put(netfs_priv);
+ key_put(rreq->netfs_priv);
}
const struct netfs_request_ops afs_req_ops = {
.init_request = afs_init_request,
+ .free_request = afs_free_request,
.begin_cache_operation = afs_begin_cache_operation,
.check_write_begin = afs_check_write_begin,
.issue_read = afs_issue_read,
- .cleanup = afs_priv_cleanup,
};
int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index d222dfbe976b..7a3803ce3a22 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -232,14 +232,14 @@ int afs_put_operation(struct afs_operation *op)
if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode)
clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags);
if (op->file[0].put_vnode)
- iput(&op->file[0].vnode->vfs_inode);
+ iput(&op->file[0].vnode->netfs.inode);
if (op->file[1].put_vnode)
- iput(&op->file[1].vnode->vfs_inode);
+ iput(&op->file[1].vnode->netfs.inode);
if (op->more_files) {
for (i = 0; i < op->nr_files - 2; i++)
if (op->more_files[i].put_vnode)
- iput(&op->more_files[i].vnode->vfs_inode);
+ iput(&op->more_files[i].vnode->netfs.inode);
kfree(op->more_files);
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 30b066299d39..64dab70d4a4f 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
*/
static void afs_set_netfs_context(struct afs_vnode *vnode)
{
- netfs_i_context_init(&vnode->vfs_inode, &afs_req_ops);
+ netfs_inode_init(&vnode->netfs, &afs_req_ops);
}
/*
@@ -96,7 +96,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_flags |= S_NOATIME;
inode->i_uid = make_kuid(&init_user_ns, status->owner);
inode->i_gid = make_kgid(&init_user_ns, status->group);
- set_nlink(&vnode->vfs_inode, status->nlink);
+ set_nlink(&vnode->netfs.inode, status->nlink);
switch (status->type) {
case AFS_FTYPE_FILE:
@@ -139,7 +139,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
afs_set_netfs_context(vnode);
vnode->invalid_before = status->data_version;
- inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
+ inode_set_iversion_raw(&vnode->netfs.inode, status->data_version);
if (!vp->scb.have_cb) {
/* it's a symlink we just created (the fileserver
@@ -163,7 +163,7 @@ static void afs_apply_status(struct afs_operation *op,
{
struct afs_file_status *status = &vp->scb.status;
struct afs_vnode *vnode = vp->vnode;
- struct inode *inode = &vnode->vfs_inode;
+ struct inode *inode = &vnode->netfs.inode;
struct timespec64 t;
umode_t mode;
bool data_changed = false;
@@ -246,7 +246,7 @@ static void afs_apply_status(struct afs_operation *op,
* idea of what the size should be that's not the same as
* what's on the server.
*/
- vnode->netfs_ctx.remote_i_size = status->size;
+ vnode->netfs.remote_i_size = status->size;
if (change_size) {
afs_set_i_size(vnode, status->size);
inode->i_ctime = t;
@@ -289,7 +289,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
*/
if (vp->scb.status.abort_code == VNOVNODE) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
- clear_nlink(&vnode->vfs_inode);
+ clear_nlink(&vnode->netfs.inode);
__afs_break_callback(vnode, afs_cb_break_for_deleted);
op->flags &= ~AFS_OPERATION_DIR_CONFLICT;
}
@@ -306,8 +306,8 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
if (vp->scb.have_cb)
afs_apply_callback(op, vp);
} else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) {
- drop_nlink(&vnode->vfs_inode);
- if (vnode->vfs_inode.i_nlink == 0) {
+ drop_nlink(&vnode->netfs.inode);
+ if (vnode->netfs.inode.i_nlink == 0) {
set_bit(AFS_VNODE_DELETED, &vnode->flags);
__afs_break_callback(vnode, afs_cb_break_for_deleted);
}
@@ -326,7 +326,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
struct afs_vnode *vnode = vp->vnode;
int ret;
- if (vnode->vfs_inode.i_state & I_NEW) {
+ if (vnode->netfs.inode.i_state & I_NEW) {
ret = afs_inode_init_from_status(op, vp, vnode);
op->error = ret;
if (ret == 0)
@@ -430,7 +430,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
struct afs_vnode_cache_aux aux;
if (vnode->status.type != AFS_FTYPE_FILE) {
- vnode->netfs_ctx.cache = NULL;
+ vnode->netfs.cache = NULL;
return;
}
@@ -457,7 +457,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp)
{
struct afs_vnode_param *dvp = &op->file[0];
- struct super_block *sb = dvp->vnode->vfs_inode.i_sb;
+ struct super_block *sb = dvp->vnode->netfs.inode.i_sb;
struct afs_vnode *vnode;
struct inode *inode;
int ret;
@@ -582,10 +582,10 @@ static void afs_zap_data(struct afs_vnode *vnode)
/* nuke all the non-dirty pages that aren't locked, mapped or being
* written back in a regular file and completely discard the pages in a
* directory or symlink */
- if (S_ISREG(vnode->vfs_inode.i_mode))
- invalidate_remote_inode(&vnode->vfs_inode);
+ if (S_ISREG(vnode->netfs.inode.i_mode))
+ invalidate_remote_inode(&vnode->netfs.inode);
else
- invalidate_inode_pages2(vnode->vfs_inode.i_mapping);
+ invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
}
/*
@@ -683,8 +683,8 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
key_serial(key));
if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
- if (vnode->vfs_inode.i_nlink)
- clear_nlink(&vnode->vfs_inode);
+ if (vnode->netfs.inode.i_nlink)
+ clear_nlink(&vnode->netfs.inode);
goto valid;
}
@@ -745,7 +745,8 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path,
_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
- if (!(query_flags & AT_STATX_DONT_SYNC) &&
+ if (vnode->volume &&
+ !(query_flags & AT_STATX_DONT_SYNC) &&
!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
key = afs_request_key(vnode->volume->cell);
if (IS_ERR(key))
@@ -826,7 +827,7 @@ void afs_evict_inode(struct inode *inode)
static void afs_setattr_success(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
- struct inode *inode = &vp->vnode->vfs_inode;
+ struct inode *inode = &vp->vnode->netfs.inode;
loff_t old_i_size = i_size_read(inode);
op->setattr.old_i_size = old_i_size;
@@ -843,7 +844,7 @@ static void afs_setattr_success(struct afs_operation *op)
static void afs_setattr_edit_file(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
- struct inode *inode = &vp->vnode->vfs_inode;
+ struct inode *inode = &vp->vnode->netfs.inode;
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
loff_t size = op->setattr.attr->ia_size;
@@ -875,7 +876,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH;
struct afs_operation *op;
struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry));
- struct inode *inode = &vnode->vfs_inode;
+ struct inode *inode = &vnode->netfs.inode;
loff_t i_size;
int ret;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a30995901266..a6f25d9e75b5 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -619,12 +619,7 @@ enum afs_lock_state {
* leak from one inode to another.
*/
struct afs_vnode {
- struct {
- /* These must be contiguous */
- struct inode vfs_inode; /* the VFS's inode record */
- struct netfs_i_context netfs_ctx; /* Netfslib context */
- };
-
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct afs_volume *volume; /* volume on which vnode resides */
struct afs_fid fid; /* the file identifier for this inode */
struct afs_file_status status; /* AFS status info for this file */
@@ -675,7 +670,7 @@ struct afs_vnode {
static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
{
#ifdef CONFIG_AFS_FSCACHE
- return netfs_i_cookie(&vnode->vfs_inode);
+ return netfs_i_cookie(&vnode->netfs);
#else
return NULL;
#endif
@@ -685,7 +680,7 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode,
struct fscache_cookie *cookie)
{
#ifdef CONFIG_AFS_FSCACHE
- vnode->netfs_ctx.cache = cookie;
+ vnode->netfs.cache = cookie;
#endif
}
@@ -892,7 +887,7 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl
afs_set_cache_aux(vnode, &aux);
fscache_invalidate(afs_vnode_cache(vnode), &aux,
- i_size_read(&vnode->vfs_inode), flags);
+ i_size_read(&vnode->netfs.inode), flags);
}
/*
@@ -1217,7 +1212,7 @@ static inline struct afs_net *afs_i2net(struct inode *inode)
static inline struct afs_net *afs_v2net(struct afs_vnode *vnode)
{
- return afs_i2net(&vnode->vfs_inode);
+ return afs_i2net(&vnode->netfs.inode);
}
static inline struct afs_net *afs_sock2net(struct sock *sk)
@@ -1593,12 +1588,12 @@ extern void yfs_fs_store_opaque_acl2(struct afs_operation *);
*/
static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
{
- return container_of(inode, struct afs_vnode, vfs_inode);
+ return container_of(inode, struct afs_vnode, netfs.inode);
}
static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
{
- return &vnode->vfs_inode;
+ return &vnode->netfs.inode;
}
/*
@@ -1621,8 +1616,8 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
*/
static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size)
{
- i_size_write(&vnode->vfs_inode, size);
- vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1;
+ i_size_write(&vnode->netfs.inode, size);
+ vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1;
}
/*
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 1fea195b0b27..95d713074dc8 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -659,7 +659,7 @@ static void afs_i_init_once(void *_vnode)
struct afs_vnode *vnode = _vnode;
memset(vnode, 0, sizeof(*vnode));
- inode_init_once(&vnode->vfs_inode);
+ inode_init_once(&vnode->netfs.inode);
mutex_init(&vnode->io_lock);
init_rwsem(&vnode->validate_lock);
spin_lock_init(&vnode->wb_lock);
@@ -700,8 +700,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
init_rwsem(&vnode->rmdir_lock);
INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work);
- _leave(" = %p", &vnode->vfs_inode);
- return &vnode->vfs_inode;
+ _leave(" = %p", &vnode->netfs.inode);
+ return &vnode->netfs.inode;
}
static void afs_free_inode(struct inode *inode)
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 94a3d247924b..cc665cef0abe 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -9,8 +9,7 @@
#include <linux/slab.h>
#include "internal.h"
-unsigned __read_mostly afs_volume_gc_delay = 10;
-unsigned __read_mostly afs_volume_record_life = 60 * 60;
+static unsigned __read_mostly afs_volume_record_life = 60 * 60;
/*
* Insert a volume into a cell. If there's an existing volume record, that is
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 2236b2165e37..2c885b22de34 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -60,7 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
* file. We need to do this before we get a lock on the page in case
* there's more than one writer competing for the same cache block.
*/
- ret = netfs_write_begin(file, mapping, pos, len, &folio, fsdata);
+ ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata);
if (ret < 0)
return ret;
@@ -146,10 +146,10 @@ int afs_write_end(struct file *file, struct address_space *mapping,
write_end_pos = pos + copied;
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (write_end_pos > i_size) {
write_seqlock(&vnode->cb_lock);
- i_size = i_size_read(&vnode->vfs_inode);
+ i_size = i_size_read(&vnode->netfs.inode);
if (write_end_pos > i_size)
afs_set_i_size(vnode, write_end_pos);
write_sequnlock(&vnode->cb_lock);
@@ -257,7 +257,7 @@ static void afs_redirty_pages(struct writeback_control *wbc,
*/
static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
{
- struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct address_space *mapping = vnode->netfs.inode.i_mapping;
struct folio *folio;
pgoff_t end;
@@ -354,7 +354,6 @@ static const struct afs_operation_ops afs_store_data_operation = {
static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos,
bool laundering)
{
- struct netfs_i_context *ictx = &vnode->netfs_ctx;
struct afs_operation *op;
struct afs_wb_key *wbk = NULL;
loff_t size = iov_iter_count(iter);
@@ -385,9 +384,9 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
op->store.write_iter = iter;
op->store.pos = pos;
op->store.size = size;
- op->store.i_size = max(pos + size, ictx->remote_i_size);
+ op->store.i_size = max(pos + size, vnode->netfs.remote_i_size);
op->store.laundering = laundering;
- op->mtime = vnode->vfs_inode.i_mtime;
+ op->mtime = vnode->netfs.inode.i_mtime;
op->flags |= AFS_OPERATION_UNINTR;
op->ops = &afs_store_data_operation;
@@ -554,7 +553,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
struct iov_iter iter;
unsigned long priv;
unsigned int offset, to, len, max_len;
- loff_t i_size = i_size_read(&vnode->vfs_inode);
+ loff_t i_size = i_size_read(&vnode->netfs.inode);
bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode));
long count = wbc->nr_to_write;
@@ -845,7 +844,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
_enter("{%llx:%llu},{%zu},",
vnode->fid.vid, vnode->fid.vnode, count);
- if (IS_SWAPFILE(&vnode->vfs_inode)) {
+ if (IS_SWAPFILE(&vnode->netfs.inode)) {
printk(KERN_INFO
"AFS: Attempt to write to active swap file!\n");
return -EBUSY;
@@ -958,8 +957,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
/* Discard unused keys */
spin_lock(&vnode->wb_lock);
- if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) &&
- !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) {
+ if (!mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_WRITEBACK) &&
+ !mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_DIRTY)) {
list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) {
if (refcount_read(&wbk->usage) == 1)
list_move(&wbk->vnode_link, &graveyard);
@@ -1034,6 +1033,6 @@ static void afs_write_to_cache(struct afs_vnode *vnode,
bool caching)
{
fscache_write_to_cache(afs_vnode_cache(vnode),
- vnode->vfs_inode.i_mapping, start, len, i_size,
+ vnode->netfs.inode.i_mapping, start, len, i_size,
afs_write_to_cache_done, vnode, caching);
}
diff --git a/fs/attr.c b/fs/attr.c
index 66899b6e9bd8..dbe996b0dedf 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -61,9 +61,15 @@ static bool chgrp_ok(struct user_namespace *mnt_userns,
const struct inode *inode, kgid_t gid)
{
kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) &&
- (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
- return true;
+ if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) {
+ kgid_t mapped_gid;
+
+ if (gid_eq(gid, inode->i_gid))
+ return true;
+ mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid);
+ if (in_group_p(mapped_gid))
+ return true;
+ }
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
if (gid_eq(kgid, INVALID_GID) &&
@@ -123,12 +129,20 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
/* Make sure a caller can chmod. */
if (ia_valid & ATTR_MODE) {
+ kgid_t mapped_gid;
+
if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
+
+ if (ia_valid & ATTR_GID)
+ mapped_gid = mapped_kgid_fs(mnt_userns,
+ i_user_ns(inode), attr->ia_gid);
+ else
+ mapped_gid = i_gid_into_mnt(mnt_userns, inode);
+
/* Also check the setgid bit! */
- if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
- i_gid_into_mnt(mnt_userns, inode)) &&
- !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
+ if (!in_group_p(mapped_gid) &&
+ !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
attr->ia_mode &= ~S_ISGID;
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 3ac668ace50a..35e0e860cc0b 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -104,6 +104,7 @@ struct btrfs_block_group {
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;
unsigned int zone_is_active:1;
+ unsigned int zoned_data_reloc_ongoing:1;
int disk_cache_state;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0e49b1a0c071..9c21e214d29e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,9 +675,8 @@ struct btrfs_fs_info {
rwlock_t global_root_lock;
struct rb_root global_root_tree;
- /* The xarray that holds all the FS roots */
- spinlock_t fs_roots_lock;
- struct xarray fs_roots;
+ spinlock_t fs_roots_radix_lock;
+ struct radix_tree_root fs_roots_radix;
/* block group cache stuff */
rwlock_t block_group_cache_lock;
@@ -995,10 +994,10 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root;
- /* Extent buffer xarray */
+ /* Extent buffer radix tree */
spinlock_t buffer_lock;
/* Entries are eb->start / sectorsize */
- struct xarray extent_buffers;
+ struct radix_tree_root buffer_radix;
/* next backup root to be overwritten */
int backup_root_index;
@@ -1119,8 +1118,7 @@ enum {
*/
BTRFS_ROOT_SHAREABLE,
BTRFS_ROOT_TRACK_DIRTY,
- /* The root is tracked in fs_info::fs_roots */
- BTRFS_ROOT_REGISTERED,
+ BTRFS_ROOT_IN_RADIX,
BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
BTRFS_ROOT_DEFRAG_RUNNING,
BTRFS_ROOT_FORCE_COW,
@@ -1224,10 +1222,10 @@ struct btrfs_root {
struct rb_root inode_tree;
/*
- * Xarray that keeps track of delayed nodes of every inode, protected
- * by inode_lock
+ * radix tree that keeps track of delayed nodes of every inode,
+ * protected by inode_lock
*/
- struct xarray delayed_nodes;
+ struct radix_tree_root delayed_nodes_tree;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@@ -1330,6 +1328,8 @@ struct btrfs_replace_extent_info {
* existing extent into a file range.
*/
bool is_new_extent;
+ /* Indicate if we should update the inode's mtime and ctime. */
+ bool update_times;
/* Meaningful only if is_new_extent is true. */
int qgroup_reserved;
/*
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 66779ab3ed4a..748bf6b0d860 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -78,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
}
spin_lock(&root->inode_lock);
- node = xa_load(&root->delayed_nodes, ino);
+ node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
if (node) {
if (btrfs_inode->delayed_node) {
@@ -90,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
/*
* It's possible that we're racing into the middle of removing
- * this node from the xarray. In this case, the refcount
+ * this node from the radix tree. In this case, the refcount
* was zero and it should never go back to one. Just return
- * NULL like it was never in the xarray at all; our release
+ * NULL like it was never in the radix at all; our release
* function is in the process of removing it.
*
* Some implementations of refcount_inc refuse to bump the
@@ -100,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount.
*
- * If this node is properly in the xarray, we want to bump the
+ * If this node is properly in the radix, we want to bump the
* refcount twice, once for the inode and once for this get
* operation.
*/
@@ -128,30 +128,36 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
u64 ino = btrfs_ino(btrfs_inode);
int ret;
- do {
- node = btrfs_get_delayed_node(btrfs_inode);
- if (node)
- return node;
+again:
+ node = btrfs_get_delayed_node(btrfs_inode);
+ if (node)
+ return node;
- node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
- if (!node)
- return ERR_PTR(-ENOMEM);
- btrfs_init_delayed_node(node, root, ino);
+ node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+ btrfs_init_delayed_node(node, root, ino);
- /* Cached in the inode and can be accessed */
- refcount_set(&node->refs, 2);
+ /* cached in the btrfs inode and can be accessed */
+ refcount_set(&node->refs, 2);
- spin_lock(&root->inode_lock);
- ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS);
- if (ret) {
- spin_unlock(&root->inode_lock);
- kmem_cache_free(delayed_node_cache, node);
- if (ret != -EBUSY)
- return ERR_PTR(ret);
- }
- } while (ret);
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret) {
+ kmem_cache_free(delayed_node_cache, node);
+ return ERR_PTR(ret);
+ }
+
+ spin_lock(&root->inode_lock);
+ ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
+ if (ret == -EEXIST) {
+ spin_unlock(&root->inode_lock);
+ kmem_cache_free(delayed_node_cache, node);
+ radix_tree_preload_end();
+ goto again;
+ }
btrfs_inode->delayed_node = node;
spin_unlock(&root->inode_lock);
+ radix_tree_preload_end();
return node;
}
@@ -270,7 +276,8 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- xa_erase(&root->delayed_nodes, delayed_node->inode_id);
+ radix_tree_delete(&root->delayed_nodes_tree,
+ delayed_node->inode_id);
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
@@ -1863,35 +1870,34 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
- unsigned long index = 0;
- struct btrfs_delayed_node *delayed_node;
+ u64 inode_id = 0;
struct btrfs_delayed_node *delayed_nodes[8];
+ int i, n;
while (1) {
- int n = 0;
-
spin_lock(&root->inode_lock);
- if (xa_empty(&root->delayed_nodes)) {
+ n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
+ (void **)delayed_nodes, inode_id,
+ ARRAY_SIZE(delayed_nodes));
+ if (!n) {
spin_unlock(&root->inode_lock);
- return;
+ break;
}
- xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) {
+ inode_id = delayed_nodes[n - 1]->inode_id + 1;
+ for (i = 0; i < n; i++) {
/*
* Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below
*/
- if (refcount_inc_not_zero(&delayed_node->refs)) {
- delayed_nodes[n] = delayed_node;
- n++;
- }
- if (n >= ARRAY_SIZE(delayed_nodes))
- break;
+ if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
+ delayed_nodes[i] = NULL;
}
- index++;
spin_unlock(&root->inode_lock);
- for (int i = 0; i < n; i++) {
+ for (i = 0; i < n; i++) {
+ if (!delayed_nodes[i])
+ continue;
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 89e94ea2fef5..de440ebf5648 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -5,6 +5,7 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
+#include <linux/radix-tree.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
@@ -485,7 +486,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
fs_info->nodesize);
- /* A dirty eb shouldn't disappear from extent_buffers */
+ /* A dirty eb shouldn't disappear from buffer_radix */
if (WARN_ON(!eb))
return -EUCLEAN;
@@ -1158,7 +1159,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
- xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
+ INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
btrfs_init_root_block_rsv(root);
@@ -1210,9 +1211,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&root->leak_list);
- spin_lock(&fs_info->fs_roots_lock);
+ spin_lock(&fs_info->fs_roots_radix_lock);
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
#endif
}
@@ -1659,11 +1660,12 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
{
struct btrfs_root *root;
- spin_lock(&fs_info->fs_roots_lock);
- root = xa_load(&fs_info->fs_roots, (unsigned long)root_id);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)root_id);
if (root)
root = btrfs_grab_root(root);
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
return root;
}
@@ -1705,14 +1707,20 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
{
int ret;
- spin_lock(&fs_info->fs_roots_lock);
- ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid,
- root, GFP_NOFS);
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret)
+ return ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_insert(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ root);
if (ret == 0) {
btrfs_grab_root(root);
- set_bit(BTRFS_ROOT_REGISTERED, &root->state);
+ set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
}
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ radix_tree_preload_end();
return ret;
}
@@ -2342,9 +2350,9 @@ void btrfs_put_root(struct btrfs_root *root)
btrfs_drew_lock_destroy(&root->snapshot_lock);
free_root_extent_buffers(root);
#ifdef CONFIG_BTRFS_DEBUG
- spin_lock(&root->fs_info->fs_roots_lock);
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
list_del_init(&root->leak_list);
- spin_unlock(&root->fs_info->fs_roots_lock);
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
#endif
kfree(root);
}
@@ -2352,21 +2360,28 @@ void btrfs_put_root(struct btrfs_root *root)
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *root;
- unsigned long index = 0;
+ int ret;
+ struct btrfs_root *gang[8];
+ int i;
while (!list_empty(&fs_info->dead_roots)) {
- root = list_entry(fs_info->dead_roots.next,
- struct btrfs_root, root_list);
- list_del(&root->root_list);
+ gang[0] = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&gang[0]->root_list);
- if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
- btrfs_drop_and_free_fs_root(fs_info, root);
- btrfs_put_root(root);
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
+ btrfs_drop_and_free_fs_root(fs_info, gang[0]);
+ btrfs_put_root(gang[0]);
}
- xa_for_each(&fs_info->fs_roots, index, root) {
- btrfs_drop_and_free_fs_root(fs_info, root);
+ while (1) {
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang));
+ if (!ret)
+ break;
+ for (i = 0; i < ret; i++)
+ btrfs_drop_and_free_fs_root(fs_info, gang[i]);
}
}
@@ -3134,8 +3149,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
- xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC);
- xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -3143,7 +3158,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
- spin_lock_init(&fs_info->fs_roots_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
@@ -3374,7 +3389,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
/*
* btrfs_find_orphan_roots() is responsible for finding all the dead
* roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
- * them into the fs_info->fs_roots. This must be done before
+ * them into the fs_info->fs_roots_radix tree. This must be done before
* calling btrfs_orphan_cleanup() on the tree root. If we don't do it
* first, then btrfs_orphan_cleanup() will delete a dead root's orphan
* item before the root's tree is deleted - this means that if we unmount
@@ -4499,11 +4514,12 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
{
bool drop_ref = false;
- spin_lock(&fs_info->fs_roots_lock);
- xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid);
- if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state))
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ radix_tree_delete(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid);
+ if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
drop_ref = true;
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
if (BTRFS_FS_ERROR(fs_info)) {
ASSERT(root->log_root == NULL);
@@ -4519,48 +4535,50 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
- struct btrfs_root *roots[8];
- unsigned long index = 0;
- int i;
+ u64 root_objectid = 0;
+ struct btrfs_root *gang[8];
+ int i = 0;
int err = 0;
- int grabbed;
+ unsigned int ret = 0;
while (1) {
- struct btrfs_root *root;
-
- spin_lock(&fs_info->fs_roots_lock);
- if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) {
- spin_unlock(&fs_info->fs_roots_lock);
- return err;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang));
+ if (!ret) {
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ break;
}
+ root_objectid = gang[ret - 1]->root_key.objectid + 1;
- grabbed = 0;
- xa_for_each_start(&fs_info->fs_roots, index, root, index) {
- /* Avoid grabbing roots in dead_roots */
- if (btrfs_root_refs(&root->root_item) > 0)
- roots[grabbed++] = btrfs_grab_root(root);
- if (grabbed >= ARRAY_SIZE(roots))
- break;
+ for (i = 0; i < ret; i++) {
+ /* Avoid to grab roots in dead_roots */
+ if (btrfs_root_refs(&gang[i]->root_item) == 0) {
+ gang[i] = NULL;
+ continue;
+ }
+ /* grab all the search result for later use */
+ gang[i] = btrfs_grab_root(gang[i]);
}
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
- for (i = 0; i < grabbed; i++) {
- if (!roots[i])
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
continue;
- index = roots[i]->root_key.objectid;
- err = btrfs_orphan_cleanup(roots[i]);
+ root_objectid = gang[i]->root_key.objectid;
+ err = btrfs_orphan_cleanup(gang[i]);
if (err)
- goto out;
- btrfs_put_root(roots[i]);
+ break;
+ btrfs_put_root(gang[i]);
}
- index++;
+ root_objectid++;
}
-out:
- /* Release the roots that remain uncleaned due to error */
- for (; i < grabbed; i++) {
- if (roots[i])
- btrfs_put_root(roots[i]);
+ /* release the uncleaned roots due to error */
+ for (; i < ret; i++) {
+ if (gang[i])
+ btrfs_put_root(gang[i]);
}
return err;
}
@@ -4632,6 +4650,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
int ret;
set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
+
+ /*
+ * We may have the reclaim task running and relocating a data block group,
+ * in which case it may create delayed iputs. So stop it before we park
+ * the cleaner kthread otherwise we can get new delayed iputs after
+ * parking the cleaner, and that can make the async reclaim task to hang
+ * if it's waiting for delayed iputs to complete, since the cleaner is
+ * parked and can not run delayed iputs - this will make us hang when
+ * trying to stop the async reclaim task.
+ */
+ cancel_work_sync(&fs_info->reclaim_bgs_work);
/*
* We don't want the cleaner to start new transactions, add more delayed
* iputs, etc. while we're closing. We can't use kthread_stop() yet
@@ -4672,8 +4701,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
- cancel_work_sync(&fs_info->reclaim_bgs_work);
-
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4870,28 +4897,31 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
{
- unsigned long index = 0;
- int grabbed = 0;
- struct btrfs_root *roots[8];
+ struct btrfs_root *gang[8];
+ u64 root_objectid = 0;
+ int ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+ (void **)gang, root_objectid,
+ ARRAY_SIZE(gang))) != 0) {
+ int i;
- spin_lock(&fs_info->fs_roots_lock);
- while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index,
- ULONG_MAX, 8, XA_PRESENT))) {
- for (int i = 0; i < grabbed; i++)
- roots[i] = btrfs_grab_root(roots[i]);
- spin_unlock(&fs_info->fs_roots_lock);
+ for (i = 0; i < ret; i++)
+ gang[i] = btrfs_grab_root(gang[i]);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
- for (int i = 0; i < grabbed; i++) {
- if (!roots[i])
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
continue;
- index = roots[i]->root_key.objectid;
- btrfs_free_log(NULL, roots[i]);
- btrfs_put_root(roots[i]);
+ root_objectid = gang[i]->root_key.objectid;
+ btrfs_free_log(NULL, gang[i]);
+ btrfs_put_root(gang[i]);
}
- index++;
- spin_lock(&fs_info->fs_roots_lock);
+ root_objectid++;
+ spin_lock(&fs_info->fs_roots_radix_lock);
}
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log_root_tree(NULL, fs_info);
}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0867c5cd6e01..a3afc15430ce 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3832,7 +3832,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
block_group->start == fs_info->data_reloc_bg ||
fs_info->data_reloc_bg == 0);
- if (block_group->ro) {
+ if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
ret = 1;
goto out;
}
@@ -3894,8 +3894,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
out:
if (ret && ffe_ctl->for_treelog)
fs_info->treelog_bg = 0;
- if (ret && ffe_ctl->for_data_reloc)
+ if (ret && ffe_ctl->for_data_reloc &&
+ fs_info->data_reloc_bg == block_group->start) {
+ /*
+ * Do not allow further allocations from this block group.
+ * Compared to increasing the ->ro, setting the
+ * ->zoned_data_reloc_ongoing flag still allows nocow
+ * writers to come in. See btrfs_inc_nocow_writers().
+ *
+ * We need to disable an allocation to avoid an allocation of
+ * regular (non-relocation data) extent. With mix of relocation
+ * extents and regular extents, we can dispatch WRITE commands
+ * (for relocation extents) and ZONE APPEND commands (for
+ * regular extents) at the same time to the same zone, which
+ * easily break the write pointer.
+ */
+ block_group->zoned_data_reloc_ongoing = 1;
fs_info->data_reloc_bg = 0;
+ }
spin_unlock(&fs_info->relocation_bg_lock);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
@@ -5813,7 +5829,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
btrfs_qgroup_free_meta_all_pertrans(root);
- if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
btrfs_add_dropped_root(trans, root);
else
btrfs_put_root(root);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8f6b544ae616..f03ab5dbda7a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2966,7 +2966,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
}
/*
- * Find extent buffer for a given bytenr.
+ * Find extent buffer for a givne bytenr.
*
* This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
* in endio context.
@@ -2985,9 +2985,11 @@ static struct extent_buffer *find_extent_buffer_readpage(
return (struct extent_buffer *)page->private;
}
- /* For subpage case, we need to lookup extent buffer xarray */
- eb = xa_load(&fs_info->extent_buffers,
- bytenr >> fs_info->sectorsize_bits);
+ /* For subpage case, we need to lookup buffer radix tree */
+ rcu_read_lock();
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ bytenr >> fs_info->sectorsize_bits);
+ rcu_read_unlock();
ASSERT(eb);
return eb;
}
@@ -4435,8 +4437,8 @@ static struct extent_buffer *find_extent_buffer_nolock(
struct extent_buffer *eb;
rcu_read_lock();
- eb = xa_load(&fs_info->extent_buffers,
- start >> fs_info->sectorsize_bits);
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ start >> fs_info->sectorsize_bits);
if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
return eb;
@@ -5241,13 +5243,14 @@ int extent_writepages(struct address_space *mapping,
*/
btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
ret = extent_write_cache_pages(mapping, wbc, &epd);
- btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
ASSERT(ret <= 0);
if (ret < 0) {
+ btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
end_write_bio(&epd, ret);
return ret;
}
flush_write_bio(&epd);
+ btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
return ret;
}
@@ -6128,22 +6131,24 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
if (!eb)
return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
-
- do {
- ret = xa_insert(&fs_info->extent_buffers,
- start >> fs_info->sectorsize_bits,
- eb, GFP_NOFS);
- if (ret == -ENOMEM) {
- exists = ERR_PTR(ret);
+again:
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret) {
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
+ spin_lock(&fs_info->buffer_lock);
+ ret = radix_tree_insert(&fs_info->buffer_radix,
+ start >> fs_info->sectorsize_bits, eb);
+ spin_unlock(&fs_info->buffer_lock);
+ radix_tree_preload_end();
+ if (ret == -EEXIST) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
goto free_eb;
- }
- if (ret == -EBUSY) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
- goto free_eb;
- }
- } while (ret);
-
+ else
+ goto again;
+ }
check_buffer_tree_ref(eb);
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
@@ -6318,22 +6323,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
- do {
- ret = xa_insert(&fs_info->extent_buffers,
- start >> fs_info->sectorsize_bits,
- eb, GFP_NOFS);
- if (ret == -ENOMEM) {
- exists = ERR_PTR(ret);
+again:
+ ret = radix_tree_preload(GFP_NOFS);
+ if (ret) {
+ exists = ERR_PTR(ret);
+ goto free_eb;
+ }
+
+ spin_lock(&fs_info->buffer_lock);
+ ret = radix_tree_insert(&fs_info->buffer_radix,
+ start >> fs_info->sectorsize_bits, eb);
+ spin_unlock(&fs_info->buffer_lock);
+ radix_tree_preload_end();
+ if (ret == -EEXIST) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
goto free_eb;
- }
- if (ret == -EBUSY) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
- goto free_eb;
- }
- } while (ret);
-
+ else
+ goto again;
+ }
/* add one reference for the tree */
check_buffer_tree_ref(eb);
set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
@@ -6378,8 +6386,10 @@ static int release_extent_buffer(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
- xa_erase(&fs_info->extent_buffers,
- eb->start >> fs_info->sectorsize_bits);
+ spin_lock(&fs_info->buffer_lock);
+ radix_tree_delete(&fs_info->buffer_radix,
+ eb->start >> fs_info->sectorsize_bits);
+ spin_unlock(&fs_info->buffer_lock);
} else {
spin_unlock(&eb->refs_lock);
}
@@ -7324,25 +7334,42 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
+#define GANG_LOOKUP_SIZE 16
static struct extent_buffer *get_next_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
{
- struct extent_buffer *eb;
- unsigned long index;
+ struct extent_buffer *gang[GANG_LOOKUP_SIZE];
+ struct extent_buffer *found = NULL;
u64 page_start = page_offset(page);
+ u64 cur = page_start;
ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
lockdep_assert_held(&fs_info->buffer_lock);
- xa_for_each_start(&fs_info->extent_buffers, index, eb,
- page_start >> fs_info->sectorsize_bits) {
- if (in_range(eb->start, page_start, PAGE_SIZE))
- return eb;
- else if (eb->start >= page_start + PAGE_SIZE)
- /* Already beyond page end */
- return NULL;
+ while (cur < page_start + PAGE_SIZE) {
+ int ret;
+ int i;
+
+ ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
+ (void **)gang, cur >> fs_info->sectorsize_bits,
+ min_t(unsigned int, GANG_LOOKUP_SIZE,
+ PAGE_SIZE / fs_info->nodesize));
+ if (ret == 0)
+ goto out;
+ for (i = 0; i < ret; i++) {
+ /* Already beyond page end */
+ if (gang[i]->start >= page_start + PAGE_SIZE)
+ goto out;
+ /* Found one */
+ if (gang[i]->start >= bytenr) {
+ found = gang[i];
+ goto out;
+ }
+ }
+ cur = gang[ret - 1]->start + gang[ret - 1]->len;
}
- return NULL;
+out:
+ return found;
}
static int try_release_subpage_extent_buffer(struct page *page)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1fd827b99c1b..9dfde1af8a64 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2323,25 +2323,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
- if (ret != BTRFS_NO_LOG_SYNC) {
+ if (ret == BTRFS_NO_LOG_SYNC) {
+ ret = btrfs_end_transaction(trans);
+ goto out;
+ }
+
+ /* We successfully logged the inode, attempt to sync the log. */
+ if (!ret) {
+ ret = btrfs_sync_log(trans, root, &ctx);
if (!ret) {
- ret = btrfs_sync_log(trans, root, &ctx);
- if (!ret) {
- ret = btrfs_end_transaction(trans);
- goto out;
- }
- }
- if (!full_sync) {
- ret = btrfs_wait_ordered_range(inode, start, len);
- if (ret) {
- btrfs_end_transaction(trans);
- goto out;
- }
+ ret = btrfs_end_transaction(trans);
+ goto out;
}
- ret = btrfs_commit_transaction(trans);
- } else {
+ }
+
+ /*
+ * At this point we need to commit the transaction because we had
+ * btrfs_need_log_full_commit() or some other error.
+ *
+ * If we didn't do a full sync we have to stop the trans handle, wait on
+ * the ordered extents, start it again and commit the transaction. If
+ * we attempt to wait on the ordered extents here we could deadlock with
+ * something like fallocate() that is holding the extent lock trying to
+ * start a transaction while some other thread is trying to commit the
+ * transaction while we (fsync) are currently holding the transaction
+ * open.
+ */
+ if (!full_sync) {
ret = btrfs_end_transaction(trans);
+ if (ret)
+ goto out;
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret)
+ goto out;
+
+ /*
+ * This is safe to use here because we're only interested in
+ * making sure the transaction that had the ordered extents is
+ * committed. We aren't waiting on anything past this point,
+ * we're purely getting the transaction and committing it.
+ */
+ trans = btrfs_attach_transaction_barrier(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+
+ /*
+ * We committed the transaction and there's no currently
+ * running transaction, this means everything we care
+ * about made it to disk and we are done.
+ */
+ if (ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
}
+
+ ret = btrfs_commit_transaction(trans);
out:
ASSERT(list_empty(&ctx.list));
err = file_check_and_advance_wb_err(file);
@@ -2719,7 +2756,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
min_size, false);
- BUG_ON(ret);
+ if (WARN_ON(ret))
+ goto out_trans;
trans->block_rsv = rsv;
cur_offset = start;
@@ -2803,6 +2841,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
extent_info->file_offset += replace_len;
}
+ /*
+ * We are releasing our handle on the transaction, balance the
+ * dirty pages of the btree inode and flush delayed items, and
+ * then get a new transaction handle, which may now point to a
+ * new transaction in case someone else may have committed the
+ * transaction we used to replace/drop file extent items. So
+ * bump the inode's iversion and update mtime and ctime except
+ * if we are called from a dedupe context. This is because a
+ * power failure/crash may happen after the transaction is
+ * committed and before we finish replacing/dropping all the
+ * file extent items we need.
+ */
+ inode_inc_iversion(&inode->vfs_inode);
+
+ if (!extent_info || extent_info->update_times) {
+ inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
+ inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
+ }
+
ret = btrfs_update_inode(trans, root, inode);
if (ret)
break;
@@ -2819,7 +2876,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
rsv, min_size, false);
- BUG_ON(ret); /* shouldn't happen */
+ if (WARN_ON(ret))
+ break;
trans->block_rsv = rsv;
cur_offset = drop_args.drop_end;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81737eff92f3..d50448bf8eed 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3195,6 +3195,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->file_offset,
ordered_extent->file_offset +
logical_len);
+ btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
+ ordered_extent->disk_num_bytes);
} else {
BUG_ON(root == fs_info->tree_root);
ret = insert_ordered_extent_file_extent(trans, ordered_extent);
@@ -3576,7 +3578,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
- /* Bail out if the cleanup is already running. */
if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
return 0;
@@ -3659,17 +3660,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
*
* btrfs_find_orphan_roots() ran before us, which has
* found all deleted roots and loaded them into
- * fs_info->fs_roots. So here we can find if an
+ * fs_info->fs_roots_radix. So here we can find if an
* orphan item corresponds to a deleted root by looking
- * up the root from that xarray.
+ * up the root from that radix tree.
*/
- spin_lock(&fs_info->fs_roots_lock);
- dead_root = xa_load(&fs_info->fs_roots,
- (unsigned long)found_key.objectid);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)found_key.objectid);
if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
is_dead_root = 1;
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
if (is_dead_root) {
/* prevent this orphan from being found again */
@@ -3909,7 +3910,7 @@ cache_index:
* cache.
*
* This is required for both inode re-read from disk and delayed inode
- * in the delayed_nodes xarray.
+ * in delayed_nodes_tree.
*/
if (BTRFS_I(inode)->last_trans == fs_info->generation)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -7679,7 +7680,19 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
- ret = -ENOTBLK;
+ /*
+ * If we are in a NOWAIT context, return -EAGAIN in order to
+ * fallback to buffered IO. This is not only because we can
+ * block with buffered IO (no support for NOWAIT semantics at
+ * the moment) but also to avoid returning short reads to user
+ * space - this happens if we were able to read some data from
+ * previous non-compressed extents and then when we fallback to
+ * buffered IO, at btrfs_file_read_iter() by calling
+ * filemap_read(), we fail to fault in pages for the read buffer,
+ * in which case filemap_read() returns a short read (the number
+ * of bytes previously read is > 0, so it does not return -EFAULT).
+ */
+ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
goto unlock_err;
}
@@ -9897,6 +9910,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
extent_info.file_offset = file_offset;
extent_info.extent_buf = (char *)&stack_fi;
extent_info.is_new_extent = true;
+ extent_info.update_times = true;
extent_info.qgroup_reserved = qgroup_released;
extent_info.insertions = 0;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 313d9d685adb..33461b4f9c8b 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne
start_ns = ktime_get_ns();
down_read_nested(&eb->lock, nest);
- eb->lock_owner = current->pid;
trace_btrfs_tree_read_lock(eb, start_ns);
}
@@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb)
int btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
if (down_read_trylock(&eb->lock)) {
- eb->lock_owner = current->pid;
trace_btrfs_try_tree_read_lock(eb);
return 1;
}
@@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
trace_btrfs_tree_read_unlock(eb);
- eb->lock_owner = 0;
up_read(&eb->lock);
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index c39f8b3a5a4a..a3549d587464 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
int ret;
const u64 len = olen_aligned;
u64 last_dest_end = destoff;
+ u64 prev_extent_end = off;
ret = -ENOMEM;
buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
@@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
key.offset = off;
while (1) {
- u64 next_key_min_offset = key.offset + 1;
struct btrfs_file_extent_item *extent;
u64 extent_gen;
int type;
@@ -431,14 +431,21 @@ process_slot:
* The first search might have left us at an extent item that
* ends before our target range's start, can happen if we have
* holes and NO_HOLES feature enabled.
+ *
+ * Subsequent searches may leave us on a file range we have
+ * processed before - this happens due to a race with ordered
+ * extent completion for a file range that is outside our source
+ * range, but that range was part of a file extent item that
+ * also covered a leading part of our source range.
*/
- if (key.offset + datal <= off) {
+ if (key.offset + datal <= prev_extent_end) {
path->slots[0]++;
goto process_slot;
} else if (key.offset >= off + len) {
break;
}
- next_key_min_offset = key.offset + datal;
+
+ prev_extent_end = key.offset + datal;
size = btrfs_item_size(leaf, slot);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
size);
@@ -489,6 +496,7 @@ process_slot:
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
clone_info.is_new_extent = false;
+ clone_info.update_times = !no_time_update;
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
drop_start, new_key.offset + datal - 1,
&clone_info, &trans);
@@ -550,7 +558,7 @@ process_slot:
break;
btrfs_release_path(path);
- key.offset = next_key_min_offset;
+ key.offset = prev_extent_end;
if (fatal_signal_pending(current)) {
ret = -EINTR;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fa56890ff81f..c7dea639a56f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -10,6 +10,7 @@
#include <linux/mount.h>
#include <linux/xattr.h>
#include <linux/posix_acl_xattr.h>
+#include <linux/radix-tree.h>
#include <linux/vmalloc.h>
#include <linux/string.h>
#include <linux/compat.h>
@@ -127,7 +128,7 @@ struct send_ctx {
struct list_head new_refs;
struct list_head deleted_refs;
- struct xarray name_cache;
+ struct radix_tree_root name_cache;
struct list_head name_cache_list;
int name_cache_size;
@@ -268,13 +269,14 @@ struct orphan_dir_info {
struct name_cache_entry {
struct list_head list;
/*
- * On 32bit kernels, xarray has only 32bit indices, but we need to
- * handle 64bit inums. We use the lower 32bit of the 64bit inum to store
- * it in the tree. If more than one inum would fall into the same entry,
- * we use inum_aliases to store the additional entries. inum_aliases is
- * also used to store entries with the same inum but different generations.
+ * radix_tree has only 32bit entries but we need to handle 64bit inums.
+ * We use the lower 32bit of the 64bit inum to store it in the tree. If
+ * more then one inum would fall into the same entry, we use radix_list
+ * to store the additional entries. radix_list is also used to store
+ * entries where two entries have the same inum but different
+ * generations.
*/
- struct list_head inum_aliases;
+ struct list_head radix_list;
u64 ino;
u64 gen;
u64 parent_ino;
@@ -2024,9 +2026,9 @@ out:
}
/*
- * Insert a name cache entry. On 32bit kernels the xarray index is 32bit,
+ * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
* so we need to do some special handling in case we have clashes. This function
- * takes care of this with the help of name_cache_entry::inum_aliases.
+ * takes care of this with the help of name_cache_entry::radix_list.
* In case of error, nce is kfreed.
*/
static int name_cache_insert(struct send_ctx *sctx,
@@ -2035,7 +2037,8 @@ static int name_cache_insert(struct send_ctx *sctx,
int ret = 0;
struct list_head *nce_head;
- nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino);
+ nce_head = radix_tree_lookup(&sctx->name_cache,
+ (unsigned long)nce->ino);
if (!nce_head) {
nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
if (!nce_head) {
@@ -2044,14 +2047,14 @@ static int name_cache_insert(struct send_ctx *sctx,
}
INIT_LIST_HEAD(nce_head);
- ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL);
+ ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
if (ret < 0) {
kfree(nce_head);
kfree(nce);
return ret;
}
}
- list_add_tail(&nce->inum_aliases, nce_head);
+ list_add_tail(&nce->radix_list, nce_head);
list_add_tail(&nce->list, &sctx->name_cache_list);
sctx->name_cache_size++;
@@ -2063,14 +2066,15 @@ static void name_cache_delete(struct send_ctx *sctx,
{
struct list_head *nce_head;
- nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino);
+ nce_head = radix_tree_lookup(&sctx->name_cache,
+ (unsigned long)nce->ino);
if (!nce_head) {
btrfs_err(sctx->send_root->fs_info,
"name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
nce->ino, sctx->name_cache_size);
}
- list_del(&nce->inum_aliases);
+ list_del(&nce->radix_list);
list_del(&nce->list);
sctx->name_cache_size--;
@@ -2078,7 +2082,7 @@ static void name_cache_delete(struct send_ctx *sctx,
* We may not get to the final release of nce_head if the lookup fails
*/
if (nce_head && list_empty(nce_head)) {
- xa_erase(&sctx->name_cache, (unsigned long)nce->ino);
+ radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
kfree(nce_head);
}
}
@@ -2089,11 +2093,11 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
struct list_head *nce_head;
struct name_cache_entry *cur;
- nce_head = xa_load(&sctx->name_cache, (unsigned long)ino);
+ nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
if (!nce_head)
return NULL;
- list_for_each_entry(cur, nce_head, inum_aliases) {
+ list_for_each_entry(cur, nce_head, radix_list) {
if (cur->ino == ino && cur->gen == gen)
return cur;
}
@@ -7518,7 +7522,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
INIT_LIST_HEAD(&sctx->new_refs);
INIT_LIST_HEAD(&sctx->deleted_refs);
- xa_init_flags(&sctx->name_cache, GFP_KERNEL);
+ INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
INIT_LIST_HEAD(&sctx->name_cache_list);
sctx->flags = arg->flags;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b1fdc6a26c76..6627dd7875ee 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -763,6 +763,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
compress_force = false;
no_compress++;
} else {
+ btrfs_err(info, "unrecognized compression value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -821,8 +823,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_thread_pool:
ret = match_int(&args[0], &intarg);
if (ret) {
+ btrfs_err(info, "unrecognized thread_pool value %s",
+ args[0].from);
goto out;
} else if (intarg == 0) {
+ btrfs_err(info, "invalid value 0 for thread_pool");
ret = -EINVAL;
goto out;
}
@@ -883,8 +888,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_ratio:
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info, "unrecognized metadata_ratio value %s",
+ args[0].from);
goto out;
+ }
info->metadata_ratio = intarg;
btrfs_info(info, "metadata ratio %u",
info->metadata_ratio);
@@ -901,6 +909,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_and_info(info, DISCARD_ASYNC,
"turning on async discard");
} else {
+ btrfs_err(info, "unrecognized discard mode value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -933,6 +943,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_and_info(info, FREE_SPACE_TREE,
"enabling free space tree");
} else {
+ btrfs_err(info, "unrecognized space_cache value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -1014,8 +1026,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_check_integrity_print_mask:
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info,
+ "unrecognized check_integrity_print_mask value %s",
+ args[0].from);
goto out;
+ }
info->check_integrity_print_mask = intarg;
btrfs_info(info, "check_integrity_print_mask 0x%x",
info->check_integrity_print_mask);
@@ -1030,13 +1046,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
goto out;
#endif
case Opt_fatal_errors:
- if (strcmp(args[0].from, "panic") == 0)
+ if (strcmp(args[0].from, "panic") == 0) {
btrfs_set_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
- else if (strcmp(args[0].from, "bug") == 0)
+ } else if (strcmp(args[0].from, "bug") == 0) {
btrfs_clear_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
- else {
+ } else {
+ btrfs_err(info, "unrecognized fatal_errors value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -1044,8 +1062,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
case Opt_commit_interval:
intarg = 0;
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info, "unrecognized commit_interval value %s",
+ args[0].from);
+ ret = -EINVAL;
goto out;
+ }
if (intarg == 0) {
btrfs_info(info,
"using default commit interval %us",
@@ -1059,8 +1081,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_rescue:
ret = parse_rescue_options(info, args[0].from);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_err(info, "unrecognized rescue value %s",
+ args[0].from);
goto out;
+ }
break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
@@ -1985,6 +2010,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
+ /* V1 cache is not supported for subpage mount. */
+ if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
+ btrfs_warn(fs_info,
+ "v1 space cache is not supported for page size %lu with sectorsize %u",
+ PAGE_SIZE, fs_info->sectorsize);
+ ret = -EINVAL;
+ goto restore;
+ }
btrfs_remount_begin(fs_info, old_opts, *flags);
btrfs_resize_thread_pool(fs_info,
fs_info->thread_pool_size, old_thread_pool_size);
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 1591bfa55bcc..d8e56edd6991 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -150,8 +150,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
- unsigned long index;
- struct extent_buffer *eb;
+ struct radix_tree_iter iter;
+ void **slot;
struct btrfs_device *dev, *tmp;
if (!fs_info)
@@ -163,9 +163,25 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
test_mnt->mnt_sb->s_fs_info = NULL;
- xa_for_each(&fs_info->extent_buffers, index, eb) {
+ spin_lock(&fs_info->buffer_lock);
+ radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
+ struct extent_buffer *eb;
+
+ eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
+ if (!eb)
+ continue;
+ /* Shouldn't happen but that kind of thinking creates CVE's */
+ if (radix_tree_exception(eb)) {
+ if (radix_tree_deref_retry(eb))
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ slot = radix_tree_iter_resume(slot, &iter);
+ spin_unlock(&fs_info->buffer_lock);
free_extent_buffer_stale(eb);
+ spin_lock(&fs_info->buffer_lock);
}
+ spin_unlock(&fs_info->buffer_lock);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
@@ -186,7 +202,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
if (!root)
return;
/* Will be freed by btrfs_free_fs_roots */
- if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state)))
+ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
return;
btrfs_global_root_delete(root);
btrfs_put_root(root);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 06c0a958d114..875b801ab3d7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -23,7 +23,7 @@
#include "space-info.h"
#include "zoned.h"
-#define BTRFS_ROOT_TRANS_TAG XA_MARK_0
+#define BTRFS_ROOT_TRANS_TAG 0
/*
* Transaction states and transitions
@@ -437,15 +437,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
*/
smp_wmb();
- spin_lock(&fs_info->fs_roots_lock);
+ spin_lock(&fs_info->fs_roots_radix_lock);
if (root->last_trans == trans->transid && !force) {
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
return 0;
}
- xa_set_mark(&fs_info->fs_roots,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
- spin_unlock(&fs_info->fs_roots_lock);
+ radix_tree_tag_set(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
root->last_trans = trans->transid;
/* this is pretty tricky. We don't want to
@@ -487,9 +487,11 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
spin_unlock(&cur_trans->dropped_roots_lock);
/* Make sure we don't try to update the root at commit time */
- xa_clear_mark(&fs_info->fs_roots,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ radix_tree_tag_clear(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
}
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
@@ -1402,8 +1404,9 @@ void btrfs_add_dead_root(struct btrfs_root *root)
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *root;
- unsigned long index;
+ struct btrfs_root *gang[8];
+ int i;
+ int ret;
/*
* At this point no one can be using this transaction to modify any tree
@@ -1411,46 +1414,57 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
*/
ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
- spin_lock(&fs_info->fs_roots_lock);
- xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) {
- int ret;
-
- /*
- * At this point we can neither have tasks logging inodes
- * from a root nor trying to commit a log tree.
- */
- ASSERT(atomic_read(&root->log_writers) == 0);
- ASSERT(atomic_read(&root->log_commit[0]) == 0);
- ASSERT(atomic_read(&root->log_commit[1]) == 0);
-
- xa_clear_mark(&fs_info->fs_roots,
- (unsigned long)root->root_key.objectid,
- BTRFS_ROOT_TRANS_TAG);
- spin_unlock(&fs_info->fs_roots_lock);
-
- btrfs_free_log(trans, root);
- ret = btrfs_update_reloc_root(trans, root);
- if (ret)
- return ret;
-
- /* See comments in should_cow_block() */
- clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
- smp_mb__after_atomic();
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while (1) {
+ ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+ (void **)gang, 0,
+ ARRAY_SIZE(gang),
+ BTRFS_ROOT_TRANS_TAG);
+ if (ret == 0)
+ break;
+ for (i = 0; i < ret; i++) {
+ struct btrfs_root *root = gang[i];
+ int ret2;
+
+ /*
+ * At this point we can neither have tasks logging inodes
+ * from a root nor trying to commit a log tree.
+ */
+ ASSERT(atomic_read(&root->log_writers) == 0);
+ ASSERT(atomic_read(&root->log_commit[0]) == 0);
+ ASSERT(atomic_read(&root->log_commit[1]) == 0);
+
+ radix_tree_tag_clear(&fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ btrfs_free_log(trans, root);
+ ret2 = btrfs_update_reloc_root(trans, root);
+ if (ret2)
+ return ret2;
+
+ /* see comments in should_cow_block() */
+ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+ smp_mb__after_atomic();
+
+ if (root->commit_root != root->node) {
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
+ btrfs_set_root_node(&root->root_item,
+ root->node);
+ }
- if (root->commit_root != root->node) {
- list_add_tail(&root->dirty_list,
- &trans->transaction->switch_commits);
- btrfs_set_root_node(&root->root_item, root->node);
+ ret2 = btrfs_update_root(trans, fs_info->tree_root,
+ &root->root_key,
+ &root->root_item);
+ if (ret2)
+ return ret2;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ btrfs_qgroup_free_meta_all_pertrans(root);
}
-
- ret = btrfs_update_root(trans, fs_info->tree_root,
- &root->root_key, &root->root_item);
- if (ret)
- return ret;
- spin_lock(&fs_info->fs_roots_lock);
- btrfs_qgroup_free_meta_all_pertrans(root);
}
- spin_unlock(&fs_info->fs_roots_lock);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
return 0;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 11237a913bee..d99026df6f67 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1735,12 +1735,14 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
&mapped_length, &bioc);
if (ret || !bioc || mapped_length < PAGE_SIZE) {
- btrfs_put_bioc(bioc);
- return -EIO;
+ ret = -EIO;
+ goto out_put_bioc;
}
- if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- return -EINVAL;
+ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = -EINVAL;
+ goto out_put_bioc;
+ }
nofs_flag = memalloc_nofs_save();
nmirrors = (int)bioc->num_stripes;
@@ -1759,7 +1761,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
break;
}
memalloc_nofs_restore(nofs_flag);
-
+out_put_bioc:
+ btrfs_put_bioc(bioc);
return ret;
}
@@ -1885,7 +1888,6 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct map_lookup *map;
- bool need_zone_finish;
int ret = 0;
int i;
@@ -1942,12 +1944,6 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
}
}
- /*
- * The block group is not fully allocated, so not fully written yet. We
- * need to send ZONE_FINISH command to free up an active zone.
- */
- need_zone_finish = !btrfs_zoned_bg_is_full(block_group);
-
block_group->zone_is_active = 0;
block_group->alloc_offset = block_group->zone_capacity;
block_group->free_space_ctl->free_space = 0;
@@ -1963,15 +1959,13 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
if (device->zone_info->max_active_zones == 0)
continue;
- if (need_zone_finish) {
- ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
- physical >> SECTOR_SHIFT,
- device->zone_info->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ device->zone_info->zone_size >> SECTOR_SHIFT,
+ GFP_NOFS);
- if (ret)
- return ret;
- }
+ if (ret)
+ return ret;
btrfs_dev_clear_active_zone(device, physical);
}
@@ -2139,3 +2133,30 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
factor = div64_u64(used * 100, total);
return factor >= fs_info->bg_reclaim_threshold;
}
+
+void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length)
+{
+ struct btrfs_block_group *block_group;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ block_group = btrfs_lookup_block_group(fs_info, logical);
+ /* It should be called on a previous data relocation block group. */
+ ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
+
+ spin_lock(&block_group->lock);
+ if (!block_group->zoned_data_reloc_ongoing)
+ goto out;
+
+ /* All relocation extents are written. */
+ if (block_group->start + block_group->alloc_offset == logical + length) {
+ /* Now, release this block group for further allocations. */
+ block_group->zoned_data_reloc_ongoing = 0;
+ }
+
+out:
+ spin_unlock(&block_group->lock);
+ btrfs_put_block_group(block_group);
+}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index bb1a189e11f9..6b2eec99162b 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -77,6 +77,8 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
+void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@@ -243,6 +245,9 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
{
return false;
}
+
+static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index a41ae6efc545..1fee702d5529 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -21,7 +21,8 @@ static int cachefiles_ondemand_fd_release(struct inode *inode,
* anon_fd.
*/
xas_for_each(&xas, req, ULONG_MAX) {
- if (req->msg.opcode == CACHEFILES_OP_READ) {
+ if (req->msg.object_id == object_id &&
+ req->msg.opcode == CACHEFILES_OP_READ) {
req->error = -EIO;
complete(&req->done);
xas_store(&xas, NULL);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e5221be6eb55..d6e5916138e4 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -63,7 +63,7 @@
(CONGESTION_ON_THRESH(congestion_kb) >> 2))
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
- struct folio *folio, void **_fsdata);
+ struct folio **foliop, void **_fsdata);
static inline struct ceph_snap_context *page_snap_context(struct page *page)
{
@@ -394,11 +394,10 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
return 0;
}
-static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
+static void ceph_netfs_free_request(struct netfs_io_request *rreq)
{
- struct inode *inode = mapping->host;
- struct ceph_inode_info *ci = ceph_inode(inode);
- int got = (uintptr_t)priv;
+ struct ceph_inode_info *ci = ceph_inode(rreq->inode);
+ int got = (uintptr_t)rreq->netfs_priv;
if (got)
ceph_put_cap_refs(ci, got);
@@ -406,12 +405,12 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
const struct netfs_request_ops ceph_netfs_ops = {
.init_request = ceph_init_request,
+ .free_request = ceph_netfs_free_request,
.begin_cache_operation = ceph_begin_cache_operation,
.issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
.check_write_begin = ceph_netfs_check_write_begin,
- .cleanup = ceph_readahead_cleanup,
};
#ifdef CONFIG_CEPH_FSCACHE
@@ -1289,18 +1288,19 @@ ceph_find_incompatible(struct page *page)
}
static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
- struct folio *folio, void **_fsdata)
+ struct folio **foliop, void **_fsdata)
{
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
- snapc = ceph_find_incompatible(folio_page(folio, 0));
+ snapc = ceph_find_incompatible(folio_page(*foliop, 0));
if (snapc) {
int r;
- folio_unlock(folio);
- folio_put(folio);
+ folio_unlock(*foliop);
+ folio_put(*foliop);
+ *foliop = NULL;
if (IS_ERR(snapc))
return PTR_ERR(snapc);
@@ -1322,10 +1322,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct folio *folio = NULL;
int r;
- r = netfs_write_begin(file, inode->i_mapping, pos, len, &folio, NULL);
+ r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL);
if (r == 0)
folio_wait_fscache(folio);
if (r < 0) {
@@ -1798,7 +1799,7 @@ enum {
static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
s64 pool, struct ceph_string *pool_ns)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->netfs.inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
struct rb_node **p, *parent;
@@ -1913,7 +1914,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
0, false, true);
err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
- wr_req->r_mtime = ci->vfs_inode.i_mtime;
+ wr_req->r_mtime = ci->netfs.inode.i_mtime;
err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
if (!err)
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index ddea99922073..177d8e8d73fe 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -29,9 +29,9 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
if (!(inode->i_state & I_NEW))
return;
- WARN_ON_ONCE(ci->netfs_ctx.cache);
+ WARN_ON_ONCE(ci->netfs.cache);
- ci->netfs_ctx.cache =
+ ci->netfs.cache =
fscache_acquire_cookie(fsc->fscache, 0,
&ci->i_vino, sizeof(ci->i_vino),
&ci->i_version, sizeof(ci->i_version),
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 7255b790a4c1..dc502daac49a 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -28,7 +28,7 @@ void ceph_fscache_invalidate(struct inode *inode, bool dio_write);
static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci)
{
- return netfs_i_cookie(&ci->vfs_inode);
+ return netfs_i_cookie(&ci->netfs);
}
static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index bf2e94005598..ac8fd5e7f540 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -492,7 +492,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
struct ceph_mount_options *opt = mdsc->fsc->mount_options;
ci->i_hold_caps_max = round_jiffies(jiffies +
opt->caps_wanted_delay_max * HZ);
- dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
+ dout("__cap_set_timeouts %p %lu\n", &ci->netfs.inode,
ci->i_hold_caps_max - jiffies);
}
@@ -507,7 +507,7 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
+ dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->netfs.inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
spin_lock(&mdsc->cap_delay_lock);
@@ -531,7 +531,7 @@ no_change:
static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+ dout("__cap_delay_requeue_front %p\n", &ci->netfs.inode);
spin_lock(&mdsc->cap_delay_lock);
ci->i_ceph_flags |= CEPH_I_FLUSH;
if (!list_empty(&ci->i_cap_delay_list))
@@ -548,7 +548,7 @@ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
{
- dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+ dout("__cap_delay_cancel %p\n", &ci->netfs.inode);
if (list_empty(&ci->i_cap_delay_list))
return;
spin_lock(&mdsc->cap_delay_lock);
@@ -568,7 +568,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
* Each time we receive FILE_CACHE anew, we increment
* i_rdcache_gen.
*/
- if (S_ISREG(ci->vfs_inode.i_mode) &&
+ if (S_ISREG(ci->netfs.inode.i_mode) &&
(issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
(had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
ci->i_rdcache_gen++;
@@ -583,14 +583,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
if (issued & CEPH_CAP_FILE_SHARED)
atomic_inc(&ci->i_shared_gen);
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
- dout(" marking %p NOT complete\n", &ci->vfs_inode);
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
+ dout(" marking %p NOT complete\n", &ci->netfs.inode);
__ceph_dir_clear_complete(ci);
}
}
/* Wipe saved layout if we're losing DIR_CREATE caps */
- if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
+ if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
!(issued & CEPH_CAP_DIR_CREATE)) {
ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
@@ -771,7 +771,7 @@ static int __cap_is_valid(struct ceph_cap *cap)
if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
dout("__cap_is_valid %p cap %p issued %s "
- "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+ "but STALE (gen %u vs %u)\n", &cap->ci->netfs.inode,
cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
return 0;
}
@@ -797,7 +797,7 @@ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
if (!__cap_is_valid(cap))
continue;
dout("__ceph_caps_issued %p cap %p issued %s\n",
- &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+ &ci->netfs.inode, cap, ceph_cap_string(cap->issued));
have |= cap->issued;
if (implemented)
*implemented |= cap->implemented;
@@ -844,12 +844,12 @@ static void __touch_cap(struct ceph_cap *cap)
spin_lock(&s->s_cap_lock);
if (!s->s_cap_iterator) {
- dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+ dout("__touch_cap %p cap %p mds%d\n", &cap->ci->netfs.inode, cap,
s->s_mds);
list_move_tail(&cap->session_caps, &s->s_caps);
} else {
dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
- &cap->ci->vfs_inode, cap, s->s_mds);
+ &cap->ci->netfs.inode, cap, s->s_mds);
}
spin_unlock(&s->s_cap_lock);
}
@@ -867,7 +867,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
if ((have & mask) == mask) {
dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
- " (mask %s)\n", ceph_ino(&ci->vfs_inode),
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode),
ceph_cap_string(have),
ceph_cap_string(mask));
return 1;
@@ -879,7 +879,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
continue;
if ((cap->issued & mask) == mask) {
dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
- " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode), cap,
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch)
@@ -891,7 +891,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
have |= cap->issued;
if ((have & mask) == mask) {
dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
- " (mask %s)\n", ceph_ino(&ci->vfs_inode),
+ " (mask %s)\n", ceph_ino(&ci->netfs.inode),
ceph_cap_string(cap->issued),
ceph_cap_string(mask));
if (touch) {
@@ -919,7 +919,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
int touch)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
int r;
r = __ceph_caps_issued_mask(ci, mask, touch);
@@ -950,7 +950,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int ret;
spin_lock(&ci->i_ceph_lock);
@@ -969,8 +969,8 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD;
if (ci->i_rdcache_ref ||
- (S_ISREG(ci->vfs_inode.i_mode) &&
- ci->vfs_inode.i_data.nrpages))
+ (S_ISREG(ci->netfs.inode.i_mode) &&
+ ci->netfs.inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR;
@@ -993,11 +993,11 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
struct ceph_mount_options *opt =
- ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+ ceph_inode_to_client(&ci->netfs.inode)->mount_options;
unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
int want = 0;
/* use used_cutoff here, to keep dir's wanted caps longer */
@@ -1050,7 +1050,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
int __ceph_caps_wanted(struct ceph_inode_info *ci)
{
int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
- if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ if (S_ISDIR(ci->netfs.inode.i_mode)) {
/* we want EXCL if holding caps of dir ops */
if (w & CEPH_CAP_ANY_DIR_OPS)
w |= CEPH_CAP_FILE_EXCL;
@@ -1116,9 +1116,9 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
lockdep_assert_held(&ci->i_ceph_lock);
- dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+ dout("__ceph_remove_cap %p from %p\n", cap, &ci->netfs.inode);
- mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc;
+ mdsc = ceph_inode_to_client(&ci->netfs.inode)->mdsc;
/* remove from inode's cap rbtree, and clear auth cap */
rb_erase(&cap->ci_node, &ci->i_caps);
@@ -1169,7 +1169,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
* keep i_snap_realm.
*/
if (ci->i_wr_ref == 0 && ci->i_snap_realm)
- ceph_change_snap_realm(&ci->vfs_inode, NULL);
+ ceph_change_snap_realm(&ci->netfs.inode, NULL);
__cap_delay_cancel(mdsc, ci);
}
@@ -1188,11 +1188,11 @@ void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
lockdep_assert_held(&ci->i_ceph_lock);
- fsc = ceph_inode_to_client(&ci->vfs_inode);
+ fsc = ceph_inode_to_client(&ci->netfs.inode);
WARN_ON_ONCE(ci->i_auth_cap == cap &&
!list_empty(&ci->i_dirty_item) &&
!fsc->blocklisted &&
- !ceph_inode_is_shutdown(&ci->vfs_inode));
+ !ceph_inode_is_shutdown(&ci->netfs.inode));
__ceph_remove_cap(cap, queue_release);
}
@@ -1343,7 +1343,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
int flushing, u64 flush_tid, u64 oldest_flush_tid)
{
struct ceph_inode_info *ci = cap->ci;
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int held, revoking;
lockdep_assert_held(&ci->i_ceph_lock);
@@ -1440,7 +1440,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
{
struct ceph_msg *msg;
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
if (!msg) {
@@ -1528,7 +1528,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap_snap *capsnap;
u64 oldest_flush_tid = 0;
@@ -1622,7 +1622,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_session *session = NULL;
int mds;
@@ -1682,8 +1682,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
struct ceph_cap_flush **pcf)
{
struct ceph_mds_client *mdsc =
- ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
- struct inode *inode = &ci->vfs_inode;
+ ceph_sb_to_client(ci->netfs.inode.i_sb)->mdsc;
+ struct inode *inode = &ci->netfs.inode;
int was = ci->i_dirty_caps;
int dirty = 0;
@@ -1696,7 +1696,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
return 0;
}
- dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+ dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->netfs.inode,
ceph_cap_string(mask), ceph_cap_string(was),
ceph_cap_string(was | mask));
ci->i_dirty_caps |= mask;
@@ -1712,7 +1712,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
ci->i_snap_realm->cached_context);
}
dout(" inode %p now dirty snapc %p auth cap %p\n",
- &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+ &ci->netfs.inode, ci->i_head_snapc, ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
list_add(&ci->i_dirty_item, &session->s_cap_dirty);
@@ -1875,7 +1875,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
bool __ceph_should_report_size(struct ceph_inode_info *ci)
{
- loff_t size = i_size_read(&ci->vfs_inode);
+ loff_t size = i_size_read(&ci->netfs.inode);
/* mds will adjust max size according to the reported size */
if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
return false;
@@ -1900,7 +1900,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_cap *cap;
u64 flush_tid, oldest_flush_tid;
@@ -2467,7 +2467,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_cap *cap;
struct ceph_cap_flush *cf;
int ret;
@@ -2560,7 +2560,7 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
pr_err("%p auth cap %p not mds%d ???\n",
- &ci->vfs_inode, cap, session->s_mds);
+ &ci->netfs.inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2610,7 +2610,7 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
pr_err("%p auth cap %p not mds%d ???\n",
- &ci->vfs_inode, cap, session->s_mds);
+ &ci->netfs.inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
continue;
}
@@ -2630,7 +2630,7 @@ void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
lockdep_assert_held(&ci->i_ceph_lock);
- dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
+ dout("%s %p flushing %s\n", __func__, &ci->netfs.inode,
ceph_cap_string(ci->i_flushing_caps));
if (!list_empty(&ci->i_cap_flush_list)) {
@@ -2673,10 +2673,10 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
}
if (got & CEPH_CAP_FILE_BUFFER) {
if (ci->i_wb_ref == 0)
- ihold(&ci->vfs_inode);
+ ihold(&ci->netfs.inode);
ci->i_wb_ref++;
dout("%s %p wb %d -> %d (?)\n", __func__,
- &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
+ &ci->netfs.inode, ci->i_wb_ref-1, ci->i_wb_ref);
}
}
@@ -3004,7 +3004,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
return ret;
}
- if (S_ISREG(ci->vfs_inode.i_mode) &&
+ if (S_ISREG(ci->netfs.inode.i_mode) &&
ci->i_inline_version != CEPH_INLINE_NONE &&
(_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
i_size_read(inode) > 0) {
@@ -3094,7 +3094,7 @@ enum put_cap_refs_mode {
static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
enum put_cap_refs_mode mode)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
int last = 0, put = 0, flushsnaps = 0, wake = 0;
bool check_flushsnaps = false;
@@ -3202,7 +3202,7 @@ void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_cap_snap *capsnap = NULL, *iter;
int put = 0;
bool last = false;
@@ -3698,7 +3698,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
session->s_mds,
&list_first_entry(&session->s_cap_flushing,
struct ceph_inode_info,
- i_flushing_item)->vfs_inode);
+ i_flushing_item)->netfs.inode);
}
}
mdsc->num_cap_flushing--;
@@ -4345,7 +4345,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
break;
list_del_init(&ci->i_cap_delay_list);
- inode = igrab(&ci->vfs_inode);
+ inode = igrab(&ci->netfs.inode);
if (inode) {
spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", inode);
@@ -4373,10 +4373,11 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
while (!list_empty(&s->s_cap_dirty)) {
ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info,
i_dirty_item);
- inode = &ci->vfs_inode;
+ inode = &ci->netfs.inode;
ihold(inode);
dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
+ ceph_wait_on_async_create(inode);
ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
@@ -4407,7 +4408,7 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci,
void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
int bits = (fmode << 1) | 1;
bool already_opened = false;
int i;
@@ -4441,7 +4442,7 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
{
- struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb);
int bits = (fmode << 1) | 1;
bool is_closed = true;
int i;
@@ -4656,7 +4657,7 @@ int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invali
lockdep_assert_held(&ci->i_ceph_lock);
dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->vfs_inode);
+ cap, ci, &ci->netfs.inode);
is_auth = (cap == ci->i_auth_cap);
__ceph_remove_cap(cap, false);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8c8226c0feac..da59e836a06e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -205,7 +205,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mount_options *opt =
- ceph_inode_to_client(&ci->vfs_inode)->mount_options;
+ ceph_inode_to_client(&ci->netfs.inode)->mount_options;
struct ceph_file_info *fi;
int ret;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b7e9cac3aeef..56c53ab3618e 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -176,7 +176,7 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
rb_insert_color(&frag->node, &ci->i_fragtree);
dout("get_or_create_frag added %llx.%llx frag %x\n",
- ceph_vinop(&ci->vfs_inode), f);
+ ceph_vinop(&ci->netfs.inode), f);
return frag;
}
@@ -457,10 +457,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
if (!ci)
return NULL;
- dout("alloc_inode %p\n", &ci->vfs_inode);
+ dout("alloc_inode %p\n", &ci->netfs.inode);
/* Set parameters for the netfs library */
- netfs_i_context_init(&ci->vfs_inode, &ceph_netfs_ops);
+ netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
spin_lock_init(&ci->i_ceph_lock);
@@ -547,7 +547,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_work, ceph_inode_work);
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
- return &ci->vfs_inode;
+ return &ci->netfs.inode;
}
void ceph_free_inode(struct inode *inode)
@@ -1978,7 +1978,7 @@ static void ceph_inode_work(struct work_struct *work)
{
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_work);
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
dout("writeback %p\n", inode);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f5d110d90b77..33f517d549ce 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1564,7 +1564,7 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
p = session->s_caps.next;
while (p != &session->s_caps) {
cap = list_entry(p, struct ceph_cap, session_caps);
- inode = igrab(&cap->ci->vfs_inode);
+ inode = igrab(&cap->ci->netfs.inode);
if (!inode) {
p = p->next;
continue;
@@ -1622,7 +1622,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
int iputs;
dout("removing cap %p, ci is %p, inode is %p\n",
- cap, ci, &ci->vfs_inode);
+ cap, ci, &ci->netfs.inode);
spin_lock(&ci->i_ceph_lock);
iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 322ee5add942..864cdaa0d2bd 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -521,7 +521,7 @@ static bool has_new_snaps(struct ceph_snap_context *o,
static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap **pcapsnap)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_snap_context *old_snapc, *new_snapc;
struct ceph_cap_snap *capsnap = *pcapsnap;
struct ceph_buffer *old_blob = NULL;
@@ -652,7 +652,7 @@ update_snapc:
int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap)
{
- struct inode *inode = &ci->vfs_inode;
+ struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
BUG_ON(capsnap->writing);
@@ -712,7 +712,7 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
spin_lock(&realm->inodes_with_caps_lock);
list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
- struct inode *inode = igrab(&ci->vfs_inode);
+ struct inode *inode = igrab(&ci->netfs.inode);
if (!inode)
continue;
spin_unlock(&realm->inodes_with_caps_lock);
@@ -904,7 +904,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
while (!list_empty(&mdsc->snap_flush_list)) {
ci = list_first_entry(&mdsc->snap_flush_list,
struct ceph_inode_info, i_snap_flush_item);
- inode = &ci->vfs_inode;
+ inode = &ci->netfs.inode;
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
ceph_flush_snaps(ci, &session);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b73b4f75462c..40140805bdcf 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -876,7 +876,7 @@ mempool_t *ceph_wb_pagevec_pool;
static void ceph_inode_init_once(void *foo)
{
struct ceph_inode_info *ci = foo;
- inode_init_once(&ci->vfs_inode);
+ inode_init_once(&ci->netfs.inode);
}
static int __init init_caches(void)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index dd7dac0f984a..f59dac66955b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -316,11 +316,7 @@ struct ceph_inode_xattrs_info {
* Ceph inode.
*/
struct ceph_inode_info {
- struct {
- /* These must be contiguous */
- struct inode vfs_inode;
- struct netfs_i_context netfs_ctx; /* Netfslib context */
- };
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
struct ceph_vino i_vino; /* ceph ino + snap */
spinlock_t i_ceph_lock;
@@ -436,7 +432,7 @@ struct ceph_inode_info {
static inline struct ceph_inode_info *
ceph_inode(const struct inode *inode)
{
- return container_of(inode, struct ceph_inode_info, vfs_inode);
+ return container_of(inode, struct ceph_inode_info, netfs.inode);
}
static inline struct ceph_fs_client *
@@ -1316,7 +1312,7 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci,
has_quota = __ceph_has_quota(ci, QUOTA_GET_ANY);
if (had_quota != has_quota)
- ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota);
+ ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota);
}
extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 8c2dc2c762a4..f141f5246163 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -57,7 +57,7 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_string *pool_ns;
s64 pool = ci->i_layout.pool_id;
@@ -69,7 +69,7 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
- dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
+ dout("ceph_vxattrcb_layout %p\n", &ci->netfs.inode);
down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) {
@@ -161,7 +161,7 @@ static ssize_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
char *val, size_t size)
{
ssize_t ret;
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ci->i_layout.pool_id;
const char *pool_name;
@@ -313,7 +313,7 @@ static ssize_t ceph_vxattrcb_snap_btime(struct ceph_inode_info *ci, char *val,
static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
char *val, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
return ceph_fmt_xattr(val, size, "%pU", &fsc->client->fsid);
}
@@ -321,7 +321,7 @@ static ssize_t ceph_vxattrcb_cluster_fsid(struct ceph_inode_info *ci,
static ssize_t ceph_vxattrcb_client_id(struct ceph_inode_info *ci,
char *val, size_t size)
{
- struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
+ struct ceph_fs_client *fsc = ceph_sb_to_client(ci->netfs.inode.i_sb);
return ceph_fmt_xattr(val, size, "client%lld",
ceph_client_gid(fsc->client));
@@ -629,7 +629,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
}
dout("__set_xattr_val added %llx.%llx xattr %p %.*s=%.*s\n",
- ceph_vinop(&ci->vfs_inode), xattr, name_len, name, val_len, val);
+ ceph_vinop(&ci->netfs.inode), xattr, name_len, name, val_len, val);
return 0;
}
@@ -871,7 +871,7 @@ struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci)
struct ceph_buffer *old_blob = NULL;
void *dest;
- dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
+ dout("__build_xattrs_blob %p\n", &ci->netfs.inode);
if (ci->i_xattrs.dirty) {
int need = __get_required_blob_size(ci, 0, 0);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 1dd995efd5b8..2cfbac8bb965 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -162,6 +162,8 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface)
seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr);
else if (iface->sockaddr.ss_family == AF_INET6)
seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr);
+ if (!iface->is_active)
+ seq_puts(m, "\t\t[for-cleanup]\n");
}
static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
@@ -221,6 +223,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
+ struct cifs_server_iface *iface;
int c, i, j;
seq_puts(m,
@@ -456,11 +459,10 @@ skip_rdma:
if (ses->iface_count)
seq_printf(m, "\n\n\tServer interfaces: %zu",
ses->iface_count);
- for (j = 0; j < ses->iface_count; j++) {
- struct cifs_server_iface *iface;
-
- iface = &ses->iface_list[j];
- seq_printf(m, "\n\t%d)", j+1);
+ j = 0;
+ list_for_each_entry(iface, &ses->iface_list,
+ iface_head) {
+ seq_printf(m, "\n\t%d)", ++j);
cifs_dump_iface(m, iface);
if (is_ses_using_iface(ses, iface))
seq_puts(m, "\t\t[CONNECTED]\n");
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 12c872800326..8f2e003e0590 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -377,7 +377,7 @@ cifs_alloc_inode(struct super_block *sb)
cifs_inode->flags = 0;
spin_lock_init(&cifs_inode->writers_lock);
cifs_inode->writers = 0;
- cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
+ cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
cifs_inode->server_eof = 0;
cifs_inode->uniqueid = 0;
cifs_inode->createtime = 0;
@@ -389,12 +389,12 @@ cifs_alloc_inode(struct super_block *sb)
* Can not set i_flags here - they get immediately overwritten to zero
* by the VFS.
*/
- /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */
+ /* cifs_inode->netfs.inode.i_flags = S_NOATIME | S_NOCMTIME; */
INIT_LIST_HEAD(&cifs_inode->openFileList);
INIT_LIST_HEAD(&cifs_inode->llist);
INIT_LIST_HEAD(&cifs_inode->deferred_closes);
spin_lock_init(&cifs_inode->deferred_lock);
- return &cifs_inode->vfs_inode;
+ return &cifs_inode->netfs.inode;
}
static void
@@ -1086,7 +1086,7 @@ struct file_system_type cifs_fs_type = {
};
MODULE_ALIAS_FS("cifs");
-static struct file_system_type smb3_fs_type = {
+struct file_system_type smb3_fs_type = {
.owner = THIS_MODULE,
.name = "smb3",
.init_fs_context = smb3_init_fs_context,
@@ -1418,7 +1418,7 @@ cifs_init_once(void *inode)
{
struct cifsInodeInfo *cifsi = inode;
- inode_init_once(&cifsi->vfs_inode);
+ inode_init_once(&cifsi->netfs.inode);
init_rwsem(&cifsi->lock_sem);
}
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index dd7e070ca243..b17be47a8e59 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -38,7 +38,7 @@ static inline unsigned long cifs_get_time(struct dentry *dentry)
return (unsigned long) dentry->d_fsdata;
}
-extern struct file_system_type cifs_fs_type;
+extern struct file_system_type cifs_fs_type, smb3_fs_type;
extern const struct address_space_operations cifs_addr_ops;
extern const struct address_space_operations cifs_addr_ops_smallbuf;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f873379066c7..a643c84ff1e9 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -80,6 +80,9 @@
#define SMB_DNS_RESOLVE_INTERVAL_MIN 120
#define SMB_DNS_RESOLVE_INTERVAL_DEFAULT 600
+/* smb multichannel query server interfaces interval in seconds */
+#define SMB_INTERFACE_POLL_INTERVAL 600
+
/* maximum number of PDUs in one compound */
#define MAX_COMPOUND 5
@@ -933,15 +936,67 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net)
#endif
struct cifs_server_iface {
+ struct list_head iface_head;
+ struct kref refcount;
size_t speed;
unsigned int rdma_capable : 1;
unsigned int rss_capable : 1;
+ unsigned int is_active : 1; /* unset if non existent */
struct sockaddr_storage sockaddr;
};
+/* release iface when last ref is dropped */
+static inline void
+release_iface(struct kref *ref)
+{
+ struct cifs_server_iface *iface = container_of(ref,
+ struct cifs_server_iface,
+ refcount);
+ list_del_init(&iface->iface_head);
+ kfree(iface);
+}
+
+/*
+ * compare two interfaces a and b
+ * return 0 if everything matches.
+ * return 1 if a has higher link speed, or rdma capable, or rss capable
+ * return -1 otherwise.
+ */
+static inline int
+iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b)
+{
+ int cmp_ret = 0;
+
+ WARN_ON(!a || !b);
+ if (a->speed == b->speed) {
+ if (a->rdma_capable == b->rdma_capable) {
+ if (a->rss_capable == b->rss_capable) {
+ cmp_ret = memcmp(&a->sockaddr, &b->sockaddr,
+ sizeof(a->sockaddr));
+ if (!cmp_ret)
+ return 0;
+ else if (cmp_ret > 0)
+ return 1;
+ else
+ return -1;
+ } else if (a->rss_capable > b->rss_capable)
+ return 1;
+ else
+ return -1;
+ } else if (a->rdma_capable > b->rdma_capable)
+ return 1;
+ else
+ return -1;
+ } else if (a->speed > b->speed)
+ return 1;
+ else
+ return -1;
+}
+
struct cifs_chan {
unsigned int in_reconnect : 1; /* if session setup in progress for this channel */
struct TCP_Server_Info *server;
+ struct cifs_server_iface *iface; /* interface in use */
__u8 signkey[SMB3_SIGN_KEY_SIZE];
};
@@ -993,7 +1048,7 @@ struct cifs_ses {
*/
spinlock_t iface_lock;
/* ========= begin: protected by iface_lock ======== */
- struct cifs_server_iface *iface_list;
+ struct list_head iface_list;
size_t iface_count;
unsigned long iface_last_update; /* jiffies */
/* ========= end: protected by iface_lock ======== */
@@ -1203,6 +1258,7 @@ struct cifs_tcon {
#ifdef CONFIG_CIFS_DFS_UPCALL
struct list_head ulist; /* cache update list */
#endif
+ struct delayed_work query_interfaces; /* query interfaces workqueue job */
};
/*
@@ -1479,20 +1535,16 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
#define CIFS_CACHE_RW_FLG (CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG)
#define CIFS_CACHE_RHW_FLG (CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG)
-#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE))
+#define CIFS_CACHE_READ(cinode) ((cinode->oplock & CIFS_CACHE_READ_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE))
#define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG)
-#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->vfs_inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE))
+#define CIFS_CACHE_WRITE(cinode) ((cinode->oplock & CIFS_CACHE_WRITE_FLG) || (CIFS_SB(cinode->netfs.inode.i_sb)->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE))
/*
* One of these for each file inode
*/
struct cifsInodeInfo {
- struct {
- /* These must be contiguous */
- struct inode vfs_inode; /* the VFS's inode record */
- struct netfs_i_context netfs_ctx; /* Netfslib context */
- };
+ struct netfs_inode netfs; /* Netfslib context and vfs inode */
bool can_cache_brlcks;
struct list_head llist; /* locks helb by this inode */
/*
@@ -1531,7 +1583,7 @@ struct cifsInodeInfo {
static inline struct cifsInodeInfo *
CIFS_I(struct inode *inode)
{
- return container_of(inode, struct cifsInodeInfo, vfs_inode);
+ return container_of(inode, struct cifsInodeInfo, netfs.inode);
}
static inline struct cifs_sb_info *
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 3b7366ec03c7..d59aebefa71c 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -636,6 +636,13 @@ cifs_chan_clear_need_reconnect(struct cifs_ses *ses,
bool
cifs_chan_needs_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server);
+bool
+cifs_chan_is_iface_active(struct cifs_ses *ses,
+ struct TCP_Server_Info *server);
+int
+cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server);
+int
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon);
void extract_unc_hostname(const char *unc, const char **h, size_t *len);
int copy_path_name(char *dst, const char *src);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d46702f5a663..386bb523c69e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -97,6 +97,10 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
if (!server->hostname)
return -EINVAL;
+ /* if server hostname isn't populated, there's nothing to do here */
+ if (server->hostname[0] == '\0')
+ return 0;
+
len = strlen(server->hostname) + 3;
unc = kmalloc(len, GFP_KERNEL);
@@ -141,6 +145,25 @@ requeue_resolve:
return rc;
}
+static void smb2_query_server_interfaces(struct work_struct *work)
+{
+ int rc;
+ struct cifs_tcon *tcon = container_of(work,
+ struct cifs_tcon,
+ query_interfaces.work);
+
+ /*
+ * query server network interfaces, in case they change
+ */
+ rc = SMB3_request_interfaces(0, tcon);
+ if (rc) {
+ cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
+ __func__, rc);
+ }
+
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+}
static void cifs_resolve_server(struct work_struct *work)
{
@@ -213,7 +236,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
bool mark_smb_session)
{
struct TCP_Server_Info *pserver;
- struct cifs_ses *ses;
+ struct cifs_ses *ses, *nses;
struct cifs_tcon *tcon;
/*
@@ -227,7 +250,20 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
+ /* check if iface is still active */
+ if (!cifs_chan_is_iface_active(ses, server)) {
+ /*
+ * HACK: drop the lock before calling
+ * cifs_chan_update_iface to avoid deadlock
+ */
+ ses->ses_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+ cifs_chan_update_iface(ses, server);
+ spin_lock(&cifs_tcp_ses_lock);
+ ses->ses_count--;
+ }
+
spin_lock(&ses->chan_lock);
if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server))
goto next_session;
@@ -1882,7 +1918,6 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
list_del_init(&ses->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
- spin_lock(&ses->chan_lock);
chan_count = ses->chan_count;
/* close any extra channels */
@@ -1890,13 +1925,14 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
int i;
for (i = 1; i < chan_count; i++) {
- spin_unlock(&ses->chan_lock);
+ if (ses->chans[i].iface) {
+ kref_put(&ses->chans[i].iface->refcount, release_iface);
+ ses->chans[i].iface = NULL;
+ }
cifs_put_tcp_session(ses->chans[i].server, 0);
- spin_lock(&ses->chan_lock);
ses->chans[i].server = NULL;
}
}
- spin_unlock(&ses->chan_lock);
sesInfoFree(ses);
cifs_put_tcp_session(server, 0);
@@ -2266,6 +2302,9 @@ cifs_put_tcon(struct cifs_tcon *tcon)
list_del_init(&tcon->tcon_list);
spin_unlock(&cifs_tcp_ses_lock);
+ /* cancel polling of interfaces */
+ cancel_delayed_work_sync(&tcon->query_interfaces);
+
if (tcon->use_witness) {
int rc;
@@ -2503,6 +2542,12 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
tcon->local_lease = ctx->local_lease;
INIT_LIST_HEAD(&tcon->pending_opens);
+ /* schedule query interfaces poll */
+ INIT_DELAYED_WORK(&tcon->query_interfaces,
+ smb2_query_server_interfaces);
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+
spin_lock(&cifs_tcp_ses_lock);
list_add(&tcon->tcon_list, &ses->tcon_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -3978,10 +4023,16 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
struct nls_table *nls_info)
{
int rc = -ENOSYS;
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
+ struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
bool is_binding = false;
-
spin_lock(&cifs_tcp_ses_lock);
+ if (server->dstaddr.ss_family == AF_INET6)
+ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr);
+ else
+ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr);
+
if (ses->ses_status != SES_GOOD &&
ses->ses_status != SES_NEW &&
ses->ses_status != SES_NEED_RECON) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1618e0537d58..e64cda7a7610 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2004,7 +2004,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
bool fsuid_only)
{
struct cifsFileInfo *open_file = NULL;
- struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
@@ -2060,7 +2060,7 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags,
return rc;
}
- cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb);
+ cifs_sb = CIFS_SB(cifs_inode->netfs.inode.i_sb);
/* only filter by fsuid on multiuser mounts */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER))
@@ -4669,14 +4669,14 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
/* This inode is open for write at least once */
struct cifs_sb_info *cifs_sb;
- cifs_sb = CIFS_SB(cifsInode->vfs_inode.i_sb);
+ cifs_sb = CIFS_SB(cifsInode->netfs.inode.i_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO) {
/* since no page cache to corrupt on directio
we can change size safely */
return true;
}
- if (i_size_read(&cifsInode->vfs_inode) < end_of_file)
+ if (i_size_read(&cifsInode->netfs.inode) < end_of_file)
return true;
return false;
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index a638b29e9062..23ef56f55ce5 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -101,13 +101,13 @@ void cifs_fscache_get_inode_cookie(struct inode *inode)
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
- cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd);
+ cifs_fscache_fill_coherency(&cifsi->netfs.inode, &cd);
- cifsi->netfs_ctx.cache =
+ cifsi->netfs.cache =
fscache_acquire_cookie(tcon->fscache, 0,
&cifsi->uniqueid, sizeof(cifsi->uniqueid),
&cd, sizeof(cd),
- i_size_read(&cifsi->vfs_inode));
+ i_size_read(&cifsi->netfs.inode));
}
void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update)
@@ -131,7 +131,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
if (cookie) {
cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cookie);
fscache_relinquish_cookie(cookie, false);
- cifsi->netfs_ctx.cache = NULL;
+ cifsi->netfs.cache = NULL;
}
}
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index 52355c0912ae..aa3b941a5555 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -52,16 +52,16 @@ void cifs_fscache_fill_coherency(struct inode *inode,
struct cifsInodeInfo *cifsi = CIFS_I(inode);
memset(cd, 0, sizeof(*cd));
- cd->last_write_time_sec = cpu_to_le64(cifsi->vfs_inode.i_mtime.tv_sec);
- cd->last_write_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_mtime.tv_nsec);
- cd->last_change_time_sec = cpu_to_le64(cifsi->vfs_inode.i_ctime.tv_sec);
- cd->last_change_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_ctime.tv_nsec);
+ cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec);
+ cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec);
+ cd->last_change_time_sec = cpu_to_le64(cifsi->netfs.inode.i_ctime.tv_sec);
+ cd->last_change_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_ctime.tv_nsec);
}
static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode)
{
- return netfs_i_cookie(inode);
+ return netfs_i_cookie(&CIFS_I(inode)->netfs);
}
static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 2f9e7d2f81b6..81da81e18553 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -115,7 +115,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
__func__, cifs_i->uniqueid);
set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags);
/* Invalidate fscache cookie */
- cifs_fscache_fill_coherency(&cifs_i->vfs_inode, &cd);
+ cifs_fscache_fill_coherency(&cifs_i->netfs.inode, &cd);
fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0);
}
@@ -2499,7 +2499,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
u64 len)
{
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
- struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->vfs_inode.i_sb);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_i->netfs.inode.i_sb);
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
struct TCP_Server_Info *server = tcon->ses->server;
struct cifsFileInfo *cfile;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 35962a1a23b9..0e84e6fcf8ab 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -75,6 +75,7 @@ sesInfoAlloc(void)
INIT_LIST_HEAD(&ret_buf->tcon_list);
mutex_init(&ret_buf->session_mutex);
spin_lock_init(&ret_buf->iface_lock);
+ INIT_LIST_HEAD(&ret_buf->iface_list);
spin_lock_init(&ret_buf->chan_lock);
}
return ret_buf;
@@ -83,6 +84,8 @@ sesInfoAlloc(void)
void
sesInfoFree(struct cifs_ses *buf_to_free)
{
+ struct cifs_server_iface *iface = NULL, *niface = NULL;
+
if (buf_to_free == NULL) {
cifs_dbg(FYI, "Null buffer passed to sesInfoFree\n");
return;
@@ -96,7 +99,11 @@ sesInfoFree(struct cifs_ses *buf_to_free)
kfree(buf_to_free->user_name);
kfree(buf_to_free->domainName);
kfree_sensitive(buf_to_free->auth_key.response);
- kfree(buf_to_free->iface_list);
+ spin_lock(&buf_to_free->iface_lock);
+ list_for_each_entry_safe(iface, niface, &buf_to_free->iface_list,
+ iface_head)
+ kref_put(&iface->refcount, release_iface);
+ spin_unlock(&buf_to_free->iface_lock);
kfree_sensitive(buf_to_free);
}
@@ -537,11 +544,11 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
if (oplock == OPLOCK_EXCLUSIVE) {
cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG;
cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else if (oplock == OPLOCK_READ) {
cinode->oplock = CIFS_CACHE_READ_FLG;
cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else
cinode->oplock = 0;
}
@@ -1211,18 +1218,23 @@ static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void
.data = data,
.sb = NULL,
};
+ struct file_system_type **fs_type = (struct file_system_type *[]) {
+ &cifs_fs_type, &smb3_fs_type, NULL,
+ };
- iterate_supers_type(&cifs_fs_type, f, &sd);
-
- if (!sd.sb)
- return ERR_PTR(-EINVAL);
- /*
- * Grab an active reference in order to prevent automounts (DFS links)
- * of expiring and then freeing up our cifs superblock pointer while
- * we're doing failover.
- */
- cifs_sb_active(sd.sb);
- return sd.sb;
+ for (; *fs_type; fs_type++) {
+ iterate_supers_type(*fs_type, f, &sd);
+ if (sd.sb) {
+ /*
+ * Grab an active reference in order to prevent automounts (DFS links)
+ * of expiring and then freeing up our cifs superblock pointer while
+ * we're doing failover.
+ */
+ cifs_sb_active(sd.sb);
+ return sd.sb;
+ }
+ }
+ return ERR_PTR(-EINVAL);
}
static void __cifs_put_super(struct super_block *sb)
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 3b7915af1f62..02c8b2906196 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -58,7 +58,7 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface)
spin_lock(&ses->chan_lock);
for (i = 0; i < ses->chan_count; i++) {
- if (is_server_using_iface(ses->chans[i].server, iface)) {
+ if (ses->chans[i].iface == iface) {
spin_unlock(&ses->chan_lock);
return true;
}
@@ -81,6 +81,9 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
}
/* If we didn't find the channel, it is likely a bug */
+ if (server)
+ cifs_dbg(VFS, "unable to get chan index for server: 0x%llx",
+ server->conn_id);
WARN_ON(1);
return 0;
}
@@ -143,16 +146,24 @@ cifs_chan_needs_reconnect(struct cifs_ses *ses,
return CIFS_CHAN_NEEDS_RECONNECT(ses, chan_index);
}
+bool
+cifs_chan_is_iface_active(struct cifs_ses *ses,
+ struct TCP_Server_Info *server)
+{
+ unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
+
+ return ses->chans[chan_index].iface &&
+ ses->chans[chan_index].iface->is_active;
+}
+
/* returns number of channels added */
int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
{
int old_chan_count, new_chan_count;
int left;
- int i = 0;
int rc = 0;
int tries = 0;
- struct cifs_server_iface *ifaces = NULL;
- size_t iface_count;
+ struct cifs_server_iface *iface = NULL, *niface = NULL;
spin_lock(&ses->chan_lock);
@@ -182,32 +193,16 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
spin_unlock(&ses->chan_lock);
/*
- * Make a copy of the iface list at the time and use that
- * instead so as to not hold the iface spinlock for opening
- * channels
- */
- spin_lock(&ses->iface_lock);
- iface_count = ses->iface_count;
- if (iface_count <= 0) {
- spin_unlock(&ses->iface_lock);
- cifs_dbg(VFS, "no iface list available to open channels\n");
- return 0;
- }
- ifaces = kmemdup(ses->iface_list, iface_count*sizeof(*ifaces),
- GFP_ATOMIC);
- if (!ifaces) {
- spin_unlock(&ses->iface_lock);
- return 0;
- }
- spin_unlock(&ses->iface_lock);
-
- /*
* Keep connecting to same, fastest, iface for all channels as
* long as its RSS. Try next fastest one if not RSS or channel
* creation fails.
*/
+ spin_lock(&ses->iface_lock);
+ iface = list_first_entry(&ses->iface_list, struct cifs_server_iface,
+ iface_head);
+ spin_unlock(&ses->iface_lock);
+
while (left > 0) {
- struct cifs_server_iface *iface;
tries++;
if (tries > 3*ses->chan_max) {
@@ -216,31 +211,128 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
break;
}
- iface = &ifaces[i];
- if (is_ses_using_iface(ses, iface) && !iface->rss_capable) {
- i = (i+1) % iface_count;
- continue;
+ spin_lock(&ses->iface_lock);
+ if (!ses->iface_count) {
+ spin_unlock(&ses->iface_lock);
+ break;
}
- rc = cifs_ses_add_channel(cifs_sb, ses, iface);
- if (rc) {
- cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n",
- i, rc);
- i = (i+1) % iface_count;
- continue;
+ list_for_each_entry_safe_from(iface, niface, &ses->iface_list,
+ iface_head) {
+ /* skip ifaces that are unusable */
+ if (!iface->is_active ||
+ (is_ses_using_iface(ses, iface) &&
+ !iface->rss_capable)) {
+ continue;
+ }
+
+ /* take ref before unlock */
+ kref_get(&iface->refcount);
+
+ spin_unlock(&ses->iface_lock);
+ rc = cifs_ses_add_channel(cifs_sb, ses, iface);
+ spin_lock(&ses->iface_lock);
+
+ if (rc) {
+ cifs_dbg(VFS, "failed to open extra channel on iface:%pIS rc=%d\n",
+ &iface->sockaddr,
+ rc);
+ kref_put(&iface->refcount, release_iface);
+ continue;
+ }
+
+ cifs_dbg(FYI, "successfully opened new channel on iface:%pIS\n",
+ &iface->sockaddr);
+ break;
}
+ spin_unlock(&ses->iface_lock);
- cifs_dbg(FYI, "successfully opened new channel on iface#%d\n",
- i);
left--;
new_chan_count++;
}
- kfree(ifaces);
return new_chan_count - old_chan_count;
}
/*
+ * update the iface for the channel if necessary.
+ * will return 0 when iface is updated, 1 if removed, 2 otherwise
+ * Must be called with chan_lock held.
+ */
+int
+cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
+{
+ unsigned int chan_index;
+ struct cifs_server_iface *iface = NULL;
+ struct cifs_server_iface *old_iface = NULL;
+ int rc = 0;
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (!chan_index) {
+ spin_unlock(&ses->chan_lock);
+ return 0;
+ }
+
+ if (ses->chans[chan_index].iface) {
+ old_iface = ses->chans[chan_index].iface;
+ if (old_iface->is_active) {
+ spin_unlock(&ses->chan_lock);
+ return 1;
+ }
+ }
+ spin_unlock(&ses->chan_lock);
+
+ spin_lock(&ses->iface_lock);
+ /* then look for a new one */
+ list_for_each_entry(iface, &ses->iface_list, iface_head) {
+ if (!iface->is_active ||
+ (is_ses_using_iface(ses, iface) &&
+ !iface->rss_capable)) {
+ continue;
+ }
+ kref_get(&iface->refcount);
+ }
+
+ if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ rc = 1;
+ iface = NULL;
+ cifs_dbg(FYI, "unable to find a suitable iface\n");
+ }
+
+ /* now drop the ref to the current iface */
+ if (old_iface && iface) {
+ kref_put(&old_iface->refcount, release_iface);
+ cifs_dbg(FYI, "replacing iface: %pIS with %pIS\n",
+ &old_iface->sockaddr,
+ &iface->sockaddr);
+ } else if (old_iface) {
+ kref_put(&old_iface->refcount, release_iface);
+ cifs_dbg(FYI, "releasing ref to iface: %pIS\n",
+ &old_iface->sockaddr);
+ } else {
+ WARN_ON(!iface);
+ cifs_dbg(FYI, "adding new iface: %pIS\n", &iface->sockaddr);
+ }
+ spin_unlock(&ses->iface_lock);
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ ses->chans[chan_index].iface = iface;
+
+ /* No iface is found. if secondary chan, drop connection */
+ if (!iface && CIFS_SERVER_IS_CHAN(server))
+ ses->chans[chan_index].server = NULL;
+
+ spin_unlock(&ses->chan_lock);
+
+ if (!iface && CIFS_SERVER_IS_CHAN(server))
+ cifs_put_tcp_session(server, false);
+
+ return rc;
+}
+
+/*
* If server is a channel of ses, return the corresponding enclosing
* cifs_chan otherwise return NULL.
*/
@@ -301,7 +393,10 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
/* Auth */
ctx.domainauto = ses->domainAuto;
ctx.domainname = ses->domainName;
- ctx.server_hostname = ses->server->hostname;
+
+ /* no hostname for extra channels */
+ ctx.server_hostname = "";
+
ctx.username = ses->user_name;
ctx.password = ses->password;
ctx.sectype = ses->sectype;
@@ -349,6 +444,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
spin_unlock(&ses->chan_lock);
goto out;
}
+ chan->iface = iface;
ses->chan_count++;
atomic_set(&ses->chan_seq, 0);
@@ -378,6 +474,14 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
out:
if (rc && chan->server) {
+ /*
+ * we should avoid race with these delayed works before we
+ * remove this channel
+ */
+ cancel_delayed_work_sync(&chan->server->echo);
+ cancel_delayed_work_sync(&chan->server->resolve);
+ cancel_delayed_work_sync(&chan->server->reconnect);
+
spin_lock(&ses->chan_lock);
/* we rely on all bits beyond chan_count to be clear */
cifs_chan_clear_need_reconnect(ses, chan->server);
@@ -388,10 +492,9 @@ out:
*/
WARN_ON(ses->chan_count < 1);
spin_unlock(&ses->chan_lock);
- }
- if (rc && chan->server)
cifs_put_tcp_session(chan->server, 0);
+ }
return rc;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 98a76fa791c0..8802995b2d3d 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -512,73 +512,41 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
static int
parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
size_t buf_len,
- struct cifs_server_iface **iface_list,
- size_t *iface_count)
+ struct cifs_ses *ses)
{
struct network_interface_info_ioctl_rsp *p;
struct sockaddr_in *addr4;
struct sockaddr_in6 *addr6;
struct iface_info_ipv4 *p4;
struct iface_info_ipv6 *p6;
- struct cifs_server_iface *info;
+ struct cifs_server_iface *info = NULL, *iface = NULL, *niface = NULL;
+ struct cifs_server_iface tmp_iface;
ssize_t bytes_left;
size_t next = 0;
int nb_iface = 0;
- int rc = 0;
-
- *iface_list = NULL;
- *iface_count = 0;
-
- /*
- * Fist pass: count and sanity check
- */
+ int rc = 0, ret = 0;
bytes_left = buf_len;
p = buf;
- while (bytes_left >= sizeof(*p)) {
- nb_iface++;
- next = le32_to_cpu(p->Next);
- if (!next) {
- bytes_left -= sizeof(*p);
- break;
- }
- p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next);
- bytes_left -= next;
- }
-
- if (!nb_iface) {
- cifs_dbg(VFS, "%s: malformed interface info\n", __func__);
- rc = -EINVAL;
- goto out;
- }
-
- /* Azure rounds the buffer size up 8, to a 16 byte boundary */
- if ((bytes_left > 8) || p->Next)
- cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
-
+ spin_lock(&ses->iface_lock);
/*
- * Second pass: extract info to internal structure
+ * Go through iface_list and do kref_put to remove
+ * any unused ifaces. ifaces in use will be removed
+ * when the last user calls a kref_put on it
*/
-
- *iface_list = kcalloc(nb_iface, sizeof(**iface_list), GFP_KERNEL);
- if (!*iface_list) {
- rc = -ENOMEM;
- goto out;
+ list_for_each_entry_safe(iface, niface, &ses->iface_list,
+ iface_head) {
+ iface->is_active = 0;
+ kref_put(&iface->refcount, release_iface);
}
+ spin_unlock(&ses->iface_lock);
- info = *iface_list;
- bytes_left = buf_len;
- p = buf;
while (bytes_left >= sizeof(*p)) {
- info->speed = le64_to_cpu(p->LinkSpeed);
- info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
- info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;
-
- cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count);
- cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed);
- cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__,
- le32_to_cpu(p->Capability));
+ memset(&tmp_iface, 0, sizeof(tmp_iface));
+ tmp_iface.speed = le64_to_cpu(p->LinkSpeed);
+ tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
+ tmp_iface.rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 1 : 0;
switch (p->Family) {
/*
@@ -587,7 +555,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
* conversion explicit in case either one changes.
*/
case INTERNETWORK:
- addr4 = (struct sockaddr_in *)&info->sockaddr;
+ addr4 = (struct sockaddr_in *)&tmp_iface.sockaddr;
p4 = (struct iface_info_ipv4 *)p->Buffer;
addr4->sin_family = AF_INET;
memcpy(&addr4->sin_addr, &p4->IPv4Address, 4);
@@ -599,7 +567,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
&addr4->sin_addr);
break;
case INTERNETWORKV6:
- addr6 = (struct sockaddr_in6 *)&info->sockaddr;
+ addr6 = (struct sockaddr_in6 *)&tmp_iface.sockaddr;
p6 = (struct iface_info_ipv6 *)p->Buffer;
addr6->sin6_family = AF_INET6;
memcpy(&addr6->sin6_addr, &p6->IPv6Address, 16);
@@ -619,46 +587,96 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
goto next_iface;
}
- (*iface_count)++;
- info++;
+ /*
+ * The iface_list is assumed to be sorted by speed.
+ * Check if the new interface exists in that list.
+ * NEVER change iface. it could be in use.
+ * Add a new one instead
+ */
+ spin_lock(&ses->iface_lock);
+ iface = niface = NULL;
+ list_for_each_entry_safe(iface, niface, &ses->iface_list,
+ iface_head) {
+ ret = iface_cmp(iface, &tmp_iface);
+ if (!ret) {
+ /* just get a ref so that it doesn't get picked/freed */
+ iface->is_active = 1;
+ kref_get(&iface->refcount);
+ spin_unlock(&ses->iface_lock);
+ goto next_iface;
+ } else if (ret < 0) {
+ /* all remaining ifaces are slower */
+ kref_get(&iface->refcount);
+ break;
+ }
+ }
+ spin_unlock(&ses->iface_lock);
+
+ /* no match. insert the entry in the list */
+ info = kmalloc(sizeof(struct cifs_server_iface),
+ GFP_KERNEL);
+ if (!info) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ memcpy(info, &tmp_iface, sizeof(tmp_iface));
+
+ /* add this new entry to the list */
+ kref_init(&info->refcount);
+ info->is_active = 1;
+
+ cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, ses->iface_count);
+ cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed);
+ cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__,
+ le32_to_cpu(p->Capability));
+
+ spin_lock(&ses->iface_lock);
+ if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ list_add_tail(&info->iface_head, &iface->iface_head);
+ kref_put(&iface->refcount, release_iface);
+ } else
+ list_add_tail(&info->iface_head, &ses->iface_list);
+ spin_unlock(&ses->iface_lock);
+
+ ses->iface_count++;
+ ses->iface_last_update = jiffies;
next_iface:
+ nb_iface++;
next = le32_to_cpu(p->Next);
- if (!next)
+ if (!next) {
+ bytes_left -= sizeof(*p);
break;
+ }
p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next);
bytes_left -= next;
}
- if (!*iface_count) {
+ if (!nb_iface) {
+ cifs_dbg(VFS, "%s: malformed interface info\n", __func__);
rc = -EINVAL;
goto out;
}
-out:
- if (rc) {
- kfree(*iface_list);
- *iface_count = 0;
- *iface_list = NULL;
- }
- return rc;
-}
+ /* Azure rounds the buffer size up 8, to a 16 byte boundary */
+ if ((bytes_left > 8) || p->Next)
+ cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
-static int compare_iface(const void *ia, const void *ib)
-{
- const struct cifs_server_iface *a = (struct cifs_server_iface *)ia;
- const struct cifs_server_iface *b = (struct cifs_server_iface *)ib;
- return a->speed == b->speed ? 0 : (a->speed > b->speed ? -1 : 1);
+ if (!ses->iface_count) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+out:
+ return rc;
}
-static int
+int
SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
{
int rc;
unsigned int ret_data_len = 0;
struct network_interface_info_ioctl_rsp *out_buf = NULL;
- struct cifs_server_iface *iface_list;
- size_t iface_count;
struct cifs_ses *ses = tcon->ses;
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
@@ -674,21 +692,10 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
goto out;
}
- rc = parse_server_interfaces(out_buf, ret_data_len,
- &iface_list, &iface_count);
+ rc = parse_server_interfaces(out_buf, ret_data_len, ses);
if (rc)
goto out;
- /* sort interfaces from fastest to slowest */
- sort(iface_list, iface_count, sizeof(*iface_list), compare_iface, NULL);
-
- spin_lock(&ses->iface_lock);
- kfree(ses->iface_list);
- ses->iface_list = iface_list;
- ses->iface_count = iface_count;
- ses->iface_last_update = jiffies;
- spin_unlock(&ses->iface_lock);
-
out:
kfree(out_buf);
return rc;
@@ -4260,15 +4267,15 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
if (oplock == SMB2_OPLOCK_LEVEL_BATCH) {
cinode->oplock = CIFS_CACHE_RHW_FLG;
cifs_dbg(FYI, "Batch Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
cinode->oplock = CIFS_CACHE_RW_FLG;
cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else if (oplock == SMB2_OPLOCK_LEVEL_II) {
cinode->oplock = CIFS_CACHE_READ_FLG;
cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
} else
cinode->oplock = 0;
}
@@ -4307,7 +4314,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
cinode->oplock = new_oplock;
cifs_dbg(FYI, "%s Lease granted on inode %p\n", message,
- &cinode->vfs_inode);
+ &cinode->netfs.inode);
}
static void
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 0e8c85249579..c705de32e225 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -288,6 +288,9 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
mutex_unlock(&ses->session_mutex);
rc = -EHOSTDOWN;
goto failed;
+ } else if (rc) {
+ mutex_unlock(&ses->session_mutex);
+ goto out;
}
} else {
mutex_unlock(&ses->session_mutex);
@@ -540,6 +543,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
struct TCP_Server_Info *server, unsigned int *total_len)
{
char *pneg_ctxt;
+ char *hostname = NULL;
unsigned int ctxt_len, neg_context_count;
if (*total_len > 200) {
@@ -567,16 +571,25 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
*total_len += ctxt_len;
pneg_ctxt += ctxt_len;
- ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
- server->hostname);
- *total_len += ctxt_len;
- pneg_ctxt += ctxt_len;
+ /*
+ * secondary channels don't have the hostname field populated
+ * use the hostname field in the primary channel instead
+ */
+ hostname = CIFS_SERVER_IS_CHAN(server) ?
+ server->primary_server->hostname : server->hostname;
+ if (hostname && (hostname[0] != 0)) {
+ ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
+ hostname);
+ *total_len += ctxt_len;
+ pneg_ctxt += ctxt_len;
+ neg_context_count = 3;
+ } else
+ neg_context_count = 2;
build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
*total_len += sizeof(struct smb2_posix_neg_context);
pneg_ctxt += sizeof(struct smb2_posix_neg_context);
-
- neg_context_count = 4;
+ neg_context_count++;
if (server->compress_algorithm) {
build_compression_ctxt((struct smb2_compression_capabilities_context *)
@@ -5151,6 +5164,8 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
data = &info;
size = sizeof(struct smb2_file_eof_info);
+ trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, le64_to_cpu(*eof));
+
return send_set_info(xid, tcon, persistent_fid, volatile_fid,
pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE,
0, 1, &data, &size);
diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h
index 2be5e0c8564d..6b88dc2e364f 100644
--- a/fs/cifs/trace.h
+++ b/fs/cifs/trace.h
@@ -121,6 +121,44 @@ DEFINE_SMB3_RW_DONE_EVENT(query_dir_done);
DEFINE_SMB3_RW_DONE_EVENT(zero_done);
DEFINE_SMB3_RW_DONE_EVENT(falloc_done);
+/* For logging successful set EOF (truncate) */
+DECLARE_EVENT_CLASS(smb3_eof_class,
+ TP_PROTO(unsigned int xid,
+ __u64 fid,
+ __u32 tid,
+ __u64 sesid,
+ __u64 offset),
+ TP_ARGS(xid, fid, tid, sesid, offset),
+ TP_STRUCT__entry(
+ __field(unsigned int, xid)
+ __field(__u64, fid)
+ __field(__u32, tid)
+ __field(__u64, sesid)
+ __field(__u64, offset)
+ ),
+ TP_fast_assign(
+ __entry->xid = xid;
+ __entry->fid = fid;
+ __entry->tid = tid;
+ __entry->sesid = sesid;
+ __entry->offset = offset;
+ ),
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx",
+ __entry->xid, __entry->sesid, __entry->tid, __entry->fid,
+ __entry->offset)
+)
+
+#define DEFINE_SMB3_EOF_EVENT(name) \
+DEFINE_EVENT(smb3_eof_class, smb3_##name, \
+ TP_PROTO(unsigned int xid, \
+ __u64 fid, \
+ __u32 tid, \
+ __u64 sesid, \
+ __u64 offset), \
+ TP_ARGS(xid, fid, tid, sesid, offset))
+
+DEFINE_SMB3_EOF_EVENT(set_eof);
+
/*
* For handle based calls other than read and write, and get/set info
*/
diff --git a/fs/exec.c b/fs/exec.c
index 0989fb8472a1..778123259e42 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1301,7 +1301,7 @@ int begin_new_exec(struct linux_binprm * bprm)
bprm->mm = NULL;
#ifdef CONFIG_POSIX_TIMERS
- exit_itimers(me->signal);
+ exit_itimers(me);
flush_itimer_signals();
#endif
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 76acc3721951..c6eaf7e9ea74 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -1198,7 +1198,9 @@ static int __exfat_rename(struct inode *old_parent_inode,
return -ENOENT;
}
- exfat_chain_dup(&olddir, &ei->dir);
+ exfat_chain_set(&olddir, EXFAT_I(old_parent_inode)->start_clu,
+ EXFAT_B_TO_CLU_ROUND_UP(i_size_read(old_parent_inode), sbi),
+ EXFAT_I(old_parent_inode)->flags);
dentry = ei->entry;
ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2c2f179b6977..43de293cef56 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -672,17 +672,14 @@ int ext2_empty_dir (struct inode * inode)
void *page_addr = NULL;
struct page *page = NULL;
unsigned long i, npages = dir_pages(inode);
- int dir_has_error = 0;
for (i = 0; i < npages; i++) {
char *kaddr;
ext2_dirent * de;
- page = ext2_get_page(inode, i, dir_has_error, &page_addr);
+ page = ext2_get_page(inode, i, 0, &page_addr);
- if (IS_ERR(page)) {
- dir_has_error = 1;
- continue;
- }
+ if (IS_ERR(page))
+ goto not_empty;
kaddr = page_addr;
de = (ext2_dirent *)kaddr;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 360ce3604a2d..e6b932219803 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1549,7 +1549,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
if (IS_ERR(raw_inode))
return -EIO;
- /* For fields not not tracking in the in-memory inode,
+ /* For fields not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
if (ei->i_state & EXT2_STATE_NEW)
memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3dce7d058985..84c0eb55071d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -829,7 +829,7 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
inode->i_ino, create);
return _ext4_get_block(inode, iblock, bh_result,
- EXT4_GET_BLOCKS_IO_CREATE_EXT);
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
}
/* Maximum number of blocks we map for direct IO at once. */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9f12f29bc346..9e06334771a3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4104,6 +4104,15 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
size = size >> bsbits;
start = start_off >> bsbits;
+ /*
+ * For tiny groups (smaller than 8MB) the chosen allocation
+ * alignment may be larger than group size. Make sure the
+ * alignment does not move allocation to a different group which
+ * makes mballoc fail assertions later.
+ */
+ start = max(start, rounddown(ac->ac_o_ex.fe_logical,
+ (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
+
/* don't cover already allocated blocks in selected range */
if (ar->pleft && start <= ar->lleft) {
size -= ar->lleft + 1 - start;
@@ -4176,7 +4185,22 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
}
rcu_read_unlock();
- if (start + size <= ac->ac_o_ex.fe_logical &&
+ /*
+ * In this function "start" and "size" are normalized for better
+ * alignment and length such that we could preallocate more blocks.
+ * This normalization is done such that original request of
+ * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
+ * "size" boundaries.
+ * (Note fe_len can be relaxed since FS block allocation API does not
+ * provide gurantee on number of contiguous blocks allocation since that
+ * depends upon free space left, etc).
+ * In case of inode pa, later we use the allocated blocks
+ * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated
+ * range of goal/best blocks [start, size] to put it at the
+ * ac_o_ex.fe_logical extent of this inode.
+ * (See ext4_mb_use_inode_pa() for more details)
+ */
+ if (start + size <= ac->ac_o_ex.fe_logical ||
start > ac->ac_o_ex.fe_logical) {
ext4_msg(ac->ac_sb, KERN_ERR,
"start %lu, size %lu, fe_logical %lu",
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 7a5353a8cfd7..42f590518b4c 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -438,7 +438,7 @@ int ext4_ext_migrate(struct inode *inode)
/*
* Worst case we can touch the allocation bitmaps and a block
- * group descriptor block. We do need need to worry about
+ * group descriptor block. We do need to worry about
* credits for modifying the quota inode.
*/
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 47d0ca4c795b..db4ba99d1ceb 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1929,7 +1929,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
struct dx_hash_info *hinfo)
{
unsigned blocksize = dir->i_sb->s_blocksize;
- unsigned count, continued;
+ unsigned continued;
+ int count;
struct buffer_head *bh2;
ext4_lblk_t newblock;
u32 hash2;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 14695e2b5042..97fa7b4c645f 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -465,7 +465,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
/*
* In the first loop we prepare and mark buffers to submit. We have to
* mark all buffers in the page before submitting so that
- * end_page_writeback() cannot be called from ext4_bio_end_io() when IO
+ * end_page_writeback() cannot be called from ext4_end_bio() when IO
* on the first buffer finishes and we are still working on submitting
* the second buffer.
*/
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 90a941d20dff..8b70a4701293 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -54,6 +54,16 @@ int ext4_resize_begin(struct super_block *sb)
return -EPERM;
/*
+ * If the reserved GDT blocks is non-zero, the resize_inode feature
+ * should always be set.
+ */
+ if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks &&
+ !ext4_has_feature_resize_inode(sb)) {
+ ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero");
+ return -EFSCORRUPTED;
+ }
+
+ /*
* If we are not using the primary superblock/GDT copy don't resize,
* because the user tools have no way of handling this. Probably a
* bad time to do it anyways.
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 450c918d68fc..845f2f8aee5f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -87,7 +87,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
static int ext4_validate_options(struct fs_context *fc);
static int ext4_check_opt_consistency(struct fs_context *fc,
struct super_block *sb);
-static int ext4_apply_options(struct fs_context *fc, struct super_block *sb);
+static void ext4_apply_options(struct fs_context *fc, struct super_block *sb);
static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param);
static int ext4_get_tree(struct fs_context *fc);
static int ext4_reconfigure(struct fs_context *fc);
@@ -1870,31 +1870,12 @@ ext4_sb_read_encoding(const struct ext4_super_block *es)
}
#endif
-static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg)
-{
-#ifdef CONFIG_FS_ENCRYPTION
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err;
-
- err = fscrypt_set_test_dummy_encryption(sb, arg,
- &sbi->s_dummy_enc_policy);
- if (err) {
- ext4_msg(sb, KERN_WARNING,
- "Error while setting test dummy encryption [%d]", err);
- return err;
- }
- ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
-#endif
- return 0;
-}
-
#define EXT4_SPEC_JQUOTA (1 << 0)
#define EXT4_SPEC_JQFMT (1 << 1)
#define EXT4_SPEC_DATAJ (1 << 2)
#define EXT4_SPEC_SB_BLOCK (1 << 3)
#define EXT4_SPEC_JOURNAL_DEV (1 << 4)
#define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5)
-#define EXT4_SPEC_DUMMY_ENCRYPTION (1 << 6)
#define EXT4_SPEC_s_want_extra_isize (1 << 7)
#define EXT4_SPEC_s_max_batch_time (1 << 8)
#define EXT4_SPEC_s_min_batch_time (1 << 9)
@@ -1911,7 +1892,7 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg)
struct ext4_fs_context {
char *s_qf_names[EXT4_MAXQUOTAS];
- char *test_dummy_enc_arg;
+ struct fscrypt_dummy_policy dummy_enc_policy;
int s_jquota_fmt; /* Format of quota to use */
#ifdef CONFIG_EXT4_DEBUG
int s_fc_debug_max_replay;
@@ -1953,7 +1934,7 @@ static void ext4_fc_free(struct fs_context *fc)
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(ctx->s_qf_names[i]);
- kfree(ctx->test_dummy_enc_arg);
+ fscrypt_free_dummy_policy(&ctx->dummy_enc_policy);
kfree(ctx);
}
@@ -2029,6 +2010,29 @@ static int unnote_qf_name(struct fs_context *fc, int qtype)
}
#endif
+static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
+ struct ext4_fs_context *ctx)
+{
+ int err;
+
+ if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "test_dummy_encryption option not supported");
+ return -EINVAL;
+ }
+ err = fscrypt_parse_test_dummy_encryption(param,
+ &ctx->dummy_enc_policy);
+ if (err == -EINVAL) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Value of option \"%s\" is unrecognized", param->key);
+ } else if (err == -EEXIST) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Conflicting test_dummy_encryption options");
+ return -EINVAL;
+ }
+ return err;
+}
+
#define EXT4_SET_CTX(name) \
static inline void ctx_set_##name(struct ext4_fs_context *ctx, \
unsigned long flag) \
@@ -2291,29 +2295,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO;
return 0;
case Opt_test_dummy_encryption:
-#ifdef CONFIG_FS_ENCRYPTION
- if (param->type == fs_value_is_flag) {
- ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION;
- ctx->test_dummy_enc_arg = NULL;
- return 0;
- }
- if (*param->string &&
- !(!strcmp(param->string, "v1") ||
- !strcmp(param->string, "v2"))) {
- ext4_msg(NULL, KERN_WARNING,
- "Value of option \"%s\" is unrecognized",
- param->key);
- return -EINVAL;
- }
- ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION;
- ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size,
- GFP_KERNEL);
- return 0;
-#else
- ext4_msg(NULL, KERN_WARNING,
- "test_dummy_encryption option not supported");
- return -EINVAL;
-#endif
+ return ext4_parse_test_dummy_encryption(param, ctx);
case Opt_dax:
case Opt_dax_type:
#ifdef CONFIG_FS_DAX
@@ -2504,7 +2486,8 @@ parse_failed:
if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)
m_ctx->journal_ioprio = s_ctx->journal_ioprio;
- ret = ext4_apply_options(fc, sb);
+ ext4_apply_options(fc, sb);
+ ret = 0;
out_free:
if (fc) {
@@ -2673,11 +2656,11 @@ err_jquota_specified:
static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
struct super_block *sb)
{
-#ifdef CONFIG_FS_ENCRYPTION
const struct ext4_fs_context *ctx = fc->fs_private;
const struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
- if (!(ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION))
+ if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy))
return 0;
if (!ext4_has_feature_encrypt(sb)) {
@@ -2691,14 +2674,46 @@ static int ext4_check_test_dummy_encryption(const struct fs_context *fc,
* needed to allow it to be set or changed during remount. We do allow
* it to be specified during remount, but only if there is no change.
*/
- if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE &&
- !sbi->s_dummy_enc_policy.policy) {
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
+ &ctx->dummy_enc_policy))
+ return 0;
ext4_msg(NULL, KERN_WARNING,
- "Can't set test_dummy_encryption on remount");
+ "Can't set or change test_dummy_encryption on remount");
return -EINVAL;
}
-#endif /* CONFIG_FS_ENCRYPTION */
- return 0;
+ /* Also make sure s_mount_opts didn't contain a conflicting value. */
+ if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) {
+ if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy,
+ &ctx->dummy_enc_policy))
+ return 0;
+ ext4_msg(NULL, KERN_WARNING,
+ "Conflicting test_dummy_encryption options");
+ return -EINVAL;
+ }
+ /*
+ * fscrypt_add_test_dummy_key() technically changes the super_block, so
+ * technically it should be delayed until ext4_apply_options() like the
+ * other changes. But since we never get here for remounts (see above),
+ * and this is the last chance to report errors, we do it here.
+ */
+ err = fscrypt_add_test_dummy_key(sb, &ctx->dummy_enc_policy);
+ if (err)
+ ext4_msg(NULL, KERN_WARNING,
+ "Error adding test dummy encryption key [%d]", err);
+ return err;
+}
+
+static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx,
+ struct super_block *sb)
+{
+ if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) ||
+ /* if already set, it was already verified to be the same */
+ fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy))
+ return;
+ EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy;
+ memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy));
+ ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
}
static int ext4_check_opt_consistency(struct fs_context *fc,
@@ -2785,11 +2800,10 @@ fail_dax_change_remount:
return ext4_check_quota_consistency(fc, sb);
}
-static int ext4_apply_options(struct fs_context *fc, struct super_block *sb)
+static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
{
struct ext4_fs_context *ctx = fc->fs_private;
struct ext4_sb_info *sbi = fc->s_fs_info;
- int ret = 0;
sbi->s_mount_opt &= ~ctx->mask_s_mount_opt;
sbi->s_mount_opt |= ctx->vals_s_mount_opt;
@@ -2825,11 +2839,7 @@ static int ext4_apply_options(struct fs_context *fc, struct super_block *sb)
#endif
ext4_apply_quota_options(fc, sb);
-
- if (ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION)
- ret = ext4_set_test_dummy_encryption(sb, ctx->test_dummy_enc_arg);
-
- return ret;
+ ext4_apply_test_dummy_encryption(ctx, sb);
}
@@ -4552,9 +4562,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (err < 0)
goto failed_mount;
- err = ext4_apply_options(fc, sb);
- if (err < 0)
- goto failed_mount;
+ ext4_apply_options(fc, sb);
#if IS_ENABLED(CONFIG_UNICODE)
if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {
@@ -5302,14 +5310,6 @@ no_journal:
err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
GFP_KERNEL);
}
- /*
- * Update the checksum after updating free space/inode
- * counters. Otherwise the superblock can have an incorrect
- * checksum in the buffer cache until it is written out and
- * e2fsprogs programs trying to open a file system immediately
- * after it is mounted can fail.
- */
- ext4_superblock_csum_set(sb);
if (!err)
err = percpu_counter_init(&sbi->s_dirs_counter,
ext4_count_dirs(sb), GFP_KERNEL);
@@ -5367,6 +5367,14 @@ no_journal:
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
+ /*
+ * Update the checksum after updating free space/inode counters and
+ * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect
+ * checksum in the buffer cache until it is written out and
+ * e2fsprogs programs trying to open a file system immediately
+ * after it is mounted can fail.
+ */
+ ext4_superblock_csum_set(sb);
if (needs_recovery) {
ext4_msg(sb, KERN_INFO, "recovery complete");
err = ext4_mark_recovery_complete(sb, es);
@@ -5898,7 +5906,6 @@ static void ext4_update_super(struct super_block *sb)
static int ext4_commit_super(struct super_block *sb)
{
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
- int error = 0;
if (!sbh)
return -EINVAL;
@@ -5907,6 +5914,13 @@ static int ext4_commit_super(struct super_block *sb)
ext4_update_super(sb);
+ lock_buffer(sbh);
+ /* Buffer got discarded which means block device got invalidated */
+ if (!buffer_mapped(sbh)) {
+ unlock_buffer(sbh);
+ return -EIO;
+ }
+
if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
/*
* Oh, dear. A previous attempt to write the
@@ -5921,17 +5935,21 @@ static int ext4_commit_super(struct super_block *sb)
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
- BUFFER_TRACE(sbh, "marking dirty");
- mark_buffer_dirty(sbh);
- error = __sync_dirty_buffer(sbh,
- REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
+ get_bh(sbh);
+ /* Clear potential dirty bit if it was journalled update */
+ clear_buffer_dirty(sbh);
+ sbh->b_end_io = end_buffer_write_sync;
+ submit_bh(REQ_OP_WRITE,
+ REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
+ wait_on_buffer(sbh);
if (buffer_write_io_error(sbh)) {
ext4_msg(sb, KERN_ERR, "I/O error while writing "
"superblock");
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
+ return -EIO;
}
- return error;
+ return 0;
}
/*
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 042325349098..564e28a1aa94 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1895,11 +1895,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
unlock_buffer(bs->bh);
ea_bdebug(bs->bh, "cloning");
- s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
+ s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
error = -ENOMEM;
if (s->base == NULL)
goto cleanup;
- memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
s->first = ENTRY(header(s->base)+1);
header(s->base)->h_refcount = cpu_to_le32(1);
s->here = ENTRY(s->base + offset);
diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c
index be599f31d3c4..d84c5f6cc09d 100644
--- a/fs/f2fs/iostat.c
+++ b/fs/f2fs/iostat.c
@@ -91,8 +91,9 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
unsigned int cnt;
struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE];
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
+ unsigned long flags;
- spin_lock_bh(&sbi->iostat_lat_lock);
+ spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
for (idx = 0; idx < MAX_IO_TYPE; idx++) {
for (io = 0; io < NR_PAGE_TYPE; io++) {
cnt = io_lat->bio_cnt[idx][io];
@@ -106,7 +107,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi)
io_lat->bio_cnt[idx][io] = 0;
}
}
- spin_unlock_bh(&sbi->iostat_lat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
trace_f2fs_iostat_latency(sbi, iostat_lat);
}
@@ -115,14 +116,15 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
{
unsigned long long iostat_diff[NR_IO_TYPE];
int i;
+ unsigned long flags;
if (time_is_after_jiffies(sbi->iostat_next_period))
return;
/* Need double check under the lock */
- spin_lock_bh(&sbi->iostat_lock);
+ spin_lock_irqsave(&sbi->iostat_lock, flags);
if (time_is_after_jiffies(sbi->iostat_next_period)) {
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lock, flags);
return;
}
sbi->iostat_next_period = jiffies +
@@ -133,7 +135,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi)
sbi->prev_rw_iostat[i];
sbi->prev_rw_iostat[i] = sbi->rw_iostat[i];
}
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lock, flags);
trace_f2fs_iostat(sbi, iostat_diff);
@@ -145,25 +147,27 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
int i;
- spin_lock_bh(&sbi->iostat_lock);
+ spin_lock_irq(&sbi->iostat_lock);
for (i = 0; i < NR_IO_TYPE; i++) {
sbi->rw_iostat[i] = 0;
sbi->prev_rw_iostat[i] = 0;
}
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irq(&sbi->iostat_lock);
- spin_lock_bh(&sbi->iostat_lat_lock);
+ spin_lock_irq(&sbi->iostat_lat_lock);
memset(io_lat, 0, sizeof(struct iostat_lat_info));
- spin_unlock_bh(&sbi->iostat_lat_lock);
+ spin_unlock_irq(&sbi->iostat_lat_lock);
}
void f2fs_update_iostat(struct f2fs_sb_info *sbi,
enum iostat_type type, unsigned long long io_bytes)
{
+ unsigned long flags;
+
if (!sbi->iostat_enable)
return;
- spin_lock_bh(&sbi->iostat_lock);
+ spin_lock_irqsave(&sbi->iostat_lock, flags);
sbi->rw_iostat[type] += io_bytes;
if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO)
@@ -172,7 +176,7 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi,
if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO)
sbi->rw_iostat[APP_READ_IO] += io_bytes;
- spin_unlock_bh(&sbi->iostat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lock, flags);
f2fs_record_iostat(sbi);
}
@@ -185,6 +189,7 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
struct f2fs_sb_info *sbi = iostat_ctx->sbi;
struct iostat_lat_info *io_lat = sbi->iostat_io_lat;
int idx;
+ unsigned long flags;
if (!sbi->iostat_enable)
return;
@@ -202,12 +207,12 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx,
idx = WRITE_ASYNC_IO;
}
- spin_lock_bh(&sbi->iostat_lat_lock);
+ spin_lock_irqsave(&sbi->iostat_lat_lock, flags);
io_lat->sum_lat[idx][iotype] += ts_diff;
io_lat->bio_cnt[idx][iotype]++;
if (ts_diff > io_lat->peak_lat[idx][iotype])
io_lat->peak_lat[idx][iotype] = ts_diff;
- spin_unlock_bh(&sbi->iostat_lat_lock);
+ spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags);
}
void iostat_update_and_unbind_ctx(struct bio *bio, int rw)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index c549acb52ac4..bf00d5057abb 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -89,8 +89,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
if (test_opt(sbi, INLINE_XATTR))
set_inode_flag(inode, FI_INLINE_XATTR);
- if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
- set_inode_flag(inode, FI_INLINE_DATA);
if (f2fs_may_inline_dentry(inode))
set_inode_flag(inode, FI_INLINE_DENTRY);
@@ -107,10 +105,6 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
f2fs_init_extent_tree(inode, NULL);
- stat_inc_inline_xattr(inode);
- stat_inc_inline_inode(inode);
- stat_inc_inline_dir(inode);
-
F2FS_I(inode)->i_flags =
f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
@@ -127,6 +121,14 @@ static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
set_compress_context(inode);
}
+ /* Should enable inline_data after compression set */
+ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
+ set_inode_flag(inode, FI_INLINE_DATA);
+
+ stat_inc_inline_xattr(inode);
+ stat_inc_inline_inode(inode);
+ stat_inc_inline_dir(inode);
+
f2fs_set_inode_flags(inode);
trace_f2fs_new_inode(inode, 0);
@@ -325,6 +327,9 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
if (!is_extension_exist(name, ext[i], false))
continue;
+ /* Do not use inline_data with compression */
+ stat_dec_inline_inode(inode);
+ clear_inode_flag(inode, FI_INLINE_DATA);
set_compress_context(inode);
return;
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 836c79a20afc..cf6f7fc83c08 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1450,7 +1450,9 @@ page_hit:
out_err:
ClearPageUptodate(page);
out_put_err:
- f2fs_handle_page_eio(sbi, page->index, NODE);
+ /* ENOENT comes from read_node_page which is not an error. */
+ if (err != -ENOENT)
+ f2fs_handle_page_eio(sbi, page->index, NODE);
f2fs_put_page(page, 1);
return ERR_PTR(err);
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a21d8f1a56d1..05221366a16d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -120,6 +120,7 @@ static bool inode_io_list_move_locked(struct inode *inode,
struct list_head *head)
{
assert_spin_locked(&wb->list_lock);
+ assert_spin_locked(&inode->i_lock);
list_move(&inode->i_io_list, head);
@@ -1365,9 +1366,9 @@ static int move_expired_inodes(struct list_head *delaying_queue,
inode = wb_inode(delaying_queue->prev);
if (inode_dirtied_after(inode, dirtied_before))
break;
+ spin_lock(&inode->i_lock);
list_move(&inode->i_io_list, &tmp);
moved++;
- spin_lock(&inode->i_lock);
inode->i_state |= I_SYNC_QUEUED;
spin_unlock(&inode->i_lock);
if (sb_is_blkdev_sb(inode->i_sb))
@@ -1383,7 +1384,12 @@ static int move_expired_inodes(struct list_head *delaying_queue,
goto out;
}
- /* Move inodes from one superblock together */
+ /*
+ * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue',
+ * we don't take inode->i_lock here because it is just a pointless overhead.
+ * Inode is already marked as I_SYNC_QUEUED so writeback list handling is
+ * fully under our control.
+ */
while (!list_empty(&tmp)) {
sb = wb_inode(tmp.prev)->i_sb;
list_for_each_prev_safe(pos, node, &tmp) {
@@ -1826,8 +1832,8 @@ static long writeback_sb_inodes(struct super_block *sb,
* We'll have another go at writing back this inode
* when we completed a full scan of b_io.
*/
- spin_unlock(&inode->i_lock);
requeue_io(inode, wb);
+ spin_unlock(&inode->i_lock);
trace_writeback_sb_inodes_requeue(inode);
continue;
}
@@ -2358,6 +2364,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
int dirtytime = 0;
+ struct bdi_writeback *wb = NULL;
trace_writeback_mark_inode_dirty(inode, flags);
@@ -2410,13 +2417,24 @@ void __mark_inode_dirty(struct inode *inode, int flags)
inode->i_state |= flags;
/*
+ * Grab inode's wb early because it requires dropping i_lock and we
+ * need to make sure following checks happen atomically with dirty
+ * list handling so that we don't move inodes under flush worker's
+ * hands.
+ */
+ if (!was_dirty) {
+ wb = locked_inode_to_wb_and_lock_list(inode);
+ spin_lock(&inode->i_lock);
+ }
+
+ /*
* If the inode is queued for writeback by flush worker, just
* update its dirty state. Once the flush worker is done with
* the inode it will place it on the appropriate superblock
* list, based upon its state.
*/
if (inode->i_state & I_SYNC_QUEUED)
- goto out_unlock_inode;
+ goto out_unlock;
/*
* Only add valid (hashed) inodes to the superblock's
@@ -2424,22 +2442,19 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
if (!S_ISBLK(inode->i_mode)) {
if (inode_unhashed(inode))
- goto out_unlock_inode;
+ goto out_unlock;
}
if (inode->i_state & I_FREEING)
- goto out_unlock_inode;
+ goto out_unlock;
/*
* If the inode was already on b_dirty/b_io/b_more_io, don't
* reposition it (that would break b_dirty time-ordering).
*/
if (!was_dirty) {
- struct bdi_writeback *wb;
struct list_head *dirty_list;
bool wakeup_bdi = false;
- wb = locked_inode_to_wb_and_lock_list(inode);
-
inode->dirtied_when = jiffies;
if (dirtytime)
inode->dirtied_time_when = jiffies;
@@ -2453,6 +2468,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
dirty_list);
spin_unlock(&wb->list_lock);
+ spin_unlock(&inode->i_lock);
trace_writeback_dirty_inode_enqueue(inode);
/*
@@ -2467,6 +2483,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
return;
}
}
+out_unlock:
+ if (wb)
+ spin_unlock(&wb->list_lock);
out_unlock_inode:
spin_unlock(&inode->i_lock);
}
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 9d3cf0111709..74920826d8f6 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -372,17 +372,22 @@ nomem:
return NULL;
}
+static inline bool fscache_cookie_is_dropped(struct fscache_cookie *cookie)
+{
+ return READ_ONCE(cookie->state) == FSCACHE_COOKIE_STATE_DROPPED;
+}
+
static void fscache_wait_on_collision(struct fscache_cookie *candidate,
struct fscache_cookie *wait_for)
{
enum fscache_cookie_state *statep = &wait_for->state;
- wait_var_event_timeout(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED,
+ wait_var_event_timeout(statep, fscache_cookie_is_dropped(wait_for),
20 * HZ);
- if (READ_ONCE(*statep) != FSCACHE_COOKIE_STATE_DROPPED) {
+ if (!fscache_cookie_is_dropped(wait_for)) {
pr_notice("Potential collision c=%08x old: c=%08x",
candidate->debug_id, wait_for->debug_id);
- wait_var_event(statep, READ_ONCE(*statep) == FSCACHE_COOKIE_STATE_DROPPED);
+ wait_var_event(statep, fscache_cookie_is_dropped(wait_for));
}
}
@@ -517,7 +522,14 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie)
}
fscache_see_cookie(cookie, fscache_cookie_see_active);
- fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE);
+ spin_lock(&cookie->lock);
+ if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags))
+ __fscache_set_cookie_state(cookie,
+ FSCACHE_COOKIE_STATE_INVALIDATING);
+ else
+ __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE);
+ spin_unlock(&cookie->lock);
+ wake_up_cookie_state(cookie);
trace = fscache_access_lookup_cookie_end;
out:
@@ -752,6 +764,9 @@ again_locked:
spin_lock(&cookie->lock);
}
+ if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags))
+ fscache_end_cookie_access(cookie, fscache_access_invalidate_cookie_end);
+
switch (state) {
case FSCACHE_COOKIE_STATE_RELINQUISHING:
fscache_see_cookie(cookie, fscache_cookie_see_relinquish);
@@ -1048,6 +1063,9 @@ void __fscache_invalidate(struct fscache_cookie *cookie,
return;
case FSCACHE_COOKIE_STATE_LOOKING_UP:
+ __fscache_begin_cookie_access(cookie, fscache_access_invalidate_cookie);
+ set_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags);
+ fallthrough;
case FSCACHE_COOKIE_STATE_CREATING:
spin_unlock(&cookie->lock);
_leave(" [look %x]", cookie->inval_counter);
diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c
index f2aa7dbad766..a058e0136bfe 100644
--- a/fs/fscache/volume.c
+++ b/fs/fscache/volume.c
@@ -143,7 +143,7 @@ static void fscache_wait_on_volume_collision(struct fscache_volume *candidate,
{
wait_var_event_timeout(&candidate->flags,
!fscache_is_acquire_pending(candidate), 20 * HZ);
- if (!fscache_is_acquire_pending(candidate)) {
+ if (fscache_is_acquire_pending(candidate)) {
pr_notice("Potential volume collision new=%08x old=%08x",
candidate->debug_id, collidee_debug_id);
fscache_stat(&fscache_n_volumes_collision);
@@ -182,7 +182,7 @@ static bool fscache_hash_volume(struct fscache_volume *candidate)
hlist_bl_add_head(&candidate->hash_link, h);
hlist_bl_unlock(h);
- if (test_bit(FSCACHE_VOLUME_ACQUIRE_PENDING, &candidate->flags))
+ if (fscache_is_acquire_pending(candidate))
fscache_wait_on_volume_collision(candidate, collidee_debug_id);
return true;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 62408047e8d7..02eb72351b15 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -600,41 +600,79 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
remove_inode_hugepages(inode, offset, LLONG_MAX);
}
+static void hugetlbfs_zero_partial_page(struct hstate *h,
+ struct address_space *mapping,
+ loff_t start,
+ loff_t end)
+{
+ pgoff_t idx = start >> huge_page_shift(h);
+ struct folio *folio;
+
+ folio = filemap_lock_folio(mapping, idx);
+ if (!folio)
+ return;
+
+ start = start & ~huge_page_mask(h);
+ end = end & ~huge_page_mask(h);
+ if (!end)
+ end = huge_page_size(h);
+
+ folio_zero_segment(folio, (size_t)start, (size_t)end);
+
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
+ struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ struct address_space *mapping = inode->i_mapping;
struct hstate *h = hstate_inode(inode);
loff_t hpage_size = huge_page_size(h);
loff_t hole_start, hole_end;
/*
- * For hole punch round up the beginning offset of the hole and
- * round down the end.
+ * hole_start and hole_end indicate the full pages within the hole.
*/
hole_start = round_up(offset, hpage_size);
hole_end = round_down(offset + len, hpage_size);
- if (hole_end > hole_start) {
- struct address_space *mapping = inode->i_mapping;
- struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ inode_lock(inode);
- inode_lock(inode);
+ /* protected by i_rwsem */
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
+ inode_unlock(inode);
+ return -EPERM;
+ }
- /* protected by i_rwsem */
- if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
- inode_unlock(inode);
- return -EPERM;
- }
+ i_mmap_lock_write(mapping);
+
+ /* If range starts before first full page, zero partial page. */
+ if (offset < hole_start)
+ hugetlbfs_zero_partial_page(h, mapping,
+ offset, min(offset + len, hole_start));
- i_mmap_lock_write(mapping);
+ /* Unmap users of full pages in the hole. */
+ if (hole_end > hole_start) {
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap,
hole_start >> PAGE_SHIFT,
hole_end >> PAGE_SHIFT, 0);
- i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, hole_start, hole_end);
- inode_unlock(inode);
}
+ /* If range extends beyond last full page, zero partial page. */
+ if ((offset + len) > hole_end && (offset + len) > hole_start)
+ hugetlbfs_zero_partial_page(h, mapping,
+ hole_end, offset + len);
+
+ i_mmap_unlock_write(mapping);
+
+ /* Remove full pages from the file. */
+ if (hole_end > hole_start)
+ remove_inode_hugepages(inode, hole_start, hole_end);
+
+ inode_unlock(inode);
+
return 0;
}
diff --git a/fs/inode.c b/fs/inode.c
index 9d9b422504d1..bd4da9c5207e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -27,7 +27,7 @@
* Inode locking rules:
*
* inode->i_lock protects:
- * inode->i_state, inode->i_hash, __iget()
+ * inode->i_state, inode->i_hash, __iget(), inode->i_io_list
* Inode LRU list locks protect:
* inode->i_sb->s_inode_lru, inode->i_lru
* inode->i_sb->s_inode_list_lock protects:
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3aab4182fd89..e8e769be9ed0 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -298,8 +298,8 @@ struct io_buffer_list {
/* below is for ring provided buffers */
__u16 buf_nr_pages;
__u16 nr_entries;
- __u32 head;
- __u32 mask;
+ __u16 head;
+ __u16 mask;
};
struct io_buffer {
@@ -576,7 +576,6 @@ struct io_close {
struct file *file;
int fd;
u32 file_slot;
- u32 flags;
};
struct io_timeout_data {
@@ -784,12 +783,6 @@ struct io_msg {
u32 len;
};
-struct io_nop {
- struct file *file;
- u64 extra1;
- u64 extra2;
-};
-
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -851,6 +844,7 @@ enum {
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT,
+ REQ_F_CQE32_INIT_BIT,
REQ_F_APOLL_MULTISHOT_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
@@ -920,6 +914,8 @@ enum {
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
/* fast poll multishot mode */
REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
+ /* ->extra1 and ->extra2 are initialised */
+ REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),
};
struct async_poll {
@@ -994,7 +990,6 @@ struct io_kiocb {
struct io_msg msg;
struct io_xattr xattr;
struct io_socket sock;
- struct io_nop nop;
struct io_uring_cmd uring_cmd;
};
@@ -1121,7 +1116,6 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_NOP] = {
.audit_skip = 1,
.iopoll = 1,
- .buffer_select = 1,
},
[IORING_OP_READV] = {
.needs_file = 1,
@@ -1189,6 +1183,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.needs_async_setup = 1,
+ .ioprio = 1,
.async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_RECVMSG] = {
@@ -1197,6 +1192,7 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.buffer_select = 1,
.needs_async_setup = 1,
+ .ioprio = 1,
.async_size = sizeof(struct io_async_msghdr),
},
[IORING_OP_TIMEOUT] = {
@@ -1272,6 +1268,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollout = 1,
.audit_skip = 1,
+ .ioprio = 1,
},
[IORING_OP_RECV] = {
.needs_file = 1,
@@ -1279,6 +1276,7 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.buffer_select = 1,
.audit_skip = 1,
+ .ioprio = 1,
},
[IORING_OP_OPENAT2] = {
},
@@ -1729,9 +1727,24 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return;
- /* don't recycle if we already did IO to this buffer */
- if (req->flags & REQ_F_PARTIAL_IO)
+ /*
+ * For legacy provided buffer mode, don't recycle if we already did
+ * IO to this buffer. For ring-mapped provided buffer mode, we should
+ * increment ring->head to explicitly monopolize the buffer to avoid
+ * multiple use.
+ */
+ if ((req->flags & REQ_F_BUFFER_SELECTED) &&
+ (req->flags & REQ_F_PARTIAL_IO))
+ return;
+
+ /*
+ * READV uses fields in `struct io_rw` (len/addr) to stash the selected
+ * buffer data. However if that buffer is recycled the original request
+ * data stored in addr is lost. Therefore forbid recycling for now.
+ */
+ if (req->opcode == IORING_OP_READV)
return;
+
/*
* We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
* the flag and hence ensure that bl->head doesn't get incremented.
@@ -1739,8 +1752,13 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
*/
if (req->flags & REQ_F_BUFFER_RING) {
if (req->buf_list) {
- req->buf_index = req->buf_list->bgid;
- req->flags &= ~REQ_F_BUFFER_RING;
+ if (req->flags & REQ_F_PARTIAL_IO) {
+ req->buf_list->head++;
+ req->buf_list = NULL;
+ } else {
+ req->buf_index = req->buf_list->bgid;
+ req->flags &= ~REQ_F_BUFFER_RING;
+ }
}
return;
}
@@ -1969,7 +1987,7 @@ static inline void io_req_track_inflight(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_INFLIGHT)) {
req->flags |= REQ_F_INFLIGHT;
- atomic_inc(&current->io_uring->inflight_tracked);
+ atomic_inc(&req->task->io_uring->inflight_tracked);
}
}
@@ -2441,94 +2459,66 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
return true;
}
-static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags)
+static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
{
struct io_uring_cqe *cqe;
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
- cqe = io_get_cqe(ctx);
- if (likely(cqe)) {
- WRITE_ONCE(cqe->user_data, user_data);
- WRITE_ONCE(cqe->res, res);
- WRITE_ONCE(cqe->flags, cflags);
- return true;
- }
- return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
-}
+ if (!(ctx->flags & IORING_SETUP_CQE32)) {
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags, 0, 0);
-static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- struct io_uring_cqe *cqe;
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqe(ctx);
+ if (likely(cqe)) {
+ memcpy(cqe, &req->cqe, sizeof(*cqe));
+ return true;
+ }
- trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
- req->cqe.res, req->cqe.flags, 0, 0);
+ return io_cqring_event_overflow(ctx, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags,
+ 0, 0);
+ } else {
+ u64 extra1 = 0, extra2 = 0;
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
- cqe = io_get_cqe(ctx);
- if (likely(cqe)) {
- memcpy(cqe, &req->cqe, sizeof(*cqe));
- return true;
- }
- return io_cqring_event_overflow(ctx, req->cqe.user_data,
- req->cqe.res, req->cqe.flags, 0, 0);
-}
+ if (req->flags & REQ_F_CQE32_INIT) {
+ extra1 = req->extra1;
+ extra2 = req->extra2;
+ }
-static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx,
- struct io_kiocb *req)
-{
- struct io_uring_cqe *cqe;
- u64 extra1 = req->extra1;
- u64 extra2 = req->extra2;
+ trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags, extra1, extra2);
- trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
- req->cqe.res, req->cqe.flags, extra1, extra2);
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ cqe = io_get_cqe(ctx);
+ if (likely(cqe)) {
+ memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
+ WRITE_ONCE(cqe->big_cqe[0], extra1);
+ WRITE_ONCE(cqe->big_cqe[1], extra2);
+ return true;
+ }
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
- cqe = io_get_cqe(ctx);
- if (likely(cqe)) {
- memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
- cqe->big_cqe[0] = extra1;
- cqe->big_cqe[1] = extra2;
- return true;
+ return io_cqring_event_overflow(ctx, req->cqe.user_data,
+ req->cqe.res, req->cqe.flags,
+ extra1, extra2);
}
-
- return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res,
- req->cqe.flags, extra1, extra2);
-}
-
-static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
-{
- trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0);
- return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
}
-static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags,
- u64 extra1, u64 extra2)
+static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
+ s32 res, u32 cflags)
{
- struct io_ring_ctx *ctx = req->ctx;
struct io_uring_cqe *cqe;
- if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
- return;
- if (req->flags & REQ_F_CQE_SKIP)
- return;
-
- trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags,
- extra1, extra2);
+ ctx->cq_extra++;
+ trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
/*
* If we can't get a cq entry, userspace overflowed the
@@ -2537,23 +2527,17 @@ static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags
*/
cqe = io_get_cqe(ctx);
if (likely(cqe)) {
- WRITE_ONCE(cqe->user_data, req->cqe.user_data);
+ WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- WRITE_ONCE(cqe->big_cqe[0], extra1);
- WRITE_ONCE(cqe->big_cqe[1], extra2);
- return;
- }
- io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2);
-}
-
-static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags)
-{
- ctx->cq_extra++;
- trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
- return __io_fill_cqe(ctx, user_data, res, cflags);
+ if (ctx->flags & IORING_SETUP_CQE32) {
+ WRITE_ONCE(cqe->big_cqe[0], 0);
+ WRITE_ONCE(cqe->big_cqe[1], 0);
+ }
+ return true;
+ }
+ return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
}
static void __io_req_complete_put(struct io_kiocb *req)
@@ -2590,16 +2574,11 @@ static void __io_req_complete_put(struct io_kiocb *req)
static void __io_req_complete_post(struct io_kiocb *req, s32 res,
u32 cflags)
{
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe_req(req, res, cflags);
- __io_req_complete_put(req);
-}
-
-static void __io_req_complete_post32(struct io_kiocb *req, s32 res,
- u32 cflags, u64 extra1, u64 extra2)
-{
- if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe32_req(req, res, cflags, extra1, extra2);
+ if (!(req->flags & REQ_F_CQE_SKIP)) {
+ req->cqe.res = res;
+ req->cqe.flags = cflags;
+ __io_fill_cqe_req(req->ctx, req);
+ }
__io_req_complete_put(req);
}
@@ -2614,18 +2593,6 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)
io_cqring_ev_posted(ctx);
}
-static void io_req_complete_post32(struct io_kiocb *req, s32 res,
- u32 cflags, u64 extra1, u64 extra2)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- spin_lock(&ctx->completion_lock);
- __io_req_complete_post32(req, res, cflags, extra1, extra2);
- io_commit_cqring(ctx);
- spin_unlock(&ctx->completion_lock);
- io_cqring_ev_posted(ctx);
-}
-
static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
u32 cflags)
{
@@ -2643,19 +2610,6 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
io_req_complete_post(req, res, cflags);
}
-static inline void __io_req_complete32(struct io_kiocb *req,
- unsigned int issue_flags, s32 res,
- u32 cflags, u64 extra1, u64 extra2)
-{
- if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
- io_req_complete_state(req, res, cflags);
- req->extra1 = extra1;
- req->extra2 = extra2;
- } else {
- io_req_complete_post32(req, res, cflags, extra1, extra2);
- }
-}
-
static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
if (res < 0)
@@ -3202,12 +3156,8 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
- if (!(req->flags & REQ_F_CQE_SKIP)) {
- if (!(ctx->flags & IORING_SETUP_CQE32))
- __io_fill_cqe_req_filled(ctx, req);
- else
- __io_fill_cqe32_req_filled(ctx, req);
- }
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ __io_fill_cqe_req(ctx, req);
}
io_commit_cqring(ctx);
@@ -3326,7 +3276,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
nr_events++;
if (unlikely(req->flags & REQ_F_CQE_SKIP))
continue;
- __io_fill_cqe_req(req, req->cqe.res, io_put_kbuf(req, 0));
+
+ req->cqe.flags = io_put_kbuf(req, 0);
+ __io_fill_cqe_req(req->ctx, req);
}
if (unlikely(!nr_events))
@@ -3497,7 +3449,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
if (unlikely(res != req->cqe.res)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
io_rw_should_reissue(req)) {
- req->flags |= REQ_F_REISSUE;
+ req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
return true;
}
req_set_fail(req);
@@ -3547,7 +3499,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
kiocb_end_write(req);
if (unlikely(res != req->cqe.res)) {
if (res == -EAGAIN && io_rw_should_reissue(req)) {
- req->flags |= REQ_F_REISSUE;
+ req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
return;
}
req->cqe.res = res;
@@ -3677,6 +3629,20 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int ret;
kiocb->ki_pos = READ_ONCE(sqe->off);
+ /* used for fixed read/write too - just read unconditionally */
+ req->buf_index = READ_ONCE(sqe->buf_index);
+
+ if (req->opcode == IORING_OP_READ_FIXED ||
+ req->opcode == IORING_OP_WRITE_FIXED) {
+ struct io_ring_ctx *ctx = req->ctx;
+ u16 index;
+
+ if (unlikely(req->buf_index >= ctx->nr_user_bufs))
+ return -EFAULT;
+ index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
+ req->imu = ctx->user_bufs[index];
+ io_req_set_rsrc_node(req, ctx, 0);
+ }
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
@@ -3689,12 +3655,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kiocb->ki_ioprio = get_current_ioprio();
}
- req->imu = NULL;
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
req->rw.flags = READ_ONCE(sqe->rw_flags);
- /* used for fixed read/write too - just read unconditionally */
- req->buf_index = READ_ONCE(sqe->buf_index);
return 0;
}
@@ -3826,20 +3789,9 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter
static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
unsigned int issue_flags)
{
- struct io_mapped_ubuf *imu = req->imu;
- u16 index, buf_index = req->buf_index;
-
- if (likely(!imu)) {
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(buf_index >= ctx->nr_user_bufs))
- return -EFAULT;
- io_req_set_rsrc_node(req, ctx, issue_flags);
- index = array_index_nospec(buf_index, ctx->nr_user_bufs);
- imu = READ_ONCE(ctx->user_bufs[index]);
- req->imu = imu;
- }
- return __io_import_fixed(req, rw, iter, imu);
+ if (WARN_ON_ONCE(!req->imu))
+ return -EFAULT;
+ return __io_import_fixed(req, rw, iter, req->imu);
}
static int io_buffer_add_list(struct io_ring_ctx *ctx,
@@ -3876,19 +3828,17 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
{
struct io_uring_buf_ring *br = bl->buf_ring;
struct io_uring_buf *buf;
- __u32 head = bl->head;
+ __u16 head = bl->head;
- if (unlikely(smp_load_acquire(&br->tail) == head)) {
- io_ring_submit_unlock(req->ctx, issue_flags);
+ if (unlikely(smp_load_acquire(&br->tail) == head))
return NULL;
- }
head &= bl->mask;
if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
buf = &br->bufs[head];
} else {
int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
- int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1;
+ int index = head / IO_BUFFER_LIST_BUF_PER_PAGE;
buf = page_address(bl->buf_pages[index]);
buf += off;
}
@@ -3898,7 +3848,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
req->buf_list = bl;
req->buf_index = buf->bid;
- if (issue_flags & IO_URING_F_UNLOCKED) {
+ if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
/*
* If we came in unlocked, we have no choice but to consume the
* buffer here. This does mean it'll be pinned until the IO
@@ -4376,18 +4326,19 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(ret < 0))
return ret;
} else {
+ rw = req->async_data;
+ s = &rw->s;
+
/*
* Safe and required to re-import if we're using provided
* buffers, as we dropped the selected one before retry.
*/
- if (req->flags & REQ_F_BUFFER_SELECT) {
+ if (io_do_buffer_select(req)) {
ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
if (unlikely(ret < 0))
return ret;
}
- rw = req->async_data;
- s = &rw->s;
/*
* We come here from an earlier attempt, restore our state to
* match in case it doesn't. It's cheap enough that we don't
@@ -5079,10 +5030,18 @@ void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
req->uring_cmd.task_work_cb = task_work_cb;
req->io_task_work.func = io_uring_cmd_work;
- io_req_task_prio_work_add(req);
+ io_req_task_work_add(req);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
+static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
+ u64 extra1, u64 extra2)
+{
+ req->extra1 = extra1;
+ req->extra2 = extra2;
+ req->flags |= REQ_F_CQE32_INIT;
+}
+
/*
* Called by consumers of io_uring_cmd, if they originally returned
* -EIOCBQUEUED upon receiving the command.
@@ -5093,10 +5052,10 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
if (ret < 0)
req_set_fail(req);
+
if (req->ctx->flags & IORING_SETUP_CQE32)
- __io_req_complete32(req, 0, ret, 0, res2, 0);
- else
- io_req_complete(req, ret);
+ io_req_set_cqe32_extra(req, res2, 0);
+ io_req_complete(req, ret);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
@@ -5115,7 +5074,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req,
{
struct io_uring_cmd *ioucmd = &req->uring_cmd;
- if (sqe->rw_flags)
+ if (sqe->rw_flags || sqe->__pad1)
return -EINVAL;
ioucmd->cmd = sqe->cmd;
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
@@ -5258,14 +5217,6 @@ done:
static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- /*
- * If the ring is setup with CQE32, relay back addr/addr
- */
- if (req->ctx->flags & IORING_SETUP_CQE32) {
- req->nop.extra1 = READ_ONCE(sqe->addr);
- req->nop.extra2 = READ_ONCE(sqe->addr2);
- }
-
return 0;
}
@@ -5274,23 +5225,7 @@ static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
*/
static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
- unsigned int cflags;
- void __user *buf;
-
- if (req->flags & REQ_F_BUFFER_SELECT) {
- size_t len = 1;
-
- buf = io_buffer_select(req, &len, issue_flags);
- if (!buf)
- return -ENOBUFS;
- }
-
- cflags = io_put_kbuf(req, issue_flags);
- if (!(req->ctx->flags & IORING_SETUP_CQE32))
- __io_req_complete(req, issue_flags, 0, cflags);
- else
- __io_req_complete32(req, issue_flags, 0, cflags,
- req->nop.extra1, req->nop.extra2);
+ __io_req_complete(req, issue_flags, 0, 0);
return 0;
}
@@ -5988,18 +5923,14 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- if (sqe->off || sqe->addr || sqe->len || sqe->buf_index)
+ if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
req->close.fd = READ_ONCE(sqe->fd);
req->close.file_slot = READ_ONCE(sqe->file_index);
- req->close.flags = READ_ONCE(sqe->close_flags);
- if (req->close.flags & ~IORING_CLOSE_FD_AND_FILE_SLOT)
- return -EINVAL;
- if (!(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT) &&
- req->close.file_slot && req->close.fd)
+ if (req->close.file_slot && req->close.fd)
return -EINVAL;
return 0;
@@ -6015,8 +5946,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
if (req->close.file_slot) {
ret = io_close_fixed(req, issue_flags);
- if (ret || !(req->close.flags & IORING_CLOSE_FD_AND_FILE_SLOT))
- goto err;
+ goto err;
}
spin_lock(&files->file_lock);
@@ -6158,14 +6088,12 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = &req->sr_msg;
- if (unlikely(sqe->file_index))
- return -EINVAL;
- if (unlikely(sqe->addr2 || sqe->file_index))
+ if (unlikely(sqe->file_index || sqe->addr2))
return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
- sr->flags = READ_ONCE(sqe->addr2);
+ sr->flags = READ_ONCE(sqe->ioprio);
if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
@@ -6396,14 +6324,12 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = &req->sr_msg;
- if (unlikely(sqe->file_index))
- return -EINVAL;
- if (unlikely(sqe->addr2 || sqe->file_index))
+ if (unlikely(sqe->file_index || sqe->addr2))
return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
- sr->flags = READ_ONCE(sqe->addr2);
+ sr->flags = READ_ONCE(sqe->ioprio);
if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
@@ -7037,7 +6963,8 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
io_req_complete_failed(req, ret);
}
-static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)
+static void __io_poll_execute(struct io_kiocb *req, int mask,
+ __poll_t __maybe_unused events)
{
req->cqe.res = mask;
/*
@@ -7046,7 +6973,6 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)
* CPU. We want to avoid pulling in req->apoll->events for that
* case.
*/
- req->apoll_events = events;
if (req->opcode == IORING_OP_POLL_ADD)
req->io_task_work.func = io_poll_task_func;
else
@@ -7197,6 +7123,8 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
io_init_poll_iocb(poll, mask, io_poll_wake);
poll->file = req->file;
+ req->apoll_events = poll->events;
+
ipt->pt._key = mask;
ipt->req = req;
ipt->error = 0;
@@ -7227,8 +7155,11 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
if (mask) {
/* can't multishot if failed, just queue the event we've got */
- if (unlikely(ipt->error || !ipt->nr_entries))
+ if (unlikely(ipt->error || !ipt->nr_entries)) {
poll->events |= EPOLLONESHOT;
+ req->apoll_events |= EPOLLONESHOT;
+ ipt->error = 0;
+ }
__io_poll_execute(req, mask, poll->events);
return 0;
}
@@ -7290,6 +7221,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
mask |= EPOLLEXCLUSIVE;
if (req->flags & REQ_F_POLLED) {
apoll = req->apoll;
+ kfree(apoll->double_poll);
} else if (!(issue_flags & IO_URING_F_UNLOCKED) &&
!list_empty(&ctx->apoll_cache)) {
apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
@@ -7475,7 +7407,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EINVAL;
io_req_set_refcount(req);
- req->apoll_events = poll->events = io_poll_parse_events(sqe, flags);
+ poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
@@ -7488,6 +7420,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
ipt.pt._qproc = io_poll_queue_proc;
ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
+ if (!ret && ipt.error)
+ req_set_fail(req);
ret = ret ?: ipt.error;
if (ret)
__io_req_complete(req, issue_flags, ret, 0);
@@ -8047,6 +7981,9 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
struct file *file;
int ret, fd;
+ if (!req->ctx->file_data)
+ return -ENXIO;
+
for (done = 0; done < req->rsrc_update.nr_args; done++) {
if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
ret = -EFAULT;
@@ -8063,8 +8000,8 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
if (ret < 0)
break;
if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
- ret = -EFAULT;
__io_close_fixed(req, issue_flags, ret);
+ ret = -EFAULT;
break;
}
}
@@ -8773,6 +8710,7 @@ static void io_queue_async(struct io_kiocb *req, int ret)
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
*/
+ io_kbuf_recycle(req, 0);
io_queue_iowq(req, NULL);
break;
case IO_APOLL_OK:
@@ -9788,11 +9726,19 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
+ unsigned nr = ctx->nr_user_files;
int ret;
if (!ctx->file_data)
return -ENXIO;
+
+ /*
+ * Quiesce may unlock ->uring_lock, and while it's not held
+ * prevent new requests using the table.
+ */
+ ctx->nr_user_files = 0;
ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
+ ctx->nr_user_files = nr;
if (!ret)
__io_sqe_files_unregister(ctx);
return ret;
@@ -10690,12 +10636,19 @@ static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
+ unsigned nr = ctx->nr_user_bufs;
int ret;
if (!ctx->buf_data)
return -ENXIO;
+ /*
+ * Quiesce may unlock ->uring_lock, and while it's not held
+ * prevent new requests using the table.
+ */
+ ctx->nr_user_bufs = 0;
ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
+ ctx->nr_user_bufs = nr;
if (!ret)
__io_sqe_buffers_unregister(ctx);
return ret;
@@ -12986,7 +12939,7 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_ring *br;
struct io_uring_buf_reg reg;
- struct io_buffer_list *bl;
+ struct io_buffer_list *bl, *free_bl = NULL;
struct page **pages;
int nr_pages;
@@ -13002,6 +12955,10 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!is_power_of_2(reg.ring_entries))
return -EINVAL;
+ /* cannot disambiguate full vs empty due to head/tail size */
+ if (reg.ring_entries >= 65536)
+ return -EINVAL;
+
if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
int ret = io_init_bl_list(ctx);
if (ret)
@@ -13014,7 +12971,7 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
return -EEXIST;
} else {
- bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl)
return -ENOMEM;
}
@@ -13023,7 +12980,7 @@ static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
struct_size(br, bufs, reg.ring_entries),
&nr_pages);
if (IS_ERR(pages)) {
- kfree(bl);
+ kfree(free_bl);
return PTR_ERR(pages);
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e49bb0938376..e9c308ae475f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2114,7 +2114,7 @@ out:
/**
* jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
- * @page: to try and free
+ * @folio: Folio to detach data from.
*
* For all the buffers on this page,
* if they are fully written out ordered data, move them onto BUF_CLEAN
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index e6f4ccc12f49..353f047e783c 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -6490,6 +6490,7 @@ int smb2_write(struct ksmbd_work *work)
goto out;
}
+ ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags));
if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
writethrough = true;
@@ -6505,10 +6506,6 @@ int smb2_write(struct ksmbd_work *work)
data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
le16_to_cpu(req->DataOffset));
- ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags));
- if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
- writethrough = true;
-
ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n",
fp->filp->f_path.dentry, offset, length);
err = ksmbd_vfs_write(work, fp, data_buf, length, &offset,
@@ -7703,7 +7700,7 @@ int smb2_ioctl(struct ksmbd_work *work)
{
struct file_zero_data_information *zero_data;
struct ksmbd_file *fp;
- loff_t off, len;
+ loff_t off, len, bfz;
if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
ksmbd_debug(SMB,
@@ -7720,19 +7717,26 @@ int smb2_ioctl(struct ksmbd_work *work)
zero_data =
(struct file_zero_data_information *)&req->Buffer[0];
- fp = ksmbd_lookup_fd_fast(work, id);
- if (!fp) {
- ret = -ENOENT;
+ off = le64_to_cpu(zero_data->FileOffset);
+ bfz = le64_to_cpu(zero_data->BeyondFinalZero);
+ if (off > bfz) {
+ ret = -EINVAL;
goto out;
}
- off = le64_to_cpu(zero_data->FileOffset);
- len = le64_to_cpu(zero_data->BeyondFinalZero) - off;
+ len = bfz - off;
+ if (len) {
+ fp = ksmbd_lookup_fd_fast(work, id);
+ if (!fp) {
+ ret = -ENOENT;
+ goto out;
+ }
- ret = ksmbd_vfs_zero_data(work, fp, off, len);
- ksmbd_fd_put(work, fp);
- if (ret < 0)
- goto out;
+ ret = ksmbd_vfs_zero_data(work, fp, off, len);
+ ksmbd_fd_put(work, fp);
+ if (ret < 0)
+ goto out;
+ }
break;
}
case FSCTL_QUERY_ALLOCATED_RANGES:
@@ -7806,14 +7810,24 @@ int smb2_ioctl(struct ksmbd_work *work)
src_off = le64_to_cpu(dup_ext->SourceFileOffset);
dst_off = le64_to_cpu(dup_ext->TargetFileOffset);
length = le64_to_cpu(dup_ext->ByteCount);
- cloned = vfs_clone_file_range(fp_in->filp, src_off, fp_out->filp,
- dst_off, length, 0);
+ /*
+ * XXX: It is not clear if FSCTL_DUPLICATE_EXTENTS_TO_FILE
+ * should fall back to vfs_copy_file_range(). This could be
+ * beneficial when re-exporting nfs/smb mount, but note that
+ * this can result in partial copy that returns an error status.
+ * If/when FSCTL_DUPLICATE_EXTENTS_TO_FILE_EX is implemented,
+ * fall back to vfs_copy_file_range(), should be avoided when
+ * the flag DUPLICATE_EXTENTS_DATA_EX_SOURCE_ATOMIC is set.
+ */
+ cloned = vfs_clone_file_range(fp_in->filp, src_off,
+ fp_out->filp, dst_off, length, 0);
if (cloned == -EXDEV || cloned == -EOPNOTSUPP) {
ret = -EOPNOTSUPP;
goto dup_ext_out;
} else if (cloned != length) {
cloned = vfs_copy_file_range(fp_in->filp, src_off,
- fp_out->filp, dst_off, length, 0);
+ fp_out->filp, dst_off,
+ length, 0);
if (cloned != length) {
if (cloned < 0)
ret = cloned;
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index d035e060c2f0..35b55ee94fe5 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -5,16 +5,6 @@
*
* Author(s): Long Li <longli@microsoft.com>,
* Hyunchul Lee <hyc.lee@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#define SUBMOD_NAME "smb_direct"
diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c
index 8fef9de787d3..143bba4e4db8 100644
--- a/fs/ksmbd/transport_tcp.c
+++ b/fs/ksmbd/transport_tcp.c
@@ -230,7 +230,7 @@ static int ksmbd_kthread_fn(void *p)
break;
}
ret = kernel_accept(iface->ksmbd_socket, &client_sk,
- O_NONBLOCK);
+ SOCK_NONBLOCK);
mutex_unlock(&iface->sock_release_lock);
if (ret) {
if (ret == -EAGAIN)
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index dcdd07c6efff..05efcdf7a4a7 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -1015,7 +1015,9 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
off, len);
- return vfs_fallocate(fp->filp, FALLOC_FL_ZERO_RANGE, off, len);
+ return vfs_fallocate(fp->filp,
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE,
+ off, len);
}
int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
@@ -1046,7 +1048,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
*out_count = 0;
end = start + length;
while (start < end && *out_count < in_count) {
- extent_start = f->f_op->llseek(f, start, SEEK_DATA);
+ extent_start = vfs_llseek(f, start, SEEK_DATA);
if (extent_start < 0) {
if (extent_start != -ENXIO)
ret = (int)extent_start;
@@ -1056,7 +1058,7 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
if (extent_start >= end)
break;
- extent_end = f->f_op->llseek(f, extent_start, SEEK_HOLE);
+ extent_end = vfs_llseek(f, extent_start, SEEK_HOLE);
if (extent_end < 0) {
if (extent_end != -ENXIO)
ret = (int)extent_end;
@@ -1777,6 +1779,10 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
ret = vfs_copy_file_range(src_fp->filp, src_off,
dst_fp->filp, dst_off, len, 0);
+ if (ret == -EOPNOTSUPP || ret == -EXDEV)
+ ret = generic_copy_file_range(src_fp->filp, src_off,
+ dst_fp->filp, dst_off,
+ len, 0);
if (ret < 0)
return ret;
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 0a22a2faf552..e1c4617de771 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -176,7 +176,7 @@ nlm_delete_file(struct nlm_file *file)
}
}
-static int nlm_unlock_files(struct nlm_file *file)
+static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner)
{
struct file_lock lock;
@@ -184,6 +184,7 @@ static int nlm_unlock_files(struct nlm_file *file)
lock.fl_type = F_UNLCK;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
+ lock.fl_owner = owner;
if (file->f_file[O_RDONLY] &&
vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL))
goto out_err;
@@ -225,7 +226,7 @@ again:
if (match(lockhost, host)) {
spin_unlock(&flctx->flc_lock);
- if (nlm_unlock_files(file))
+ if (nlm_unlock_files(file, fl->fl_owner))
return 1;
goto again;
}
@@ -282,11 +283,10 @@ nlm_file_inuse(struct nlm_file *file)
static void nlm_close_files(struct nlm_file *file)
{
- struct file *f;
-
- for (f = file->f_file[0]; f <= file->f_file[1]; f++)
- if (f)
- nlmsvc_ops->fclose(f);
+ if (file->f_file[O_RDONLY])
+ nlmsvc_ops->fclose(file->f_file[O_RDONLY]);
+ if (file->f_file[O_WRONLY])
+ nlmsvc_ops->fclose(file->f_file[O_WRONLY]);
}
/*
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 8742d22dfd2b..0ce535852151 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -155,7 +155,7 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq,
void netfs_readahead(struct readahead_control *ractl)
{
struct netfs_io_request *rreq;
- struct netfs_i_context *ctx = netfs_i_context(ractl->mapping->host);
+ struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
int ret;
_enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
@@ -215,7 +215,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)
{
struct address_space *mapping = folio_file_mapping(folio);
struct netfs_io_request *rreq;
- struct netfs_i_context *ctx = netfs_i_context(mapping->host);
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
int ret;
_enter("%lx", folio_index(folio));
@@ -297,6 +297,7 @@ zero_out:
/**
* netfs_write_begin - Helper to prepare for writing
+ * @ctx: The netfs context
* @file: The file to read from
* @mapping: The mapping to read from
* @pos: File position at which the write will begin
@@ -318,20 +319,21 @@ zero_out:
* conflicting writes once the folio is grabbed and locked. It is passed a
* pointer to the fsdata cookie that gets returned to the VM to be passed to
* write_end. It is permitted to sleep. It should return 0 if the request
- * should go ahead; unlock the folio and return -EAGAIN to cause the folio to
- * be regot; or return an error.
+ * should go ahead or it may return an error. It may also unlock and put the
+ * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
+ * will cause the folio to be re-got and the process to be retried.
*
* The calling netfs must initialise a netfs context contiguous to the vfs
* inode before calling this.
*
* This is usable whether or not caching is enabled.
*/
-int netfs_write_begin(struct file *file, struct address_space *mapping,
+int netfs_write_begin(struct netfs_inode *ctx,
+ struct file *file, struct address_space *mapping,
loff_t pos, unsigned int len, struct folio **_folio,
void **_fsdata)
{
struct netfs_io_request *rreq;
- struct netfs_i_context *ctx = netfs_i_context(file_inode(file ));
struct folio *folio;
unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
pgoff_t index = pos >> PAGE_SHIFT;
@@ -347,13 +349,13 @@ retry:
if (ctx->ops->check_write_begin) {
/* Allow the netfs (eg. ceph) to flush conflicts. */
- ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata);
+ ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
if (ret < 0) {
trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
- if (ret == -EAGAIN)
- goto retry;
goto error;
}
+ if (!folio)
+ goto retry;
}
if (folio_test_uptodate(folio))
@@ -415,8 +417,10 @@ have_folio_no_wait:
error_put:
netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
error:
- folio_unlock(folio);
- folio_put(folio);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index b7b0e3d18d9e..43fac1b14e40 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -91,7 +91,7 @@ static inline void netfs_stat_d(atomic_t *stat)
/*
* Miscellaneous functions.
*/
-static inline bool netfs_is_cache_enabled(struct netfs_i_context *ctx)
+static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
{
#if IS_ENABLED(CONFIG_FSCACHE)
struct fscache_cookie *cookie = ctx->cache;
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index e86107b30ba4..e17cdf53f6a7 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -18,7 +18,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
{
static atomic_t debug_ids;
struct inode *inode = file ? file_inode(file) : mapping->host;
- struct netfs_i_context *ctx = netfs_i_context(inode);
+ struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
int ret;
@@ -75,10 +75,10 @@ static void netfs_free_request(struct work_struct *work)
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
- netfs_clear_subrequests(rreq, false);
- if (rreq->netfs_priv)
- rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+ netfs_clear_subrequests(rreq, false);
+ if (rreq->netfs_ops->free_request)
+ rreq->netfs_ops->free_request(rreq);
if (rreq->cache_resources.ops)
rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
kfree(rreq);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c8520284dda7..c1eda73254e1 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -288,6 +288,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
rv = NFS4_OK;
break;
case -ENOENT:
+ set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
/* Embrace your forgetfulness! */
rv = NFS4ERR_NOMATCHING_LAYOUT;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a8ecdd527662..0c4e8dd6aa96 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2124,6 +2124,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
}
goto out;
}
+ file->f_mode |= FMODE_CAN_ODIRECT;
err = nfs_finish_open(ctx, ctx->dentry, file, open_flags);
trace_nfs_atomic_open_exit(dir, ctx, open_flags, err);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 03d3a270eff4..e88f6b18445e 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -93,6 +93,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
nfs_file_set_open_context(filp, ctx);
nfs_fscache_open_file(inode, filp);
err = 0;
+ filp->f_mode |= FMODE_CAN_ODIRECT;
out_put_ctx:
put_nfs_open_context(ctx);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c0fdcf8c0032..bb0e84a46d61 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4012,22 +4012,29 @@ static int _nfs4_discover_trunking(struct nfs_server *server,
}
page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
- if (page == NULL || locations == NULL)
- goto out;
+ if (!locations)
+ goto out_free;
+ locations->fattr = nfs_alloc_fattr();
+ if (!locations->fattr)
+ goto out_free_2;
status = nfs4_proc_get_locations(server, fhandle, locations, page,
cred);
if (status)
- goto out;
+ goto out_free_3;
for (i = 0; i < locations->nlocations; i++)
test_fs_location_for_trunking(&locations->locations[i], clp,
server);
-out:
- if (page)
- __free_page(page);
+out_free_3:
+ kfree(locations->fattr);
+out_free_2:
kfree(locations);
+out_free:
+ __free_page(page);
return status;
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2540b35ec187..9bab3e9c702a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2753,5 +2753,6 @@ again:
goto again;
nfs_put_client(clp);
+ module_put_and_kthread_exit(0);
return 0;
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 68a87be3e6f9..41a9b6b58fb9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -469,6 +469,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
pnfs_clear_lseg_state(lseg, lseg_list);
pnfs_clear_layoutreturn_info(lo);
pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
+ set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
!test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
pnfs_clear_layoutreturn_waitbit(lo);
@@ -1917,8 +1918,9 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
{
- if (atomic_dec_and_test(&lo->plh_outstanding))
- wake_up_var(&lo->plh_outstanding);
+ if (atomic_dec_and_test(&lo->plh_outstanding) &&
+ test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags))
+ wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN);
}
static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo)
@@ -2025,11 +2027,11 @@ lookup_again:
* If the layout segment list is empty, but there are outstanding
* layoutget calls, then they might be subject to a layoutrecall.
*/
- if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) &&
+ if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
atomic_read(&lo->plh_outstanding) != 0) {
spin_unlock(&ino->i_lock);
- lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
- !atomic_read(&lo->plh_outstanding)));
+ lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN,
+ TASK_KILLABLE));
if (IS_ERR(lseg))
goto out_put_layout_hdr;
pnfs_put_layout_hdr(lo);
@@ -2152,6 +2154,12 @@ lookup_again:
case -ERECALLCONFLICT:
case -EAGAIN:
break;
+ case -ENODATA:
+ /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */
+ pnfs_layout_set_fail_bit(
+ lo, pnfs_iomode_to_fail_bit(iomode));
+ lseg = NULL;
+ goto out_put_layout_hdr;
default:
if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
@@ -2407,7 +2415,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out_forget;
}
- if (!pnfs_layout_is_valid(lo) && !pnfs_is_first_layoutget(lo))
+ if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
+ !pnfs_is_first_layoutget(lo))
goto out_forget;
if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 07f11489e4e9..f331f067691b 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -105,6 +105,7 @@ enum {
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */
NFS_LAYOUT_HASHED, /* The layout visible */
+ NFS_LAYOUT_DRAIN,
};
enum layoutdriver_policy_flags {
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index f172412447f5..9cb2d590c036 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -309,11 +309,12 @@ nfsd_file_put(struct nfsd_file *nf)
if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) {
nfsd_file_flush(nf);
nfsd_file_put_noref(nf);
- } else {
+ } else if (nf->nf_file) {
nfsd_file_put_noref(nf);
- if (nf->nf_file)
- nfsd_file_schedule_laundrette();
- }
+ nfsd_file_schedule_laundrette();
+ } else
+ nfsd_file_put_noref(nf);
+
if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT)
nfsd_file_gc();
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 61b2aae81abb..2acea7792bb2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -470,6 +470,15 @@ nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen,
return nfserr_bad_xdr;
}
}
+ if (bmval[1] & FATTR4_WORD1_TIME_CREATE) {
+ struct timespec64 ts;
+
+ /* No Linux filesystem supports setting this attribute. */
+ bmval[1] &= ~FATTR4_WORD1_TIME_CREATE;
+ status = nfsd4_decode_nfstime4(argp, &ts);
+ if (status)
+ return status;
+ }
if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
u32 set_it;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 847b482155ae..9a8b09afc173 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -465,7 +465,8 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
#define NFSD_WRITEABLE_ATTRS_WORD1 \
(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
- | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_CREATE \
+ | FATTR4_WORD1_TIME_MODIFY_SET)
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#define MAYBE_FATTR4_WORD2_SECURITY_LABEL \
FATTR4_WORD2_SECURITY_LABEL
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 840e3af63a6f..d79db56475d4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -577,6 +577,7 @@ out_err:
ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
u64 dst_pos, u64 count)
{
+ ssize_t ret;
/*
* Limit copy to 4MB to prevent indefinitely blocking an nfsd
@@ -587,7 +588,12 @@ ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
* limit like this and pipeline multiple COPY requests.
*/
count = min_t(u64, count, 1 << 22);
- return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+ ret = vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+
+ if (ret == -EOPNOTSUPP || ret == -EXDEV)
+ ret = generic_copy_file_range(src, src_pos, dst, dst_pos,
+ count, 0);
+ return ret;
}
__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -1173,6 +1179,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
nfsd_copy_write_verifier(verf, nn);
err2 = filemap_check_wb_err(nf->nf_file->f_mapping,
since);
+ err = nfserrno(err2);
break;
case -EINVAL:
err = nfserr_notsupp;
@@ -1180,8 +1187,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
default:
nfsd_reset_write_verifier(nn);
trace_nfsd_writeverf_reset(nn, rqstp, err2);
+ err = nfserrno(err2);
}
- err = nfserrno(err2);
} else
nfsd_copy_write_verifier(verf, nn);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 1344f7d475d3..aecda4fc95f5 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -198,6 +198,9 @@ static inline int nilfs_acl_chmod(struct inode *inode)
static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
{
+ if (S_ISLNK(inode->i_mode))
+ return 0;
+
inode->i_mode &= ~current_umask();
return 0;
}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index c2255b440df9..b08ce0d821a7 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1513,8 +1513,15 @@ static int fanotify_test_fid(struct dentry *dentry)
return 0;
}
-static int fanotify_events_supported(struct path *path, __u64 mask)
+static int fanotify_events_supported(struct fsnotify_group *group,
+ struct path *path, __u64 mask,
+ unsigned int flags)
{
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+ /* Strict validation of events in non-dir inode mask with v5.17+ APIs */
+ bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
+ (mask & FAN_RENAME);
+
/*
* Some filesystems such as 'proc' acquire unusual locks when opening
* files. For them fanotify permission events have high chances of
@@ -1526,6 +1533,16 @@ static int fanotify_events_supported(struct path *path, __u64 mask)
if (mask & FANOTIFY_PERM_EVENTS &&
path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
return -EINVAL;
+
+ /*
+ * We shouldn't have allowed setting dirent events and the directory
+ * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode,
+ * but because we always allowed it, error only when using new APIs.
+ */
+ if (strict_dir_events && mark_type == FAN_MARK_INODE &&
+ !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
+ return -ENOTDIR;
+
return 0;
}
@@ -1672,7 +1689,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
goto fput_and_out;
if (flags & FAN_MARK_ADD) {
- ret = fanotify_events_supported(&path, mask);
+ ret = fanotify_events_supported(group, &path, mask, flags);
if (ret)
goto path_put_and_out;
}
@@ -1695,19 +1712,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
else
mnt = path.mnt;
- /*
- * FAN_RENAME is not allowed on non-dir (for now).
- * We shouldn't have allowed setting any dirent events in mask of
- * non-dir, but because we always allowed it, error only if group
- * was initialized with the new flag FAN_REPORT_TARGET_FID.
- */
- ret = -ENOTDIR;
- if (inode && !S_ISDIR(inode->i_mode) &&
- ((mask & FAN_RENAME) ||
- ((mask & FANOTIFY_DIRENT_EVENTS) &&
- FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID))))
- goto path_put_and_out;
-
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
if (mnt || !S_ISDIR(inode->i_mode)) {
mask &= ~FAN_EVENT_ON_CHILD;
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 4de597a83b88..52615e6090e1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -592,8 +592,12 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
a = (ATTR_RECORD*)((u8*)ctx->attr +
le32_to_cpu(ctx->attr->length));
for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
- if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
- le32_to_cpu(ctx->mrec->bytes_allocated))
+ u8 *mrec_end = (u8 *)ctx->mrec +
+ le32_to_cpu(ctx->mrec->bytes_allocated);
+ u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
+ a->name_length * sizeof(ntfschar);
+ if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end ||
+ name_end > mrec_end)
break;
ctx->attr = a;
if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 337527571461..740b64238312 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -277,7 +277,6 @@ enum ocfs2_mount_options
OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
- OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -673,8 +672,7 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
static inline int ocfs2_mount_local(struct ocfs2_super *osb)
{
- return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)
- || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER));
+ return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
}
static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 0b0ae3ebb0cf..da7718cef735 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -252,16 +252,14 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
int i, ret = -ENOSPC;
if ((preferred >= 0) && (preferred < si->si_num_slots)) {
- if (!si->si_slots[preferred].sl_valid ||
- !si->si_slots[preferred].sl_node_num) {
+ if (!si->si_slots[preferred].sl_valid) {
ret = preferred;
goto out;
}
}
for(i = 0; i < si->si_num_slots; i++) {
- if (!si->si_slots[i].sl_valid ||
- !si->si_slots[i].sl_node_num) {
+ if (!si->si_slots[i].sl_valid) {
ret = i;
break;
}
@@ -456,30 +454,24 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
- if (ocfs2_mount_local(osb))
- /* use slot 0 directly in local mode */
- slot = 0;
- else {
- /* search for ourselves first and take the slot if it already
- * exists. Perhaps we need to mark this in a variable for our
- * own journal recovery? Possibly not, though we certainly
- * need to warn to the user */
- slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+ /* search for ourselves first and take the slot if it already
+ * exists. Perhaps we need to mark this in a variable for our
+ * own journal recovery? Possibly not, though we certainly
+ * need to warn to the user */
+ slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+ if (slot < 0) {
+ /* if no slot yet, then just take 1st available
+ * one. */
+ slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
if (slot < 0) {
- /* if no slot yet, then just take 1st available
- * one. */
- slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
- if (slot < 0) {
- spin_unlock(&osb->osb_lock);
- mlog(ML_ERROR, "no free slots available!\n");
- status = -EINVAL;
- goto bail;
- }
- } else
- printk(KERN_INFO "ocfs2: Slot %d on device (%s) was "
- "already allocated to this node!\n",
- slot, osb->dev_str);
- }
+ spin_unlock(&osb->osb_lock);
+ mlog(ML_ERROR, "no free slots available!\n");
+ status = -EINVAL;
+ goto bail;
+ }
+ } else
+ printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+ "allocated to this node!\n", slot, osb->dev_str);
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f7298816d8d9..438be028935d 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -172,7 +172,6 @@ enum {
Opt_dir_resv_level,
Opt_journal_async_commit,
Opt_err_cont,
- Opt_nocluster,
Opt_err,
};
@@ -206,7 +205,6 @@ static const match_table_t tokens = {
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_journal_async_commit, "journal_async_commit"},
{Opt_err_cont, "errors=continue"},
- {Opt_nocluster, "nocluster"},
{Opt_err, NULL}
};
@@ -618,13 +616,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
goto out;
}
- tmp = OCFS2_MOUNT_NOCLUSTER;
- if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
- ret = -EINVAL;
- mlog(ML_ERROR, "Cannot change nocluster option on remount\n");
- goto out;
- }
-
tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
OCFS2_MOUNT_HB_NONE;
if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
@@ -865,7 +856,6 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
}
if (ocfs2_userspace_stack(osb) &&
- !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) &&
strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
OCFS2_STACK_LABEL_LEN)) {
mlog(ML_ERROR,
@@ -1137,11 +1127,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
"ordered");
- if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) &&
- !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT))
- printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted "
- "without cluster aware mode.\n", osb->dev_str);
-
atomic_set(&osb->vol_state, VOLUME_MOUNTED);
wake_up(&osb->osb_mount_event);
@@ -1452,9 +1437,6 @@ static int ocfs2_parse_options(struct super_block *sb,
case Opt_journal_async_commit:
mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
break;
- case Opt_nocluster:
- mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER;
- break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1566,9 +1548,6 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
seq_printf(s, ",journal_async_commit");
- if (opts & OCFS2_MOUNT_NOCLUSTER)
- seq_printf(s, ",nocluster");
-
return 0;
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e0a2e0468ee7..1ce5c9698393 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1003,6 +1003,9 @@ ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, void *buffer, size_t size)
{
+ if (!IS_POSIXACL(inode))
+ return -EOPNOTSUPP;
+
return ovl_xattr_get(dentry, inode, handler->name, buffer, size);
}
@@ -1018,6 +1021,9 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
struct posix_acl *acl = NULL;
int err;
+ if (!IS_POSIXACL(inode))
+ return -EOPNOTSUPP;
+
/* Check that everything is OK before copy-up */
if (value) {
acl = posix_acl_from_xattr(&init_user_ns, value, size);
@@ -1960,6 +1966,20 @@ static struct dentry *ovl_get_root(struct super_block *sb,
return root;
}
+static bool ovl_has_idmapped_layers(struct ovl_fs *ofs)
+{
+
+ unsigned int i;
+ const struct vfsmount *mnt;
+
+ for (i = 0; i < ofs->numlayer; i++) {
+ mnt = ofs->layers[i].mnt;
+ if (mnt && is_idmapped_mnt(mnt))
+ return true;
+ }
+ return false;
+}
+
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
struct path upperpath = { };
@@ -2129,7 +2149,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
ovl_trusted_xattr_handlers;
sb->s_fs_info = ofs;
- sb->s_flags |= SB_POSIXACL;
+ if (ovl_has_idmapped_layers(ofs))
+ pr_warn("POSIX ACLs are not yet supported with idmapped layers, mounting without ACL support.\n");
+ else
+ sb->s_flags |= SB_POSIXACL;
sb->s_iflags |= SB_I_SKIP_SYNC;
err = -ENOMEM;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a74aef99bd3d..09d1307959d0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -79,6 +79,7 @@
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
+#include <linux/sched/mm.h>
#include "../internal.h" /* ugh */
#include <linux/uaccess.h>
@@ -425,9 +426,11 @@ EXPORT_SYMBOL(mark_info_dirty);
int dquot_acquire(struct dquot *dquot)
{
int ret = 0, ret2 = 0;
+ unsigned int memalloc;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
+ memalloc = memalloc_nofs_save();
if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
if (ret < 0)
@@ -458,6 +461,7 @@ int dquot_acquire(struct dquot *dquot)
smp_mb__before_atomic();
set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_iolock:
+ memalloc_nofs_restore(memalloc);
mutex_unlock(&dquot->dq_lock);
return ret;
}
@@ -469,9 +473,11 @@ EXPORT_SYMBOL(dquot_acquire);
int dquot_commit(struct dquot *dquot)
{
int ret = 0;
+ unsigned int memalloc;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
+ memalloc = memalloc_nofs_save();
if (!clear_dquot_dirty(dquot))
goto out_lock;
/* Inactive dquot can be only if there was error during read/init
@@ -481,6 +487,7 @@ int dquot_commit(struct dquot *dquot)
else
ret = -EIO;
out_lock:
+ memalloc_nofs_restore(memalloc);
mutex_unlock(&dquot->dq_lock);
return ret;
}
@@ -492,9 +499,11 @@ EXPORT_SYMBOL(dquot_commit);
int dquot_release(struct dquot *dquot)
{
int ret = 0, ret2 = 0;
+ unsigned int memalloc;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
+ memalloc = memalloc_nofs_save();
/* Check whether we are not racing with some other dqget() */
if (dquot_is_busy(dquot))
goto out_dqlock;
@@ -510,6 +519,7 @@ int dquot_release(struct dquot *dquot)
}
clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_dqlock:
+ memalloc_nofs_restore(memalloc);
mutex_unlock(&dquot->dq_lock);
return ret;
}
diff --git a/fs/read_write.c b/fs/read_write.c
index b1b1cdfee9d3..397da0236607 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1263,6 +1263,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
count, fl);
file_end_write(out.file);
} else {
+ if (out.file->f_flags & O_NONBLOCK)
+ fl |= SPLICE_F_NONBLOCK;
+
retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
}
@@ -1397,28 +1400,6 @@ ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
}
EXPORT_SYMBOL(generic_copy_file_range);
-static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t len, unsigned int flags)
-{
- /*
- * Although we now allow filesystems to handle cross sb copy, passing
- * a file of the wrong filesystem type to filesystem driver can result
- * in an attempt to dereference the wrong type of ->private_data, so
- * avoid doing that until we really have a good reason. NFS defines
- * several different file_system_type structures, but they all end up
- * using the same ->copy_file_range() function pointer.
- */
- if (file_out->f_op->copy_file_range &&
- file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
- return file_out->f_op->copy_file_range(file_in, pos_in,
- file_out, pos_out,
- len, flags);
-
- return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
- flags);
-}
-
/*
* Performs necessary checks before doing a file copy
*
@@ -1440,6 +1421,24 @@ static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
if (ret)
return ret;
+ /*
+ * We allow some filesystems to handle cross sb copy, but passing
+ * a file of the wrong filesystem type to filesystem driver can result
+ * in an attempt to dereference the wrong type of ->private_data, so
+ * avoid doing that until we really have a good reason.
+ *
+ * nfs and cifs define several different file_system_type structures
+ * and several different sets of file_operations, but they all end up
+ * using the same ->copy_file_range() function pointer.
+ */
+ if (file_out->f_op->copy_file_range) {
+ if (file_in->f_op->copy_file_range !=
+ file_out->f_op->copy_file_range)
+ return -EXDEV;
+ } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
+ return -EXDEV;
+ }
+
/* Don't touch certain kinds of inodes */
if (IS_IMMUTABLE(inode_out))
return -EPERM;
@@ -1505,26 +1504,41 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
file_start_write(file_out);
/*
- * Try cloning first, this is supported by more file systems, and
- * more efficient if both clone and copy are supported (e.g. NFS).
+ * Cloning is supported by more file systems, so we implement copy on
+ * same sb using clone, but for filesystems where both clone and copy
+ * are supported (e.g. nfs,cifs), we only call the copy method.
*/
+ if (file_out->f_op->copy_file_range) {
+ ret = file_out->f_op->copy_file_range(file_in, pos_in,
+ file_out, pos_out,
+ len, flags);
+ goto done;
+ }
+
if (file_in->f_op->remap_file_range &&
file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
- loff_t cloned;
-
- cloned = file_in->f_op->remap_file_range(file_in, pos_in,
+ ret = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out,
min_t(loff_t, MAX_RW_COUNT, len),
REMAP_FILE_CAN_SHORTEN);
- if (cloned > 0) {
- ret = cloned;
+ if (ret > 0)
goto done;
- }
}
- ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
- flags);
- WARN_ON_ONCE(ret == -EOPNOTSUPP);
+ /*
+ * We can get here for same sb copy of filesystems that do not implement
+ * ->copy_file_range() in case filesystem does not support clone or in
+ * case filesystem supports clone but rejected the clone request (e.g.
+ * because it was not block aligned).
+ *
+ * In both cases, fall back to kernel copy so we are able to maintain a
+ * consistent story about which filesystems support copy_file_range()
+ * and which filesystems do not, that will allow userspace tools to
+ * make consistent desicions w.r.t using copy_file_range().
+ */
+ ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
+ flags);
+
done:
if (ret > 0) {
fsnotify_access(file_in);
diff --git a/fs/remap_range.c b/fs/remap_range.c
index e112b5424cdb..881a306ee247 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -71,7 +71,8 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
* Otherwise, make sure the count is also block-aligned, having
* already confirmed the starting offsets' block alignment.
*/
- if (pos_in + count == size_in) {
+ if (pos_in + count == size_in &&
+ (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) {
bcount = ALIGN(size_in, bs) - pos_in;
} else {
if (!IS_ALIGNED(count, bs))
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index de7252715b12..81d26abf486f 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -553,7 +553,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
*
* Only one instances directory is allowed.
*
- * The instances directory is special as it allows for mkdir and rmdir to
+ * The instances directory is special as it allows for mkdir and rmdir
* to be done by userspace. When a mkdir or rmdir is performed, the inode
* locks are released and the methods passed in (@mkdir and @rmdir) are
* called without locks and with the name of the directory being created
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e943370107d0..de86f5b2859f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -192,17 +192,19 @@ static inline void msg_init(struct uffd_msg *msg)
}
static inline struct uffd_msg userfault_msg(unsigned long address,
+ unsigned long real_address,
unsigned int flags,
unsigned long reason,
unsigned int features)
{
struct uffd_msg msg;
+
msg_init(&msg);
msg.event = UFFD_EVENT_PAGEFAULT;
- if (!(features & UFFD_FEATURE_EXACT_ADDRESS))
- address &= PAGE_MASK;
- msg.arg.pagefault.address = address;
+ msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
+ real_address : address;
+
/*
* These flags indicate why the userfault occurred:
* - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
@@ -488,8 +490,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason,
- ctx->features);
+ uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
+ reason, ctx->features);
uwq.ctx = ctx;
uwq.waken = false;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 836ab1b8ed7b..224649a76cbb 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -50,7 +50,7 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp);
-STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp);
+STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args);
/*
* Internal routines when attribute list is more than one block.
@@ -393,16 +393,10 @@ xfs_attr_sf_addname(
* It won't fit in the shortform, transform to a leaf block. GROT:
* another possible req'mt for a double-split btree op.
*/
- error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp);
+ error = xfs_attr_shortform_to_leaf(args);
if (error)
return error;
- /*
- * Prevent the leaf buffer from being unlocked so that a concurrent AIL
- * push cannot grab the half-baked leaf buffer and run into problems
- * with the write verifier.
- */
- xfs_trans_bhold(args->trans, attr->xattri_leaf_bp);
attr->xattri_dela_state = XFS_DAS_LEAF_ADD;
out:
trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp);
@@ -447,11 +441,9 @@ xfs_attr_leaf_addname(
/*
* Use the leaf buffer we may already hold locked as a result of
- * a sf-to-leaf conversion. The held buffer is no longer valid
- * after this call, regardless of the result.
+ * a sf-to-leaf conversion.
*/
- error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp);
- attr->xattri_leaf_bp = NULL;
+ error = xfs_attr_leaf_try_add(args);
if (error == -ENOSPC) {
error = xfs_attr3_leaf_to_node(args);
@@ -497,8 +489,6 @@ xfs_attr_node_addname(
struct xfs_da_args *args = attr->xattri_da_args;
int error;
- ASSERT(!attr->xattri_leaf_bp);
-
error = xfs_attr_node_addname_find_attr(attr);
if (error)
return error;
@@ -997,9 +987,11 @@ xfs_attr_set(
/*
* We have no control over the attribute names that userspace passes us
* to remove, so we have to allow the name lookup prior to attribute
- * removal to fail as well.
+ * removal to fail as well. Preserve the logged flag, since we need
+ * to pass that through to the logging code.
*/
- args->op_flags = XFS_DA_OP_OKNOENT;
+ args->op_flags = XFS_DA_OP_OKNOENT |
+ (args->op_flags & XFS_DA_OP_LOGGED);
if (args->value) {
XFS_STATS_INC(mp, xs_attr_set);
@@ -1213,24 +1205,14 @@ xfs_attr_restore_rmt_blk(
*/
STATIC int
xfs_attr_leaf_try_add(
- struct xfs_da_args *args,
- struct xfs_buf *bp)
+ struct xfs_da_args *args)
{
+ struct xfs_buf *bp;
int error;
- /*
- * If the caller provided a buffer to us, it is locked and held in
- * the transaction because it just did a shortform to leaf conversion.
- * Hence we don't need to read it again. Otherwise read in the leaf
- * buffer.
- */
- if (bp) {
- xfs_trans_bhold_release(args->trans, bp);
- } else {
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
- if (error)
- return error;
- }
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
/*
* Look up the xattr name to set the insertion point for the new xattr.
@@ -1439,12 +1421,11 @@ static int
xfs_attr_node_try_addname(
struct xfs_attr_intent *attr)
{
- struct xfs_da_args *args = attr->xattri_da_args;
struct xfs_da_state *state = attr->xattri_da_state;
struct xfs_da_state_blk *blk;
int error;
- trace_xfs_attr_node_addname(args);
+ trace_xfs_attr_node_addname(state->args);
blk = &state->path.blk[state->path.active-1];
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index e329da3e7afa..dfb47fa63c6d 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -28,16 +28,6 @@ struct xfs_attr_list_context;
*/
#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */
-static inline bool xfs_has_larp(struct xfs_mount *mp)
-{
-#ifdef DEBUG
- /* Logged xattrs require a V5 super for log_incompat */
- return xfs_has_crc(mp) && xfs_globals.larp;
-#else
- return false;
-#endif
-}
-
/*
* Kernel-internal version of the attrlist cursor.
*/
@@ -525,11 +515,6 @@ struct xfs_attr_intent {
*/
struct xfs_attri_log_nameval *xattri_nameval;
- /*
- * Used by xfs_attr_set to hold a leaf buffer across a transaction roll
- */
- struct xfs_buf *xattri_leaf_bp;
-
/* Used to keep track of current state of delayed operation */
enum xfs_delattr_state xattri_dela_state;
@@ -624,7 +609,7 @@ static inline enum xfs_delattr_state
xfs_attr_init_replace_state(struct xfs_da_args *args)
{
args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE;
- if (xfs_has_larp(args->dp->i_mount))
+ if (args->op_flags & XFS_DA_OP_LOGGED)
return xfs_attr_init_remove_state(args);
return xfs_attr_init_add_state(args);
}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 15a990409463..8f47396f8dd2 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -289,6 +289,23 @@ xfs_attr3_leaf_verify_entry(
return NULL;
}
+/*
+ * Validate an attribute leaf block.
+ *
+ * Empty leaf blocks can occur under the following circumstances:
+ *
+ * 1. setxattr adds a new extended attribute to a file;
+ * 2. The file has zero existing attributes;
+ * 3. The attribute is too large to fit in the attribute fork;
+ * 4. The attribute is small enough to fit in a leaf block;
+ * 5. A log flush occurs after committing the transaction that creates
+ * the (empty) leaf block; and
+ * 6. The filesystem goes down after the log flush but before the new
+ * attribute can be committed to the leaf block.
+ *
+ * Hence we need to ensure that we don't fail the validation purely
+ * because the leaf is empty.
+ */
static xfs_failaddr_t
xfs_attr3_leaf_verify(
struct xfs_buf *bp)
@@ -311,15 +328,6 @@ xfs_attr3_leaf_verify(
return fa;
/*
- * Empty leaf blocks should never occur; they imply the existence of a
- * software bug that needs fixing. xfs_repair also flags them as a
- * corruption that needs fixing, so we should never let these go to
- * disk.
- */
- if (ichdr.count == 0)
- return __this_address;
-
- /*
* firstused is the block offset of the first name info structure.
* Make sure it doesn't go off the block or crash into the header.
*/
@@ -922,14 +930,10 @@ xfs_attr_shortform_getvalue(
return -ENOATTR;
}
-/*
- * Convert from using the shortform to the leaf. On success, return the
- * buffer so that we can keep it locked until we're totally done with it.
- */
+/* Convert from using the shortform to the leaf format. */
int
xfs_attr_shortform_to_leaf(
- struct xfs_da_args *args,
- struct xfs_buf **leaf_bp)
+ struct xfs_da_args *args)
{
struct xfs_inode *dp;
struct xfs_attr_shortform *sf;
@@ -991,7 +995,6 @@ xfs_attr_shortform_to_leaf(
sfe = xfs_attr_sf_nextentry(sfe);
}
error = 0;
- *leaf_bp = bp;
out:
kmem_free(tmpbuffer);
return error;
@@ -1530,7 +1533,7 @@ xfs_attr3_leaf_add_work(
if (tmp)
entry->flags |= XFS_ATTR_LOCAL;
if (args->op_flags & XFS_DA_OP_REPLACE) {
- if (!xfs_has_larp(mp))
+ if (!(args->op_flags & XFS_DA_OP_LOGGED))
entry->flags |= XFS_ATTR_INCOMPLETE;
if ((args->blkno2 == args->blkno) &&
(args->index2 <= args->index)) {
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index efa757f1e912..368f4d9fa1d5 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -49,8 +49,7 @@ void xfs_attr_shortform_create(struct xfs_da_args *args);
void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
-int xfs_attr_shortform_to_leaf(struct xfs_da_args *args,
- struct xfs_buf **leaf_bp);
+int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_sf_removename(struct xfs_da_args *args);
int xfs_attr_sf_findname(struct xfs_da_args *args,
struct xfs_attr_sf_entry **sfep,
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index d33b7686a0b3..ffa3df5b2893 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -92,6 +92,7 @@ typedef struct xfs_da_args {
#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */
#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */
#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */
+#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
@@ -101,7 +102,8 @@ typedef struct xfs_da_args {
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
{ XFS_DA_OP_NOTIME, "NOTIME" }, \
{ XFS_DA_OP_REMOVE, "REMOVE" }, \
- { XFS_DA_OP_RECOVERY, "RECOVERY" }
+ { XFS_DA_OP_RECOVERY, "RECOVERY" }, \
+ { XFS_DA_OP_LOGGED, "LOGGED" }
/*
* Storage for holding state during Btree searches and split/join ops.
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 4a28c2d77070..5077a7ad5646 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -413,18 +413,20 @@ xfs_attr_create_intent(
struct xfs_mount *mp = tp->t_mountp;
struct xfs_attri_log_item *attrip;
struct xfs_attr_intent *attr;
+ struct xfs_da_args *args;
ASSERT(count == 1);
- if (!xfs_sb_version_haslogxattrs(&mp->m_sb))
- return NULL;
-
/*
* Each attr item only performs one attribute operation at a time, so
* this is a list of one
*/
attr = list_first_entry_or_null(items, struct xfs_attr_intent,
xattri_list);
+ args = attr->xattri_da_args;
+
+ if (!(args->op_flags & XFS_DA_OP_LOGGED))
+ return NULL;
/*
* Create a buffer to store the attribute name and value. This buffer
@@ -432,8 +434,6 @@ xfs_attr_create_intent(
* and the lower level xattr log items.
*/
if (!attr->xattri_nameval) {
- struct xfs_da_args *args = attr->xattri_da_args;
-
/*
* Transfer our reference to the name/value buffer to the
* deferred work state structure.
@@ -576,7 +576,7 @@ xfs_attri_item_recover(
struct xfs_trans_res tres;
struct xfs_attri_log_format *attrp;
struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
- int error, ret = 0;
+ int error;
int total;
int local;
struct xfs_attrd_log_item *done_item = NULL;
@@ -617,7 +617,10 @@ xfs_attri_item_recover(
args->namelen = nv->name.i_len;
args->hashval = xfs_da_hashname(args->name, args->namelen);
args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK;
- args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT;
+ args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
+ XFS_DA_OP_LOGGED;
+
+ ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb));
switch (attr->xattri_op_flags) {
case XFS_ATTRI_OP_FLAGS_SET:
@@ -652,29 +655,32 @@ xfs_attri_item_recover(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- ret = xfs_xattri_finish_update(attr, done_item);
- if (ret == -EAGAIN) {
- /* There's more work to do, so add it to this transaction */
+ error = xfs_xattri_finish_update(attr, done_item);
+ if (error == -EAGAIN) {
+ /*
+ * There's more work to do, so add the intent item to this
+ * transaction so that we can continue it later.
+ */
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
- } else
- error = ret;
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+ if (error)
+ goto out_unlock;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+ return 0;
+ }
if (error) {
xfs_trans_cancel(tp);
goto out_unlock;
}
error = xfs_defer_ops_capture_and_commit(tp, capture_list);
-
out_unlock:
- if (attr->xattri_leaf_bp)
- xfs_buf_relse(attr->xattri_leaf_bp);
-
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
out:
- if (ret != -EAGAIN)
- xfs_attr_free_item(attr);
+ xfs_attr_free_item(attr);
return error;
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 52be58372c63..85e1a26c92e8 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -686,6 +686,8 @@ xfs_can_free_eofblocks(
* forever.
*/
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
+ if (XFS_IS_REALTIME_INODE(ip) && mp->m_sb.sb_rextsize > 1)
+ end_fsb = roundup_64(end_fsb, mp->m_sb.sb_rextsize);
last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
if (last_fsb <= end_fsb)
return false;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 5269354b1b69..2609825d53ee 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -440,7 +440,7 @@ xfs_inodegc_queue_all(
for_each_online_cpu(cpu) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list))
- queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
+ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
}
}
@@ -1841,8 +1841,8 @@ void
xfs_inodegc_worker(
struct work_struct *work)
{
- struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc,
- work);
+ struct xfs_inodegc *gc = container_of(to_delayed_work(work),
+ struct xfs_inodegc, work);
struct llist_node *node = llist_del_all(&gc->list);
struct xfs_inode *ip, *n;
@@ -1862,19 +1862,29 @@ xfs_inodegc_worker(
}
/*
- * Force all currently queued inode inactivation work to run immediately and
- * wait for the work to finish.
+ * Expedite all pending inodegc work to run immediately. This does not wait for
+ * completion of the work.
*/
void
-xfs_inodegc_flush(
+xfs_inodegc_push(
struct xfs_mount *mp)
{
if (!xfs_is_inodegc_enabled(mp))
return;
+ trace_xfs_inodegc_push(mp, __return_address);
+ xfs_inodegc_queue_all(mp);
+}
+/*
+ * Force all currently queued inode inactivation work to run immediately and
+ * wait for the work to finish.
+ */
+void
+xfs_inodegc_flush(
+ struct xfs_mount *mp)
+{
+ xfs_inodegc_push(mp);
trace_xfs_inodegc_flush(mp, __return_address);
-
- xfs_inodegc_queue_all(mp);
flush_workqueue(mp->m_inodegc_wq);
}
@@ -2014,6 +2024,7 @@ xfs_inodegc_queue(
struct xfs_inodegc *gc;
int items;
unsigned int shrinker_hits;
+ unsigned long queue_delay = 1;
trace_xfs_inode_set_need_inactive(ip);
spin_lock(&ip->i_flags_lock);
@@ -2025,19 +2036,26 @@ xfs_inodegc_queue(
items = READ_ONCE(gc->items);
WRITE_ONCE(gc->items, items + 1);
shrinker_hits = READ_ONCE(gc->shrinker_hits);
- put_cpu_ptr(gc);
- if (!xfs_is_inodegc_enabled(mp))
+ /*
+ * We queue the work while holding the current CPU so that the work
+ * is scheduled to run on this CPU.
+ */
+ if (!xfs_is_inodegc_enabled(mp)) {
+ put_cpu_ptr(gc);
return;
-
- if (xfs_inodegc_want_queue_work(ip, items)) {
- trace_xfs_inodegc_queue(mp, __return_address);
- queue_work(mp->m_inodegc_wq, &gc->work);
}
+ if (xfs_inodegc_want_queue_work(ip, items))
+ queue_delay = 0;
+
+ trace_xfs_inodegc_queue(mp, __return_address);
+ mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
+ put_cpu_ptr(gc);
+
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
trace_xfs_inodegc_throttle(mp, __return_address);
- flush_work(&gc->work);
+ flush_delayed_work(&gc->work);
}
}
@@ -2054,7 +2072,7 @@ xfs_inodegc_cpu_dead(
unsigned int count = 0;
dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
- cancel_work_sync(&dead_gc->work);
+ cancel_delayed_work_sync(&dead_gc->work);
if (llist_empty(&dead_gc->list))
return;
@@ -2073,12 +2091,12 @@ xfs_inodegc_cpu_dead(
llist_add_batch(first, last, &gc->list);
count += READ_ONCE(gc->items);
WRITE_ONCE(gc->items, count);
- put_cpu_ptr(gc);
if (xfs_is_inodegc_enabled(mp)) {
trace_xfs_inodegc_queue(mp, __return_address);
- queue_work(mp->m_inodegc_wq, &gc->work);
+ mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
}
+ put_cpu_ptr(gc);
}
/*
@@ -2173,7 +2191,7 @@ xfs_inodegc_shrinker_scan(
unsigned int h = READ_ONCE(gc->shrinker_hits);
WRITE_ONCE(gc->shrinker_hits, h + 1);
- queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
+ mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
no_items = false;
}
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 2e4cfddf8b8e..6cd180721659 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp);
void xfs_blockgc_start(struct xfs_mount *mp);
void xfs_inodegc_worker(struct work_struct *work);
+void xfs_inodegc_push(struct xfs_mount *mp);
void xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 52d6f2c7d58b..3e1c62ffa4f7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -132,6 +132,26 @@ xfs_ilock_attr_map_shared(
}
/*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED,
+ * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values
+ * to set in lock_flags.
+ */
+static inline void
+xfs_lock_flags_assert(
+ uint lock_flags)
+{
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+ (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ ASSERT(lock_flags != 0);
+}
+
+/*
* In addition to i_rwsem in the VFS inode, the xfs inode contains 2
* multi-reader locks: invalidate_lock and the i_lock. This routine allows
* various combinations of the locks to be obtained.
@@ -168,18 +188,7 @@ xfs_ilock(
{
trace_xfs_ilock(ip, lock_flags, _RET_IP_);
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL) {
down_write_nested(&VFS_I(ip)->i_rwsem,
@@ -222,18 +231,7 @@ xfs_ilock_nowait(
{
trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL) {
if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
@@ -291,19 +289,7 @@ xfs_iunlock(
xfs_inode_t *ip,
uint lock_flags)
{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
- (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
- ASSERT(lock_flags != 0);
+ xfs_lock_flags_assert(lock_flags);
if (lock_flags & XFS_IOLOCK_EXCL)
up_write(&VFS_I(ip)->i_rwsem);
@@ -379,8 +365,8 @@ xfs_isilocked(
}
if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
- return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
- (lock_flags & XFS_IOLOCK_SHARED));
+ return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
+ (lock_flags & XFS_MMAPLOCK_SHARED));
}
if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5a364a7d58fd..0d67ff8a8961 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1096,7 +1096,8 @@ xfs_flags2diflags2(
{
uint64_t di_flags2 =
(ip->i_diflags2 & (XFS_DIFLAG2_REFLINK |
- XFS_DIFLAG2_BIGTIME));
+ XFS_DIFLAG2_BIGTIME |
+ XFS_DIFLAG2_NREXT64));
if (xflags & FS_XFLAG_DAX)
di_flags2 |= XFS_DIFLAG2_DAX;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 1e972f884a81..ae904b21e9cc 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2092,8 +2092,6 @@ xlog_dealloc_log(
xlog_in_core_t *iclog, *next_iclog;
int i;
- xlog_cil_destroy(log);
-
/*
* Cycle all the iclogbuf locks to make sure all log IO completion
* is done before we tear down these buffers.
@@ -2105,6 +2103,13 @@ xlog_dealloc_log(
iclog = iclog->ic_next;
}
+ /*
+ * Destroy the CIL after waiting for iclog IO completion because an
+ * iclog EIO error will try to shut down the log, which accesses the
+ * CIL to wake up the waiters.
+ */
+ xlog_cil_destroy(log);
+
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
next_iclog = iclog->ic_next;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index ba5d42abf66e..d2eaebd85abf 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -61,7 +61,7 @@ struct xfs_error_cfg {
*/
struct xfs_inodegc {
struct llist_head list;
- struct work_struct work;
+ struct delayed_work work;
/* approximate count of inodes in the list */
unsigned int items;
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 74ac9ca9e119..392cb39cc10c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -454,9 +454,12 @@ xfs_qm_scall_getquota(
struct xfs_dquot *dqp;
int error;
- /* Flush inodegc work at the start of a quota reporting scan. */
+ /*
+ * Expedite pending inodegc work at the start of a quota reporting
+ * scan but don't block waiting for it to complete.
+ */
if (id == 0)
- xfs_inodegc_flush(mp);
+ xfs_inodegc_push(mp);
/*
* Try to get the dquot. We don't want it allocated on disk, so don't
@@ -498,7 +501,7 @@ xfs_qm_scall_getquota_next(
/* Flush inodegc work at the start of a quota reporting scan. */
if (*id == 0)
- xfs_inodegc_flush(mp);
+ xfs_inodegc_push(mp);
error = xfs_qm_dqget_next(mp, *id, type, &dqp);
if (error)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ed18160e6181..aa977c7ea370 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -797,8 +797,11 @@ xfs_fs_statfs(
xfs_extlen_t lsize;
int64_t ffree;
- /* Wait for whatever inactivations are in progress. */
- xfs_inodegc_flush(mp);
+ /*
+ * Expedite background inodegc but don't wait. We do not want to block
+ * here waiting hours for a billion extent file to be truncated.
+ */
+ xfs_inodegc_push(mp);
statp->f_type = XFS_SUPER_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
@@ -1074,7 +1077,7 @@ xfs_inodegc_init_percpu(
gc = per_cpu_ptr(mp->m_inodegc, cpu);
init_llist_head(&gc->list);
gc->items = 0;
- INIT_WORK(&gc->work, xfs_inodegc_worker);
+ INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
}
return 0;
}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index d32026585c1b..0fa1b7a2918c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -240,6 +240,7 @@ DEFINE_EVENT(xfs_fs_class, name, \
TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
TP_ARGS(mp, caller_ip))
DEFINE_FS_EVENT(xfs_inodegc_flush);
+DEFINE_FS_EVENT(xfs_inodegc_push);
DEFINE_FS_EVENT(xfs_inodegc_start);
DEFINE_FS_EVENT(xfs_inodegc_stop);
DEFINE_FS_EVENT(xfs_inodegc_queue);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 35e13e125ec6..c325a28b89a8 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -68,6 +68,18 @@ xfs_attr_rele_log_assist(
xlog_drop_incompat_feat(mp->m_log);
}
+static inline bool
+xfs_attr_want_log_assist(
+ struct xfs_mount *mp)
+{
+#ifdef DEBUG
+ /* Logged xattrs require a V5 super for log_incompat */
+ return xfs_has_crc(mp) && xfs_globals.larp;
+#else
+ return false;
+#endif
+}
+
/*
* Set or remove an xattr, having grabbed the appropriate logging resources
* prior to calling libxfs.
@@ -80,11 +92,14 @@ xfs_attr_change(
bool use_logging = false;
int error;
- if (xfs_has_larp(mp)) {
+ ASSERT(!(args->op_flags & XFS_DA_OP_LOGGED));
+
+ if (xfs_attr_want_log_assist(mp)) {
error = xfs_attr_grab_log_assist(mp);
if (error)
return error;
+ args->op_flags |= XFS_DA_OP_LOGGED;
use_logging = true;
}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index bcb21aea990a..053299758deb 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -110,15 +110,51 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
}
}
-static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned int flags, struct iomap *iomap,
- struct iomap *srcmap)
+static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct super_block *sb = inode->i_sb;
loff_t isize;
- /* All I/Os should always be within the file maximum size */
+ /*
+ * All blocks are always mapped below EOF. If reading past EOF,
+ * act as if there is a hole up to the file maximum size.
+ */
+ mutex_lock(&zi->i_truncate_mutex);
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+ isize = i_size_read(inode);
+ if (iomap->offset >= isize) {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = length;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
+ iomap->length = isize - iomap->offset;
+ }
+ mutex_unlock(&zi->i_truncate_mutex);
+
+ trace_zonefs_iomap_begin(inode, iomap);
+
+ return 0;
+}
+
+static const struct iomap_ops zonefs_read_iomap_ops = {
+ .iomap_begin = zonefs_read_iomap_begin,
+};
+
+static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ loff_t isize;
+
+ /* All write I/Os should always be within the file maximum size */
if (WARN_ON_ONCE(offset + length > zi->i_max_size))
return -EIO;
@@ -128,7 +164,7 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
* operation.
*/
if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
- (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)))
+ !(flags & IOMAP_DIRECT)))
return -EIO;
/*
@@ -137,47 +173,44 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
* write pointer) and unwriten beyond.
*/
mutex_lock(&zi->i_truncate_mutex);
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+ iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
isize = i_size_read(inode);
- if (offset >= isize)
+ if (iomap->offset >= isize) {
iomap->type = IOMAP_UNWRITTEN;
- else
+ iomap->length = zi->i_max_size - iomap->offset;
+ } else {
iomap->type = IOMAP_MAPPED;
- if (flags & IOMAP_WRITE)
- length = zi->i_max_size - offset;
- else
- length = min(length, isize - offset);
+ iomap->length = isize - iomap->offset;
+ }
mutex_unlock(&zi->i_truncate_mutex);
- iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
- iomap->length = ALIGN(offset + length, sb->s_blocksize) - iomap->offset;
- iomap->bdev = inode->i_sb->s_bdev;
- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
-
trace_zonefs_iomap_begin(inode, iomap);
return 0;
}
-static const struct iomap_ops zonefs_iomap_ops = {
- .iomap_begin = zonefs_iomap_begin,
+static const struct iomap_ops zonefs_write_iomap_ops = {
+ .iomap_begin = zonefs_write_iomap_begin,
};
static int zonefs_read_folio(struct file *unused, struct folio *folio)
{
- return iomap_read_folio(folio, &zonefs_iomap_ops);
+ return iomap_read_folio(folio, &zonefs_read_iomap_ops);
}
static void zonefs_readahead(struct readahead_control *rac)
{
- iomap_readahead(rac, &zonefs_iomap_ops);
+ iomap_readahead(rac, &zonefs_read_iomap_ops);
}
/*
* Map blocks for page writeback. This is used only on conventional zone files,
* which implies that the page range can only be within the fixed inode size.
*/
-static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset)
+static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct inode *inode, loff_t offset)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
@@ -191,12 +224,12 @@ static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc,
offset < wpc->iomap.offset + wpc->iomap.length)
return 0;
- return zonefs_iomap_begin(inode, offset, zi->i_max_size - offset,
- IOMAP_WRITE, &wpc->iomap, NULL);
+ return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
}
static const struct iomap_writeback_ops zonefs_writeback_ops = {
- .map_blocks = zonefs_map_blocks,
+ .map_blocks = zonefs_write_map_blocks,
};
static int zonefs_writepage(struct page *page, struct writeback_control *wbc)
@@ -226,7 +259,8 @@ static int zonefs_swap_activate(struct swap_info_struct *sis,
return -EINVAL;
}
- return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops);
+ return iomap_swapfile_activate(sis, swap_file, span,
+ &zonefs_read_iomap_ops);
}
static const struct address_space_operations zonefs_file_aops = {
@@ -647,7 +681,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
/* Serialize against truncates */
filemap_invalidate_lock_shared(inode->i_mapping);
- ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
@@ -899,7 +933,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
if (append)
ret = zonefs_file_dio_append(iocb, from);
else
- ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
+ ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
&zonefs_write_dio_ops, 0, NULL, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
@@ -948,7 +982,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
if (ret <= 0)
goto inode_unlock;
- ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops);
+ ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
if (ret > 0)
iocb->ki_pos += ret;
else if (ret == -EIO)
@@ -1041,7 +1075,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
goto inode_unlock;
}
file_accessed(iocb->ki_filp);
- ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
+ ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
&zonefs_read_dio_ops, 0, NULL, 0);
} else {
ret = generic_file_read_iter(iocb, to);
@@ -1085,7 +1119,8 @@ static int zonefs_seq_file_write_open(struct inode *inode)
if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
- if (wro > sbi->s_max_wro_seq_files) {
+ if (sbi->s_max_wro_seq_files
+ && wro > sbi->s_max_wro_seq_files) {
atomic_dec(&sbi->s_wro_seq_files);
ret = -EBUSY;
goto unlock;
@@ -1760,12 +1795,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
atomic_set(&sbi->s_wro_seq_files, 0);
sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev);
- if (!sbi->s_max_wro_seq_files &&
- sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
- zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
- sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
- }
-
atomic_set(&sbi->s_active_seq_files, 0);
sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev);
@@ -1790,6 +1819,14 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
zonefs_info(sb, "Mounting %u zones",
blkdev_nr_zones(sb->s_bdev->bd_disk));
+ if (!sbi->s_max_wro_seq_files &&
+ !sbi->s_max_active_seq_files &&
+ sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
+ zonefs_info(sb,
+ "No open and active zone limits. Ignoring explicit_open mount option\n");
+ sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
+ }
+
/* Create root directory inode */
ret = -ENOMEM;
inode = new_inode(sb);