aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/vfs_file.c2
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/super.c8
-rw-r--r--fs/aio.c27
-rw-r--r--fs/block_dev.c17
-rw-r--r--fs/btrfs/async-thread.c57
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/backref.c35
-rw-r--r--fs/btrfs/ctree.c4
-rw-r--r--fs/btrfs/ctree.h18
-rw-r--r--fs/btrfs/dev-replace.c3
-rw-r--r--fs/btrfs/disk-io.c92
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c298
-rw-r--r--fs/btrfs/extent_io.c25
-rw-r--r--fs/btrfs/free-space-cache.c57
-rw-r--r--fs/btrfs/inode.c38
-rw-r--r--fs/btrfs/ioctl.c52
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/raid56.c87
-rw-r--r--fs/btrfs/raid56.h10
-rw-r--r--fs/btrfs/reada.c4
-rw-r--r--fs/btrfs/relocation.c33
-rw-r--r--fs/btrfs/scrub.c358
-rw-r--r--fs/btrfs/super.c14
-rw-r--r--fs/btrfs/transaction.c15
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-defrag.c3
-rw-r--r--fs/btrfs/tree-log.c344
-rw-r--r--fs/btrfs/volumes.c100
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/ceph/addr.c8
-rw-r--r--fs/ceph/caps.c8
-rw-r--r--fs/ceph/file.c14
-rw-r--r--fs/ceph/mds_client.c59
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/snap.c7
-rw-r--r--fs/ceph/super.c3
-rw-r--r--fs/cifs/cifs_ioctl.h42
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifspdu.h14
-rw-r--r--fs/cifs/cifssmb.c7
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/ioctl.c48
-rw-r--r--fs/cifs/smb2pdu.c7
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--fs/coda/upcall.c6
-rw-r--r--fs/coredump.c46
-rw-r--r--fs/dax.c261
-rw-r--r--fs/dcache.c14
-rw-r--r--fs/debugfs/file.c14
-rw-r--r--fs/drop_caches.c10
-rw-r--r--fs/ecryptfs/crypto.c3
-rw-r--r--fs/ecryptfs/dentry.c16
-rw-r--r--fs/ext2/file.c10
-rw-r--r--fs/ext2/inode.c1
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/file.c68
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c12
-rw-r--r--fs/ext4/super.c4
-rw-r--r--fs/fs-writeback.c218
-rw-r--r--fs/gfs2/glock.c348
-rw-r--r--fs/gfs2/glops.c38
-rw-r--r--fs/gfs2/incore.h15
-rw-r--r--fs/gfs2/lock_dlm.c12
-rw-r--r--fs/gfs2/lops.c6
-rw-r--r--fs/gfs2/meta_io.c6
-rw-r--r--fs/gfs2/meta_io.h2
-rw-r--r--fs/gfs2/quota.c22
-rw-r--r--fs/gfs2/rgrp.c10
-rw-r--r--fs/gfs2/super.c6
-rw-r--r--fs/gfs2/trace_gfs2.h34
-rw-r--r--fs/gfs2/trans.c4
-rw-r--r--fs/hfs/bnode.c9
-rw-r--r--fs/hfs/brec.c20
-rw-r--r--fs/hfs/super.c4
-rw-r--r--fs/hfsplus/bnode.c3
-rw-r--r--fs/hfsplus/options.c4
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hugetlbfs/inode.c302
-rw-r--r--fs/inode.c50
-rw-r--r--fs/internal.h3
-rw-r--r--fs/kernfs/dir.c23
-rw-r--r--fs/lockd/svc.c8
-rw-r--r--fs/locks.c1
-rw-r--r--fs/namei.c29
-rw-r--r--fs/nfs/blocklayout/blocklayout.h19
-rw-r--r--fs/nfs/blocklayout/dev.c9
-rw-r--r--fs/nfs/blocklayout/extent_tree.c19
-rw-r--r--fs/nfs/callback.c10
-rw-r--r--fs/nfs/callback_proc.c9
-rw-r--r--fs/nfs/client.c113
-rw-r--r--fs/nfs/delegation.c29
-rw-r--r--fs/nfs/delegation.h3
-rw-r--r--fs/nfs/dir.c20
-rw-r--r--fs/nfs/file.c21
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c424
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h5
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c82
-rw-r--r--fs/nfs/inode.c61
-rw-r--r--fs/nfs/internal.h20
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs42.h2
-rw-r--r--fs/nfs/nfs42xdr.c5
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c5
-rw-r--r--fs/nfs/nfs4file.c32
-rw-r--r--fs/nfs/nfs4idmap.c14
-rw-r--r--fs/nfs/nfs4proc.c136
-rw-r--r--fs/nfs/nfs4state.c12
-rw-r--r--fs/nfs/nfs4trace.h61
-rw-r--r--fs/nfs/nfs4xdr.c75
-rw-r--r--fs/nfs/pagelist.c4
-rw-r--r--fs/nfs/pnfs.c227
-rw-r--r--fs/nfs/pnfs.h48
-rw-r--r--fs/nfs/pnfs_nfs.c88
-rw-r--r--fs/nfs/super.c7
-rw-r--r--fs/nfs/write.c36
-rw-r--r--fs/nfs_common/grace.c23
-rw-r--r--fs/nfsd/blocklayoutxdr.c2
-rw-r--r--fs/nfsd/blocklayoutxdr.h15
-rw-r--r--fs/nfsd/export.c73
-rw-r--r--fs/nfsd/export.h1
-rw-r--r--fs/nfsd/idmap.h4
-rw-r--r--fs/nfsd/netns.h1
-rw-r--r--fs/nfsd/nfs2acl.c10
-rw-r--r--fs/nfsd/nfs3acl.c4
-rw-r--r--fs/nfsd/nfs4acl.c8
-rw-r--r--fs/nfsd/nfs4callback.c122
-rw-r--r--fs/nfsd/nfs4idmap.c3
-rw-r--r--fs/nfsd/nfs4proc.c30
-rw-r--r--fs/nfsd/nfs4recover.c18
-rw-r--r--fs/nfsd/nfs4state.c180
-rw-r--r--fs/nfsd/nfs4xdr.c158
-rw-r--r--fs/nfsd/nfssvc.c17
-rw-r--r--fs/nfsd/state.h2
-rw-r--r--fs/nfsd/vfs.c6
-rw-r--r--fs/nfsd/vfs.h6
-rw-r--r--fs/notify/dnotify/dnotify.c14
-rw-r--r--fs/notify/fanotify/fanotify_user.c8
-rw-r--r--fs/notify/fdinfo.c3
-rw-r--r--fs/notify/fsnotify.c11
-rw-r--r--fs/notify/fsnotify.h21
-rw-r--r--fs/notify/inode_mark.c40
-rw-r--r--fs/notify/mark.c113
-rw-r--r--fs/notify/vfsmount_mark.c19
-rw-r--r--fs/nsfs.c3
-rw-r--r--fs/ntfs/super.c21
-rw-r--r--fs/ocfs2/acl.c26
-rw-r--r--fs/ocfs2/alloc.c148
-rw-r--r--fs/ocfs2/aops.c54
-rw-r--r--fs/ocfs2/buffer_head_io.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c69
-rw-r--r--fs/ocfs2/dir.c70
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c78
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c22
-rw-r--r--fs/ocfs2/dlm/dlmthread.c10
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/extent_map.c22
-rw-r--r--fs/ocfs2/file.c53
-rw-r--r--fs/ocfs2/inode.c49
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/journal.c32
-rw-r--r--fs/ocfs2/localalloc.c3
-rw-r--r--fs/ocfs2/move_extents.c8
-rw-r--r--fs/ocfs2/namei.c96
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/quota_local.c3
-rw-r--r--fs/ocfs2/refcounttree.c81
-rw-r--r--fs/ocfs2/suballoc.c96
-rw-r--r--fs/ocfs2/super.c73
-rw-r--r--fs/ocfs2/super.h8
-rw-r--r--fs/ocfs2/xattr.c51
-rw-r--r--fs/overlayfs/super.c6
-rw-r--r--fs/proc/array.c5
-rw-r--r--fs/proc/base.c113
-rw-r--r--fs/proc/generic.c44
-rw-r--r--fs/proc/page.c65
-rw-r--r--fs/proc/task_mmu.c292
-rw-r--r--fs/quota/dquot.c16
-rw-r--r--fs/reiserfs/super.c8
-rw-r--r--fs/seq_file.c112
-rw-r--r--fs/super.c175
-rw-r--r--fs/ufs/Makefile2
-rw-r--r--fs/ufs/balloc.c8
-rw-r--r--fs/ufs/inode.c948
-rw-r--r--fs/ufs/super.c36
-rw-r--r--fs/ufs/truncate.c523
-rw-r--r--fs/ufs/ufs.h13
-rw-r--r--fs/userfaultfd.c1330
-rw-r--r--fs/xfs/Makefile2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c6
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr.c2
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c9
-rw-r--r--fs/xfs/libxfs/xfs_bit.c (renamed from fs/xfs/xfs_bit.c)0
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c1
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c5
-rw-r--r--fs/xfs/libxfs/xfs_btree.c10
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c32
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h11
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c36
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c7
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c17
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_format.h22
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c7
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_sb.c27
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c4
-rw-r--r--fs/xfs/xfs_aops.c9
-rw-r--r--fs/xfs/xfs_bmap_util.c87
-rw-r--r--fs/xfs/xfs_buf.c9
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_buf_item.c26
-rw-r--r--fs/xfs/xfs_buf_item.h2
-rw-r--r--fs/xfs/xfs_dir2_readdir.c11
-rw-r--r--fs/xfs/xfs_dquot.c10
-rw-r--r--fs/xfs/xfs_extfree_item.c105
-rw-r--r--fs/xfs/xfs_extfree_item.h26
-rw-r--r--fs/xfs/xfs_file.c81
-rw-r--r--fs/xfs/xfs_fsops.c6
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_inode.c141
-rw-r--r--fs/xfs/xfs_inode.h85
-rw-r--r--fs/xfs/xfs_inode_item.c11
-rw-r--r--fs/xfs/xfs_iops.c8
-rw-r--r--fs/xfs/xfs_itable.c3
-rw-r--r--fs/xfs/xfs_log.c87
-rw-r--r--fs/xfs/xfs_log.h1
-rw-r--r--fs/xfs/xfs_log_cil.c8
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c216
-rw-r--r--fs/xfs/xfs_mount.c28
-rw-r--r--fs/xfs/xfs_rtalloc.c57
-rw-r--r--fs/xfs/xfs_super.c20
-rw-r--r--fs/xfs/xfs_symlink.c9
-rw-r--r--fs/xfs/xfs_trace.h35
-rw-r--r--fs/xfs/xfs_trans.c15
-rw-r--r--fs/xfs/xfs_trans.h9
-rw-r--r--fs/xfs/xfs_trans_extfree.c32
-rw-r--r--fs/xfs/xfs_trans_priv.h15
249 files changed, 7861 insertions, 4407 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 8aa56bb6e861..6caca025019d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -52,7 +52,7 @@ enum {
/* Options that take integer arguments */
Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
/* String options */
- Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag,
+ Opt_uname, Opt_remotename, Opt_cache, Opt_cachetag,
/* Options that take no arguments */
Opt_nodevmap,
/* Cache options */
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 1ef16bd8280b..3abc447783aa 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -381,7 +381,7 @@ static ssize_t
v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct p9_fid *fid = iocb->ki_filp->private_data;
- int ret, err;
+ int ret, err = 0;
p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n",
iov_iter_count(to), iocb->ki_pos);
diff --git a/fs/Makefile b/fs/Makefile
index 09e051fefc5b..f79cf4043e60 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FILE_LOCKING) += locks.o
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3f89c9e05b40..5b50c4ca43a7 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -18,6 +18,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/writeback.h>
+#include <linux/blkdev.h>
#include "affs.h"
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
@@ -352,18 +353,19 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
* blocks, we will have to change it.
*/
- size = sb->s_bdev->bd_inode->i_size >> 9;
+ size = i_size_read(sb->s_bdev->bd_inode) >> 9;
pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
affs_set_blocksize(sb, PAGE_SIZE);
/* Try to find root block. Its location depends on the block size. */
- i = 512;
- j = 4096;
+ i = bdev_logical_block_size(sb->s_bdev);
+ j = PAGE_SIZE;
if (blocksize > 0) {
i = j = blocksize;
size = size / (blocksize / 512);
}
+
for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
sbi->s_root_block = root_block;
if (root_block < 0)
diff --git a/fs/aio.c b/fs/aio.c
index 480440f4701f..155f84253f33 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx)
}
}
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
-{
- vma->vm_flags |= VM_DONTEXPAND;
- vma->vm_ops = &generic_file_vm_ops;
- return 0;
-}
-
-static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mremap(struct vm_area_struct *vma)
{
+ struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
struct kioctx_table *table;
int i, res = -EINVAL;
@@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
return res;
}
+static const struct vm_operations_struct aio_ring_vm_ops = {
+ .mremap = aio_ring_mremap,
+#if IS_ENABLED(CONFIG_MMU)
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = filemap_page_mkwrite,
+#endif
+};
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ vma->vm_flags |= VM_DONTEXPAND;
+ vma->vm_ops = &aio_ring_vm_ops;
+ return 0;
+}
+
static const struct file_operations aio_ring_fops = {
.mmap = aio_ring_mmap,
- .mremap = aio_ring_remap,
};
#if IS_ENABLED(CONFIG_MIGRATION)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 198243717da5..22ea424ee741 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -28,6 +28,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
* accessible at this address.
*/
long bdev_direct_access(struct block_device *bdev, sector_t sector,
- void **addr, unsigned long *pfn, long size)
+ void __pmem **addr, unsigned long *pfn, long size)
{
long avail;
const struct block_device_operations *ops = bdev->bd_disk->fops;
@@ -462,7 +463,7 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
sector += get_start_sect(bdev);
if (sector % (PAGE_SIZE / 512))
return -EINVAL;
- avail = ops->direct_access(bdev, sector, addr, pfn, size);
+ avail = ops->direct_access(bdev, sector, addr, pfn);
if (!avail)
return -ERANGE;
return min(avail, size);
@@ -1769,7 +1770,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
{
struct inode *inode, *old_inode = NULL;
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
struct address_space *mapping = inode->i_mapping;
@@ -1781,13 +1782,13 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
/*
* We hold a reference to 'inode' so it couldn't have been
* removed from s_inodes list while we dropped the
- * inode_sb_list_lock. We cannot iput the inode now as we can
+ * s_inode_list_lock We cannot iput the inode now as we can
* be holding the last reference and we cannot iput it under
- * inode_sb_list_lock. So we keep the reference and iput it
+ * s_inode_list_lock. So we keep the reference and iput it
* later.
*/
iput(old_inode);
@@ -1795,8 +1796,8 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
func(I_BDEV(inode), arg);
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
iput(old_inode);
}
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 1ce06c849a86..3e36e4adc4a3 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -42,8 +42,14 @@ struct __btrfs_workqueue {
/* Thresholding related variants */
atomic_t pending;
- int max_active;
- int current_max;
+
+ /* Up limit of concurrency workers */
+ int limit_active;
+
+ /* Current number of concurrency workers */
+ int current_active;
+
+ /* Threshold to change current_active */
int thresh;
unsigned int count;
spinlock_t thres_lock;
@@ -88,7 +94,7 @@ BTRFS_WORK_HELPER(scrubnc_helper);
BTRFS_WORK_HELPER(scrubparity_helper);
static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
+__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
int thresh)
{
struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -96,26 +102,31 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
if (!ret)
return NULL;
- ret->max_active = max_active;
+ ret->limit_active = limit_active;
atomic_set(&ret->pending, 0);
if (thresh == 0)
thresh = DFT_THRESHOLD;
/* For low threshold, disabling threshold is a better choice */
if (thresh < DFT_THRESHOLD) {
- ret->current_max = max_active;
+ ret->current_active = limit_active;
ret->thresh = NO_THRESHOLD;
} else {
- ret->current_max = 1;
+ /*
+ * For threshold-able wq, let its concurrency grow on demand.
+ * Use minimal max_active at alloc time to reduce resource
+ * usage.
+ */
+ ret->current_active = 1;
ret->thresh = thresh;
}
if (flags & WQ_HIGHPRI)
ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
- ret->max_active,
- "btrfs", name);
+ ret->current_active, "btrfs",
+ name);
else
ret->normal_wq = alloc_workqueue("%s-%s", flags,
- ret->max_active, "btrfs",
+ ret->current_active, "btrfs",
name);
if (!ret->normal_wq) {
kfree(ret);
@@ -134,7 +145,7 @@ __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
unsigned int flags,
- int max_active,
+ int limit_active,
int thresh)
{
struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -143,14 +154,14 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
return NULL;
ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
- max_active, thresh);
+ limit_active, thresh);
if (!ret->normal) {
kfree(ret);
return NULL;
}
if (flags & WQ_HIGHPRI) {
- ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+ ret->high = __btrfs_alloc_workqueue(name, flags, limit_active,
thresh);
if (!ret->high) {
__btrfs_destroy_workqueue(ret->normal);
@@ -180,7 +191,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
*/
static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
{
- int new_max_active;
+ int new_current_active;
long pending;
int need_change = 0;
@@ -197,7 +208,7 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
wq->count %= (wq->thresh / 4);
if (!wq->count)
goto out;
- new_max_active = wq->current_max;
+ new_current_active = wq->current_active;
/*
* pending may be changed later, but it's OK since we really
@@ -205,19 +216,19 @@ static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
*/
pending = atomic_read(&wq->pending);
if (pending > wq->thresh)
- new_max_active++;
+ new_current_active++;
if (pending < wq->thresh / 2)
- new_max_active--;
- new_max_active = clamp_val(new_max_active, 1, wq->max_active);
- if (new_max_active != wq->current_max) {
+ new_current_active--;
+ new_current_active = clamp_val(new_current_active, 1, wq->limit_active);
+ if (new_current_active != wq->current_active) {
need_change = 1;
- wq->current_max = new_max_active;
+ wq->current_active = new_current_active;
}
out:
spin_unlock(&wq->thres_lock);
if (need_change) {
- workqueue_set_max_active(wq->normal_wq, wq->current_max);
+ workqueue_set_max_active(wq->normal_wq, wq->current_active);
}
}
@@ -351,13 +362,13 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
kfree(wq);
}
-void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
{
if (!wq)
return;
- wq->normal->max_active = max;
+ wq->normal->limit_active = limit_active;
if (wq->high)
- wq->high->max_active = max;
+ wq->high->limit_active = limit_active;
}
void btrfs_set_work_high_priority(struct btrfs_work *work)
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index b0b093b6afec..ad4d0647d1a6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -69,7 +69,7 @@ BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
unsigned int flags,
- int max_active,
+ int limit_active,
int thresh);
void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
btrfs_func_t func,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 802fabb30e15..ecbc63d3143e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -206,10 +206,33 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
return -ENOMEM;
ref->root_id = root_id;
- if (key)
+ if (key) {
ref->key_for_search = *key;
- else
+ /*
+ * We can often find data backrefs with an offset that is too
+ * large (>= LLONG_MAX, maximum allowed file offset) due to
+ * underflows when subtracting a file's offset with the data
+ * offset of its corresponding extent data item. This can
+ * happen for example in the clone ioctl.
+ * So if we detect such case we set the search key's offset to
+ * zero to make sure we will find the matching file extent item
+ * at add_all_parents(), otherwise we will miss it because the
+ * offset taken form the backref is much larger then the offset
+ * of the file extent item. This can make us scan a very large
+ * number of file extent items, but at least it will not make
+ * us miss any.
+ * This is an ugly workaround for a behaviour that should have
+ * never existed, but it does and a fix for the clone ioctl
+ * would touch a lot of places, cause backwards incompatibility
+ * and would not fix the problem for extents cloned with older
+ * kernels.
+ */
+ if (ref->key_for_search.type == BTRFS_EXTENT_DATA_KEY &&
+ ref->key_for_search.offset >= LLONG_MAX)
+ ref->key_for_search.offset = 0;
+ } else {
memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+ }
ref->inode_list = NULL;
ref->level = level;
@@ -632,7 +655,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct btrfs_delayed_tree_ref *ref;
ref = btrfs_delayed_node_to_tree_ref(node);
- ret = __add_prelim_ref(prefs, ref->root, NULL,
+ ret = __add_prelim_ref(prefs, 0, NULL,
ref->level + 1, ref->parent,
node->bytenr,
node->ref_mod * sgn, GFP_ATOMIC);
@@ -664,11 +687,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
struct btrfs_delayed_data_ref *ref;
ref = btrfs_delayed_node_to_data_ref(node);
-
- key.objectid = ref->objectid;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = ref->offset;
- ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+ ret = __add_prelim_ref(prefs, 0, NULL, 0,
ref->parent, node->bytenr,
node->ref_mod * sgn, GFP_ATOMIC);
break;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 54114b4887dd..5f745eadf77d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1159,8 +1159,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
return ret;
+ }
}
if (buf == root->node) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aac314e14188..938efe33be80 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1300,7 +1300,7 @@ struct btrfs_block_group_cache {
/* for raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
- unsigned int ro:1;
+ unsigned int ro;
unsigned int iref:1;
unsigned int has_caching_ctl:1;
unsigned int removed:1;
@@ -1518,12 +1518,6 @@ struct btrfs_fs_info {
*/
struct mutex ordered_operations_mutex;
- /*
- * Same as ordered_operations_mutex except this is for ordered extents
- * and not the operations.
- */
- struct mutex ordered_extent_flush_mutex;
-
struct rw_semaphore commit_root_sem;
struct rw_semaphore cleanup_work_sem;
@@ -3437,6 +3431,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start,
struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -3495,9 +3491,9 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
@@ -4073,6 +4069,7 @@ __cold
void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
+const char *btrfs_decode_error(int errno);
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
@@ -4185,8 +4182,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *cow);
-void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending,
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564a7de17d99..e54dd5905cee 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -183,8 +183,7 @@ no_valid_dev_replace_entry_found:
}
out:
- if (path)
- btrfs_free_path(path);
+ btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5e307bd0471a..0d98aee34fee 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1730,6 +1730,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
bdi->congested_fn = btrfs_congested_fn;
bdi->congested_data = info;
+ bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
return 0;
}
@@ -2613,7 +2614,6 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->ordered_operations_mutex);
- mutex_init(&fs_info->ordered_extent_flush_mutex);
mutex_init(&fs_info->tree_log_mutex);
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2955,8 +2955,9 @@ retry_root_backup:
if (fs_info->fs_devices->missing_devices >
fs_info->num_tolerated_disk_barrier_failures &&
!(sb->s_flags & MS_RDONLY)) {
- printk(KERN_WARNING "BTRFS: "
- "too many missing devices, writeable mount is not allowed\n");
+ pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n",
+ fs_info->fs_devices->missing_devices,
+ fs_info->num_tolerated_disk_barrier_failures);
goto fail_sysfs;
}
@@ -3442,6 +3443,26 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
return 0;
}
+int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
+{
+ if ((flags & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
+ ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
+ return 0;
+
+ if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ return 1;
+
+ if (flags & BTRFS_BLOCK_GROUP_RAID6)
+ return 2;
+
+ pr_warn("BTRFS: unknown raid type: %llu\n", flags);
+ return 0;
+}
+
int btrfs_calc_num_tolerated_disk_barrier_failures(
struct btrfs_fs_info *fs_info)
{
@@ -3451,13 +3472,12 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
BTRFS_BLOCK_GROUP_SYSTEM,
BTRFS_BLOCK_GROUP_METADATA,
BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
- int num_types = 4;
int i;
int c;
int num_tolerated_disk_barrier_failures =
(int)fs_info->fs_devices->num_devices;
- for (i = 0; i < num_types; i++) {
+ for (i = 0; i < ARRAY_SIZE(types); i++) {
struct btrfs_space_info *tmp;
sinfo = NULL;
@@ -3475,44 +3495,21 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
down_read(&sinfo->groups_sem);
for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
- if (!list_empty(&sinfo->block_groups[c])) {
- u64 flags;
-
- btrfs_get_block_group_info(
- &sinfo->block_groups[c], &space);
- if (space.total_bytes == 0 ||
- space.used_bytes == 0)
- continue;
- flags = space.flags;
- /*
- * return
- * 0: if dup, single or RAID0 is configured for
- * any of metadata, system or data, else
- * 1: if RAID5 is configured, or if RAID1 or
- * RAID10 is configured and only two mirrors
- * are used, else
- * 2: if RAID6 is configured, else
- * num_mirrors - 1: if RAID1 or RAID10 is
- * configured and more than
- * 2 mirrors are used.
- */
- if (num_tolerated_disk_barrier_failures > 0 &&
- ((flags & (BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID0)) ||
- ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
- == 0)))
- num_tolerated_disk_barrier_failures = 0;
- else if (num_tolerated_disk_barrier_failures > 1) {
- if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID10)) {
- num_tolerated_disk_barrier_failures = 1;
- } else if (flags &
- BTRFS_BLOCK_GROUP_RAID6) {
- num_tolerated_disk_barrier_failures = 2;
- }
- }
- }
+ u64 flags;
+
+ if (list_empty(&sinfo->block_groups[c]))
+ continue;
+
+ btrfs_get_block_group_info(&sinfo->block_groups[c],
+ &space);
+ if (space.total_bytes == 0 || space.used_bytes == 0)
+ continue;
+ flags = space.flags;
+
+ num_tolerated_disk_barrier_failures = min(
+ num_tolerated_disk_barrier_failures,
+ btrfs_get_num_tolerated_disk_barrier_failures(
+ flags));
}
up_read(&sinfo->groups_sem);
}
@@ -3763,6 +3760,15 @@ void close_ctree(struct btrfs_root *root)
cancel_work_sync(&fs_info->async_reclaim_work);
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+ /*
+ * If the cleaner thread is stopped and there are
+ * block groups queued for removal, the deletion will be
+ * skipped when we quit the cleaner thread.
+ */
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_delete_unused_bgs(root->fs_info);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
ret = btrfs_commit_super(root);
if (ret)
btrfs_err(fs_info, "commit super ret %d", ret);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index d4cbfeeeedd4..bdfb479ea859 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -139,6 +139,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
+int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_calc_num_tolerated_disk_barrier_failures(
struct btrfs_fs_info *fs_info);
int __init btrfs_end_io_wq_init(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 07204bf601ed..5411f0ab5683 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1316,8 +1316,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline u32 extent_data_ref_count(struct btrfs_root *root,
- struct btrfs_path *path,
+static noinline u32 extent_data_ref_count(struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref)
{
struct btrfs_key key;
@@ -1883,10 +1882,77 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-static int btrfs_issue_discard(struct block_device *bdev,
- u64 start, u64 len)
+#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+ u64 *discarded_bytes)
{
- return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+ int j, ret = 0;
+ u64 bytes_left, end;
+ u64 aligned_start = ALIGN(start, 1 << 9);
+
+ if (WARN_ON(start != aligned_start)) {
+ len -= aligned_start - start;
+ len = round_down(len, 1 << 9);
+ start = aligned_start;
+ }
+
+ *discarded_bytes = 0;
+
+ if (!len)
+ return 0;
+
+ end = start + len;
+ bytes_left = len;
+
+ /* Skip any superblocks on this device. */
+ for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+ u64 sb_start = btrfs_sb_offset(j);
+ u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
+ u64 size = sb_start - start;
+
+ if (!in_range(sb_start, start, bytes_left) &&
+ !in_range(sb_end, start, bytes_left) &&
+ !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
+ continue;
+
+ /*
+ * Superblock spans beginning of range. Adjust start and
+ * try again.
+ */
+ if (sb_start <= start) {
+ start += sb_end - start;
+ if (start > end) {
+ bytes_left = 0;
+ break;
+ }
+ bytes_left = end - start;
+ continue;
+ }
+
+ if (size) {
+ ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+ GFP_NOFS, 0);
+ if (!ret)
+ *discarded_bytes += size;
+ else if (ret != -EOPNOTSUPP)
+ return ret;
+ }
+
+ start = sb_end;
+ if (start > end) {
+ bytes_left = 0;
+ break;
+ }
+ bytes_left = end - start;
+ }
+
+ if (bytes_left) {
+ ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
+ GFP_NOFS, 0);
+ if (!ret)
+ *discarded_bytes += bytes_left;
+ }
+ return ret;
}
int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1907,14 +1973,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+ u64 bytes;
if (!stripe->dev->can_discard)
continue;
ret = btrfs_issue_discard(stripe->dev->bdev,
stripe->physical,
- stripe->length);
+ stripe->length,
+ &bytes);
if (!ret)
- discarded_bytes += stripe->length;
+ discarded_bytes += bytes;
else if (ret != -EOPNOTSUPP)
break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
@@ -6062,20 +6130,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *block_group, *tmp;
+ struct list_head *deleted_bgs;
struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
- if (trans->aborted)
- return 0;
-
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
unpin = &fs_info->freed_extents[1];
else
unpin = &fs_info->freed_extents[0];
- while (1) {
+ while (!trans->aborted) {
mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, NULL);
@@ -6094,6 +6161,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
cond_resched();
}
+ /*
+ * Transaction is finished. We don't need the lock anymore. We
+ * do need to clean up the block groups in case of a transaction
+ * abort.
+ */
+ deleted_bgs = &trans->transaction->deleted_bgs;
+ list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
+ u64 trimmed = 0;
+
+ ret = -EROFS;
+ if (!trans->aborted)
+ ret = btrfs_discard_extent(root,
+ block_group->key.objectid,
+ block_group->key.offset,
+ &trimmed);
+
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group_trimming(block_group);
+ btrfs_put_block_group(block_group);
+
+ if (ret) {
+ const char *errstr = btrfs_decode_error(ret);
+ btrfs_warn(fs_info,
+ "Discard failed while removing blockgroup: errno=%d %s\n",
+ ret, errstr);
+ }
+ }
+
return 0;
}
@@ -6349,7 +6444,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
} else {
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
- extent_data_ref_count(root, path, iref));
+ extent_data_ref_count(path, iref));
if (iref) {
BUG_ON(path->slots[0] != extent_slot);
} else {
@@ -7567,9 +7662,6 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
/*
* finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
* returns the tree buffer or an ERR_PTR on error.
*/
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@ -8723,14 +8815,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
return flags;
}
-static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
u64 min_allocable_bytes;
int ret = -ENOSPC;
-
/*
* We need some metadata space and system metadata space for
* allocating chunks in some corner cases until we force to set
@@ -8747,6 +8838,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
spin_lock(&cache->lock);
if (cache->ro) {
+ cache->ro++;
ret = 0;
goto out;
}
@@ -8758,7 +8850,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
min_allocable_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
- cache->ro = 1;
+ cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
ret = 0;
}
@@ -8768,7 +8860,7 @@ out:
return ret;
}
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
@@ -8776,8 +8868,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
u64 alloc_flags;
int ret;
- BUG_ON(cache->ro);
-
again:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
@@ -8820,7 +8910,7 @@ again:
goto out;
}
- ret = set_block_group_ro(cache, 0);
+ ret = inc_block_group_ro(cache, 0);
if (!ret)
goto out;
alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -8828,7 +8918,7 @@ again:
CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
- ret = set_block_group_ro(cache, 0);
+ ret = inc_block_group_ro(cache, 0);
out:
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
alloc_flags = update_block_group_flags(root, cache->flags);
@@ -8891,7 +8981,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
return free_bytes;
}
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
struct btrfs_space_info *sinfo = cache->space_info;
@@ -8901,11 +8991,13 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
- num_bytes = cache->key.offset - cache->reserved - cache->pinned -
- cache->bytes_super - btrfs_block_group_used(&cache->item);
- sinfo->bytes_readonly -= num_bytes;
- cache->ro = 0;
- list_del_init(&cache->ro_list);
+ if (!--cache->ro) {
+ num_bytes = cache->key.offset - cache->reserved -
+ cache->pinned - cache->bytes_super -
+ btrfs_block_group_used(&cache->item);
+ sinfo->bytes_readonly -= num_bytes;
+ list_del_init(&cache->ro_list);
+ }
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
}
@@ -9421,7 +9513,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid)) {
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
} else if (btrfs_block_group_used(&cache->item) == 0) {
spin_lock(&info->unused_bgs_lock);
/* Should always be true but just in case. */
@@ -9449,11 +9541,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
list_for_each_entry(cache,
&space_info->block_groups[BTRFS_RAID_RAID0],
list)
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
list_for_each_entry(cache,
&space_info->block_groups[BTRFS_RAID_SINGLE],
list)
- set_block_group_ro(cache, 1);
+ inc_block_group_ro(cache, 1);
}
init_global_block_rsv(info);
@@ -9834,6 +9926,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* currently running transaction might finish and a new one start,
* allowing for new block groups to be created that can reuse the same
* physical device locations unless we take this special care.
+ *
+ * There may also be an implicit trim operation if the file system
+ * is mounted with -odiscard. The same protections must remain
+ * in place until the extents have been discarded completely when
+ * the transaction commit has completed.
*/
remove_em = (atomic_read(&block_group->trimming) == 0);
/*
@@ -9908,6 +10005,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
u64 start, end;
+ int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
struct btrfs_block_group_cache,
@@ -9941,7 +10039,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&block_group->lock);
/* We don't want to force the issue, only flip if it's ok. */
- ret = set_block_group_ro(block_group, 0);
+ ret = inc_block_group_ro(block_group, 0);
up_write(&space_info->groups_sem);
if (ret < 0) {
ret = 0;
@@ -9955,7 +10053,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
/* 1 for btrfs_orphan_reserve_metadata() */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
- btrfs_set_block_group_rw(root, block_group);
+ btrfs_dec_block_group_ro(root, block_group);
ret = PTR_ERR(trans);
goto next;
}
@@ -9982,14 +10080,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_set_block_group_rw(root, block_group);
+ btrfs_dec_block_group_ro(root, block_group);
goto end_trans;
}
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- btrfs_set_block_group_rw(root, block_group);
+ btrfs_dec_block_group_ro(root, block_group);
goto end_trans;
}
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
@@ -10007,12 +10105,39 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
+ /* DISCARD can flip during remount */
+ trimming = btrfs_test_opt(root, DISCARD);
+
+ /* Implicit trim during transaction commit. */
+ if (trimming)
+ btrfs_get_block_group_trimming(block_group);
+
/*
* Btrfs_remove_chunk will abort the transaction if things go
* horribly wrong.
*/
ret = btrfs_remove_chunk(trans, root,
block_group->key.objectid);
+
+ if (ret) {
+ if (trimming)
+ btrfs_put_block_group_trimming(block_group);
+ goto end_trans;
+ }
+
+ /*
+ * If we're not mounted with -odiscard, we can just forget
+ * about this block group. Otherwise we'll need to wait
+ * until transaction commit to do the actual discard.
+ */
+ if (trimming) {
+ WARN_ON(!list_empty(&block_group->bg_list));
+ spin_lock(&trans->transaction->deleted_bgs_lock);
+ list_move(&block_group->bg_list,
+ &trans->transaction->deleted_bgs);
+ spin_unlock(&trans->transaction->deleted_bgs_lock);
+ btrfs_get_block_group(block_group);
+ }
end_trans:
btrfs_end_transaction(trans, root);
next:
@@ -10066,10 +10191,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
return unpin_extent_range(root, start, end, false);
}
+/*
+ * It used to be that old block groups would be left around forever.
+ * Iterating over them would be enough to trim unused space. Since we
+ * now automatically remove them, we also need to iterate over unallocated
+ * space.
+ *
+ * We don't want a transaction for this since the discard may take a
+ * substantial amount of time. We don't require that a transaction be
+ * running, but we do need to take a running transaction into account
+ * to ensure that we're not discarding chunks that were released in
+ * the current transaction.
+ *
+ * Holding the chunks lock will prevent other threads from allocating
+ * or releasing chunks, but it won't prevent a running transaction
+ * from committing and releasing the memory that the pending chunks
+ * list head uses. For that, we need to take a reference to the
+ * transaction.
+ */
+static int btrfs_trim_free_extents(struct btrfs_device *device,
+ u64 minlen, u64 *trimmed)
+{
+ u64 start = 0, len = 0;
+ int ret;
+
+ *trimmed = 0;
+
+ /* Not writeable = nothing to do. */
+ if (!device->writeable)
+ return 0;
+
+ /* No free space = nothing to do. */
+ if (device->total_bytes <= device->bytes_used)
+ return 0;
+
+ ret = 0;
+
+ while (1) {
+ struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+ struct btrfs_transaction *trans;
+ u64 bytes;
+
+ ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+ if (ret)
+ return ret;
+
+ down_read(&fs_info->commit_root_sem);
+
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ atomic_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ret = find_free_dev_extent_start(trans, device, minlen, start,
+ &start, &len);
+ if (trans)
+ btrfs_put_transaction(trans);
+
+ if (ret) {
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&fs_info->chunk_mutex);
+ if (ret == -ENOSPC)
+ ret = 0;
+ break;
+ }
+
+ ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+ up_read(&fs_info->commit_root_sem);
+ mutex_unlock(&fs_info->chunk_mutex);
+
+ if (ret)
+ break;
+
+ start += len;
+ *trimmed += bytes;
+
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
+ struct btrfs_device *device;
+ struct list_head *devices;
u64 group_trimmed;
u64 start;
u64 end;
@@ -10124,6 +10338,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
cache = next_block_group(fs_info->tree_root, cache);
}
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ devices = &root->fs_info->fs_devices->alloc_list;
+ list_for_each_entry(device, devices, dev_alloc_list) {
+ ret = btrfs_trim_free_extents(device, range->minlen,
+ &group_trimmed);
+ if (ret)
+ break;
+
+ trimmed += group_trimmed;
+ }
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
range->len = trimmed;
return ret;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 68b12bbc709f..f1018cfbfefa 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2723,6 +2723,12 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
btrfs_bio->csum = NULL;
btrfs_bio->csum_allocated = NULL;
btrfs_bio->end_io = NULL;
+
+#ifdef CONFIG_BLK_CGROUP
+ /* FIXME, put this into bio_clone_bioset */
+ if (bio->bi_css)
+ bio_associate_blkcg(new, bio->bi_css);
+#endif
}
return new;
}
@@ -2783,6 +2789,7 @@ static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
}
static int submit_extent_page(int rw, struct extent_io_tree *tree,
+ struct writeback_control *wbc,
struct page *page, sector_t sector,
size_t size, unsigned long offset,
struct block_device *bdev,
@@ -2817,6 +2824,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
}
bio = NULL;
} else {
+ if (wbc)
+ wbc_account_io(wbc, page, page_size);
return 0;
}
}
@@ -2829,6 +2838,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
bio_add_page(bio, page, page_size, offset);
bio->bi_end_io = end_io_func;
bio->bi_private = tree;
+ if (wbc) {
+ wbc_init_bio(wbc, bio);
+ wbc_account_io(wbc, page, page_size);
+ }
if (bio_ret)
*bio_ret = bio;
@@ -3039,7 +3052,7 @@ static int __do_readpage(struct extent_io_tree *tree,
}
pnr -= page->index;
- ret = submit_extent_page(rw, tree, page,
+ ret = submit_extent_page(rw, tree, NULL, page,
sector, disk_io_size, pg_offset,
bdev, bio, pnr,
end_bio_extent_readpage, mirror_num,
@@ -3434,7 +3447,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
page->index, cur, end);
}
- ret = submit_extent_page(write_flags, tree, page,
+ ret = submit_extent_page(write_flags, tree, wbc, page,
sector, iosize, pg_offset,
bdev, &epd->bio, max_nr,
end_bio_extent_writepage,
@@ -3738,7 +3751,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
- ret = submit_extent_page(rw, tree, p, offset >> 9,
+ ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
-1, end_bio_extent_buffer_writepage,
0, epd->bio_flags, bio_flags);
@@ -4603,9 +4616,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
{
struct extent_buffer *eb = NULL;
- eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
- if (eb == NULL)
- return NULL;
+ eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
eb->start = start;
eb->len = len;
eb->fs_info = fs_info;
@@ -4863,7 +4874,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
return NULL;
for (i = 0; i < num_pages; i++, index++) {
- p = find_or_create_page(mapping, index, GFP_NOFS);
+ p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
if (!p)
goto free_eb;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index fb5a6b1c62a6..abe3a66bd3ba 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3272,35 +3272,23 @@ next:
return ret;
}
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
- u64 *trimmed, u64 start, u64 end, u64 minlen)
+void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache)
{
- int ret;
+ atomic_inc(&cache->trimming);
+}
- *trimmed = 0;
+void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
+{
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ bool cleanup;
spin_lock(&block_group->lock);
- if (block_group->removed) {
- spin_unlock(&block_group->lock);
- return 0;
- }
- atomic_inc(&block_group->trimming);
+ cleanup = (atomic_dec_and_test(&block_group->trimming) &&
+ block_group->removed);
spin_unlock(&block_group->lock);
- ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
- if (ret)
- goto out;
-
- ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
-out:
- spin_lock(&block_group->lock);
- if (atomic_dec_and_test(&block_group->trimming) &&
- block_group->removed) {
- struct extent_map_tree *em_tree;
- struct extent_map *em;
-
- spin_unlock(&block_group->lock);
-
+ if (cleanup) {
lock_chunks(block_group->fs_info->chunk_root);
em_tree = &block_group->fs_info->mapping_tree.map_tree;
write_lock(&em_tree->lock);
@@ -3324,10 +3312,31 @@ out:
* this block group have left 1 entry each one. Free them.
*/
__btrfs_remove_free_space_cache(block_group->free_space_ctl);
- } else {
+ }
+}
+
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+ int ret;
+
+ *trimmed = 0;
+
+ spin_lock(&block_group->lock);
+ if (block_group->removed) {
spin_unlock(&block_group->lock);
+ return 0;
}
+ btrfs_get_block_group_trimming(block_group);
+ spin_unlock(&block_group->lock);
+
+ ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+ if (ret)
+ goto out;
+ ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+out:
+ btrfs_put_block_group_trimming(block_group);
return ret;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f924d9a62700..a0fa7253a2d7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3658,6 +3658,35 @@ cache_index:
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
+ /*
+ * We don't persist the id of the transaction where an unlink operation
+ * against the inode was last made. So here we assume the inode might
+ * have been evicted, and therefore the exact value of last_unlink_trans
+ * lost, and set it to last_trans to avoid metadata inconsistencies
+ * between the inode and its parent if the inode is fsync'ed and the log
+ * replayed. For example, in the scenario:
+ *
+ * touch mydir/foo
+ * ln mydir/foo mydir/bar
+ * sync
+ * unlink mydir/bar
+ * echo 2 > /proc/sys/vm/drop_caches # evicts inode
+ * xfs_io -c fsync mydir/foo
+ * <power failure>
+ * mount fs, triggers fsync log replay
+ *
+ * We must make sure that when we fsync our inode foo we also log its
+ * parent inode, otherwise after log replay the parent still has the
+ * dentry with the "bar" name but our inode foo has a link count of 1
+ * and doesn't have an inode ref with the name "bar" anymore.
+ *
+ * Setting last_unlink_trans to last_trans is a pessimistic approach,
+ * but it guarantees correctness at the expense of ocassional full
+ * transaction commits on fsync if our inode is a directory, or if our
+ * inode is not a directory, logging its parent unnecessarily.
+ */
+ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+
path->slots[0]++;
if (inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
@@ -6880,8 +6909,7 @@ out:
trace_btrfs_get_extent(root, em);
- if (path)
- btrfs_free_path(path);
+ btrfs_free_path(path);
if (trans) {
ret = btrfs_end_transaction(trans, root);
if (!err)
@@ -7958,7 +7986,11 @@ out:
static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
u64 first_sector, gfp_t gfp_flags)
{
- return btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
+ struct bio *bio;
+ bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
+ if (bio)
+ bio_associate_current(bio);
+ return bio;
}
static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0770c91586ca..0adf5422fce9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1030,6 +1030,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
struct extent_map *em;
int ret = 1;
bool next_mergeable = true;
+ bool prev_mergeable = true;
/*
* make sure that once we start defragging an extent, we keep on
@@ -1050,13 +1051,16 @@ static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
goto out;
}
+ if (!*defrag_end)
+ prev_mergeable = false;
+
next_mergeable = defrag_check_next_extent(inode, em);
/*
* we hit a real extent, if it is big or the next extent is not a
* real extent, don't bother defragging it
*/
if (!compress && (*last_len == 0 || *last_len >= thresh) &&
- (em->len >= thresh || !next_mergeable))
+ (em->len >= thresh || (!next_mergeable && !prev_mergeable)))
ret = 0;
out:
/*
@@ -1933,6 +1937,7 @@ static noinline int copy_to_sk(struct btrfs_root *root,
u64 found_transid;
struct extent_buffer *leaf;
struct btrfs_ioctl_search_header sh;
+ struct btrfs_key test;
unsigned long item_off;
unsigned long item_len;
int nritems;
@@ -2016,12 +2021,17 @@ static noinline int copy_to_sk(struct btrfs_root *root,
}
advance_key:
ret = 0;
- if (key->offset < (u64)-1 && key->offset < sk->max_offset)
+ test.objectid = sk->max_objectid;
+ test.type = sk->max_type;
+ test.offset = sk->max_offset;
+ if (btrfs_comp_cpu_keys(key, &test) >= 0)
+ ret = 1;
+ else if (key->offset < (u64)-1)
key->offset++;
- else if (key->type < (u8)-1 && key->type < sk->max_type) {
+ else if (key->type < (u8)-1) {
key->offset = 0;
key->type++;
- } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
+ } else if (key->objectid < (u64)-1) {
key->offset = 0;
key->type = 0;
key->objectid++;
@@ -2842,8 +2852,7 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
swap(inode1, inode2);
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
- if (inode1 != inode2)
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
}
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
@@ -2861,8 +2870,7 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
swap(loff1, loff2);
}
lock_extent_range(inode1, loff1, len);
- if (inode1 != inode2)
- lock_extent_range(inode2, loff2, len);
+ lock_extent_range(inode2, loff2, len);
}
struct cmp_pages {
@@ -3787,13 +3795,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
goto out_fput;
if (!same_inode) {
- if (inode < src) {
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
- } else {
- mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- }
+ btrfs_double_inode_lock(src, inode);
} else {
mutex_lock(&src->i_mutex);
}
@@ -3843,8 +3845,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
lock_extent_range(src, lock_start, lock_len);
} else {
- lock_extent_range(src, off, len);
- lock_extent_range(inode, destoff, len);
+ btrfs_double_extent_lock(src, off, inode, destoff, len);
}
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
@@ -3855,9 +3856,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
} else {
- unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
- unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
- destoff + len - 1);
+ btrfs_double_extent_unlock(src, off, inode, destoff, len);
}
/*
* Truncate page cache pages so that future reads will see the cloned
@@ -3866,17 +3865,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
truncate_inode_pages_range(&inode->i_data, destoff,
PAGE_CACHE_ALIGN(destoff + len) - 1);
out_unlock:
- if (!same_inode) {
- if (inode < src) {
- mutex_unlock(&src->i_mutex);
- mutex_unlock(&inode->i_mutex);
- } else {
- mutex_unlock(&inode->i_mutex);
- mutex_unlock(&src->i_mutex);
- }
- } else {
+ if (!same_inode)
+ btrfs_double_inode_unlock(src, inode);
+ else
mutex_unlock(&src->i_mutex);
- }
out_fput:
fdput(src_file);
out_drop_write:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index f8229ef1b46d..d7e6baf1b205 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -241,6 +241,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
*/
void btrfs_tree_lock(struct extent_buffer *eb)
{
+ WARN_ON(eb->lock_owner == current->pid);
again:
wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0a02e24900aa..fcf7265ca46f 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -61,9 +61,10 @@
#define RBIO_CACHE_SIZE 1024
enum btrfs_rbio_ops {
- BTRFS_RBIO_WRITE = 0,
- BTRFS_RBIO_READ_REBUILD = 1,
- BTRFS_RBIO_PARITY_SCRUB = 2,
+ BTRFS_RBIO_WRITE,
+ BTRFS_RBIO_READ_REBUILD,
+ BTRFS_RBIO_PARITY_SCRUB,
+ BTRFS_RBIO_REBUILD_MISSING,
};
struct btrfs_raid_bio {
@@ -602,6 +603,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
cur->operation == BTRFS_RBIO_PARITY_SCRUB)
return 0;
+ if (last->operation == BTRFS_RBIO_REBUILD_MISSING ||
+ cur->operation == BTRFS_RBIO_REBUILD_MISSING)
+ return 0;
+
return 1;
}
@@ -793,7 +798,10 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
if (next->operation == BTRFS_RBIO_READ_REBUILD)
async_read_rebuild(next);
- else if (next->operation == BTRFS_RBIO_WRITE) {
+ else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
+ steal_rbio(rbio, next);
+ async_read_rebuild(next);
+ } else if (next->operation == BTRFS_RBIO_WRITE) {
steal_rbio(rbio, next);
async_rmw_stripe(next);
} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
@@ -1805,7 +1813,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
faila = rbio->faila;
failb = rbio->failb;
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
spin_lock_irq(&rbio->bio_list_lock);
set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
spin_unlock_irq(&rbio->bio_list_lock);
@@ -1830,7 +1839,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* if we're rebuilding a read, we have to use
* pages from the bio list
*/
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
(stripe == faila || stripe == failb)) {
page = page_in_rbio(rbio, stripe, pagenr, 0);
} else {
@@ -1939,7 +1949,8 @@ pstripe:
* if we're rebuilding a read, we have to use
* pages from the bio list
*/
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
+ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
(stripe == faila || stripe == failb)) {
page = page_in_rbio(rbio, stripe, pagenr, 0);
} else {
@@ -1961,6 +1972,8 @@ cleanup_io:
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
rbio_orig_end_io(rbio, err);
+ } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
+ rbio_orig_end_io(rbio, err);
} else if (err == 0) {
rbio->faila = -1;
rbio->failb = -1;
@@ -2096,7 +2109,8 @@ out:
return 0;
cleanup:
- if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
+ if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
rbio_orig_end_io(rbio, -EIO);
return -EIO;
}
@@ -2227,8 +2241,9 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
return rbio;
}
-void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
- struct page *page, u64 logical)
+/* Used for both parity scrub and missing. */
+void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+ u64 logical)
{
int stripe_offset;
int index;
@@ -2662,3 +2677,55 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
if (!lock_stripe_add(rbio))
async_scrub_parity(rbio);
}
+
+/* The following code is used for dev replace of a missing RAID 5/6 device. */
+
+struct btrfs_raid_bio *
+raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 length)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = alloc_rbio(root, bbio, length);
+ if (IS_ERR(rbio))
+ return NULL;
+
+ rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
+ bio_list_add(&rbio->bio_list, bio);
+ /*
+ * This is a special bio which is used to hold the completion handler
+ * and make the scrub rbio is similar to the other types
+ */
+ ASSERT(!bio->bi_iter.bi_size);
+
+ rbio->faila = find_logical_bio_stripe(rbio, bio);
+ if (rbio->faila == -1) {
+ BUG();
+ kfree(rbio);
+ return NULL;
+ }
+
+ return rbio;
+}
+
+static void missing_raid56_work(struct btrfs_work *work)
+{
+ struct btrfs_raid_bio *rbio;
+
+ rbio = container_of(work, struct btrfs_raid_bio, work);
+ __raid56_parity_recover(rbio);
+}
+
+static void async_missing_raid56(struct btrfs_raid_bio *rbio)
+{
+ btrfs_init_work(&rbio->work, btrfs_rmw_helper,
+ missing_raid56_work, NULL, NULL);
+
+ btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
+}
+
+void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
+{
+ if (!lock_stripe_add(rbio))
+ async_missing_raid56(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 2b5d7977d83b..8b694699d502 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -48,15 +48,21 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len);
+void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+ u64 logical);
+
struct btrfs_raid_bio *
raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
struct btrfs_bio *bbio, u64 stripe_len,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
-void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
- struct page *page, u64 logical);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
+struct btrfs_raid_bio *
+raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio,
+ struct btrfs_bio *bbio, u64 length);
+void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
+
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 0e7beea92b4c..4645cd16d5ba 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -328,6 +328,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
struct btrfs_device *prev_dev;
u32 blocksize;
u64 length;
+ int real_stripes;
int nzones = 0;
int i;
unsigned long index = logical >> PAGE_CACHE_SHIFT;
@@ -369,7 +370,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
goto error;
}
- for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
+ real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
+ for (nzones = 0; nzones < real_stripes; ++nzones) {
struct reada_zone *zone;
dev = bbio->stripes[nzones].dev;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 88cbb5995667..303babeef505 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2523,8 +2523,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
* counted. return -ENOENT if the block is root of reloc tree.
*/
static noinline_for_stack
-struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
- struct backref_node *node)
+struct btrfs_root *select_one_root(struct backref_node *node)
{
struct backref_node *next;
struct btrfs_root *root;
@@ -2912,7 +2911,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
return 0;
BUG_ON(node->processed);
- root = select_one_root(trans, node);
+ root = select_one_root(node);
if (root == ERR_PTR(-ENOENT)) {
update_processed_blocks(rc, node);
goto out;
@@ -3755,8 +3754,7 @@ out:
* helper to find next unprocessed extent
*/
static noinline_for_stack
-int find_next_extent(struct btrfs_trans_handle *trans,
- struct reloc_control *rc, struct btrfs_path *path,
+int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
struct btrfs_key *extent_key)
{
struct btrfs_key key;
@@ -3951,7 +3949,7 @@ restart:
continue;
}
- ret = find_next_extent(trans, rc, path, &key);
+ ret = find_next_extent(rc, path, &key);
if (ret < 0)
err = ret;
if (ret != 0)
@@ -3976,6 +3974,10 @@ restart:
sizeof(struct btrfs_extent_item_v0));
ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
&path_change);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
else
@@ -4140,7 +4142,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
struct btrfs_key key;
- u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+ u64 objectid;
int err = 0;
root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
@@ -4215,14 +4217,12 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
BUG_ON(!rc->block_group);
- if (!rc->block_group->ro) {
- ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
- if (ret) {
- err = ret;
- goto out;
- }
- rw = 1;
+ ret = btrfs_inc_block_group_ro(extent_root, rc->block_group);
+ if (ret) {
+ err = ret;
+ goto out;
}
+ rw = 1;
path = btrfs_alloc_path();
if (!path) {
@@ -4294,7 +4294,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
out:
if (err && rw)
- btrfs_set_block_group_rw(extent_root, rc->block_group);
+ btrfs_dec_block_group_ro(extent_root, rc->block_group);
iput(rc->data_inode);
btrfs_put_block_group(rc->block_group);
kfree(rc);
@@ -4594,8 +4594,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
* called before creating snapshot. it calculates metadata reservation
* requried for relocating tree blocks in the snapshot
*/
-void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
- struct btrfs_pending_snapshot *pending,
+void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve)
{
struct btrfs_root *root;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 9c146d8307b5..a39f5d1144e8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -125,6 +125,7 @@ struct scrub_block {
/* It is for the data with checksum */
unsigned int data_corrected:1;
};
+ struct btrfs_work work;
};
/* Used for the chunks with parity stripe such RAID5/6 */
@@ -332,11 +333,14 @@ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
}
}
-static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+static void scrub_pause_on(struct btrfs_fs_info *fs_info)
{
atomic_inc(&fs_info->scrubs_paused);
wake_up(&fs_info->scrub_pause_wait);
+}
+static void scrub_pause_off(struct btrfs_fs_info *fs_info)
+{
mutex_lock(&fs_info->scrub_lock);
__scrub_blocked_if_needed(fs_info);
atomic_dec(&fs_info->scrubs_paused);
@@ -345,6 +349,12 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
wake_up(&fs_info->scrub_pause_wait);
}
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+ scrub_pause_on(fs_info);
+ scrub_pause_off(fs_info);
+}
+
/*
* used for workers that require transaction commits (i.e., for the
* NOCOW case)
@@ -2074,21 +2084,7 @@ static void scrub_submit(struct scrub_ctx *sctx)
sbio = sctx->bios[sctx->curr];
sctx->curr = -1;
scrub_pending_bio_inc(sctx);
-
- if (!sbio->bio->bi_bdev) {
- /*
- * this case should not happen. If btrfs_map_block() is
- * wrong, it could happen for dev-replace operations on
- * missing devices when no mirrors are available, but in
- * this case it should already fail the mount.
- * This case is handled correctly (but _very_ slowly).
- */
- printk_ratelimited(KERN_WARNING
- "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
- bio_io_error(sbio->bio);
- } else {
- btrfsic_submit_bio(READ, sbio->bio);
- }
+ btrfsic_submit_bio(READ, sbio->bio);
}
static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
@@ -2165,6 +2161,134 @@ again:
return 0;
}
+static void scrub_missing_raid56_end_io(struct bio *bio)
+{
+ struct scrub_block *sblock = bio->bi_private;
+ struct btrfs_fs_info *fs_info = sblock->sctx->dev_root->fs_info;
+
+ if (bio->bi_error)
+ sblock->no_io_error_seen = 0;
+
+ btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
+}
+
+static void scrub_missing_raid56_worker(struct btrfs_work *work)
+{
+ struct scrub_block *sblock = container_of(work, struct scrub_block, work);
+ struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ unsigned int is_metadata;
+ unsigned int have_csum;
+ u8 *csum;
+ u64 generation;
+ u64 logical;
+ struct btrfs_device *dev;
+
+ is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA);
+ have_csum = sblock->pagev[0]->have_csum;
+ csum = sblock->pagev[0]->csum;
+ generation = sblock->pagev[0]->generation;
+ logical = sblock->pagev[0]->logical;
+ dev = sblock->pagev[0]->dev;
+
+ if (sblock->no_io_error_seen) {
+ scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+ have_csum, csum, generation,
+ sctx->csum_size);
+ }
+
+ if (!sblock->no_io_error_seen) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.read_errors++;
+ spin_unlock(&sctx->stat_lock);
+ printk_ratelimited_in_rcu(KERN_ERR
+ "BTRFS: I/O error rebulding logical %llu for dev %s\n",
+ logical, rcu_str_deref(dev->name));
+ } else if (sblock->header_error || sblock->checksum_error) {
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
+ printk_ratelimited_in_rcu(KERN_ERR
+ "BTRFS: failed to rebuild valid logical %llu for dev %s\n",
+ logical, rcu_str_deref(dev->name));
+ } else {
+ scrub_write_block_to_dev_replace(sblock);
+ }
+
+ scrub_block_put(sblock);
+
+ if (sctx->is_dev_replace &&
+ atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+ mutex_lock(&sctx->wr_ctx.wr_lock);
+ scrub_wr_submit(sctx);
+ mutex_unlock(&sctx->wr_ctx.wr_lock);
+ }
+
+ scrub_pending_bio_dec(sctx);
+}
+
+static void scrub_missing_raid56_pages(struct scrub_block *sblock)
+{
+ struct scrub_ctx *sctx = sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ u64 length = sblock->page_count * PAGE_SIZE;
+ u64 logical = sblock->pagev[0]->logical;
+ struct btrfs_bio *bbio;
+ struct bio *bio;
+ struct btrfs_raid_bio *rbio;
+ int ret;
+ int i;
+
+ ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+ &bbio, 0, 1);
+ if (ret || !bbio || !bbio->raid_map)
+ goto bbio_out;
+
+ if (WARN_ON(!sctx->is_dev_replace ||
+ !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+ /*
+ * We shouldn't be scrubbing a missing device. Even for dev
+ * replace, we should only get here for RAID 5/6. We either
+ * managed to mount something with no mirrors remaining or
+ * there's a bug in scrub_remap_extent()/btrfs_map_block().
+ */
+ goto bbio_out;
+ }
+
+ bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
+ if (!bio)
+ goto bbio_out;
+
+ bio->bi_iter.bi_sector = logical >> 9;
+ bio->bi_private = sblock;
+ bio->bi_end_io = scrub_missing_raid56_end_io;
+
+ rbio = raid56_alloc_missing_rbio(sctx->dev_root, bio, bbio, length);
+ if (!rbio)
+ goto rbio_out;
+
+ for (i = 0; i < sblock->page_count; i++) {
+ struct scrub_page *spage = sblock->pagev[i];
+
+ raid56_add_scrub_pages(rbio, spage->page, spage->logical);
+ }
+
+ btrfs_init_work(&sblock->work, btrfs_scrub_helper,
+ scrub_missing_raid56_worker, NULL, NULL);
+ scrub_block_get(sblock);
+ scrub_pending_bio_inc(sctx);
+ raid56_submit_missing_rbio(rbio);
+ return;
+
+rbio_out:
+ bio_put(bio);
+bbio_out:
+ btrfs_put_bbio(bbio);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.malloc_errors++;
+ spin_unlock(&sctx->stat_lock);
+}
+
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum, int force,
@@ -2228,19 +2352,27 @@ leave_nomem:
}
WARN_ON(sblock->page_count == 0);
- for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev[index];
- int ret;
+ if (dev->missing) {
+ /*
+ * This case should only be hit for RAID 5/6 device replace. See
+ * the comment in scrub_missing_raid56_pages() for details.
+ */
+ scrub_missing_raid56_pages(sblock);
+ } else {
+ for (index = 0; index < sblock->page_count; index++) {
+ struct scrub_page *spage = sblock->pagev[index];
+ int ret;
- ret = scrub_add_page_to_rd_bio(sctx, spage);
- if (ret) {
- scrub_block_put(sblock);
- return ret;
+ ret = scrub_add_page_to_rd_bio(sctx, spage);
+ if (ret) {
+ scrub_block_put(sblock);
+ return ret;
+ }
}
- }
- if (force)
- scrub_submit(sctx);
+ if (force)
+ scrub_submit(sctx);
+ }
/* last one frees, either here or in bio completion for last page */
scrub_block_put(sblock);
@@ -2551,6 +2683,11 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
u8 csum[BTRFS_CSUM_SIZE];
u32 blocksize;
+ if (dev->missing) {
+ scrub_parity_mark_sectors_error(sparity, logical, len);
+ return 0;
+ }
+
if (flags & BTRFS_EXTENT_FLAG_DATA) {
blocksize = sctx->sectorsize;
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
@@ -2689,7 +2826,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
sparity->nsectors))
goto out;
- length = sparity->logic_end - sparity->logic_start + 1;
+ length = sparity->logic_end - sparity->logic_start;
ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
sparity->logic_start,
&length, &bbio, 0, 1);
@@ -2712,8 +2849,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
goto rbio_out;
list_for_each_entry(spage, &sparity->spages, list)
- raid56_parity_add_scrub_pages(rbio, spage->page,
- spage->logical);
+ raid56_add_scrub_pages(rbio, spage->page, spage->logical);
scrub_pending_bio_inc(sctx);
raid56_parity_submit_scrub_rbio(rbio);
@@ -2761,6 +2897,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_root *csum_root = fs_info->csum_root;
struct btrfs_extent_item *extent;
+ struct btrfs_bio *bbio = NULL;
u64 flags;
int ret;
int slot;
@@ -2770,6 +2907,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
u64 extent_logical;
u64 extent_physical;
u64 extent_len;
+ u64 mapped_length;
struct btrfs_device *extent_dev;
struct scrub_parity *sparity;
int nsectors;
@@ -2843,6 +2981,10 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
}
btrfs_item_key_to_cpu(l, &key, slot);
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY)
+ goto next;
+
if (key.type == BTRFS_METADATA_ITEM_KEY)
bytes = root->nodesize;
else
@@ -2851,11 +2993,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
if (key.objectid + bytes <= logic_start)
goto next;
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
- if (key.objectid > logic_end) {
+ if (key.objectid >= logic_end) {
stop_loop = 1;
break;
}
@@ -2868,11 +3006,12 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
flags = btrfs_extent_flags(l, extent);
generation = btrfs_extent_generation(l, extent);
- if (key.objectid < logic_start &&
- (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
- btrfs_err(fs_info,
- "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
- key.objectid, logic_start);
+ if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ (key.objectid < logic_start ||
+ key.objectid + bytes >
+ logic_start + map->stripe_len)) {
+ btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+ key.objectid, logic_start);
goto next;
}
again:
@@ -2892,10 +3031,21 @@ again:
scrub_parity_mark_sectors_data(sparity, extent_logical,
extent_len);
- scrub_remap_extent(fs_info, extent_logical,
- extent_len, &extent_physical,
- &extent_dev,
- &extent_mirror_num);
+ mapped_length = extent_len;
+ ret = btrfs_map_block(fs_info, READ, extent_logical,
+ &mapped_length, &bbio, 0);
+ if (!ret) {
+ if (!bbio || mapped_length < extent_len)
+ ret = -EIO;
+ }
+ if (ret) {
+ btrfs_put_bbio(bbio);
+ goto out;
+ }
+ extent_physical = bbio->stripes[0].physical;
+ extent_mirror_num = bbio->mirror_num;
+ extent_dev = bbio->stripes[0].dev;
+ btrfs_put_bbio(bbio);
ret = btrfs_lookup_csums_range(csum_root,
extent_logical,
@@ -2910,10 +3060,12 @@ again:
extent_dev, flags,
generation,
extent_mirror_num);
+
+ scrub_free_csums(sctx);
+
if (ret)
goto out;
- scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
logic_start += map->stripe_len;
@@ -2942,7 +3094,7 @@ next:
out:
if (ret < 0)
scrub_parity_mark_sectors_error(sparity, logic_start,
- logic_end - logic_start + 1);
+ logic_end - logic_start);
scrub_parity_put(sparity);
scrub_submit(sctx);
mutex_lock(&sctx->wr_ctx.wr_lock);
@@ -3091,22 +3243,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
ret = 0;
while (physical < physical_end) {
- /* for raid56, we skip parity stripe */
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- ret = get_raid56_logic_offset(physical, num,
- map, &logical, &stripe_logical);
- logical += base;
- if (ret) {
- stripe_logical += base;
- stripe_end = stripe_logical + increment - 1;
- ret = scrub_raid56_parity(sctx, map, scrub_dev,
- ppath, stripe_logical,
- stripe_end);
- if (ret)
- goto out;
- goto skip;
- }
- }
/*
* canceled?
*/
@@ -3131,6 +3267,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
scrub_blocked_if_needed(fs_info);
}
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ ret = get_raid56_logic_offset(physical, num, map,
+ &logical,
+ &stripe_logical);
+ logical += base;
+ if (ret) {
+ /* it is parity strip */
+ stripe_logical += base;
+ stripe_end = stripe_logical + increment;
+ ret = scrub_raid56_parity(sctx, map, scrub_dev,
+ ppath, stripe_logical,
+ stripe_end);
+ if (ret)
+ goto out;
+ goto skip;
+ }
+ }
+
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
@@ -3175,6 +3329,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
}
btrfs_item_key_to_cpu(l, &key, slot);
+ if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+ key.type != BTRFS_METADATA_ITEM_KEY)
+ goto next;
+
if (key.type == BTRFS_METADATA_ITEM_KEY)
bytes = root->nodesize;
else
@@ -3183,10 +3341,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (key.objectid + bytes <= logical)
goto next;
- if (key.type != BTRFS_EXTENT_ITEM_KEY &&
- key.type != BTRFS_METADATA_ITEM_KEY)
- goto next;
-
if (key.objectid >= logical + map->stripe_len) {
/* out of this device extent */
if (key.objectid >= logic_end)
@@ -3199,8 +3353,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
flags = btrfs_extent_flags(l, extent);
generation = btrfs_extent_generation(l, extent);
- if (key.objectid < logical &&
- (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+ if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+ (key.objectid < logical ||
+ key.objectid + bytes >
+ logical + map->stripe_len)) {
btrfs_err(fs_info,
"scrub: tree block %llu spanning "
"stripes, ignored. logical=%llu",
@@ -3234,9 +3390,11 @@ again:
&extent_dev,
&extent_mirror_num);
- ret = btrfs_lookup_csums_range(csum_root, logical,
- logical + map->stripe_len - 1,
- &sctx->csum_list, 1);
+ ret = btrfs_lookup_csums_range(csum_root,
+ extent_logical,
+ extent_logical +
+ extent_len - 1,
+ &sctx->csum_list, 1);
if (ret)
goto out;
@@ -3244,10 +3402,12 @@ again:
extent_physical, extent_dev, flags,
generation, extent_mirror_num,
extent_logical - logical + physical);
+
+ scrub_free_csums(sctx);
+
if (ret)
goto out;
- scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -3265,7 +3425,7 @@ loop:
if (ret && physical < physical_end) {
stripe_logical += base;
stripe_end = stripe_logical +
- increment - 1;
+ increment;
ret = scrub_raid56_parity(sctx,
map, scrub_dev, ppath,
stripe_logical,
@@ -3320,7 +3480,6 @@ out:
static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
- u64 chunk_tree, u64 chunk_objectid,
u64 chunk_offset, u64 length,
u64 dev_offset, int is_dev_replace)
{
@@ -3371,10 +3530,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
struct btrfs_root *root = sctx->dev_root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 length;
- u64 chunk_tree;
- u64 chunk_objectid;
u64 chunk_offset;
- int ret;
+ int ret = 0;
int slot;
struct extent_buffer *l;
struct btrfs_key key;
@@ -3402,8 +3559,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
- if (ret)
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
break;
+ }
+ } else {
+ ret = 0;
}
}
@@ -3430,8 +3593,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (found_key.offset + length <= start)
goto skip;
- chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
- chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
/*
@@ -3445,12 +3606,27 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;
+ /*
+ * we need call btrfs_inc_block_group_ro() with scrubs_paused,
+ * to avoid deadlock caused by:
+ * btrfs_inc_block_group_ro()
+ * -> btrfs_wait_for_commit()
+ * -> btrfs_commit_transaction()
+ * -> btrfs_scrub_pause()
+ */
+ scrub_pause_on(fs_info);
+ ret = btrfs_inc_block_group_ro(root, cache);
+ scrub_pause_off(fs_info);
+ if (ret) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
- ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
- chunk_offset, length, found_key.offset,
- is_dev_replace);
+ ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
+ found_key.offset, is_dev_replace);
/*
* flush, submit all pending read and write bios, afterwards
@@ -3470,8 +3646,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
- atomic_inc(&fs_info->scrubs_paused);
- wake_up(&fs_info->scrub_pause_wait);
+
+ scrub_pause_on(fs_info);
/*
* must be called before we decrease @scrub_paused.
@@ -3482,11 +3658,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
atomic_read(&sctx->workers_pending) == 0);
atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
- mutex_lock(&fs_info->scrub_lock);
- __scrub_blocked_if_needed(fs_info);
- atomic_dec(&fs_info->scrubs_paused);
- mutex_unlock(&fs_info->scrub_lock);
- wake_up(&fs_info->scrub_pause_wait);
+ scrub_pause_off(fs_info);
+
+ btrfs_dec_block_group_ro(root, cache);
btrfs_put_block_group(cache);
if (ret)
@@ -3510,11 +3684,7 @@ skip:
btrfs_free_path(path);
- /*
- * ret can still be 1 from search_slot or next_leaf,
- * that's not an error
- */
- return ret < 0 ? ret : 0;
+ return ret;
}
static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6bad63379a4c..2b07b3581781 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -69,7 +69,7 @@ static struct file_system_type btrfs_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
-static const char *btrfs_decode_error(int errno)
+const char *btrfs_decode_error(int errno)
{
char *errstr = "unknown";
@@ -1033,6 +1033,7 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_flags |= MS_POSIXACL;
#endif
sb->s_flags |= MS_I_VERSION;
+ sb->s_iflags |= SB_I_CGROUPWB;
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
printk(KERN_ERR "BTRFS: open_ctree failed\n");
@@ -1650,6 +1651,17 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
sb->s_flags |= MS_RDONLY;
+ /*
+ * Setting MS_RDONLY will put the cleaner thread to
+ * sleep at the next loop if it's already active.
+ * If it's already asleep, we'll leave unused block
+ * groups on disk until we're mounted read-write again
+ * unless we clean them up here.
+ */
+ mutex_lock(&root->fs_info->cleaner_mutex);
+ btrfs_delete_unused_bgs(fs_info);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
btrfs_pause_balance(fs_info);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f5021fcb154e..8f259b3a66b3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -258,6 +258,8 @@ loop:
mutex_init(&cur_trans->cache_write_mutex);
cur_trans->num_dirty_bgs = 0;
spin_lock_init(&cur_trans->dirty_bgs_lock);
+ INIT_LIST_HEAD(&cur_trans->deleted_bgs);
+ spin_lock_init(&cur_trans->deleted_bgs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -1301,7 +1303,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
*/
btrfs_set_skip_qgroup(trans, objectid);
- btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+ btrfs_reloc_pre_snapshot(pending, &to_reserve);
if (to_reserve > 0) {
pending->error = btrfs_block_rsv_add(root,
@@ -1638,9 +1640,7 @@ static void do_async_commit(struct work_struct *work)
* Tell lockdep about it.
*/
if (ac->newtrans->type & __TRANS_FREEZABLE)
- rwsem_acquire_read(
- &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 0, 1, _THIS_IP_);
+ __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS);
current->journal_info = ac->newtrans;
@@ -1679,9 +1679,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
* async commit thread will be the one to unlock it.
*/
if (ac->newtrans->type & __TRANS_FREEZABLE)
- rwsem_release(
- &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 1, _THIS_IP_);
+ __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS);
schedule_work(&ac->work);
@@ -1893,8 +1891,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
spin_unlock(&root->fs_info->trans_lock);
wait_for_commit(root, prev_trans);
+ ret = prev_trans->aborted;
btrfs_put_transaction(prev_trans);
+ if (ret)
+ goto cleanup_transaction;
} else {
spin_unlock(&root->fs_info->trans_lock);
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index eb09c2067fa8..edc2fbc262d7 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -74,6 +74,8 @@ struct btrfs_transaction {
*/
struct mutex cache_write_mutex;
spinlock_t dirty_bgs_lock;
+ struct list_head deleted_bgs;
+ spinlock_t deleted_bgs_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
int dirty_bg_run;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a4b9c8b2d35a..f31db4325339 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -115,8 +115,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
}
out:
- if (path)
- btrfs_free_path(path);
+ btrfs_free_path(path);
if (ret == -EAGAIN) {
if (root->defrag_max.objectid > root->defrag_progress.objectid)
goto done;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c45431e69ab..1bbaace73383 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_log_ctx *ctx)
{
- int index;
- int ret;
+ int ret = 0;
mutex_lock(&root->log_mutex);
+
if (root->log_root) {
if (btrfs_need_log_full_commit(root->fs_info, trans)) {
ret = -EAGAIN;
goto out;
}
+
if (!root->log_start_pid) {
- root->log_start_pid = current->pid;
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ root->log_start_pid = current->pid;
} else if (root->log_start_pid != current->pid) {
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
+ } else {
+ mutex_lock(&root->fs_info->tree_log_mutex);
+ if (!root->fs_info->log_root_tree)
+ ret = btrfs_init_log_root_tree(trans, root->fs_info);
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ if (ret)
+ goto out;
- atomic_inc(&root->log_batch);
- atomic_inc(&root->log_writers);
- if (ctx) {
- index = root->log_transid % 2;
- list_add_tail(&ctx->list, &root->log_ctxs[index]);
- ctx->log_transid = root->log_transid;
- }
- mutex_unlock(&root->log_mutex);
- return 0;
- }
-
- ret = 0;
- mutex_lock(&root->fs_info->tree_log_mutex);
- if (!root->fs_info->log_root_tree)
- ret = btrfs_init_log_root_tree(trans, root->fs_info);
- mutex_unlock(&root->fs_info->tree_log_mutex);
- if (ret)
- goto out;
-
- if (!root->log_root) {
ret = btrfs_add_log_tree(trans, root);
if (ret)
goto out;
+
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ root->log_start_pid = current->pid;
}
- clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
- root->log_start_pid = current->pid;
+
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
if (ctx) {
- index = root->log_transid % 2;
+ int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
}
+
out:
mutex_unlock(&root->log_mutex);
return ret;
@@ -731,12 +722,66 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
&ordered_sums, 0);
if (ret)
goto out;
+ /*
+ * Now delete all existing cums in the csum root that
+ * cover our range. We do this because we can have an
+ * extent that is completely referenced by one file
+ * extent item and partially referenced by another
+ * file extent item (like after using the clone or
+ * extent_same ioctls). In this case if we end up doing
+ * the replay of the one that partially references the
+ * extent first, and we do not do the csum deletion
+ * below, we can get 2 csum items in the csum tree that
+ * overlap each other. For example, imagine our log has
+ * the two following file extent items:
+ *
+ * key (257 EXTENT_DATA 409600)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 20480 nr 20480 ram 102400
+ *
+ * key (257 EXTENT_DATA 819200)
+ * extent data disk byte 12845056 nr 102400
+ * extent data offset 0 nr 102400 ram 102400
+ *
+ * Where the second one fully references the 100K extent
+ * that starts at disk byte 12845056, and the log tree
+ * has a single csum item that covers the entire range
+ * of the extent:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ *
+ * After the first file extent item is replayed, the
+ * csum tree gets the following csum item:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which covers the 20K sub-range starting at offset 20K
+ * of our extent. Now when we replay the second file
+ * extent item, if we do not delete existing csum items
+ * that cover any of its blocks, we end up getting two
+ * csum items in our csum tree that overlap each other:
+ *
+ * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
+ * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
+ *
+ * Which is a problem, because after this anyone trying
+ * to lookup up for the checksum of any block of our
+ * extent starting at an offset of 40K or higher, will
+ * end up looking at the second csum item only, which
+ * does not contain the checksum for any block starting
+ * at offset 40K or higher of our extent.
+ */
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums;
sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
if (!ret)
+ ret = btrfs_del_csums(trans,
+ root->fs_info->csum_root,
+ sums->bytenr,
+ sums->len);
+ if (!ret)
ret = btrfs_csum_file_blocks(trans,
root->fs_info->csum_root,
sums);
@@ -1549,9 +1594,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
*/
static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path,
u64 dirid, u64 index,
- char *name, int name_len, u8 type,
+ char *name, int name_len,
struct btrfs_key *location)
{
struct inode *inode;
@@ -1613,6 +1657,9 @@ static bool name_in_log_ref(struct btrfs_root *log_root,
* not exist in the FS, it is skipped. fsyncs on directories
* do not force down inodes inside that directory, just changes to the
* names or unlinks in a directory.
+ *
+ * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
+ * non-existing inode) and 1 if the name was replayed.
*/
static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -1631,6 +1678,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
int exists;
int ret = 0;
bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
+ bool name_added = false;
dir = read_one_inode(root, key->objectid);
if (!dir)
@@ -1708,6 +1756,8 @@ out:
}
kfree(name);
iput(dir);
+ if (!ret && name_added)
+ ret = 1;
return ret;
insert:
@@ -1719,10 +1769,12 @@ insert:
goto out;
}
btrfs_release_path(path);
- ret = insert_one_name(trans, root, path, key->objectid, key->offset,
- name, name_len, log_type, &log_key);
+ ret = insert_one_name(trans, root, key->objectid, key->offset,
+ name, name_len, &log_key);
if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
+ if (!ret)
+ name_added = true;
update_size = false;
ret = 0;
goto out;
@@ -1740,12 +1792,13 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- int ret;
+ int ret = 0;
u32 item_size = btrfs_item_size_nr(eb, slot);
struct btrfs_dir_item *di;
int name_len;
unsigned long ptr;
unsigned long ptr_end;
+ struct btrfs_path *fixup_path = NULL;
ptr = btrfs_item_ptr_offset(eb, slot);
ptr_end = ptr + item_size;
@@ -1755,12 +1808,59 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
return -EIO;
name_len = btrfs_dir_name_len(eb, di);
ret = replay_one_name(trans, root, path, eb, di, key);
- if (ret)
- return ret;
+ if (ret < 0)
+ break;
ptr = (unsigned long)(di + 1);
ptr += name_len;
+
+ /*
+ * If this entry refers to a non-directory (directories can not
+ * have a link count > 1) and it was added in the transaction
+ * that was not committed, make sure we fixup the link count of
+ * the inode it the entry points to. Otherwise something like
+ * the following would result in a directory pointing to an
+ * inode with a wrong link that does not account for this dir
+ * entry:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * touch testdir/bar
+ * sync
+ *
+ * ln testdir/bar testdir/bar_link
+ * ln testdir/foo testdir/foo_link
+ * xfs_io -c "fsync" testdir/bar
+ *
+ * <power failure>
+ *
+ * mount fs, log replay happens
+ *
+ * File foo would remain with a link count of 1 when it has two
+ * entries pointing to it in the directory testdir. This would
+ * make it impossible to ever delete the parent directory has
+ * it would result in stale dentries that can never be deleted.
+ */
+ if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
+ struct btrfs_key di_key;
+
+ if (!fixup_path) {
+ fixup_path = btrfs_alloc_path();
+ if (!fixup_path) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+
+ btrfs_dir_item_key_to_cpu(eb, di, &di_key);
+ ret = link_to_fixup_dir(trans, root, fixup_path,
+ di_key.objectid);
+ if (ret)
+ break;
+ }
+ ret = 0;
}
- return 0;
+ btrfs_free_path(fixup_path);
+ return ret;
}
/*
@@ -2535,8 +2635,7 @@ static int update_log_root(struct btrfs_trans_handle *trans,
return ret;
}
-static void wait_log_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, int transid)
+static void wait_log_commit(struct btrfs_root *root, int transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
@@ -2561,8 +2660,7 @@ static void wait_log_commit(struct btrfs_trans_handle *trans,
atomic_read(&root->log_commit[index]));
}
-static void wait_for_writer(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_root *root)
{
DEFINE_WAIT(wait);
@@ -2642,7 +2740,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index1 = log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
- wait_log_commit(trans, root, log_transid);
+ wait_log_commit(root, log_transid);
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
@@ -2651,7 +2749,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
- wait_log_commit(trans, root, log_transid - 1);
+ wait_log_commit(root, log_transid - 1);
while (1) {
int batch = atomic_read(&root->log_batch);
@@ -2662,7 +2760,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
}
- wait_for_writer(trans, root);
+ wait_for_writer(root);
if (batch == atomic_read(&root->log_batch))
break;
}
@@ -2759,7 +2857,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
mark);
btrfs_wait_logged_extents(trans, log, log_transid);
- wait_log_commit(trans, log_root_tree,
+ wait_log_commit(log_root_tree,
root_log_ctx.log_transid);
mutex_unlock(&log_root_tree->log_mutex);
if (!ret)
@@ -2770,11 +2868,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
atomic_set(&log_root_tree->log_commit[index2], 1);
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
- wait_log_commit(trans, log_root_tree,
+ wait_log_commit(log_root_tree,
root_log_ctx.log_transid - 1);
}
- wait_for_writer(trans, log_root_tree);
+ wait_for_writer(log_root_tree);
/*
* now that we've moved on to the tree of log tree roots,
@@ -4904,6 +5002,94 @@ next_dir_inode:
return ret;
}
+static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct btrfs_log_ctx *ctx)
+{
+ int ret;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ const u64 ino = btrfs_ino(inode);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+
+ key.objectid = ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ u32 cur_offset = 0;
+ u32 item_size;
+ unsigned long ptr;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
+ if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
+ break;
+
+ item_size = btrfs_item_size_nr(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
+ while (cur_offset < item_size) {
+ struct btrfs_key inode_key;
+ struct inode *dir_inode;
+
+ inode_key.type = BTRFS_INODE_ITEM_KEY;
+ inode_key.offset = 0;
+
+ if (key.type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)
+ (ptr + cur_offset);
+ inode_key.objectid = btrfs_inode_extref_parent(
+ leaf, extref);
+ cur_offset += sizeof(*extref);
+ cur_offset += btrfs_inode_extref_name_len(leaf,
+ extref);
+ } else {
+ inode_key.objectid = key.offset;
+ cur_offset = item_size;
+ }
+
+ dir_inode = btrfs_iget(root->fs_info->sb, &inode_key,
+ root, NULL);
+ /* If parent inode was deleted, skip it. */
+ if (IS_ERR(dir_inode))
+ continue;
+
+ ret = btrfs_log_inode(trans, root, dir_inode,
+ LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+ iput(dir_inode);
+ if (ret)
+ goto out;
+ }
+ path->slots[0]++;
+ }
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
/*
* helper function around btrfs_log_inode to make sure newly created
* parent directories also end up in the log. A minimal inode and backref
@@ -4923,9 +5109,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct dentry *old_parent = NULL;
int ret = 0;
u64 last_committed = root->fs_info->last_trans_committed;
- const struct dentry * const first_parent = parent;
- const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
- last_committed);
bool log_dentries = false;
struct inode *orig_inode = inode;
@@ -4986,6 +5169,53 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
log_dentries = true;
+ /*
+ * On unlink we must make sure all our current and old parent directores
+ * inodes are fully logged. This is to prevent leaving dangling
+ * directory index entries in directories that were our parents but are
+ * not anymore. Not doing this results in old parent directory being
+ * impossible to delete after log replay (rmdir will always fail with
+ * error -ENOTEMPTY).
+ *
+ * Example 1:
+ *
+ * mkdir testdir
+ * touch testdir/foo
+ * ln testdir/foo testdir/bar
+ * sync
+ * unlink testdir/bar
+ * xfs_io -c fsync testdir/foo
+ * <power failure>
+ * mount fs, triggers log replay
+ *
+ * If we don't log the parent directory (testdir), after log replay the
+ * directory still has an entry pointing to the file inode using the bar
+ * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
+ * the file inode has a link count of 1.
+ *
+ * Example 2:
+ *
+ * mkdir testdir
+ * touch foo
+ * ln foo testdir/foo2
+ * ln foo testdir/foo3
+ * sync
+ * unlink testdir/foo3
+ * xfs_io -c fsync foo
+ * <power failure>
+ * mount fs, triggers log replay
+ *
+ * Similar as the first example, after log replay the parent directory
+ * testdir still has an entry pointing to the inode file with name foo3
+ * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
+ * and has a link count of 2.
+ */
+ if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+ ret = btrfs_log_all_parents(trans, orig_inode, ctx);
+ if (ret)
+ goto end_trans;
+ }
+
while (1) {
if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
break;
@@ -4994,23 +5224,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (root != BTRFS_I(inode)->root)
break;
- /*
- * On unlink we must make sure our immediate parent directory
- * inode is fully logged. This is to prevent leaving dangling
- * directory index entries and a wrong directory inode's i_size.
- * Not doing so can result in a directory being impossible to
- * delete after log replay (rmdir will always fail with error
- * -ENOTEMPTY).
- */
- if (did_unlink && parent == first_parent)
- inode_only = LOG_INODE_ALL;
- else
- inode_only = LOG_INODE_EXISTS;
-
- if (BTRFS_I(inode)->generation >
- root->fs_info->last_trans_committed ||
- inode_only == LOG_INODE_ALL) {
- ret = btrfs_log_inode(trans, root, inode, inode_only,
+ if (BTRFS_I(inode)->generation > last_committed) {
+ ret = btrfs_log_inode(trans, root, inode,
+ LOG_INODE_EXISTS,
0, LLONG_MAX, ctx);
if (ret)
goto end_trans;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 762476f5f08d..6fc735869c18 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1116,15 +1116,18 @@ out:
return ret;
}
-static int contains_pending_extent(struct btrfs_trans_handle *trans,
+static int contains_pending_extent(struct btrfs_transaction *transaction,
struct btrfs_device *device,
u64 *start, u64 len)
{
+ struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
struct extent_map *em;
- struct list_head *search_list = &trans->transaction->pending_chunks;
+ struct list_head *search_list = &fs_info->pinned_chunks;
int ret = 0;
u64 physical_start = *start;
+ if (transaction)
+ search_list = &transaction->pending_chunks;
again:
list_for_each_entry(em, search_list, list) {
struct map_lookup *map;
@@ -1159,8 +1162,8 @@ again:
}
}
}
- if (search_list == &trans->transaction->pending_chunks) {
- search_list = &trans->root->fs_info->pinned_chunks;
+ if (search_list != &fs_info->pinned_chunks) {
+ search_list = &fs_info->pinned_chunks;
goto again;
}
@@ -1169,12 +1172,13 @@ again:
/*
- * find_free_dev_extent - find free space in the specified device
- * @device: the device which we search the free space in
- * @num_bytes: the size of the free space that we need
- * @start: store the start of the free space.
- * @len: the size of the free space. that we find, or the size of the max
- * free space if we don't find suitable free space
+ * find_free_dev_extent_start - find free space in the specified device
+ * @device: the device which we search the free space in
+ * @num_bytes: the size of the free space that we need
+ * @search_start: the position from which to begin the search
+ * @start: store the start of the free space.
+ * @len: the size of the free space. that we find, or the size
+ * of the max free space if we don't find suitable free space
*
* this uses a pretty simple search, the expectation is that it is
* called very infrequently and that a given device has a small number
@@ -1188,9 +1192,9 @@ again:
* But if we don't find suitable free space, it is used to store the size of
* the max free space.
*/
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device, u64 num_bytes,
- u64 *start, u64 *len)
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 search_start, u64 *start, u64 *len)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
@@ -1200,19 +1204,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
u64 max_hole_start;
u64 max_hole_size;
u64 extent_end;
- u64 search_start;
u64 search_end = device->total_bytes;
int ret;
int slot;
struct extent_buffer *l;
- /* FIXME use last free of some kind */
-
- /* we don't want to overwrite the superblock on the drive,
- * so we make sure to start at an offset of at least 1MB
- */
- search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1273,7 +1269,7 @@ again:
* Have to check before we set max_hole_start, otherwise
* we could end up sending back this offset anyway.
*/
- if (contains_pending_extent(trans, device,
+ if (contains_pending_extent(transaction, device,
&search_start,
hole_size)) {
if (key.offset >= search_start) {
@@ -1322,7 +1318,7 @@ next:
if (search_end > search_start) {
hole_size = search_end - search_start;
- if (contains_pending_extent(trans, device, &search_start,
+ if (contains_pending_extent(transaction, device, &search_start,
hole_size)) {
btrfs_release_path(path);
goto again;
@@ -1348,6 +1344,24 @@ out:
return ret;
}
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *len)
+{
+ struct btrfs_root *root = device->dev_root;
+ u64 search_start;
+
+ /* FIXME use last free of some kind */
+
+ /*
+ * we don't want to overwrite the superblock on the drive,
+ * so we make sure to start at an offset of at least 1MB
+ */
+ search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+ return find_free_dev_extent_start(trans->transaction, device,
+ num_bytes, search_start, start, len);
+}
+
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 start, u64 *dev_extent_len)
@@ -2755,9 +2769,7 @@ out:
return ret;
}
-static int btrfs_relocate_chunk(struct btrfs_root *root,
- u64 chunk_objectid,
- u64 chunk_offset)
+static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
{
struct btrfs_root *extent_root;
struct btrfs_trans_handle *trans;
@@ -2785,7 +2797,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
return -ENOSPC;
/* step one, relocate all the extents inside this chunk */
+ btrfs_scrub_pause(root);
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
+ btrfs_scrub_continue(root);
if (ret)
return ret;
@@ -2855,7 +2869,6 @@ again:
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_relocate_chunk(chunk_root,
- found_key.objectid,
found_key.offset);
if (ret == -ENOSPC)
failed++;
@@ -3375,7 +3388,6 @@ again:
}
ret = btrfs_relocate_chunk(chunk_root,
- found_key.objectid,
found_key.offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
@@ -3573,23 +3585,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
} while (read_seqretry(&fs_info->profiles_lock, seq));
if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
- int num_tolerated_disk_barrier_failures;
- u64 target = bctl->sys.target;
-
- num_tolerated_disk_barrier_failures =
- btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
- if (num_tolerated_disk_barrier_failures > 0 &&
- (target &
- (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
- num_tolerated_disk_barrier_failures = 0;
- else if (num_tolerated_disk_barrier_failures > 1 &&
- (target &
- (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
- num_tolerated_disk_barrier_failures = 1;
-
- fs_info->num_tolerated_disk_barrier_failures =
- num_tolerated_disk_barrier_failures;
+ fs_info->num_tolerated_disk_barrier_failures = min(
+ btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),
+ btrfs_get_num_tolerated_disk_barrier_failures(
+ bctl->sys.target));
}
ret = insert_balance_item(fs_info->tree_root, bctl);
@@ -4077,7 +4076,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
u64 length;
- u64 chunk_objectid;
u64 chunk_offset;
int ret;
int slot;
@@ -4154,11 +4152,10 @@ again:
break;
}
- chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
btrfs_release_path(path);
- ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
+ ret = btrfs_relocate_chunk(root, chunk_offset);
mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
if (ret && ret != -ENOSPC)
goto done;
@@ -4200,7 +4197,8 @@ again:
u64 start = new_size;
u64 len = old_size - new_size;
- if (contains_pending_extent(trans, device, &start, len)) {
+ if (contains_pending_extent(trans->transaction, device,
+ &start, len)) {
unlock_chunks(root);
checked_pending_chunks = true;
failed = 0;
@@ -5071,9 +5069,7 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
* and the stripes
*/
sizeof(u64) * (total_stripes),
- GFP_NOFS);
- if (!bbio)
- return NULL;
+ GFP_NOFS|__GFP_NOFAIL);
atomic_set(&bbio->error, 0);
atomic_set(&bbio->refs, 1);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 95842a909e7f..2ca784a14e84 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -453,6 +453,9 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent_start(struct btrfs_transaction *transaction,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 search_start, u64 *start, u64 *max_avail);
int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 890c50971a69..9d23e788d1df 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
for (i = 0; i < num_pages; i++) {
struct page *page = osd_data->pages[i];
- if (rc < 0)
+ if (rc < 0 && rc != ENOENT)
goto unlock;
if (bytes < (int)PAGE_CACHE_SIZE) {
/* zero (remainder of) page */
@@ -717,8 +717,10 @@ static int ceph_writepages_start(struct address_space *mapping,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
- if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
+ if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
pr_warn("writepage_start %p on forced umount\n", inode);
+ truncate_pagecache(inode, 0);
+ mapping_set_error(mapping, -EIO);
return -EIO; /* we're in a forced umount, don't write! */
}
if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
@@ -1593,7 +1595,7 @@ out:
return err;
}
-static struct vm_operations_struct ceph_vmops = {
+static const struct vm_operations_struct ceph_vmops = {
.fault = ceph_filemap_fault,
.page_mkwrite = ceph_page_mkwrite,
};
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ddd5e9471290..27b566874bc1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2413,6 +2413,14 @@ again:
goto out_unlock;
}
+ if (!__ceph_is_any_caps(ci) &&
+ ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout("get_cap_refs %p forced umount\n", inode);
+ *err = -EIO;
+ ret = 1;
+ goto out_unlock;
+ }
+
dout("get_cap_refs %p have %s needed %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need));
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8b79d87eaf46..0c62868b5c56 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -136,7 +136,6 @@ int ceph_open(struct inode *inode, struct file *file)
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
struct ceph_file_info *cf = file->private_data;
- struct inode *parent_inode = NULL;
int err;
int flags, fmode, wanted;
@@ -210,10 +209,7 @@ int ceph_open(struct inode *inode, struct file *file)
ihold(inode);
req->r_num_caps = 1;
- if (flags & O_CREAT)
- parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
if (!err)
err = ceph_init_file(inode, file, req->r_fmode);
ceph_mdsc_put_request(req);
@@ -279,7 +275,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (err)
goto out_req;
- if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
+ if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
if (d_unhashed(dentry)) {
@@ -956,6 +952,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
+ if (iocb->ki_flags & IOCB_APPEND) {
+ err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
+ if (err < 0)
+ goto out;
+ }
+
err = generic_write_checks(iocb, from);
if (err <= 0)
goto out;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6aa07af67603..51cb02da75d9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2107,7 +2107,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
msg = create_request_message(mdsc, req, mds, drop_cap_releases);
if (IS_ERR(msg)) {
req->r_err = PTR_ERR(msg);
- complete_request(mdsc, req);
return PTR_ERR(msg);
}
req->r_request = msg;
@@ -2135,7 +2134,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
{
struct ceph_mds_session *session = NULL;
int mds = -1;
- int err = -EAGAIN;
+ int err = 0;
if (req->r_err || req->r_got_result) {
if (req->r_aborted)
@@ -2149,6 +2148,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
err = -EIO;
goto finish;
}
+ if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+ dout("do_request forced umount\n");
+ err = -EIO;
+ goto finish;
+ }
put_request_session(req);
@@ -2196,13 +2200,15 @@ static int __do_request(struct ceph_mds_client *mdsc,
out_session:
ceph_put_mds_session(session);
+finish:
+ if (err) {
+ dout("__do_request early error %d\n", err);
+ req->r_err = err;
+ complete_request(mdsc, req);
+ __unregister_request(mdsc, req);
+ }
out:
return err;
-
-finish:
- req->r_err = err;
- complete_request(mdsc, req);
- goto out;
}
/*
@@ -2289,8 +2295,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
if (req->r_err) {
err = req->r_err;
- __unregister_request(mdsc, req);
- dout("do_request early error %d\n", err);
goto out;
}
@@ -2411,7 +2415,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_unlock(&mdsc->mutex);
goto out;
}
- if (req->r_got_safe && !head->safe) {
+ if (req->r_got_safe) {
pr_warn("got unsafe after safe on %llu from mds%d\n",
tid, mds);
mutex_unlock(&mdsc->mutex);
@@ -2520,8 +2524,7 @@ out_err:
if (err) {
req->r_err = err;
} else {
- req->r_reply = msg;
- ceph_msg_get(msg);
+ req->r_reply = ceph_msg_get(msg);
req->r_got_result = true;
}
} else {
@@ -3555,7 +3558,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
u64 want_tid, want_flush, want_snap;
- if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+ if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return;
dout("sync\n");
@@ -3584,7 +3587,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
*/
static bool done_closing_sessions(struct ceph_mds_client *mdsc)
{
- if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
+ if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return true;
return atomic_read(&mdsc->num_sessions) == 0;
}
@@ -3643,6 +3646,34 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
dout("stopped\n");
}
+void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
+{
+ struct ceph_mds_session *session;
+ int mds;
+
+ dout("force umount\n");
+
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; mds < mdsc->max_sessions; mds++) {
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ if (!session)
+ continue;
+ mutex_unlock(&mdsc->mutex);
+ mutex_lock(&session->s_mutex);
+ __close_session(mdsc, session);
+ if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
+ cleanup_session_requests(mdsc, session);
+ remove_session_caps(session);
+ }
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ mutex_lock(&mdsc->mutex);
+ kick_requests(mdsc, mds);
+ }
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+ mutex_unlock(&mdsc->mutex);
+}
+
static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
{
dout("stop\n");
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 762757e6cebf..f575eafe2261 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -366,6 +366,7 @@ extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
+extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 233d906aec02..4aa7122a8d38 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -338,12 +338,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
return 0;
}
- if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
- ceph_get_snap_context(ceph_empty_snapc);
- snapc = ceph_empty_snapc;
- goto done;
- }
-
/* alloc new snap context */
err = -ENOMEM;
if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
@@ -381,7 +375,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
realm->ino, realm, snapc, snapc->seq,
(unsigned int) snapc->num_snaps);
-done:
ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
return 0;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d1c833c321b9..f446afada328 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -479,7 +479,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
- seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+ seq_show_option(m, "snapdirname", fsopt->snapdir_name);
return 0;
}
@@ -708,6 +708,7 @@ static void ceph_umount_begin(struct super_block *sb)
if (!fsc)
return;
fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
+ ceph_mdsc_force_umount(fsc->mdsc);
return;
}
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
new file mode 100644
index 000000000000..0065256881d8
--- /dev/null
+++ b/fs/cifs/cifs_ioctl.h
@@ -0,0 +1,42 @@
+/*
+ * fs/cifs/cifs_ioctl.h
+ *
+ * Structure definitions for io control for cifs/smb3
+ *
+ * Copyright (c) 2015 Steve French <steve.french@primarydata.com>
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU Lesser General Public License for more details.
+ *
+ */
+
+struct smb_mnt_fs_info {
+ __u32 version; /* 0001 */
+ __u16 protocol_id;
+ __u16 tcon_flags;
+ __u32 vol_serial_number;
+ __u32 vol_create_time;
+ __u32 share_caps;
+ __u32 share_flags;
+ __u32 sector_flags;
+ __u32 optimal_sector_size;
+ __u32 max_bytes_chunk;
+ __u32 fs_attributes;
+ __u32 max_path_component;
+ __u32 device_type;
+ __u32 device_characteristics;
+ __u32 maximal_access;
+ __u64 cifs_posix_caps;
+} __packed;
+
+#define CIFS_IOCTL_MAGIC 0xCF
+#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
+#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
+#define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0a9fb6b53126..6a1119e87fbb 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -394,17 +394,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
struct sockaddr *srcaddr;
srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
- seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
+ seq_show_option(s, "vers", tcon->ses->server->vals->version_string);
cifs_show_security(s, tcon->ses);
cifs_show_cache_flavor(s, cifs_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
seq_puts(s, ",multiuser");
else if (tcon->ses->user_name)
- seq_printf(s, ",username=%s", tcon->ses->user_name);
+ seq_show_option(s, "username", tcon->ses->user_name);
if (tcon->ses->domainName)
- seq_printf(s, ",domain=%s", tcon->ses->domainName);
+ seq_show_option(s, "domain", tcon->ses->domainName);
if (srcaddr->sa_family != AF_UNSPEC) {
struct sockaddr_in *saddr4;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a782b22904e4..27aea110e923 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.06"
+#define CIFS_VERSION "2.07"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 47b030da0781..f5b87303ce46 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2245,6 +2245,20 @@ typedef struct {
#define FILE_DEVICE_VIRTUAL_DISK 0x00000024
#define FILE_DEVICE_NETWORK_REDIRECTOR 0x00000028
+/* Device Characteristics */
+#define FILE_REMOVABLE_MEDIA 0x00000001
+#define FILE_READ_ONLY_DEVICE 0x00000002
+#define FILE_FLOPPY_DISKETTE 0x00000004
+#define FILE_WRITE_ONCE_MEDIA 0x00000008
+#define FILE_REMOTE_DEVICE 0x00000010
+#define FILE_DEVICE_IS_MOUNTED 0x00000020
+#define FILE_VIRTUAL_VOLUME 0x00000040
+#define FILE_DEVICE_SECURE_OPEN 0x00000100
+#define FILE_CHARACTERISTIC_TS_DEVICE 0x00001000
+#define FILE_CHARACTERISTIC_WEBDAV_DEVICE 0x00002000
+#define FILE_PORTABLE_DEVICE 0x00004000
+#define FILE_DEVICE_ALLOW_APPCONTAINER_TRAVERSAL 0x00020000
+
typedef struct {
__le32 DeviceType;
__le32 DeviceCharacteristics;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 672ef35c9f73..90b4f9f7de66 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -696,7 +696,9 @@ cifs_echo_callback(struct mid_q_entry *mid)
{
struct TCP_Server_Info *server = mid->callback_data;
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(server, 1, CIFS_ECHO_OP);
}
@@ -1572,7 +1574,9 @@ cifs_readv_callback(struct mid_q_entry *mid)
}
queue_work(cifsiod_wq, &rdata->work);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(server, 1, 0);
}
@@ -2032,6 +2036,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
{
struct cifs_writedata *wdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
unsigned int written;
WRITE_RSP *smb = (WRITE_RSP *)mid->resp_buf;
@@ -2068,7 +2073,9 @@ cifs_writev_callback(struct mid_q_entry *mid)
}
queue_work(cifsiod_wq, &wdata->work);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(tcon->ses->server, 1, 0);
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3f50cee79df9..e2a6af1508af 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3216,7 +3216,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
return VM_FAULT_LOCKED;
}
-static struct vm_operations_struct cifs_file_vm_ops = {
+static const struct vm_operations_struct cifs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = cifs_page_mkwrite,
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 49b8b6e41a18..c63f5227b681 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -31,12 +31,9 @@
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifsfs.h"
+#include "cifs_ioctl.h"
#include <linux/btrfs.h>
-#define CIFS_IOCTL_MAGIC 0xCF
-#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
-#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4)
-
static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
unsigned long srcfd, u64 off, u64 len, u64 destoff,
bool dup_extents)
@@ -135,6 +132,43 @@ out_drop_write:
return rc;
}
+static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon,
+ void __user *arg)
+{
+ int rc = 0;
+ struct smb_mnt_fs_info *fsinf;
+
+ fsinf = kzalloc(sizeof(struct smb_mnt_fs_info), GFP_KERNEL);
+ if (fsinf == NULL)
+ return -ENOMEM;
+
+ fsinf->version = 1;
+ fsinf->protocol_id = tcon->ses->server->vals->protocol_id;
+ fsinf->device_characteristics =
+ le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics);
+ fsinf->device_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
+ fsinf->fs_attributes = le32_to_cpu(tcon->fsAttrInfo.Attributes);
+ fsinf->max_path_component =
+ le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength);
+#ifdef CONFIG_CIFS_SMB2
+ fsinf->vol_serial_number = tcon->vol_serial_number;
+ fsinf->vol_create_time = le64_to_cpu(tcon->vol_create_time);
+ fsinf->share_flags = tcon->share_flags;
+ fsinf->share_caps = le32_to_cpu(tcon->capabilities);
+ fsinf->sector_flags = tcon->ss_flags;
+ fsinf->optimal_sector_size = tcon->perf_sector_size;
+ fsinf->max_bytes_chunk = tcon->max_bytes_chunk;
+ fsinf->maximal_access = tcon->maximal_access;
+#endif /* SMB2 */
+ fsinf->cifs_posix_caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+
+ if (copy_to_user(arg, fsinf, sizeof(struct smb_mnt_fs_info)))
+ rc = -EFAULT;
+
+ kfree(fsinf);
+ return rc;
+}
+
long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
{
struct inode *inode = file_inode(filep);
@@ -148,8 +182,6 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
xid = get_xid();
- cifs_dbg(FYI, "ioctl file %p cmd %u arg %lu\n", filep, command, arg);
-
cifs_sb = CIFS_SB(inode->i_sb);
switch (command) {
@@ -228,6 +260,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
else
rc = -EOPNOTSUPP;
break;
+ case CIFS_IOC_GET_MNT_INFO:
+ tcon = tlink_tcon(pSMBFile->tlink);
+ rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg);
+ break;
default:
cifs_dbg(FYI, "unsupported ioctl\n");
break;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index b8b4f08ee094..070fb2ad85ce 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1626,7 +1626,9 @@ smb2_echo_callback(struct mid_q_entry *mid)
if (mid->mid_state == MID_RESPONSE_RECEIVED)
credits_received = le16_to_cpu(smb2->hdr.CreditRequest);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(server, credits_received, CIFS_ECHO_OP);
}
@@ -1810,7 +1812,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
cifs_stats_fail_inc(tcon, SMB2_READ_HE);
queue_work(cifsiod_wq, &rdata->work);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(server, credits_received, 0);
}
@@ -1938,6 +1942,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
{
struct cifs_writedata *wdata = mid->callback_data;
struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
unsigned int written;
struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
unsigned int credits_received = 1;
@@ -1977,7 +1982,9 @@ smb2_writev_callback(struct mid_q_entry *mid)
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
queue_work(cifsiod_wq, &wdata->work);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
add_credits(tcon->ses->server, credits_received, 0);
}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 126f46b887cc..2a24c524fb9a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -644,7 +644,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
}
spin_unlock(&GlobalMid_Lock);
+ mutex_lock(&server->srv_mutex);
DeleteMidQEntry(mid);
+ mutex_unlock(&server->srv_mutex);
return rc;
}
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 9b1ffaa0572e..f6c6c8adbc01 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -353,7 +353,7 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
char *result;
insize = max_t(unsigned int,
- INSIZE(readlink), OUTSIZE(readlink)+ *length + 1);
+ INSIZE(readlink), OUTSIZE(readlink)+ *length);
UPARG(CODA_READLINK);
inp->coda_readlink.VFid = *fid;
@@ -361,8 +361,8 @@ int venus_readlink(struct super_block *sb, struct CodaFid *fid,
error = coda_upcall(coda_vcp(sb), insize, &outsize, inp);
if (!error) {
retlen = outp->coda_readlink.count;
- if ( retlen > *length )
- retlen = *length;
+ if (retlen >= *length)
+ retlen = *length - 1;
*length = retlen;
result = (char *)outp + (long)outp->coda_readlink.data;
memcpy(buffer, result, retlen);
diff --git a/fs/coredump.c b/fs/coredump.c
index c5ecde6f3eed..a8f75640ac86 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -513,10 +513,10 @@ void do_coredump(const siginfo_t *siginfo)
const struct cred *old_cred;
struct cred *cred;
int retval = 0;
- int flag = 0;
int ispipe;
struct files_struct *displaced;
- bool need_nonrelative = false;
+ /* require nonrelative corefile path and be extra careful */
+ bool need_suid_safe = false;
bool core_dumped = false;
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
@@ -550,9 +550,8 @@ void do_coredump(const siginfo_t *siginfo)
*/
if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
/* Setuid core dump mode */
- flag = O_EXCL; /* Stop rewrite attacks */
cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
- need_nonrelative = true;
+ need_suid_safe = true;
}
retval = coredump_wait(siginfo->si_signo, &core_state);
@@ -633,7 +632,7 @@ void do_coredump(const siginfo_t *siginfo)
if (cprm.limit < binfmt->min_coredump)
goto fail_unlock;
- if (need_nonrelative && cn.corename[0] != '/') {
+ if (need_suid_safe && cn.corename[0] != '/') {
printk(KERN_WARNING "Pid %d(%s) can only dump core "\
"to fully qualified path!\n",
task_tgid_vnr(current), current->comm);
@@ -641,8 +640,35 @@ void do_coredump(const siginfo_t *siginfo)
goto fail_unlock;
}
+ /*
+ * Unlink the file if it exists unless this is a SUID
+ * binary - in that case, we're running around with root
+ * privs and don't want to unlink another user's coredump.
+ */
+ if (!need_suid_safe) {
+ mm_segment_t old_fs;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ /*
+ * If it doesn't exist, that's fine. If there's some
+ * other problem, we'll catch it at the filp_open().
+ */
+ (void) sys_unlink((const char __user *)cn.corename);
+ set_fs(old_fs);
+ }
+
+ /*
+ * There is a race between unlinking and creating the
+ * file, but if that causes an EEXIST here, that's
+ * fine - another process raced with us while creating
+ * the corefile, and the other process won. To userspace,
+ * what matters is that at least one of the two processes
+ * writes its coredump successfully, not which one.
+ */
cprm.file = filp_open(cn.corename,
- O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+ O_CREAT | 2 | O_NOFOLLOW |
+ O_LARGEFILE | O_EXCL,
0600);
if (IS_ERR(cprm.file))
goto fail_unlock;
@@ -659,11 +685,15 @@ void do_coredump(const siginfo_t *siginfo)
if (!S_ISREG(inode->i_mode))
goto close_fail;
/*
- * Dont allow local users get cute and trick others to coredump
- * into their pre-created files.
+ * Don't dump core if the filesystem changed owner or mode
+ * of the file during file creation. This is an issue when
+ * a process dumps core while its cwd is e.g. on a vfat
+ * filesystem.
*/
if (!uid_eq(inode->i_uid, current_fsuid()))
goto close_fail;
+ if ((inode->i_mode & 0677) != 0600)
+ goto close_fail;
if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
goto close_fail;
if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
diff --git a/fs/dax.c b/fs/dax.c
index a7f77e1fa18c..93bf2f990ace 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -17,12 +17,14 @@
#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
+#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/highmem.h>
#include <linux/memcontrol.h>
#include <linux/mm.h>
#include <linux/mutex.h>
+#include <linux/pmem.h>
#include <linux/sched.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
@@ -34,7 +36,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size)
might_sleep();
do {
- void *addr;
+ void __pmem *addr;
unsigned long pfn;
long count;
@@ -46,10 +48,7 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size)
unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
if (pgsz > count)
pgsz = count;
- if (pgsz < PAGE_SIZE)
- memset(addr, 0, pgsz);
- else
- clear_page(addr);
+ clear_pmem(addr, pgsz);
addr += pgsz;
size -= pgsz;
count -= pgsz;
@@ -59,26 +58,29 @@ int dax_clear_blocks(struct inode *inode, sector_t block, long size)
}
} while (size);
+ wmb_pmem();
return 0;
}
EXPORT_SYMBOL_GPL(dax_clear_blocks);
-static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+static long dax_get_addr(struct buffer_head *bh, void __pmem **addr,
+ unsigned blkbits)
{
unsigned long pfn;
sector_t sector = bh->b_blocknr << (blkbits - 9);
return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
}
-static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
- loff_t end)
+/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
+static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
+ loff_t pos, loff_t end)
{
loff_t final = end - pos + first; /* The final byte of the buffer */
if (first > 0)
- memset(addr, 0, first);
+ clear_pmem(addr, first);
if (final < size)
- memset(addr + final, 0, size - final);
+ clear_pmem(addr + final, size - final);
}
static bool buffer_written(struct buffer_head *bh)
@@ -106,14 +108,15 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
loff_t pos = start;
loff_t max = start;
loff_t bh_max = start;
- void *addr;
+ void __pmem *addr;
bool hole = false;
+ bool need_wmb = false;
if (iov_iter_rw(iter) != WRITE)
end = min(end, i_size_read(inode));
while (pos < end) {
- unsigned len;
+ size_t len;
if (pos == max) {
unsigned blkbits = inode->i_blkbits;
sector_t block = pos >> blkbits;
@@ -145,19 +148,23 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
retval = dax_get_addr(bh, &addr, blkbits);
if (retval < 0)
break;
- if (buffer_unwritten(bh) || buffer_new(bh))
+ if (buffer_unwritten(bh) || buffer_new(bh)) {
dax_new_buf(addr, retval, first, pos,
end);
+ need_wmb = true;
+ }
addr += first;
size = retval - first;
}
max = min(pos + size, end);
}
- if (iov_iter_rw(iter) == WRITE)
- len = copy_from_iter_nocache(addr, max - pos, iter);
- else if (!hole)
- len = copy_to_iter(addr, max - pos, iter);
+ if (iov_iter_rw(iter) == WRITE) {
+ len = copy_from_iter_pmem(addr, max - pos, iter);
+ need_wmb = true;
+ } else if (!hole)
+ len = copy_to_iter((void __force *)addr, max - pos,
+ iter);
else
len = iov_iter_zero(max - pos, iter);
@@ -168,6 +175,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
addr += len;
}
+ if (need_wmb)
+ wmb_pmem();
+
return (pos == start) ? retval : pos - start;
}
@@ -260,11 +270,13 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
static int copy_user_bh(struct page *to, struct buffer_head *bh,
unsigned blkbits, unsigned long vaddr)
{
- void *vfrom, *vto;
+ void __pmem *vfrom;
+ void *vto;
+
if (dax_get_addr(bh, &vfrom, blkbits) < 0)
return -EIO;
vto = kmap_atomic(to);
- copy_user_page(vto, vfrom, vaddr, to);
+ copy_user_page(vto, (void __force *)vfrom, vaddr, to);
kunmap_atomic(vto);
return 0;
}
@@ -272,16 +284,13 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct address_space *mapping = inode->i_mapping;
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
unsigned long vaddr = (unsigned long)vmf->virtual_address;
- void *addr;
+ void __pmem *addr;
unsigned long pfn;
pgoff_t size;
int error;
- i_mmap_lock_read(mapping);
-
/*
* Check truncate didn't happen while we were allocating a block.
* If it did, this block may or may not be still allocated to the
@@ -303,14 +312,14 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
goto out;
}
- if (buffer_unwritten(bh) || buffer_new(bh))
- clear_page(addr);
+ if (buffer_unwritten(bh) || buffer_new(bh)) {
+ clear_pmem(addr, PAGE_SIZE);
+ wmb_pmem();
+ }
error = vm_insert_mixed(vma, vaddr, pfn);
out:
- i_mmap_unlock_read(mapping);
-
return error;
}
@@ -372,15 +381,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* from a read fault and we've raced with a truncate
*/
error = -EIO;
- goto unlock_page;
+ goto unlock;
}
+ } else {
+ i_mmap_lock_write(mapping);
}
error = get_block(inode, block, &bh, 0);
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO; /* fs corruption? */
if (error)
- goto unlock_page;
+ goto unlock;
if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -391,8 +402,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO;
if (error)
- goto unlock_page;
+ goto unlock;
} else {
+ i_mmap_unlock_write(mapping);
return dax_load_hole(mapping, page, vmf);
}
}
@@ -404,17 +416,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
else
clear_user_highpage(new_page, vaddr);
if (error)
- goto unlock_page;
+ goto unlock;
vmf->page = page;
if (!page) {
- i_mmap_lock_read(mapping);
/* Check we didn't race with truncate */
size = (i_size_read(inode) + PAGE_SIZE - 1) >>
PAGE_SHIFT;
if (vmf->pgoff >= size) {
- i_mmap_unlock_read(mapping);
error = -EIO;
- goto out;
+ goto unlock;
}
}
return VM_FAULT_LOCKED;
@@ -450,6 +460,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
}
+ if (!page)
+ i_mmap_unlock_write(mapping);
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
@@ -458,11 +470,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
return VM_FAULT_SIGBUS | major;
return VM_FAULT_NOPAGE | major;
- unlock_page:
+ unlock:
if (page) {
unlock_page(page);
page_cache_release(page);
+ } else {
+ i_mmap_unlock_write(mapping);
}
+
goto out;
}
EXPORT_SYMBOL(__dax_fault);
@@ -494,6 +509,177 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
}
EXPORT_SYMBOL_GPL(dax_fault);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
+ * more often than one might expect in the below function.
+ */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head bh;
+ unsigned blkbits = inode->i_blkbits;
+ unsigned long pmd_addr = address & PMD_MASK;
+ bool write = flags & FAULT_FLAG_WRITE;
+ long length;
+ void __pmem *kaddr;
+ pgoff_t size, pgoff;
+ sector_t block, sector;
+ unsigned long pfn;
+ int result = 0;
+
+ /* Fall back to PTEs if we're going to COW */
+ if (write && !(vma->vm_flags & VM_SHARED))
+ return VM_FAULT_FALLBACK;
+ /* If the PMD would extend outside the VMA */
+ if (pmd_addr < vma->vm_start)
+ return VM_FAULT_FALLBACK;
+ if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+ return VM_FAULT_FALLBACK;
+
+ pgoff = linear_page_index(vma, pmd_addr);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size)
+ return VM_FAULT_SIGBUS;
+ /* If the PMD would cover blocks out of the file */
+ if ((pgoff | PG_PMD_COLOUR) >= size)
+ return VM_FAULT_FALLBACK;
+
+ memset(&bh, 0, sizeof(bh));
+ block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
+
+ bh.b_size = PMD_SIZE;
+ i_mmap_lock_write(mapping);
+ length = get_block(inode, block, &bh, write);
+ if (length)
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * If the filesystem isn't willing to tell us the length of a hole,
+ * just fall back to PTEs. Calling get_block 512 times in a loop
+ * would be silly.
+ */
+ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
+ goto fallback;
+
+ if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+ int i;
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
+ wmb_pmem();
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ result |= VM_FAULT_MAJOR;
+ }
+
+ /*
+ * If we allocated new storage, make sure no process has any
+ * zero pages covering this hole
+ */
+ if (buffer_new(&bh)) {
+ i_mmap_unlock_write(mapping);
+ unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
+ i_mmap_lock_write(mapping);
+ }
+
+ /*
+ * If a truncate happened while we were allocating blocks, we may
+ * leave blocks allocated to the file that are beyond EOF. We can't
+ * take i_mutex here, so just leave them hanging; they'll be freed
+ * when the file is deleted.
+ */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((pgoff | PG_PMD_COLOUR) >= size)
+ goto fallback;
+
+ if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+ spinlock_t *ptl;
+ pmd_t entry;
+ struct page *zero_page = get_huge_zero_page();
+
+ if (unlikely(!zero_page))
+ goto fallback;
+
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_none(*pmd)) {
+ spin_unlock(ptl);
+ goto fallback;
+ }
+
+ entry = mk_pmd(zero_page, vma->vm_page_prot);
+ entry = pmd_mkhuge(entry);
+ set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
+ result = VM_FAULT_NOPAGE;
+ spin_unlock(ptl);
+ } else {
+ sector = bh.b_blocknr << (blkbits - 9);
+ length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
+ bh.b_size);
+ if (length < 0) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+ goto fallback;
+
+ result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+ }
+
+ out:
+ if (buffer_unwritten(&bh))
+ complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
+
+ i_mmap_unlock_write(mapping);
+
+ return result;
+
+ fallback:
+ count_vm_event(THP_FAULT_FALLBACK);
+ result = VM_FAULT_FALLBACK;
+ goto out;
+}
+EXPORT_SYMBOL_GPL(__dax_pmd_fault);
+
+/**
+ * dax_pmd_fault - handle a PMD fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * pmd_fault handler for DAX files.
+ */
+int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ int result;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ }
+ result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
+ complete_unwritten);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(sb);
+
+ return result;
+}
+EXPORT_SYMBOL_GPL(dax_pmd_fault);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
/**
* dax_pfn_mkwrite - handle first write to DAX page
* @vma: The virtual memory area where the fault occurred
@@ -548,11 +734,12 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
if (err < 0)
return err;
if (buffer_written(&bh)) {
- void *addr;
+ void __pmem *addr;
err = dax_get_addr(&bh, &addr, inode->i_blkbits);
if (err < 0)
return err;
- memset(addr + offset, 0, length);
+ clear_pmem(addr + offset, length);
+ wmb_pmem();
}
return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index 9b5fe503f6cb..5c33aeb0f68f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2718,7 +2718,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
* This helper attempts to cope with remotely renamed directories
*
* It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock
+ * dentry->d_parent->d_inode->i_mutex, and rename_lock
*
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
@@ -2744,7 +2744,6 @@ out_unalias:
__d_move(alias, dentry, false);
ret = 0;
out_err:
- spin_unlock(&inode->i_lock);
if (m2)
mutex_unlock(m2);
if (m1)
@@ -2790,10 +2789,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
if (S_ISDIR(inode->i_mode)) {
struct dentry *new = __d_find_any_alias(inode);
if (unlikely(new)) {
+ /* The reference to new ensures it remains an alias */
+ spin_unlock(&inode->i_lock);
write_seqlock(&rename_lock);
if (unlikely(d_ancestor(new, dentry))) {
write_sequnlock(&rename_lock);
- spin_unlock(&inode->i_lock);
dput(new);
new = ERR_PTR(-ELOOP);
pr_warn_ratelimited(
@@ -2812,7 +2812,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
} else {
__d_move(new, dentry, false);
write_sequnlock(&rename_lock);
- spin_unlock(&inode->i_lock);
security_d_instantiate(new, inode);
}
iput(inode);
@@ -2926,6 +2925,13 @@ restart:
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
+ /* Escaped? */
+ if (dentry != vfsmnt->mnt_root) {
+ bptr = *buffer;
+ blen = *buflen;
+ error = 3;
+ break;
+ }
/* Global root? */
if (mnt != parent) {
dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 284f9aa0028b..6c55ade071c3 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -435,8 +435,8 @@ struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
}
EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
-static ssize_t read_file_bool(struct file *file, char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[3];
u32 *val = file->private_data;
@@ -449,9 +449,10 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf,
buf[2] = 0x00;
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
}
+EXPORT_SYMBOL_GPL(debugfs_read_file_bool);
-static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
- size_t count, loff_t *ppos)
+ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
{
char buf[32];
size_t buf_size;
@@ -468,10 +469,11 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
return count;
}
+EXPORT_SYMBOL_GPL(debugfs_write_file_bool);
static const struct file_operations fops_bool = {
- .read = read_file_bool,
- .write = write_file_bool,
+ .read = debugfs_read_file_bool,
+ .write = debugfs_write_file_bool,
.open = simple_open,
.llseek = default_llseek,
};
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 5718cb9f7273..d72d52b90433 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -17,7 +17,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
{
struct inode *inode, *toput_inode = NULL;
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
@@ -27,13 +27,15 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
+
invalidate_mapping_pages(inode->i_mapping, 0, -1);
iput(toput_inode);
toput_inode = inode;
- spin_lock(&inode_sb_list_lock);
+
+ spin_lock(&sb->s_inode_list_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
iput(toput_inode);
}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 97315f2f6816..80d6901493cf 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -258,8 +258,7 @@ void ecryptfs_destroy_mount_crypt_stat(
&mount_crypt_stat->global_auth_tok_list,
mount_crypt_stat_list) {
list_del(&auth_tok->mount_crypt_stat_list);
- if (auth_tok->global_auth_tok_key
- && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID))
+ if (!(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID))
key_put(auth_tok->global_auth_tok_key);
kmem_cache_free(ecryptfs_global_auth_tok_cache, auth_tok);
}
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 8db0b464483f..63cd2c147221 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -45,20 +45,20 @@
static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- int rc;
-
- if (!(lower_dentry->d_flags & DCACHE_OP_REVALIDATE))
- return 1;
+ int rc = 1;
if (flags & LOOKUP_RCU)
return -ECHILD;
- rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+ if (lower_dentry->d_flags & DCACHE_OP_REVALIDATE)
+ rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+
if (d_really_is_positive(dentry)) {
- struct inode *lower_inode =
- ecryptfs_inode_to_lower(d_inode(dentry));
+ struct inode *inode = d_inode(dentry);
- fsstack_copy_attr_all(d_inode(dentry), lower_inode);
+ fsstack_copy_attr_all(inode, ecryptfs_inode_to_lower(inode));
+ if (!inode->i_nlink)
+ return 0;
}
return rc;
}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3b57c9f83c9b..1982c3f11aec 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
#include <linux/time.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include "ext2.h"
#include "xattr.h"
@@ -31,6 +32,12 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return dax_fault(vma, vmf, ext2_get_block, NULL);
}
+static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+}
+
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
@@ -38,6 +45,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
+ .pmd_fault = ext2_dax_pmd_fault,
.page_mkwrite = ext2_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -49,7 +57,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &ext2_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
#else
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a3a404c5df2e..c60a248c640c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -25,6 +25,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 32071f5c1c26..fd1f28be5296 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2272,6 +2272,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bc313ac5d3fa..113837e7ba98 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
@@ -195,7 +196,7 @@ out:
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
struct inode *inode = bh->b_assoc_map->host;
- /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+ /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
int err;
if (!uptodate)
@@ -206,17 +207,74 @@ static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
- /* Is this the right get_block? */
+ int result;
+ handle_t *handle = NULL;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ EXT4_DATA_TRANS_BLOCKS(sb));
+ }
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_fault(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ sb_end_pagefault(sb);
+ }
+
+ return result;
+}
+
+static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ int result;
+ handle_t *handle = NULL;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ bool write = flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ ext4_chunk_trans_blocks(inode,
+ PMD_SIZE / PAGE_SIZE));
+ }
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_pmd_fault(vma, addr, pmd, flags,
+ ext4_get_block_dax, ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ sb_end_pagefault(sb);
+ }
+
+ return result;
}
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
+ return dax_mkwrite(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
+ .pmd_fault = ext4_dax_pmd_fault,
.page_mkwrite = ext4_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -244,7 +302,7 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 4f6ac499f09e..2468261748b2 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
#include "ext4_jbd2.h"
#include "truncate.h"
+#include <linux/dax.h>
#include <linux/uio.h>
#include <trace/events/ext4.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 29f1af7c2cab..612fbcf76b5c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -22,6 +22,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
@@ -3020,6 +3021,17 @@ static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_NO_LOCK);
}
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
+ if (create)
+ flags |= EXT4_GET_BLOCKS_CREATE;
+ ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result, flags);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ee3878262a49..a63c7b0a10cf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1776,10 +1776,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
}
if (sbi->s_qf_names[USRQUOTA])
- seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+ seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
if (sbi->s_qf_names[GRPQUOTA])
- seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+ seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
#endif
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5fa588e933d5..587ac08eabb6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,8 +53,6 @@ struct wb_writeback_work {
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
unsigned int auto_free:1; /* free on completion */
- unsigned int single_wait:1;
- unsigned int single_done:1;
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
@@ -88,7 +86,7 @@ unsigned int dirtytime_expire_interval = 12 * 60 * 60;
static inline struct inode *wb_inode(struct list_head *head)
{
- return list_entry(head, struct inode, i_wb_list);
+ return list_entry(head, struct inode, i_io_list);
}
/*
@@ -125,22 +123,22 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb)
}
/**
- * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list
+ * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
* @inode: inode to be moved
* @wb: target bdi_writeback
* @head: one of @wb->b_{dirty|io|more_io}
*
- * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io.
+ * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
* Returns %true if @inode is the first occupant of the !dirty_time IO
* lists; otherwise, %false.
*/
-static bool inode_wb_list_move_locked(struct inode *inode,
+static bool inode_io_list_move_locked(struct inode *inode,
struct bdi_writeback *wb,
struct list_head *head)
{
assert_spin_locked(&wb->list_lock);
- list_move(&inode->i_wb_list, head);
+ list_move(&inode->i_io_list, head);
/* dirty_time doesn't count as dirty_io until expiration */
if (head != &wb->b_dirty_time)
@@ -151,19 +149,19 @@ static bool inode_wb_list_move_locked(struct inode *inode,
}
/**
- * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list
+ * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
* @inode: inode to be removed
* @wb: bdi_writeback @inode is being removed from
*
* Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
* clear %WB_has_dirty_io if all are empty afterwards.
*/
-static void inode_wb_list_del_locked(struct inode *inode,
+static void inode_io_list_del_locked(struct inode *inode,
struct bdi_writeback *wb)
{
assert_spin_locked(&wb->list_lock);
- list_del_init(&inode->i_wb_list);
+ list_del_init(&inode->i_io_list);
wb_io_lists_depopulated(wb);
}
@@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb)
static void wb_queue_work(struct bdi_writeback *wb,
struct wb_writeback_work *work)
{
- trace_writeback_queue(wb->bdi, work);
+ trace_writeback_queue(wb, work);
spin_lock_bh(&wb->work_lock);
- if (!test_bit(WB_registered, &wb->state)) {
- if (work->single_wait)
- work->single_done = 1;
+ if (!test_bit(WB_registered, &wb->state))
goto out_unlock;
- }
if (work->done)
atomic_inc(&work->done->cnt);
list_add_tail(&work->list, &wb->work_list);
@@ -351,7 +346,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
/*
* Once I_FREEING is visible under i_lock, the eviction path owns
- * the inode and we shouldn't modify ->i_wb_list.
+ * the inode and we shouldn't modify ->i_io_list.
*/
if (unlikely(inode->i_state & I_FREEING))
goto skip_switch;
@@ -390,16 +385,16 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
* is always correct including from ->b_dirty_time. The transfer
* preserves @inode->dirtied_when ordering.
*/
- if (!list_empty(&inode->i_wb_list)) {
+ if (!list_empty(&inode->i_io_list)) {
struct inode *pos;
- inode_wb_list_del_locked(inode, old_wb);
+ inode_io_list_del_locked(inode, old_wb);
inode->i_wb = new_wb;
- list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+ list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
if (time_after_eq(inode->dirtied_when,
pos->dirtied_when))
break;
- inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+ inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
} else {
inode->i_wb = new_wb;
}
@@ -706,7 +701,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
/**
* inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion
+ * @inode: inode to test for congestion (may be NULL)
* @cong_bits: mask of WB_[a]sync_congested bits to test
*
* Tests whether @inode is congested. @cong_bits is the mask of congestion
@@ -716,6 +711,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io);
* determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
* associated with @inode is congested; otherwise, the root wb's congestion
* state is used.
+ *
+ * @inode is allowed to be NULL as this function is often called on
+ * mapping->host which is NULL for the swapper space.
*/
int inode_congested(struct inode *inode, int cong_bits)
{
@@ -738,32 +736,6 @@ int inode_congested(struct inode *inode, int cong_bits)
EXPORT_SYMBOL_GPL(inode_congested);
/**
- * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
- * @bdi: bdi the work item was issued to
- * @work: work item to wait for
- *
- * Wait for the completion of @work which was issued to one of @bdi's
- * bdi_writeback's. The caller must have set @work->single_wait before
- * issuing it. This wait operates independently fo
- * wb_wait_for_completion() and also disables automatic freeing of @work.
- */
-static void wb_wait_for_single_work(struct backing_dev_info *bdi,
- struct wb_writeback_work *work)
-{
- if (WARN_ON_ONCE(!work->single_wait))
- return;
-
- wait_event(bdi->wb_waitq, work->single_done);
-
- /*
- * Paired with smp_wmb() in wb_do_writeback() and ensures that all
- * modifications to @work prior to assertion of ->single_done is
- * visible to the caller once this function returns.
- */
- smp_rmb();
-}
-
-/**
* wb_split_bdi_pages - split nr_pages to write according to bandwidth
* @wb: target bdi_writeback to split @nr_pages to
* @nr_pages: number of pages to write for the whole bdi
@@ -792,38 +764,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
}
/**
- * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
- * @wb: target bdi_writeback
- * @base_work: source wb_writeback_work
- *
- * Try to make a clone of @base_work and issue it to @wb. If cloning
- * succeeds, %true is returned; otherwise, @base_work is issued directly
- * and %false is returned. In the latter case, the caller is required to
- * wait for @base_work's completion using wb_wait_for_single_work().
- *
- * A clone is auto-freed on completion. @base_work never is.
- */
-static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
- struct wb_writeback_work *base_work)
-{
- struct wb_writeback_work *work;
-
- work = kmalloc(sizeof(*work), GFP_ATOMIC);
- if (work) {
- *work = *base_work;
- work->auto_free = 1;
- work->single_wait = 0;
- } else {
- work = base_work;
- work->auto_free = 0;
- work->single_wait = 1;
- }
- work->single_done = 0;
- wb_queue_work(wb, work);
- return work != base_work;
-}
-
-/**
* bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
* @bdi: target backing_dev_info
* @base_work: wb_writeback_work to issue
@@ -838,15 +778,19 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
struct wb_writeback_work *base_work,
bool skip_if_busy)
{
- long nr_pages = base_work->nr_pages;
- int next_blkcg_id = 0;
+ int next_memcg_id = 0;
struct bdi_writeback *wb;
struct wb_iter iter;
might_sleep();
restart:
rcu_read_lock();
- bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+ bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+ DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
+ struct wb_writeback_work fallback_work;
+ struct wb_writeback_work *work;
+ long nr_pages;
+
/* SYNC_ALL writes out I_DIRTY_TIME too */
if (!wb_has_dirty_io(wb) &&
(base_work->sync_mode == WB_SYNC_NONE ||
@@ -855,13 +799,30 @@ restart:
if (skip_if_busy && writeback_in_progress(wb))
continue;
- base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
- if (!wb_clone_and_queue_work(wb, base_work)) {
- next_blkcg_id = wb->blkcg_css->id + 1;
- rcu_read_unlock();
- wb_wait_for_single_work(bdi, base_work);
- goto restart;
+ nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (work) {
+ *work = *base_work;
+ work->nr_pages = nr_pages;
+ work->auto_free = 1;
+ wb_queue_work(wb, work);
+ continue;
}
+
+ /* alloc failed, execute synchronously using on-stack fallback */
+ work = &fallback_work;
+ *work = *base_work;
+ work->nr_pages = nr_pages;
+ work->auto_free = 0;
+ work->done = &fallback_work_done;
+
+ wb_queue_work(wb, work);
+
+ next_memcg_id = wb->memcg_css->id + 1;
+ rcu_read_unlock();
+ wb_wait_for_completion(bdi, &fallback_work_done);
+ goto restart;
}
rcu_read_unlock();
}
@@ -902,8 +863,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
base_work->auto_free = 0;
- base_work->single_wait = 0;
- base_work->single_done = 0;
wb_queue_work(&bdi->wb, base_work);
}
}
@@ -924,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
*/
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
- trace_writeback_nowork(wb->bdi);
+ trace_writeback_nowork(wb);
wb_wakeup(wb);
return;
}
@@ -954,19 +913,19 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
* We just wake up the flusher thread. It will perform background
* writeback as soon as there is no other work to do.
*/
- trace_writeback_wake_background(wb->bdi);
+ trace_writeback_wake_background(wb);
wb_wakeup(wb);
}
/*
* Remove the inode from the writeback list it is on.
*/
-void inode_wb_list_del(struct inode *inode)
+void inode_io_list_del(struct inode *inode)
{
struct bdi_writeback *wb;
wb = inode_to_wb_and_lock_list(inode);
- inode_wb_list_del_locked(inode, wb);
+ inode_io_list_del_locked(inode, wb);
spin_unlock(&wb->list_lock);
}
@@ -988,7 +947,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
if (time_before(inode->dirtied_when, tail->dirtied_when))
inode->dirtied_when = jiffies;
}
- inode_wb_list_move_locked(inode, wb, &wb->b_dirty);
+ inode_io_list_move_locked(inode, wb, &wb->b_dirty);
}
/*
@@ -996,7 +955,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
*/
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
- inode_wb_list_move_locked(inode, wb, &wb->b_more_io);
+ inode_io_list_move_locked(inode, wb, &wb->b_more_io);
}
static void inode_sync_complete(struct inode *inode)
@@ -1055,7 +1014,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
if (older_than_this &&
inode_dirtied_after(inode, *older_than_this))
break;
- list_move(&inode->i_wb_list, &tmp);
+ list_move(&inode->i_io_list, &tmp);
moved++;
if (flags & EXPIRE_DIRTY_ATIME)
set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
@@ -1078,7 +1037,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
list_for_each_prev_safe(pos, node, &tmp) {
inode = wb_inode(pos);
if (inode->i_sb == sb)
- list_move(&inode->i_wb_list, dispatch_queue);
+ list_move(&inode->i_io_list, dispatch_queue);
}
}
out:
@@ -1232,10 +1191,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
redirty_tail(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) {
inode->dirtied_when = jiffies;
- inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time);
+ inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
} else {
/* The inode is clean. Remove from writeback lists. */
- inode_wb_list_del_locked(inode, wb);
+ inode_io_list_del_locked(inode, wb);
}
}
@@ -1378,7 +1337,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* touch it. See comment above for explanation.
*/
if (!(inode->i_state & I_DIRTY_ALL))
- inode_wb_list_del_locked(inode, wb);
+ inode_io_list_del_locked(inode, wb);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
out:
@@ -1421,6 +1380,10 @@ static long writeback_chunk_size(struct bdi_writeback *wb,
* Write a portion of b_io inodes which belong to @sb.
*
* Return the number of pages and/or inodes written.
+ *
+ * NOTE! This is called with wb->list_lock held, and will
+ * unlock and relock that for each inode it ends up doing
+ * IO for.
*/
static long writeback_sb_inodes(struct super_block *sb,
struct bdi_writeback *wb,
@@ -1583,12 +1546,15 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
.range_cyclic = 1,
.reason = reason,
};
+ struct blk_plug plug;
+ blk_start_plug(&plug);
spin_lock(&wb->list_lock);
if (list_empty(&wb->b_io))
queue_io(wb, &work);
__writeback_inodes_wb(wb, &work);
spin_unlock(&wb->list_lock);
+ blk_finish_plug(&plug);
return nr_pages - work.nr_pages;
}
@@ -1616,10 +1582,12 @@ static long wb_writeback(struct bdi_writeback *wb,
unsigned long oldest_jif;
struct inode *inode;
long progress;
+ struct blk_plug plug;
oldest_jif = jiffies;
work->older_than_this = &oldest_jif;
+ blk_start_plug(&plug);
spin_lock(&wb->list_lock);
for (;;) {
/*
@@ -1657,14 +1625,14 @@ static long wb_writeback(struct bdi_writeback *wb,
} else if (work->for_background)
oldest_jif = jiffies;
- trace_writeback_start(wb->bdi, work);
+ trace_writeback_start(wb, work);
if (list_empty(&wb->b_io))
queue_io(wb, work);
if (work->sb)
progress = writeback_sb_inodes(work->sb, wb, work);
else
progress = __writeback_inodes_wb(wb, work);
- trace_writeback_written(wb->bdi, work);
+ trace_writeback_written(wb, work);
wb_update_bandwidth(wb, wb_start);
@@ -1689,7 +1657,7 @@ static long wb_writeback(struct bdi_writeback *wb,
* we'll just busyloop.
*/
if (!list_empty(&wb->b_more_io)) {
- trace_writeback_wait(wb->bdi, work);
+ trace_writeback_wait(wb, work);
inode = wb_inode(wb->b_more_io.prev);
spin_lock(&inode->i_lock);
spin_unlock(&wb->list_lock);
@@ -1699,6 +1667,7 @@ static long wb_writeback(struct bdi_writeback *wb,
}
}
spin_unlock(&wb->list_lock);
+ blk_finish_plug(&plug);
return nr_pages - work->nr_pages;
}
@@ -1794,26 +1763,14 @@ static long wb_do_writeback(struct bdi_writeback *wb)
set_bit(WB_writeback_running, &wb->state);
while ((work = get_next_work_item(wb)) != NULL) {
struct wb_completion *done = work->done;
- bool need_wake_up = false;
- trace_writeback_exec(wb->bdi, work);
+ trace_writeback_exec(wb, work);
wrote += wb_writeback(wb, work);
- if (work->single_wait) {
- WARN_ON_ONCE(work->auto_free);
- /* paired w/ rmb in wb_wait_for_single_work() */
- smp_wmb();
- work->single_done = 1;
- need_wake_up = true;
- } else if (work->auto_free) {
+ if (work->auto_free)
kfree(work);
- }
-
if (done && atomic_dec_and_test(&done->cnt))
- need_wake_up = true;
-
- if (need_wake_up)
wake_up_all(&wb->bdi->wb_waitq);
}
@@ -2088,7 +2045,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
else
dirty_list = &wb->b_dirty_time;
- wakeup_bdi = inode_wb_list_move_locked(inode, wb,
+ wakeup_bdi = inode_io_list_move_locked(inode, wb,
dirty_list);
spin_unlock(&wb->list_lock);
@@ -2111,6 +2068,15 @@ out_unlock_inode:
}
EXPORT_SYMBOL(__mark_inode_dirty);
+/*
+ * The @s_sync_lock is used to serialise concurrent sync operations
+ * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
+ * Concurrent callers will block on the s_sync_lock rather than doing contending
+ * walks. The queueing maintains sync(2) required behaviour as all the IO that
+ * has been issued up to the time this function is enter is guaranteed to be
+ * completed by the time we have gained the lock and waited for all IO that is
+ * in progress regardless of the order callers are granted the lock.
+ */
static void wait_sb_inodes(struct super_block *sb)
{
struct inode *inode, *old_inode = NULL;
@@ -2121,7 +2087,8 @@ static void wait_sb_inodes(struct super_block *sb)
*/
WARN_ON(!rwsem_is_locked(&sb->s_umount));
- spin_lock(&inode_sb_list_lock);
+ mutex_lock(&sb->s_sync_lock);
+ spin_lock(&sb->s_inode_list_lock);
/*
* Data integrity sync. Must wait for all pages under writeback,
@@ -2141,14 +2108,14 @@ static void wait_sb_inodes(struct super_block *sb)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
/*
* We hold a reference to 'inode' so it couldn't have been
* removed from s_inodes list while we dropped the
- * inode_sb_list_lock. We cannot iput the inode now as we can
+ * s_inode_list_lock. We cannot iput the inode now as we can
* be holding the last reference and we cannot iput it under
- * inode_sb_list_lock. So we keep the reference and iput it
+ * s_inode_list_lock. So we keep the reference and iput it
* later.
*/
iput(old_inode);
@@ -2158,10 +2125,11 @@ static void wait_sb_inodes(struct super_block *sb)
cond_resched();
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
iput(old_inode);
+ mutex_unlock(&sb->s_sync_lock);
}
static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a38e38f7b6fc..9bd1244caf38 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -34,6 +34,7 @@
#include <linux/percpu.h>
#include <linux/list_sort.h>
#include <linux/lockref.h>
+#include <linux/rhashtable.h>
#include "gfs2.h"
#include "incore.h"
@@ -50,9 +51,8 @@
#include "trace_gfs2.h"
struct gfs2_glock_iter {
- int hash; /* hash bucket index */
- unsigned nhash; /* Index within current bucket */
struct gfs2_sbd *sdp; /* incore superblock */
+ struct rhashtable_iter hti; /* rhashtable iterator */
struct gfs2_glock *gl; /* current glock struct */
loff_t last_pos; /* last position */
};
@@ -70,44 +70,19 @@ static DEFINE_SPINLOCK(lru_lock);
#define GFS2_GL_HASH_SHIFT 15
#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
-#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
-static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
-static struct dentry *gfs2_root;
-
-/**
- * gl_hash() - Turn glock number into hash bucket number
- * @lock: The glock number
- *
- * Returns: The number of the corresponding hash bucket
- */
-
-static unsigned int gl_hash(const struct gfs2_sbd *sdp,
- const struct lm_lockname *name)
-{
- unsigned int h;
-
- h = jhash(&name->ln_number, sizeof(u64), 0);
- h = jhash(&name->ln_type, sizeof(unsigned int), h);
- h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
- h &= GFS2_GL_HASH_MASK;
-
- return h;
-}
-
-static inline void spin_lock_bucket(unsigned int hash)
-{
- hlist_bl_lock(&gl_hash_table[hash]);
-}
+static struct rhashtable_params ht_parms = {
+ .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
+ .key_len = sizeof(struct lm_lockname),
+ .key_offset = offsetof(struct gfs2_glock, gl_name),
+ .head_offset = offsetof(struct gfs2_glock, gl_node),
+};
-static inline void spin_unlock_bucket(unsigned int hash)
-{
- hlist_bl_unlock(&gl_hash_table[hash]);
-}
+static struct rhashtable gl_hash_table;
-static void gfs2_glock_dealloc(struct rcu_head *rcu)
+void gfs2_glock_free(struct gfs2_glock *gl)
{
- struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (gl->gl_ops->go_flags & GLOF_ASPACE) {
kmem_cache_free(gfs2_glock_aspace_cachep, gl);
@@ -115,13 +90,6 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
kfree(gl->gl_lksb.sb_lvbptr);
kmem_cache_free(gfs2_glock_cachep, gl);
}
-}
-
-void gfs2_glock_free(struct gfs2_glock *gl)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
-
- call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
if (atomic_dec_and_test(&sdp->sd_glock_disposal))
wake_up(&sdp->sd_glock_wait);
}
@@ -192,7 +160,7 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
void gfs2_glock_put(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = gfs2_glock2aspace(gl);
if (lockref_put_or_lock(&gl->gl_lockref))
@@ -202,9 +170,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
gfs2_glock_remove_from_lru(gl);
spin_unlock(&gl->gl_lockref.lock);
- spin_lock_bucket(gl->gl_hash);
- hlist_bl_del_rcu(&gl->gl_list);
- spin_unlock_bucket(gl->gl_hash);
+ rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
trace_gfs2_glock_put(gl);
@@ -212,33 +178,6 @@ void gfs2_glock_put(struct gfs2_glock *gl)
}
/**
- * search_bucket() - Find struct gfs2_glock by lock number
- * @bucket: the bucket to search
- * @name: The lock name
- *
- * Returns: NULL, or the struct gfs2_glock with the requested number
- */
-
-static struct gfs2_glock *search_bucket(unsigned int hash,
- const struct gfs2_sbd *sdp,
- const struct lm_lockname *name)
-{
- struct gfs2_glock *gl;
- struct hlist_bl_node *h;
-
- hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
- if (!lm_name_equal(&gl->gl_name, name))
- continue;
- if (gl->gl_sbd != sdp)
- continue;
- if (lockref_get_not_dead(&gl->gl_lockref))
- return gl;
- }
-
- return NULL;
-}
-
-/**
* may_grant - check if its ok to grant a new lock
* @gl: The glock
* @gh: The lock request which we wish to grant
@@ -506,7 +445,7 @@ __releases(&gl->gl_spin)
__acquires(&gl->gl_spin)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned int lck_flags = gh ? gh->gh_flags : 0;
int ret;
@@ -628,7 +567,7 @@ out_unlock:
static void delete_work_func(struct work_struct *work)
{
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip;
struct inode *inode;
u64 no_addr = gl->gl_name.ln_number;
@@ -704,15 +643,17 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
struct gfs2_glock **glp)
{
struct super_block *s = sdp->sd_vfs;
- struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
- struct gfs2_glock *gl, *tmp;
- unsigned int hash = gl_hash(sdp, &name);
+ struct lm_lockname name = { .ln_number = number,
+ .ln_type = glops->go_type,
+ .ln_sbd = sdp };
+ struct gfs2_glock *gl, *tmp = NULL;
struct address_space *mapping;
struct kmem_cache *cachep;
+ int ret, tries = 0;
- rcu_read_lock();
- gl = search_bucket(hash, sdp, &name);
- rcu_read_unlock();
+ gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
+ if (gl && !lockref_get_not_dead(&gl->gl_lockref))
+ gl = NULL;
*glp = gl;
if (gl)
@@ -739,14 +680,13 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
}
atomic_inc(&sdp->sd_glock_disposal);
- gl->gl_sbd = sdp;
+ gl->gl_node.next = NULL;
gl->gl_flags = 0;
gl->gl_name = name;
gl->gl_lockref.count = 1;
gl->gl_state = LM_ST_UNLOCKED;
gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
- gl->gl_hash = hash;
gl->gl_ops = glops;
gl->gl_dstamp = ktime_set(0, 0);
preempt_disable();
@@ -771,22 +711,34 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
mapping->writeback_index = 0;
}
- spin_lock_bucket(hash);
- tmp = search_bucket(hash, sdp, &name);
- if (tmp) {
- spin_unlock_bucket(hash);
- kfree(gl->gl_lksb.sb_lvbptr);
- kmem_cache_free(cachep, gl);
- atomic_dec(&sdp->sd_glock_disposal);
- gl = tmp;
- } else {
- hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
- spin_unlock_bucket(hash);
+again:
+ ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node,
+ ht_parms);
+ if (ret == 0) {
+ *glp = gl;
+ return 0;
}
- *glp = gl;
+ if (ret == -EEXIST) {
+ ret = 0;
+ tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
+ if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) {
+ if (++tries < 100) {
+ cond_resched();
+ goto again;
+ }
+ tmp = NULL;
+ ret = -ENOMEM;
+ }
+ } else {
+ WARN_ON_ONCE(ret);
+ }
+ kfree(gl->gl_lksb.sb_lvbptr);
+ kmem_cache_free(cachep, gl);
+ atomic_dec(&sdp->sd_glock_disposal);
+ *glp = tmp;
- return 0;
+ return ret;
}
/**
@@ -928,7 +880,7 @@ __releases(&gl->gl_spin)
__acquires(&gl->gl_spin)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct list_head *insert_pt = NULL;
struct gfs2_holder *gh2;
int try_futile = 0;
@@ -1006,7 +958,7 @@ trap_recursive:
int gfs2_glock_nq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
int error = 0;
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
@@ -1313,7 +1265,7 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
{
- struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+ struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
spin_lock(&gl->gl_spin);
gl->gl_reply = ret;
@@ -1462,31 +1414,26 @@ static struct shrinker glock_shrinker = {
*
*/
-static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
- unsigned int hash)
+static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
{
struct gfs2_glock *gl;
- struct hlist_bl_head *head = &gl_hash_table[hash];
- struct hlist_bl_node *pos;
+ struct rhash_head *pos, *next;
+ const struct bucket_table *tbl;
+ int i;
rcu_read_lock();
- hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
- if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref))
- examiner(gl);
+ tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table);
+ for (i = 0; i < tbl->size; i++) {
+ rht_for_each_entry_safe(gl, pos, next, tbl, i, gl_node) {
+ if ((gl->gl_name.ln_sbd == sdp) &&
+ lockref_get_not_dead(&gl->gl_lockref))
+ examiner(gl);
+ }
}
rcu_read_unlock();
cond_resched();
}
-static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
-{
- unsigned x;
-
- for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
- examine_bucket(examiner, sdp, x);
-}
-
-
/**
* thaw_glock - thaw out a glock which has an unprocessed reply waiting
* @gl: The glock to thaw
@@ -1569,7 +1516,7 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
int ret;
ret = gfs2_truncatei_resume(ip);
- gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+ gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0);
spin_lock(&gl->gl_spin);
clear_bit(GLF_LOCK, &gl->gl_flags);
@@ -1733,17 +1680,17 @@ static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
{
struct gfs2_glock *gl = iter_ptr;
- seq_printf(seq, "G: n:%u/%llx rtt:%lld/%lld rttb:%lld/%lld irt:%lld/%lld dcnt: %lld qcnt: %lld\n",
+ seq_printf(seq, "G: n:%u/%llx rtt:%llu/%llu rttb:%llu/%llu irt:%llu/%llu dcnt: %llu qcnt: %llu\n",
gl->gl_name.ln_type,
(unsigned long long)gl->gl_name.ln_number,
- (long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
- (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
- (long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
- (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
- (long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
- (long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
- (long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
- (long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTT],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTB],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRT],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT],
+ (unsigned long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]);
return 0;
}
@@ -1776,11 +1723,10 @@ static const char *gfs2_stype[] = {
static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
{
- struct gfs2_glock_iter *gi = seq->private;
- struct gfs2_sbd *sdp = gi->sdp;
- unsigned index = gi->hash >> 3;
- unsigned subindex = gi->hash & 0x07;
- s64 value;
+ struct gfs2_sbd *sdp = seq->private;
+ loff_t pos = *(loff_t *)iter_ptr;
+ unsigned index = pos >> 3;
+ unsigned subindex = pos & 0x07;
int i;
if (index == 0 && subindex != 0)
@@ -1791,12 +1737,12 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
for_each_possible_cpu(i) {
const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i);
- if (index == 0) {
- value = i;
- } else {
- value = lkstats->lkstats[index - 1].stats[subindex];
- }
- seq_printf(seq, " %15lld", (long long)value);
+
+ if (index == 0)
+ seq_printf(seq, " %15u", i);
+ else
+ seq_printf(seq, " %15llu", (unsigned long long)lkstats->
+ lkstats[index - 1].stats[subindex]);
}
seq_putc(seq, '\n');
return 0;
@@ -1804,20 +1750,24 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr)
int __init gfs2_glock_init(void)
{
- unsigned i;
- for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
- INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
- }
+ int ret;
+
+ ret = rhashtable_init(&gl_hash_table, &ht_parms);
+ if (ret < 0)
+ return ret;
glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
WQ_HIGHPRI | WQ_FREEZABLE, 0);
- if (!glock_workqueue)
+ if (!glock_workqueue) {
+ rhashtable_destroy(&gl_hash_table);
return -ENOMEM;
+ }
gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
WQ_MEM_RECLAIM | WQ_FREEZABLE,
0);
if (!gfs2_delete_workqueue) {
destroy_workqueue(glock_workqueue);
+ rhashtable_destroy(&gl_hash_table);
return -ENOMEM;
}
@@ -1829,72 +1779,41 @@ int __init gfs2_glock_init(void)
void gfs2_glock_exit(void)
{
unregister_shrinker(&glock_shrinker);
+ rhashtable_destroy(&gl_hash_table);
destroy_workqueue(glock_workqueue);
destroy_workqueue(gfs2_delete_workqueue);
}
-static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
+static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
{
- return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
- struct gfs2_glock, gl_list);
-}
-
-static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
-{
- return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
- struct gfs2_glock, gl_list);
-}
-
-static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
-{
- struct gfs2_glock *gl;
-
do {
- gl = gi->gl;
- if (gl) {
- gi->gl = glock_hash_next(gl);
- gi->nhash++;
- } else {
- if (gi->hash >= GFS2_GL_HASH_SIZE) {
- rcu_read_unlock();
- return 1;
- }
- gi->gl = glock_hash_chain(gi->hash);
- gi->nhash = 0;
- }
- while (gi->gl == NULL) {
- gi->hash++;
- if (gi->hash >= GFS2_GL_HASH_SIZE) {
- rcu_read_unlock();
- return 1;
- }
- gi->gl = glock_hash_chain(gi->hash);
- gi->nhash = 0;
+ gi->gl = rhashtable_walk_next(&gi->hti);
+ if (IS_ERR(gi->gl)) {
+ if (PTR_ERR(gi->gl) == -EAGAIN)
+ continue;
+ gi->gl = NULL;
}
/* Skip entries for other sb and dead entries */
- } while (gi->sdp != gi->gl->gl_sbd ||
- __lockref_is_dead(&gi->gl->gl_lockref));
-
- return 0;
+ } while ((gi->gl) && ((gi->sdp != gi->gl->gl_name.ln_sbd) ||
+ __lockref_is_dead(&gi->gl->gl_lockref)));
}
static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
{
struct gfs2_glock_iter *gi = seq->private;
loff_t n = *pos;
+ int ret;
if (gi->last_pos <= *pos)
- n = gi->nhash + (*pos - gi->last_pos);
- else
- gi->hash = 0;
+ n = (*pos - gi->last_pos);
- gi->nhash = 0;
- rcu_read_lock();
+ ret = rhashtable_walk_start(&gi->hti);
+ if (ret)
+ return NULL;
do {
- if (gfs2_glock_iter_next(gi))
- return NULL;
- } while (n--);
+ gfs2_glock_iter_next(gi);
+ } while (gi->gl && n--);
gi->last_pos = *pos;
return gi->gl;
@@ -1907,9 +1826,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
(*pos)++;
gi->last_pos = *pos;
- if (gfs2_glock_iter_next(gi))
- return NULL;
-
+ gfs2_glock_iter_next(gi);
return gi->gl;
}
@@ -1917,9 +1834,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
{
struct gfs2_glock_iter *gi = seq->private;
- if (gi->gl)
- rcu_read_unlock();
gi->gl = NULL;
+ rhashtable_walk_stop(&gi->hti);
}
static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1930,26 +1846,19 @@ static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct gfs2_glock_iter *gi = seq->private;
-
- gi->hash = *pos;
+ preempt_disable();
if (*pos >= GFS2_NR_SBSTATS)
return NULL;
- preempt_disable();
- return SEQ_START_TOKEN;
+ return pos;
}
static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr,
loff_t *pos)
{
- struct gfs2_glock_iter *gi = seq->private;
(*pos)++;
- gi->hash++;
- if (gi->hash >= GFS2_NR_SBSTATS) {
- preempt_enable();
+ if (*pos >= GFS2_NR_SBSTATS)
return NULL;
- }
- return SEQ_START_TOKEN;
+ return pos;
}
static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr)
@@ -1987,14 +1896,28 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
if (ret == 0) {
struct seq_file *seq = file->private_data;
struct gfs2_glock_iter *gi = seq->private;
+
gi->sdp = inode->i_private;
+ gi->last_pos = 0;
seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
+ gi->gl = NULL;
+ ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
}
return ret;
}
+static int gfs2_glocks_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq = file->private_data;
+ struct gfs2_glock_iter *gi = seq->private;
+
+ gi->gl = NULL;
+ rhashtable_walk_exit(&gi->hti);
+ return seq_release_private(inode, file);
+}
+
static int gfs2_glstats_open(struct inode *inode, struct file *file)
{
int ret = seq_open_private(file, &gfs2_glstats_seq_ops,
@@ -2003,21 +1926,22 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file)
struct seq_file *seq = file->private_data;
struct gfs2_glock_iter *gi = seq->private;
gi->sdp = inode->i_private;
+ gi->last_pos = 0;
seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
if (seq->buf)
seq->size = GFS2_SEQ_GOODSIZE;
+ gi->gl = NULL;
+ ret = rhashtable_walk_init(&gl_hash_table, &gi->hti);
}
return ret;
}
static int gfs2_sbstats_open(struct inode *inode, struct file *file)
{
- int ret = seq_open_private(file, &gfs2_sbstats_seq_ops,
- sizeof(struct gfs2_glock_iter));
+ int ret = seq_open(file, &gfs2_sbstats_seq_ops);
if (ret == 0) {
struct seq_file *seq = file->private_data;
- struct gfs2_glock_iter *gi = seq->private;
- gi->sdp = inode->i_private;
+ seq->private = inode->i_private; /* sdp */
}
return ret;
}
@@ -2027,7 +1951,7 @@ static const struct file_operations gfs2_glocks_fops = {
.open = gfs2_glocks_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = gfs2_glocks_release,
};
static const struct file_operations gfs2_glstats_fops = {
@@ -2035,7 +1959,7 @@ static const struct file_operations gfs2_glstats_fops = {
.open = gfs2_glstats_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = gfs2_glocks_release,
};
static const struct file_operations gfs2_sbstats_fops = {
@@ -2043,7 +1967,7 @@ static const struct file_operations gfs2_sbstats_fops = {
.open = gfs2_sbstats_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release,
};
int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index fa3fa5e94553..1f6c9c3fe5cb 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -32,13 +32,15 @@ struct workqueue_struct *gfs2_freeze_wq;
static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
{
- fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n",
+ fs_err(gl->gl_name.ln_sbd,
+ "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page "
+ "state 0x%lx\n",
bh, (unsigned long long)bh->b_blocknr, bh->b_state,
bh->b_page->mapping, bh->b_page->flags);
- fs_err(gl->gl_sbd, "AIL glock %u:%llu mapping %p\n",
+ fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n",
gl->gl_name.ln_type, gl->gl_name.ln_number,
gfs2_glock2aspace(gl));
- gfs2_lm_withdraw(gl->gl_sbd, "AIL error\n");
+ gfs2_lm_withdraw(gl->gl_name.ln_sbd, "AIL error\n");
}
/**
@@ -52,7 +54,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
unsigned int nr_revokes)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct list_head *head = &gl->gl_ail_list;
struct gfs2_bufdata *bd, *tmp;
struct buffer_head *bh;
@@ -80,7 +82,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync,
static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_trans tr;
memset(&tr, 0, sizeof(tr));
@@ -109,7 +111,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned int revokes = atomic_read(&gl->gl_ail_count);
unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64);
int ret;
@@ -139,7 +141,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
static void rgrp_go_sync(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = &sdp->sd_aspace;
struct gfs2_rgrpd *rgd;
int error;
@@ -179,7 +181,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = &sdp->sd_aspace;
struct gfs2_rgrpd *rgd = gl->gl_object;
@@ -218,7 +220,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
- gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH);
+ gfs2_log_flush(gl->gl_name.ln_sbd, gl, NORMAL_FLUSH);
filemap_fdatawrite(metamapping);
if (ip) {
struct address_space *mapping = ip->i_inode.i_mapping;
@@ -252,7 +254,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
{
struct gfs2_inode *ip = gl->gl_object;
- gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+ gfs2_assert_withdraw(gl->gl_name.ln_sbd, !atomic_read(&gl->gl_ail_count));
if (flags & DIO_METADATA) {
struct address_space *mapping = gfs2_glock2aspace(gl);
@@ -264,9 +266,9 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
}
}
- if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) {
- gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH);
- gl->gl_sbd->sd_rindex_uptodate = 0;
+ if (ip == GFS2_I(gl->gl_name.ln_sbd->sd_rindex)) {
+ gfs2_log_flush(gl->gl_name.ln_sbd, NULL, NORMAL_FLUSH);
+ gl->gl_name.ln_sbd->sd_rindex_uptodate = 0;
}
if (ip && S_ISREG(ip->i_inode.i_mode))
truncate_inode_pages(ip->i_inode.i_mapping, 0);
@@ -281,7 +283,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
static int inode_go_demote_ok(const struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_holder *gh;
if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
@@ -416,7 +418,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
static int inode_go_lock(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip = gl->gl_object;
int error = 0;
@@ -477,7 +479,7 @@ static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
static void freeze_go_sync(struct gfs2_glock *gl)
{
int error = 0;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (gl->gl_state == LM_ST_SHARED &&
test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
@@ -500,7 +502,7 @@ static void freeze_go_sync(struct gfs2_glock *gl)
static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
struct gfs2_glock *j_gl = ip->i_gl;
struct gfs2_log_header_host head;
@@ -545,7 +547,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl)
static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
{
struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (!remote || (sdp->sd_vfs->s_flags & MS_RDONLY))
return;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a1ec7c20e498..121ed08d9d9f 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -22,6 +22,7 @@
#include <linux/ktime.h>
#include <linux/percpu.h>
#include <linux/lockref.h>
+#include <linux/rhashtable.h>
#define DIO_WAIT 0x00000010
#define DIO_METADATA 0x00000020
@@ -203,13 +204,15 @@ enum {
};
struct lm_lockname {
+ struct gfs2_sbd *ln_sbd;
u64 ln_number;
unsigned int ln_type;
};
#define lm_name_equal(name1, name2) \
- (((name1)->ln_number == (name2)->ln_number) && \
- ((name1)->ln_type == (name2)->ln_type))
+ (((name1)->ln_number == (name2)->ln_number) && \
+ ((name1)->ln_type == (name2)->ln_type) && \
+ ((name1)->ln_sbd == (name2)->ln_sbd))
struct gfs2_glock_operations {
@@ -241,7 +244,7 @@ enum {
};
struct gfs2_lkstats {
- s64 stats[GFS2_NR_LKSTATS];
+ u64 stats[GFS2_NR_LKSTATS];
};
enum {
@@ -327,7 +330,6 @@ enum {
struct gfs2_glock {
struct hlist_bl_node gl_list;
- struct gfs2_sbd *gl_sbd;
unsigned long gl_flags; /* GLF_... */
struct lm_lockname gl_name;
@@ -341,7 +343,6 @@ struct gfs2_glock {
gl_req:2, /* State in last dlm request */
gl_reply:8; /* Last reply from the dlm */
- unsigned int gl_hash;
unsigned long gl_demote_time; /* time of first demote request */
long gl_hold_time;
struct list_head gl_holders;
@@ -367,7 +368,7 @@ struct gfs2_glock {
loff_t end;
} gl_vm;
};
- struct rcu_head gl_rcu;
+ struct rhash_head gl_node;
};
#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
@@ -835,7 +836,7 @@ static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which)
{
- const struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
preempt_disable();
this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++;
preempt_enable();
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 641383a9c1bb..284c1542783e 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -31,7 +31,7 @@ extern struct workqueue_struct *gfs2_control_wq;
*
* @delta is the difference between the current rtt sample and the
* running average srtt. We add 1/8 of that to the srtt in order to
- * update the current srtt estimate. The varience estimate is a bit
+ * update the current srtt estimate. The variance estimate is a bit
* more complicated. We subtract the abs value of the @delta from
* the current variance estimate and add 1/4 of that to the running
* total.
@@ -80,7 +80,7 @@ static inline void gfs2_update_reply_times(struct gfs2_glock *gl)
preempt_disable();
rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp));
- lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+ lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats);
gfs2_update_stats(&gl->gl_stats, index, rtt); /* Local */
gfs2_update_stats(&lks->lkstats[gltype], index, rtt); /* Global */
preempt_enable();
@@ -108,7 +108,7 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl)
dstamp = gl->gl_dstamp;
gl->gl_dstamp = ktime_get_real();
irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp));
- lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats);
+ lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats);
gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt); /* Local */
gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt); /* Global */
preempt_enable();
@@ -253,7 +253,7 @@ static void gfs2_reverse_hex(char *c, u64 value)
static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
unsigned int flags)
{
- struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+ struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
int req;
u32 lkf;
char strname[GDLM_STRNAME_BYTES] = "";
@@ -281,7 +281,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
static void gdlm_put_lock(struct gfs2_glock *gl)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
int lvb_needs_unlock = 0;
int error;
@@ -319,7 +319,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
static void gdlm_cancel(struct gfs2_glock *gl)
{
- struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+ struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
}
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 92324ac58290..d5369a109781 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -70,7 +70,7 @@ static bool buffer_is_rgrp(const struct gfs2_bufdata *bd)
static void maybe_release_space(struct gfs2_bufdata *bd)
{
struct gfs2_glock *gl = bd->bd_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_rgrpd *rgd = gl->gl_object;
unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
struct gfs2_bitmap *bi = rgd->rd_bits + index;
@@ -578,7 +578,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
static void gfs2_meta_sync(struct gfs2_glock *gl)
{
struct address_space *mapping = gfs2_glock2aspace(gl);
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
int error;
if (mapping == NULL)
@@ -588,7 +588,7 @@ static void gfs2_meta_sync(struct gfs2_glock *gl)
error = filemap_fdatawait(mapping);
if (error)
- gfs2_io_error(gl->gl_sbd);
+ gfs2_io_error(gl->gl_name.ln_sbd);
}
static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index b984a6e190bc..0e1d4be5865a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -114,7 +114,7 @@ const struct address_space_operations gfs2_rgrp_aops = {
struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
{
struct address_space *mapping = gfs2_glock2aspace(gl);
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct page *page;
struct buffer_head *bh;
unsigned int shift;
@@ -200,7 +200,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
struct buffer_head **bhp)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct buffer_head *bh;
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
@@ -362,7 +362,7 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct buffer_head *first_bh, *bh;
u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
sdp->sd_sb.sb_bsize_shift;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index ac5d8027d335..8ca161567a93 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -44,7 +44,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
{
struct inode *inode = mapping->host;
if (mapping->a_ops == &gfs2_meta_aops)
- return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+ return (((struct gfs2_glock *)mapping) - 1)->gl_name.ln_sbd;
else if (mapping->a_ops == &gfs2_rgrp_aops)
return container_of(mapping, struct gfs2_sbd, sd_aspace);
else
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 9b61f92fcfdf..3a31226531ea 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -119,7 +119,7 @@ static void gfs2_qd_dispose(struct list_head *list)
while (!list_empty(list)) {
qd = list_entry(list->next, struct gfs2_quota_data, qd_lru);
- sdp = qd->qd_gl->gl_sbd;
+ sdp = qd->qd_gl->gl_name.ln_sbd;
list_del(&qd->qd_lru);
@@ -302,7 +302,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
static void qd_hold(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref));
lockref_get(&qd->qd_lockref);
}
@@ -367,7 +367,7 @@ static void slot_put(struct gfs2_quota_data *qd)
static int bh_get(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
unsigned int block, offset;
struct buffer_head *bh;
@@ -414,7 +414,7 @@ fail:
static void bh_put(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
mutex_lock(&sdp->sd_quota_mutex);
gfs2_assert(sdp, qd->qd_bh_count);
@@ -486,7 +486,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
static void qd_unlock(struct gfs2_quota_data *qd)
{
- gfs2_assert_warn(qd->qd_gl->gl_sbd,
+ gfs2_assert_warn(qd->qd_gl->gl_name.ln_sbd,
test_bit(QDF_LOCKED, &qd->qd_flags));
clear_bit(QDF_LOCKED, &qd->qd_flags);
bh_put(qd);
@@ -614,7 +614,7 @@ static int sort_qd(const void *a, const void *b)
static void do_qc(struct gfs2_quota_data *qd, s64 change)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
struct gfs2_quota_change *qc = qd->qd_bh_qc;
s64 x;
@@ -831,7 +831,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
{
- struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct gfs2_alloc_parms ap = { .aflags = 0, };
unsigned int data_blocks, ind_blocks;
@@ -922,7 +922,7 @@ out:
gfs2_glock_dq_uninit(&ghs[qx]);
mutex_unlock(&ip->i_inode.i_mutex);
kfree(ghs);
- gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH);
+ gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH);
return error;
}
@@ -954,7 +954,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
struct gfs2_holder *q_gh)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct gfs2_holder i_gh;
int error;
@@ -1037,7 +1037,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
static int need_sync(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
struct gfs2_tune *gt = &sdp->sd_tune;
s64 value;
unsigned int num, den;
@@ -1125,7 +1125,7 @@ out:
static int print_message(struct gfs2_quota_data *qd, char *type)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
fs_info(sdp, "quota %s for %s %u\n",
type,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c6c62321dfd6..475985d14758 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1860,13 +1860,13 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
{
const struct gfs2_glock *gl = rgd->rd_gl;
- const struct gfs2_sbd *sdp = gl->gl_sbd;
+ const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_lkstats *st;
- s64 r_dcount, l_dcount;
- s64 l_srttb, a_srttb = 0;
+ u64 r_dcount, l_dcount;
+ u64 l_srttb, a_srttb = 0;
s64 srttb_diff;
- s64 sqr_diff;
- s64 var;
+ u64 sqr_diff;
+ u64 var;
int cpu, nonzero = 0;
preempt_disable();
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2982445947e1..894fb01a91da 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1334,11 +1334,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
if (is_ancestor(root, sdp->sd_master_dir))
seq_puts(s, ",meta");
if (args->ar_lockproto[0])
- seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+ seq_show_option(s, "lockproto", args->ar_lockproto);
if (args->ar_locktable[0])
- seq_printf(s, ",locktable=%s", args->ar_locktable);
+ seq_show_option(s, "locktable", args->ar_locktable);
if (args->ar_hostdata[0])
- seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+ seq_show_option(s, "hostdata", args->ar_hostdata);
if (args->ar_spectator)
seq_puts(s, ",spectator");
if (args->ar_localflocks)
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 20c007d747ab..49ac55da4e33 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -104,7 +104,7 @@ TRACE_EVENT(gfs2_glock_state_change,
),
TP_fast_assign(
- __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gl->gl_name.ln_number;
__entry->gltype = gl->gl_name.ln_type;
__entry->cur_state = glock_trace_state(gl->gl_state);
@@ -140,7 +140,7 @@ TRACE_EVENT(gfs2_glock_put,
),
TP_fast_assign(
- __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->gltype = gl->gl_name.ln_type;
__entry->glnum = gl->gl_name.ln_number;
__entry->cur_state = glock_trace_state(gl->gl_state);
@@ -174,7 +174,7 @@ TRACE_EVENT(gfs2_demote_rq,
),
TP_fast_assign(
- __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->gltype = gl->gl_name.ln_type;
__entry->glnum = gl->gl_name.ln_number;
__entry->cur_state = glock_trace_state(gl->gl_state);
@@ -209,7 +209,7 @@ TRACE_EVENT(gfs2_promote,
),
TP_fast_assign(
- __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gh->gh_gl->gl_name.ln_number;
__entry->gltype = gh->gh_gl->gl_name.ln_type;
__entry->first = first;
@@ -239,7 +239,7 @@ TRACE_EVENT(gfs2_glock_queue,
),
TP_fast_assign(
- __entry->dev = gh->gh_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gh->gh_gl->gl_name.ln_number;
__entry->gltype = gh->gh_gl->gl_name.ln_type;
__entry->queue = queue;
@@ -267,18 +267,18 @@ TRACE_EVENT(gfs2_glock_lock_time,
__field( int, status )
__field( char, flags )
__field( s64, tdiff )
- __field( s64, srtt )
- __field( s64, srttvar )
- __field( s64, srttb )
- __field( s64, srttvarb )
- __field( s64, sirt )
- __field( s64, sirtvar )
- __field( s64, dcount )
- __field( s64, qcount )
+ __field( u64, srtt )
+ __field( u64, srttvar )
+ __field( u64, srttb )
+ __field( u64, srttvarb )
+ __field( u64, sirt )
+ __field( u64, sirtvar )
+ __field( u64, dcount )
+ __field( u64, qcount )
),
TP_fast_assign(
- __entry->dev = gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->glnum = gl->gl_name.ln_number;
__entry->gltype = gl->gl_name.ln_type;
__entry->status = gl->gl_lksb.sb_status;
@@ -333,7 +333,7 @@ TRACE_EVENT(gfs2_pin,
),
TP_fast_assign(
- __entry->dev = bd->bd_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = bd->bd_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->pin = pin;
__entry->len = bd->bd_bh->b_size;
__entry->block = bd->bd_bh->b_blocknr;
@@ -449,7 +449,7 @@ TRACE_EVENT(gfs2_bmap,
),
TP_fast_assign(
- __entry->dev = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->lblock = lblock;
__entry->pblock = buffer_mapped(bh) ? bh->b_blocknr : 0;
__entry->inum = ip->i_no_addr;
@@ -489,7 +489,7 @@ TRACE_EVENT(gfs2_block_alloc,
),
TP_fast_assign(
- __entry->dev = rgd->rd_gl->gl_sbd->sd_vfs->s_dev;
+ __entry->dev = rgd->rd_gl->gl_name.ln_sbd->sd_vfs->s_dev;
__entry->start = block;
__entry->inum = ip->i_no_addr;
__entry->len = len;
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 88bff2430669..b95d0d625f32 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -158,7 +158,7 @@ static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
{
struct gfs2_trans *tr = current->journal_info;
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct address_space *mapping = bh->b_page->mapping;
struct gfs2_inode *ip = GFS2_I(mapping->host);
struct gfs2_bufdata *bd;
@@ -224,7 +224,7 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
{
- struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct gfs2_bufdata *bd;
lock_buffer(bh);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index d3fa6bd9503e..221719eac5de 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -288,7 +288,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page_cache_release(page);
goto fail;
}
- page_cache_release(page);
node->page[i] = page;
}
@@ -398,11 +397,11 @@ node_error:
void hfs_bnode_free(struct hfs_bnode *node)
{
- //int i;
+ int i;
- //for (i = 0; i < node->tree->pages_per_bnode; i++)
- // if (node->page[i])
- // page_cache_release(node->page[i]);
+ for (i = 0; i < node->tree->pages_per_bnode; i++)
+ if (node->page[i])
+ page_cache_release(node->page[i]);
kfree(node);
}
diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c
index 9f4ee7f52026..6fc766df0461 100644
--- a/fs/hfs/brec.c
+++ b/fs/hfs/brec.c
@@ -131,13 +131,16 @@ skip:
hfs_bnode_write(node, entry, data_off + key_len, entry_len);
hfs_bnode_dump(node);
- if (new_node) {
- /* update parent key if we inserted a key
- * at the start of the first node
- */
- if (!rec && new_node != node)
- hfs_brec_update_parent(fd);
+ /*
+ * update parent key if we inserted a key
+ * at the start of the node and it is not the new node
+ */
+ if (!rec && new_node != node) {
+ hfs_bnode_read_key(node, fd->search_key, data_off + size);
+ hfs_brec_update_parent(fd);
+ }
+ if (new_node) {
hfs_bnode_put(fd->bnode);
if (!new_node->parent) {
hfs_btree_inc_height(tree);
@@ -166,9 +169,6 @@ skip:
goto again;
}
- if (!rec)
- hfs_brec_update_parent(fd);
-
return 0;
}
@@ -366,6 +366,8 @@ again:
if (IS_ERR(parent))
return PTR_ERR(parent);
__hfs_brec_find(parent, fd);
+ if (fd->record < 0)
+ return -ENOENT;
hfs_bnode_dump(parent);
rec = fd->record;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 55c03b9e9070..4574fdd3d421 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -136,9 +136,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root)
struct hfs_sb_info *sbi = HFS_SB(root->d_sb);
if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
- seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+ seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4);
if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
- seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+ seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4);
seq_printf(seq, ",uid=%u,gid=%u",
from_kuid_munged(&init_user_ns, sbi->s_uid),
from_kgid_munged(&init_user_ns, sbi->s_gid));
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 759708fd9331..63924662aaf3 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -454,7 +454,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
page_cache_release(page);
goto fail;
}
- page_cache_release(page);
node->page[i] = page;
}
@@ -566,13 +565,11 @@ node_error:
void hfs_bnode_free(struct hfs_bnode *node)
{
-#if 0
int i;
for (i = 0; i < node->tree->pages_per_bnode; i++)
if (node->page[i])
page_cache_release(node->page[i]);
-#endif
kfree(node);
}
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index c90b72ee676d..bb806e58c977 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -218,9 +218,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);
if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
- seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+ seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4);
if (sbi->type != HFSPLUS_DEF_CR_TYPE)
- seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+ seq_show_option_n(seq, "type", (char *)&sbi->type, 4);
seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
from_kuid_munged(&init_user_ns, sbi->uid),
from_kgid_munged(&init_user_ns, sbi->gid));
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 059597b23f67..2ac99db3750e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -260,7 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
size_t offset = strlen(root_ino) + 1;
if (strlen(root_path) > offset)
- seq_printf(seq, ",%s", root_path + offset);
+ seq_show_option(seq, root_path + offset, NULL);
if (append)
seq_puts(seq, ",append");
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 973c24ce59ad..316adb968b65 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -12,6 +12,7 @@
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h> /* remove ASAP */
+#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
@@ -84,6 +85,29 @@ static const match_table_t tokens = {
{Opt_err, NULL},
};
+#ifdef CONFIG_NUMA
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+ vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+ index);
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+ mpol_cond_put(vma->vm_policy);
+}
+#else
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+}
+#endif
+
static void huge_pagevec_release(struct pagevec *pvec)
{
int i;
@@ -293,26 +317,61 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
return -EINVAL;
}
-static void truncate_huge_page(struct page *page)
+static void remove_huge_page(struct page *page)
{
ClearPageDirty(page);
ClearPageUptodate(page);
delete_from_page_cache(page);
}
-static void truncate_hugepages(struct inode *inode, loff_t lstart)
+
+/*
+ * remove_inode_hugepages handles two distinct cases: truncation and hole
+ * punch. There are subtle differences in operation for each case.
+
+ * truncation is indicated by end of range being LLONG_MAX
+ * In this case, we first scan the range and release found pages.
+ * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+ * maps and global counts.
+ * hole punch is indicated if end is not LLONG_MAX
+ * In the hole punch case we scan the range and release found pages.
+ * Only when releasing a page is the associated region/reserv map
+ * deleted. The region/reserv map for ranges without associated
+ * pages are not modified.
+ * Note: If the passed end of range value is beyond the end of file, but
+ * not LLONG_MAX this routine still performs a hole punch operation.
+ */
+static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
+ loff_t lend)
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
+ const pgoff_t end = lend >> huge_page_shift(h);
+ struct vm_area_struct pseudo_vma;
struct pagevec pvec;
pgoff_t next;
int i, freed = 0;
+ long lookup_nr = PAGEVEC_SIZE;
+ bool truncate_op = (lend == LLONG_MAX);
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pagevec_init(&pvec, 0);
next = start;
- while (1) {
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ while (next < end) {
+ /*
+ * Make sure to never grab more pages that we
+ * might possibly need.
+ */
+ if (end - next < lookup_nr)
+ lookup_nr = end - next;
+
+ /*
+ * This pagevec_lookup() may return pages past 'end',
+ * so we must check for page->index > end.
+ */
+ if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
if (next == start)
break;
next = start;
@@ -321,26 +380,69 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
+ u32 hash;
+
+ hash = hugetlb_fault_mutex_hash(h, current->mm,
+ &pseudo_vma,
+ mapping, next, 0);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
lock_page(page);
+ if (page->index >= end) {
+ unlock_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ next = end; /* we are done */
+ break;
+ }
+
+ /*
+ * If page is mapped, it was faulted in after being
+ * unmapped. Do nothing in this race case. In the
+ * normal case page is not mapped.
+ */
+ if (!page_mapped(page)) {
+ bool rsv_on_error = !PagePrivate(page);
+ /*
+ * We must free the huge page and remove
+ * from page cache (remove_huge_page) BEFORE
+ * removing the region/reserve map
+ * (hugetlb_unreserve_pages). In rare out
+ * of memory conditions, removal of the
+ * region/reserve map could fail. Before
+ * free'ing the page, note PagePrivate which
+ * is used in case of error.
+ */
+ remove_huge_page(page);
+ freed++;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(
+ inode, next,
+ next + 1, 1)))
+ hugetlb_fix_reserve_counts(
+ inode, rsv_on_error);
+ }
+ }
+
if (page->index > next)
next = page->index;
+
++next;
- truncate_huge_page(page);
unlock_page(page);
- freed++;
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
huge_pagevec_release(&pvec);
}
- BUG_ON(!lstart && mapping->nrpages);
- hugetlb_unreserve_pages(inode, start, freed);
+
+ if (truncate_op)
+ (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
- truncate_hugepages(inode, 0);
+ remove_inode_hugepages(inode, 0, LLONG_MAX);
resv_map = (struct resv_map *)inode->i_mapping->private_data;
/* root inode doesn't have the resv_map, so we should check it */
if (resv_map)
@@ -349,11 +451,15 @@ static void hugetlbfs_evict_inode(struct inode *inode)
}
static inline void
-hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
{
struct vm_area_struct *vma;
- vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
+ /*
+ * end == 0 indicates that the entire range after
+ * start should be unmapped.
+ */
+ vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
unsigned long v_offset;
/*
@@ -362,13 +468,20 @@ hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
* which overlap the truncated area starting at pgoff,
* and no vma on a 32-bit arch can span beyond the 4GB.
*/
- if (vma->vm_pgoff < pgoff)
- v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
+ if (vma->vm_pgoff < start)
+ v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
else
v_offset = 0;
- unmap_hugepage_range(vma, vma->vm_start + v_offset,
- vma->vm_end, NULL);
+ if (end) {
+ end = ((end - start) << PAGE_SHIFT) +
+ vma->vm_start + v_offset;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ } else
+ end = vma->vm_end;
+
+ unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
}
}
@@ -384,12 +497,164 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_size_write(inode, offset);
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
- hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+ hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_unlock_write(mapping);
- truncate_hugepages(inode, offset);
+ remove_inode_hugepages(inode, offset, LLONG_MAX);
return 0;
}
+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct hstate *h = hstate_inode(inode);
+ loff_t hpage_size = huge_page_size(h);
+ loff_t hole_start, hole_end;
+
+ /*
+ * For hole punch round up the beginning offset of the hole and
+ * round down the end.
+ */
+ hole_start = round_up(offset, hpage_size);
+ hole_end = round_down(offset + len, hpage_size);
+
+ if (hole_end > hole_start) {
+ struct address_space *mapping = inode->i_mapping;
+
+ mutex_lock(&inode->i_mutex);
+ i_mmap_lock_write(mapping);
+ if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ hole_start >> PAGE_SHIFT,
+ hole_end >> PAGE_SHIFT);
+ i_mmap_unlock_write(mapping);
+ remove_inode_hugepages(inode, hole_start, hole_end);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ return 0;
+}
+
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct hstate *h = hstate_inode(inode);
+ struct vm_area_struct pseudo_vma;
+ struct mm_struct *mm = current->mm;
+ loff_t hpage_size = huge_page_size(h);
+ unsigned long hpage_shift = huge_page_shift(h);
+ pgoff_t start, index, end;
+ int error;
+ u32 hash;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return hugetlbfs_punch_hole(inode, offset, len);
+
+ /*
+ * Default preallocate case.
+ * For this range, start is rounded down and end is rounded up
+ * as well as being converted to page offsets.
+ */
+ start = offset >> hpage_shift;
+ end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+ mutex_lock(&inode->i_mutex);
+
+ /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+ error = inode_newsize_ok(inode, offset + len);
+ if (error)
+ goto out;
+
+ /*
+ * Initialize a pseudo vma as this is required by the huge page
+ * allocation routines. If NUMA is configured, use page index
+ * as input to create an allocation policy.
+ */
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+ pseudo_vma.vm_file = file;
+
+ for (index = start; index < end; index++) {
+ /*
+ * This is supposed to be the vaddr where the page is being
+ * faulted in, but we have no vaddr here.
+ */
+ struct page *page;
+ unsigned long addr;
+ int avoid_reserve = 0;
+
+ cond_resched();
+
+ /*
+ * fallocate(2) manpage permits EINTR; we may have been
+ * interrupted because we are using up too much memory.
+ */
+ if (signal_pending(current)) {
+ error = -EINTR;
+ break;
+ }
+
+ /* Set numa allocation policy based on index */
+ hugetlb_set_vma_policy(&pseudo_vma, inode, index);
+
+ /* addr is the offset within the file (zero based) */
+ addr = index * hpage_size;
+
+ /* mutex taken here, fault path and hole punch */
+ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
+ index, addr);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ /* See if already present in mapping to avoid alloc/free */
+ page = find_get_page(mapping, index);
+ if (page) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_drop_vma_policy(&pseudo_vma);
+ continue;
+ }
+
+ /* Allocate page and add to page cache */
+ page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+ hugetlb_drop_vma_policy(&pseudo_vma);
+ if (IS_ERR(page)) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ error = PTR_ERR(page);
+ goto out;
+ }
+ clear_huge_page(page, addr, pages_per_huge_page(h));
+ __SetPageUptodate(page);
+ error = huge_add_to_page_cache(page, mapping, index);
+ if (unlikely(error)) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out;
+ }
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+ /*
+ * page_put due to reference from alloc_huge_page()
+ * unlock_page because locked by add_to_page_cache()
+ */
+ put_page(page);
+ unlock_page(page);
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+ i_size_write(inode, offset + len);
+ inode->i_ctime = CURRENT_TIME;
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ spin_unlock(&inode->i_lock);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return error;
+}
+
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
@@ -701,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = {
.mmap = hugetlbfs_file_mmap,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
- .llseek = default_llseek,
+ .llseek = default_llseek,
+ .fallocate = hugetlbfs_fallocate,
};
static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/fs/inode.c b/fs/inode.c
index d30640f7a193..78a17b8859e1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -28,16 +28,16 @@
* inode->i_state, inode->i_hash, __iget()
* Inode LRU list locks protect:
* inode->i_sb->s_inode_lru, inode->i_lru
- * inode_sb_list_lock protects:
- * sb->s_inodes, inode->i_sb_list
+ * inode->i_sb->s_inode_list_lock protects:
+ * inode->i_sb->s_inodes, inode->i_sb_list
* bdi->wb.list_lock protects:
- * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
+ * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
* inode_hash_lock protects:
* inode_hashtable, inode->i_hash
*
* Lock ordering:
*
- * inode_sb_list_lock
+ * inode->i_sb->s_inode_list_lock
* inode->i_lock
* Inode LRU list locks
*
@@ -45,7 +45,7 @@
* inode->i_lock
*
* inode_hash_lock
- * inode_sb_list_lock
+ * inode->i_sb->s_inode_list_lock
* inode->i_lock
*
* iunique_lock
@@ -57,8 +57,6 @@ static unsigned int i_hash_shift __read_mostly;
static struct hlist_head *inode_hashtable __read_mostly;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
-
/*
* Empty aops. Can be used for the cases where the user does not
* define any of the address_space operations.
@@ -359,7 +357,7 @@ void inode_init_once(struct inode *inode)
memset(inode, 0, sizeof(*inode));
INIT_HLIST_NODE(&inode->i_hash);
INIT_LIST_HEAD(&inode->i_devices);
- INIT_LIST_HEAD(&inode->i_wb_list);
+ INIT_LIST_HEAD(&inode->i_io_list);
INIT_LIST_HEAD(&inode->i_lru);
address_space_init_once(&inode->i_data);
i_size_ordered_init(inode);
@@ -426,18 +424,18 @@ static void inode_lru_list_del(struct inode *inode)
*/
void inode_sb_list_add(struct inode *inode)
{
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&inode->i_sb->s_inode_list_lock);
list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&inode->i_sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);
static inline void inode_sb_list_del(struct inode *inode)
{
if (!list_empty(&inode->i_sb_list)) {
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&inode->i_sb->s_inode_list_lock);
list_del_init(&inode->i_sb_list);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&inode->i_sb->s_inode_list_lock);
}
}
@@ -527,8 +525,8 @@ static void evict(struct inode *inode)
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(!list_empty(&inode->i_lru));
- if (!list_empty(&inode->i_wb_list))
- inode_wb_list_del(inode);
+ if (!list_empty(&inode->i_io_list))
+ inode_io_list_del(inode);
inode_sb_list_del(inode);
@@ -577,6 +575,7 @@ static void dispose_list(struct list_head *head)
list_del_init(&inode->i_lru);
evict(inode);
+ cond_resched();
}
}
@@ -594,7 +593,8 @@ void evict_inodes(struct super_block *sb)
struct inode *inode, *next;
LIST_HEAD(dispose);
- spin_lock(&inode_sb_list_lock);
+again:
+ spin_lock(&sb->s_inode_list_lock);
list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
if (atomic_read(&inode->i_count))
continue;
@@ -609,8 +609,20 @@ void evict_inodes(struct super_block *sb)
inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
+
+ /*
+ * We can have a ton of inodes to evict at unmount time given
+ * enough memory, check to see if we need to go to sleep for a
+ * bit so we don't livelock.
+ */
+ if (need_resched()) {
+ spin_unlock(&sb->s_inode_list_lock);
+ cond_resched();
+ dispose_list(&dispose);
+ goto again;
+ }
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
dispose_list(&dispose);
}
@@ -631,7 +643,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
struct inode *inode, *next;
LIST_HEAD(dispose);
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
@@ -654,7 +666,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
dispose_list(&dispose);
@@ -890,7 +902,7 @@ struct inode *new_inode(struct super_block *sb)
{
struct inode *inode;
- spin_lock_prefetch(&inode_sb_list_lock);
+ spin_lock_prefetch(&sb->s_inode_list_lock);
inode = new_inode_pseudo(sb);
if (inode)
diff --git a/fs/internal.h b/fs/internal.h
index 4d5af583ab03..71859c4d0b41 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -112,14 +112,13 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *);
/*
* inode.c
*/
-extern spinlock_t inode_sb_list_lock;
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
extern void inode_add_lru(struct inode *inode);
/*
* fs-writeback.c
*/
-extern void inode_wb_list_del(struct inode *inode);
+extern void inode_io_list_del(struct inode *inode);
extern long get_nr_dirty_inodes(void);
extern void evict_inodes(struct super_block *);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 2d48d28e1640..91e004518237 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -92,6 +92,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
}
/**
+ * kernfs_path_len - determine the length of the full path of a given node
+ * @kn: kernfs_node of interest
+ *
+ * The returned length doesn't include the space for the terminating '\0'.
+ */
+size_t kernfs_path_len(struct kernfs_node *kn)
+{
+ size_t len = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kernfs_rename_lock, flags);
+
+ do {
+ len += strlen(kn->name) + 1;
+ kn = kn->parent;
+ } while (kn && kn->parent);
+
+ spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+
+ return len;
+}
+
+/**
* kernfs_path - build full path of a given node
* @kn: kernfs_node of interest
* @buf: buffer to copy @kn's name into
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 55505cbe11af..d678bcc3cbcb 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -322,6 +322,11 @@ out_rqst:
return error;
}
+static struct svc_serv_ops lockd_sv_ops = {
+ .svo_shutdown = svc_rpcb_cleanup,
+ .svo_enqueue_xprt = svc_xprt_do_enqueue,
+};
+
static struct svc_serv *lockd_create_svc(void)
{
struct svc_serv *serv;
@@ -350,7 +355,7 @@ static struct svc_serv *lockd_create_svc(void)
nlm_timeout = LOCKD_DFLT_TIMEO;
nlmsvc_timeout = nlm_timeout * HZ;
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
return ERR_PTR(-ENOMEM);
@@ -586,6 +591,7 @@ static int lockd_init_net(struct net *net)
INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
INIT_LIST_HEAD(&ln->lockd_manager.list);
+ ln->lockd_manager.block_opens = false;
spin_lock_init(&ln->nsm_clnt_lock);
return 0;
}
diff --git a/fs/locks.c b/fs/locks.c
index d3d558ba4da7..2a54c800a223 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1568,6 +1568,7 @@ int fcntl_getlease(struct file *filp)
* desired lease.
* @dentry: dentry to check
* @arg: type of lease that we're trying to acquire
+ * @flags: current lock flags
*
* Check to see if there's an existing open fd on this file that would
* conflict with the lease we're trying to set.
diff --git a/fs/namei.c b/fs/namei.c
index 1c2105ed20c5..726d211db484 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -560,6 +560,24 @@ static int __nd_alloc_stack(struct nameidata *nd)
return 0;
}
+/**
+ * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
+ * @path: nameidate to verify
+ *
+ * Rename can sometimes move a file or directory outside of a bind
+ * mount, path_connected allows those cases to be detected.
+ */
+static bool path_connected(const struct path *path)
+{
+ struct vfsmount *mnt = path->mnt;
+
+ /* Only bind mounts can have disconnected paths */
+ if (mnt->mnt_root == mnt->mnt_sb->s_root)
+ return true;
+
+ return is_subdir(path->dentry, mnt->mnt_root);
+}
+
static inline int nd_alloc_stack(struct nameidata *nd)
{
if (likely(nd->depth != EMBEDDED_LEVELS))
@@ -1296,6 +1314,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
return -ECHILD;
nd->path.dentry = parent;
nd->seq = seq;
+ if (unlikely(!path_connected(&nd->path)))
+ return -ENOENT;
break;
} else {
struct mount *mnt = real_mount(nd->path.mnt);
@@ -1396,7 +1416,7 @@ static void follow_mount(struct path *path)
}
}
-static void follow_dotdot(struct nameidata *nd)
+static int follow_dotdot(struct nameidata *nd)
{
if (!nd->root.mnt)
set_root(nd);
@@ -1412,6 +1432,8 @@ static void follow_dotdot(struct nameidata *nd)
/* rare case of legitimate dget_parent()... */
nd->path.dentry = dget_parent(nd->path.dentry);
dput(old);
+ if (unlikely(!path_connected(&nd->path)))
+ return -ENOENT;
break;
}
if (!follow_up(&nd->path))
@@ -1419,6 +1441,7 @@ static void follow_dotdot(struct nameidata *nd)
}
follow_mount(&nd->path);
nd->inode = nd->path.dentry->d_inode;
+ return 0;
}
/*
@@ -1634,7 +1657,7 @@ static inline int handle_dots(struct nameidata *nd, int type)
if (nd->flags & LOOKUP_RCU) {
return follow_dotdot_rcu(nd);
} else
- follow_dotdot(nd);
+ return follow_dotdot(nd);
}
return 0;
}
@@ -2415,7 +2438,7 @@ done:
/**
* path_mountpoint - look up a path to be umounted
- * @nameidata: lookup context
+ * @nd: lookup context
* @flags: lookup flags
* @path: pointer to container for result
*
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 92dca9e90d8d..c556640dcf3b 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -46,13 +46,6 @@
struct pnfs_block_dev;
-enum pnfs_block_volume_type {
- PNFS_BLOCK_VOLUME_SIMPLE = 0,
- PNFS_BLOCK_VOLUME_SLICE = 1,
- PNFS_BLOCK_VOLUME_CONCAT = 2,
- PNFS_BLOCK_VOLUME_STRIPE = 3,
-};
-
#define PNFS_BLOCK_MAX_UUIDS 4
#define PNFS_BLOCK_MAX_DEVICES 64
@@ -117,13 +110,6 @@ struct pnfs_block_dev {
struct pnfs_block_dev_map *map);
};
-enum exstate4 {
- PNFS_BLOCK_READWRITE_DATA = 0,
- PNFS_BLOCK_READ_DATA = 1,
- PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
- PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
-};
-
/* sector_t fields are all in 512-byte sectors */
struct pnfs_block_extent {
union {
@@ -134,15 +120,12 @@ struct pnfs_block_extent {
sector_t be_f_offset; /* the starting offset in the file */
sector_t be_length; /* the size of the extent */
sector_t be_v_offset; /* the starting offset in the volume */
- enum exstate4 be_state; /* the state of this extent */
+ enum pnfs_block_extent_state be_state; /* the state of this extent */
#define EXTENT_WRITTEN 1
#define EXTENT_COMMITTING 2
unsigned int be_tag;
};
-/* on the wire size of the extent */
-#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
-
struct pnfs_block_layout {
struct pnfs_layout_hdr bl_layout;
struct rb_root bl_ext_rw;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index e535599a0719..a861bbdfe577 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -22,7 +22,7 @@ bl_free_device(struct pnfs_block_dev *dev)
kfree(dev->children);
} else {
if (dev->bdev)
- blkdev_put(dev->bdev, FMODE_READ);
+ blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
}
}
@@ -65,6 +65,11 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
return -EIO;
p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+ if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) {
+ pr_info("signature too long: %d\n",
+ b->simple.sigs[i].sig_len);
+ return -EIO;
+ }
p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
if (!p)
@@ -195,7 +200,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
if (!dev)
return -EIO;
- d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
if (IS_ERR(d->bdev)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 31d0b5e53dfd..c59a59c37f3d 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -462,6 +462,12 @@ out:
return err;
}
+static size_t ext_tree_layoutupdate_size(size_t count)
+{
+ return sizeof(__be32) /* number of entries */ +
+ PNFS_BLOCK_EXTENT_SIZE * count;
+}
+
static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
size_t buffer_size)
{
@@ -489,7 +495,7 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
continue;
(*count)++;
- if (*count * BL_EXTENT_SIZE > buffer_size) {
+ if (ext_tree_layoutupdate_size(*count) > buffer_size) {
/* keep counting.. */
ret = -ENOSPC;
continue;
@@ -530,7 +536,7 @@ retry:
if (unlikely(ret)) {
ext_tree_free_commitdata(arg, buffer_size);
- buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+ buffer_size = ext_tree_layoutupdate_size(count);
count = 0;
arg->layoutupdate_pages =
@@ -549,17 +555,14 @@ retry:
}
*start_p = cpu_to_be32(count);
- arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+ arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
- __be32 *p = start_p;
+ void *p = start_p, *end = p + arg->layoutupdate_len;
int i = 0;
- for (p = start_p;
- p < start_p + arg->layoutupdate_len;
- p += PAGE_SIZE) {
+ for ( ; p < end; p += PAGE_SIZE)
arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
- }
}
dprintk("%s found %zu ranges\n", __func__, count);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 682529c00996..75f7c0a7538a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -162,10 +162,6 @@ nfs41_callback_up(struct svc_serv *serv)
spin_lock_init(&serv->sv_cb_lock);
init_waitqueue_head(&serv->sv_cb_waitq);
rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
- if (IS_ERR(rqstp)) {
- svc_xprt_put(serv->sv_bc_xprt);
- serv->sv_bc_xprt = NULL;
- }
dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
return rqstp;
}
@@ -308,6 +304,10 @@ err_bind:
return ret;
}
+static struct svc_serv_ops nfs_cb_sv_ops = {
+ .svo_enqueue_xprt = svc_xprt_do_enqueue,
+};
+
static struct svc_serv *nfs_callback_create_svc(int minorversion)
{
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
@@ -333,7 +333,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
cb_info->users);
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops);
if (!serv) {
printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
return ERR_PTR(-ENOMEM);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 29e3c1b011b7..b85cf7a30232 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -40,8 +40,11 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
inode = nfs_delegation_find_inode(cps->clp, &args->fh);
- if (inode == NULL)
+ if (inode == NULL) {
+ trace_nfs4_cb_getattr(cps->clp, &args->fh, NULL,
+ -ntohl(res->status));
goto out;
+ }
nfsi = NFS_I(inode);
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
@@ -60,6 +63,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
res->status = 0;
out_iput:
rcu_read_unlock();
+ trace_nfs4_cb_getattr(cps->clp, &args->fh, inode, -ntohl(res->status));
iput(inode);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
@@ -194,6 +198,7 @@ unlock:
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list);
pnfs_put_layout_hdr(lo);
+ trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv);
iput(ino);
out:
return rv;
@@ -554,7 +559,7 @@ __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy,
status = htonl(NFS4_OK);
nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
- nfs41_server_notify_target_slotid_update(cps->clp);
+ nfs41_notify_server(cps->clp);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4a90c9bb3135..57c5a02f6213 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -20,6 +20,7 @@
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/unistd.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/metrics.h>
@@ -285,116 +286,6 @@ void nfs_put_client(struct nfs_client *clp)
}
EXPORT_SYMBOL_GPL(nfs_put_client);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/*
- * Test if two ip6 socket addresses refer to the same socket by
- * comparing relevant fields. The padding bytes specifically, are not
- * compared. sin6_flowinfo is not compared because it only affects QoS
- * and sin6_scope_id is only compared if the address is "link local"
- * because "link local" addresses need only be unique to a specific
- * link. Conversely, ordinary unicast addresses might have different
- * sin6_scope_id.
- *
- * The caller should ensure both socket addresses are AF_INET6.
- */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
- const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
- if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
- return 0;
- else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- return sin1->sin6_scope_id == sin2->sin6_scope_id;
-
- return 1;
-}
-#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- return 0;
-}
-#endif
-
-/*
- * Test if two ip4 socket addresses refer to the same socket, by
- * comparing relevant fields. The padding bytes specifically, are
- * not compared.
- *
- * The caller should ensure both socket addresses are AF_INET.
- */
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
- const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
- return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
-}
-
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
- const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
- return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
- (sin1->sin6_port == sin2->sin6_port);
-}
-
-static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
- const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
- return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
- (sin1->sin_port == sin2->sin_port);
-}
-
-#if defined(CONFIG_NFS_V4_1)
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, excluding the port number.
- */
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- if (sa1->sa_family != sa2->sa_family)
- return 0;
-
- switch (sa1->sa_family) {
- case AF_INET:
- return nfs_sockaddr_match_ipaddr4(sa1, sa2);
- case AF_INET6:
- return nfs_sockaddr_match_ipaddr6(sa1, sa2);
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr);
-#endif /* CONFIG_NFS_V4_1 */
-
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, including the port number.
- */
-static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- if (sa1->sa_family != sa2->sa_family)
- return 0;
-
- switch (sa1->sa_family) {
- case AF_INET:
- return nfs_sockaddr_cmp_ip4(sa1, sa2);
- case AF_INET6:
- return nfs_sockaddr_cmp_ip6(sa1, sa2);
- }
- return 0;
-}
-
/*
* Find an nfs_client on the list that matches the initialisation data
* that is supplied.
@@ -421,7 +312,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
if (clp->cl_minorversion != data->minorversion)
continue;
/* Match the full socket address */
- if (!nfs_sockaddr_cmp(sap, clap))
+ if (!rpc_cmp_addr_port(sap, clap))
continue;
atomic_inc(&clp->cl_count);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 029d688a969f..2714ef835bdd 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -175,7 +175,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
if (delegation->inode != NULL) {
nfs4_stateid_copy(&delegation->stateid, &res->delegation);
delegation->type = res->delegation_type;
- delegation->maxsize = res->maxsize;
+ delegation->pagemod_limit = res->pagemod_limit;
oldcred = delegation->cred;
delegation->cred = get_rpccred(cred);
clear_bit(NFS_DELEGATION_NEED_RECLAIM,
@@ -337,7 +337,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
return -ENOMEM;
nfs4_stateid_copy(&delegation->stateid, &res->delegation);
delegation->type = res->delegation_type;
- delegation->maxsize = res->maxsize;
+ delegation->pagemod_limit = res->pagemod_limit;
delegation->change_attr = inode->i_version;
delegation->cred = get_rpccred(cred);
delegation->inode = inode;
@@ -900,3 +900,28 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode,
rcu_read_unlock();
return ret;
}
+
+/**
+ * nfs4_delegation_flush_on_close - Check if we must flush file on close
+ * @inode: inode to check
+ *
+ * This function checks the number of outstanding writes to the file
+ * against the delegation 'space_limit' field to see if
+ * the spec requires us to flush the file on close.
+ */
+bool nfs4_delegation_flush_on_close(const struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_delegation *delegation;
+ bool ret = true;
+
+ rcu_read_lock();
+ delegation = rcu_dereference(nfsi->delegation);
+ if (delegation == NULL || !(delegation->type & FMODE_WRITE))
+ goto out;
+ if (nfsi->nrequests < delegation->pagemod_limit)
+ ret = false;
+out:
+ rcu_read_unlock();
+ return ret;
+}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index e3c20a3ccc93..a44829173e57 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -18,7 +18,7 @@ struct nfs_delegation {
struct inode *inode;
nfs4_stateid stateid;
fmode_t type;
- loff_t maxsize;
+ unsigned long pagemod_limit;
__u64 change_attr;
unsigned long flags;
spinlock_t lock;
@@ -61,6 +61,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
int nfs4_check_delegation(struct inode *inode, fmode_t flags);
+bool nfs4_delegation_flush_on_close(const struct inode *inode);
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 547308a5ec6f..3d8e4ffa0a33 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -583,26 +583,19 @@ out_nopages:
}
static
-void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
{
unsigned int i;
for (i = 0; i < npages; i++)
put_page(pages[i]);
}
-static
-void nfs_readdir_free_large_page(void *ptr, struct page **pages,
- unsigned int npages)
-{
- nfs_readdir_free_pagearray(pages, npages);
-}
-
/*
* nfs_readdir_large_page will allocate pages that must be freed with a call
- * to nfs_readdir_free_large_page
+ * to nfs_readdir_free_pagearray
*/
static
-int nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
{
unsigned int i;
@@ -615,7 +608,7 @@ int nfs_readdir_large_page(struct page **pages, unsigned int npages)
return 0;
out_freepages:
- nfs_readdir_free_pagearray(pages, i);
+ nfs_readdir_free_pages(pages, i);
return -ENOMEM;
}
@@ -623,7 +616,6 @@ static
int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
{
struct page *pages[NFS_MAX_READDIR_PAGES];
- void *pages_ptr = NULL;
struct nfs_entry entry;
struct file *file = desc->file;
struct nfs_cache_array *array;
@@ -653,7 +645,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
memset(array, 0, sizeof(struct nfs_cache_array));
array->eof_index = -1;
- status = nfs_readdir_large_page(pages, array_size);
+ status = nfs_readdir_alloc_pages(pages, array_size);
if (status < 0)
goto out_release_array;
do {
@@ -671,7 +663,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
}
} while (array->eof_index < 0);
- nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+ nfs_readdir_free_pages(pages, array_size);
out_release_array:
nfs_readdir_release_array(page);
out_label_free:
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cc4fa1ed61fc..c0f9b1ed12b9 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -82,7 +82,8 @@ nfs_file_release(struct inode *inode, struct file *filp)
dprintk("NFS: release(%pD2)\n", filp);
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
- return nfs_release(inode, filp);
+ nfs_file_clear_open_context(filp);
+ return 0;
}
EXPORT_SYMBOL_GPL(nfs_file_release);
@@ -141,7 +142,7 @@ EXPORT_SYMBOL_GPL(nfs_file_llseek);
/*
* Flush all dirty pages, and check for write errors.
*/
-int
+static int
nfs_file_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
@@ -152,17 +153,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
if ((file->f_mode & FMODE_WRITE) == 0)
return 0;
- /*
- * If we're holding a write delegation, then just start the i/o
- * but don't wait for completion (or send a commit).
- */
- if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
- return filemap_fdatawrite(file->f_mapping);
-
/* Flush writes to the server and return any errors */
return vfs_fsync(file, 0);
}
-EXPORT_SYMBOL_GPL(nfs_file_flush);
ssize_t
nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
@@ -644,12 +637,10 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
.page_mkwrite = nfs_vm_page_mkwrite,
};
-static int nfs_need_sync_write(struct file *filp, struct inode *inode)
+static int nfs_need_check_write(struct file *filp, struct inode *inode)
{
struct nfs_open_context *ctx;
- if (IS_SYNC(inode) || (filp->f_flags & O_DSYNC))
- return 1;
ctx = nfs_file_open_context(filp);
if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
nfs_ctx_key_to_expire(ctx))
@@ -699,8 +690,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
if (result > 0)
written = result;
- /* Return error values for O_DSYNC and IS_SYNC() */
- if (result >= 0 && nfs_need_sync_write(file, inode)) {
+ /* Return error values */
+ if (result >= 0 && nfs_need_check_write(file, inode)) {
int err = vfs_fsync(file, 0);
if (err < 0)
result = err;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index b3289d701eea..fbc5a56de875 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -34,6 +34,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
ffl = kzalloc(sizeof(*ffl), gfp_flags);
if (ffl) {
INIT_LIST_HEAD(&ffl->error_list);
+ INIT_LIST_HEAD(&ffl->mirrors);
return &ffl->generic_hdr;
} else
return NULL;
@@ -135,6 +136,95 @@ decode_name(struct xdr_stream *xdr, u32 *id)
return 0;
}
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+ const struct nfs4_ff_layout_mirror *m2)
+{
+ int i, j;
+
+ if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+ return false;
+ for (i = 0; i < m1->fh_versions_cnt; i++) {
+ bool found_fh = false;
+ for (j = 0; j < m2->fh_versions_cnt; i++) {
+ if (nfs_compare_fh(&m1->fh_versions[i],
+ &m2->fh_versions[j]) == 0) {
+ found_fh = true;
+ break;
+ }
+ }
+ if (!found_fh)
+ return false;
+ }
+ return true;
+}
+
+static struct nfs4_ff_layout_mirror *
+ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
+ struct nfs4_ff_layout_mirror *pos;
+ struct inode *inode = lo->plh_inode;
+
+ spin_lock(&inode->i_lock);
+ list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
+ if (mirror->mirror_ds != pos->mirror_ds)
+ continue;
+ if (!ff_mirror_match_fh(mirror, pos))
+ continue;
+ if (atomic_inc_not_zero(&pos->ref)) {
+ spin_unlock(&inode->i_lock);
+ return pos;
+ }
+ }
+ list_add(&mirror->mirrors, &ff_layout->mirrors);
+ mirror->layout = lo;
+ spin_unlock(&inode->i_lock);
+ return mirror;
+}
+
+static void
+ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ struct inode *inode;
+ if (mirror->layout == NULL)
+ return;
+ inode = mirror->layout->plh_inode;
+ spin_lock(&inode->i_lock);
+ list_del(&mirror->mirrors);
+ spin_unlock(&inode->i_lock);
+ mirror->layout = NULL;
+}
+
+static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+
+ mirror = kzalloc(sizeof(*mirror), gfp_flags);
+ if (mirror != NULL) {
+ spin_lock_init(&mirror->lock);
+ atomic_set(&mirror->ref, 1);
+ INIT_LIST_HEAD(&mirror->mirrors);
+ }
+ return mirror;
+}
+
+static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ ff_layout_remove_mirror(mirror);
+ kfree(mirror->fh_versions);
+ if (mirror->cred)
+ put_rpccred(mirror->cred);
+ nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+ kfree(mirror);
+}
+
+static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror != NULL && atomic_dec_and_test(&mirror->ref))
+ ff_layout_free_mirror(mirror);
+}
+
static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
{
int i;
@@ -144,11 +234,7 @@ static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
/* normally mirror_ds is freed in
* .free_deviceid_node but we still do it here
* for .alloc_lseg error path */
- if (fls->mirror_array[i]) {
- kfree(fls->mirror_array[i]->fh_versions);
- nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
- kfree(fls->mirror_array[i]);
- }
+ ff_layout_put_mirror(fls->mirror_array[i]);
}
kfree(fls->mirror_array);
fls->mirror_array = NULL;
@@ -181,6 +267,65 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
}
}
+static bool
+ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
+{
+ u64 end1, end2;
+
+ if (l1->iomode != l2->iomode)
+ return l1->iomode != IOMODE_READ;
+ end1 = pnfs_calc_offset_end(l1->offset, l1->length);
+ end2 = pnfs_calc_offset_end(l2->offset, l2->length);
+ if (end1 < l2->offset)
+ return false;
+ if (end2 < l1->offset)
+ return true;
+ return l2->offset <= l1->offset;
+}
+
+static bool
+ff_lseg_merge(struct pnfs_layout_segment *new,
+ struct pnfs_layout_segment *old)
+{
+ u64 new_end, old_end;
+
+ if (new->pls_range.iomode != old->pls_range.iomode)
+ return false;
+ old_end = pnfs_calc_offset_end(old->pls_range.offset,
+ old->pls_range.length);
+ if (old_end < new->pls_range.offset)
+ return false;
+ new_end = pnfs_calc_offset_end(new->pls_range.offset,
+ new->pls_range.length);
+ if (new_end < old->pls_range.offset)
+ return false;
+
+ /* Mergeable: copy info from 'old' to 'new' */
+ if (new_end < old_end)
+ new_end = old_end;
+ if (new->pls_range.offset < old->pls_range.offset)
+ new->pls_range.offset = old->pls_range.offset;
+ new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
+ new_end);
+ if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
+ set_bit(NFS_LSEG_ROC, &new->pls_flags);
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+ set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
+ return true;
+}
+
+static void
+ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ pnfs_generic_layout_insert_lseg(lo, lseg,
+ ff_lseg_range_is_after,
+ ff_lseg_merge,
+ free_me);
+}
+
static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
{
int i, j;
@@ -246,6 +391,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
for (i = 0; i < fls->mirror_array_cnt; i++) {
+ struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid devid;
struct nfs4_deviceid_node *idnode;
u32 ds_count;
@@ -262,17 +408,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
if (ds_count != 1)
goto out_err_free;
- fls->mirror_array[i] =
- kzalloc(sizeof(struct nfs4_ff_layout_mirror),
- gfp_flags);
+ fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
if (fls->mirror_array[i] == NULL) {
rc = -ENOMEM;
goto out_err_free;
}
- spin_lock_init(&fls->mirror_array[i]->lock);
fls->mirror_array[i]->ds_count = ds_count;
- fls->mirror_array[i]->lseg = &fls->generic_hdr;
/* deviceid */
rc = decode_deviceid(&stream, &devid);
@@ -338,6 +480,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
if (rc)
goto out_err_free;
+ mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
+ if (mirror != fls->mirror_array[i]) {
+ ff_layout_free_mirror(fls->mirror_array[i]);
+ fls->mirror_array[i] = mirror;
+ }
+
dprintk("%s: uid %d gid %d\n", __func__,
fls->mirror_array[i]->uid,
fls->mirror_array[i]->gid);
@@ -379,21 +527,9 @@ static void
ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
- int i;
dprintk("--> %s\n", __func__);
- for (i = 0; i < fls->mirror_array_cnt; i++) {
- if (fls->mirror_array[i]) {
- nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
- fls->mirror_array[i]->mirror_ds = NULL;
- if (fls->mirror_array[i]->cred) {
- put_rpccred(fls->mirror_array[i]->cred);
- fls->mirror_array[i]->cred = NULL;
- }
- }
- }
-
if (lseg->pls_range.iomode == IOMODE_RW) {
struct nfs4_flexfile_layout *ffl;
struct inode *inode;
@@ -419,48 +555,44 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
}
static void
-nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
/* first IO request? */
if (atomic_inc_return(&timer->n_ops) == 1) {
- timer->start_time = ktime_get();
+ timer->start_time = now;
}
}
static ktime_t
-nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
- ktime_t start, now;
+ ktime_t start;
if (atomic_dec_return(&timer->n_ops) < 0)
WARN_ON_ONCE(1);
- now = ktime_get();
start = timer->start_time;
timer->start_time = now;
return ktime_sub(now, start);
}
-static ktime_t
-nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
-{
- return ktime_sub(ktime_get(), task->tk_start);
-}
-
static bool
nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
- struct nfs4_ff_layoutstat *layoutstat)
+ struct nfs4_ff_layoutstat *layoutstat,
+ ktime_t now)
{
static const ktime_t notime = {0};
- ktime_t now = ktime_get();
+ s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
- nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
+ nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
if (ktime_equal(mirror->start_time, notime))
mirror->start_time = now;
if (ktime_equal(mirror->last_report_time, notime))
mirror->last_report_time = now;
+ if (layoutstats_timer != 0)
+ report_interval = (s64)layoutstats_timer * 1000LL;
if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
- FF_LAYOUTSTATS_REPORT_INTERVAL) {
+ report_interval) {
mirror->last_report_time = now;
return true;
}
@@ -482,35 +614,39 @@ static void
nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
__u64 requested,
__u64 completed,
- ktime_t time_completed)
+ ktime_t time_completed,
+ ktime_t time_started)
{
struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+ ktime_t completion_time = ktime_sub(time_completed, time_started);
ktime_t timer;
iostat->ops_completed++;
iostat->bytes_completed += completed;
iostat->bytes_not_delivered += requested - completed;
- timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
+ timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
iostat->total_busy_time =
ktime_add(iostat->total_busy_time, timer);
iostat->aggregate_completion_time =
- ktime_add(iostat->aggregate_completion_time, time_completed);
+ ktime_add(iostat->aggregate_completion_time,
+ completion_time);
}
static void
-nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
- __u64 requested)
+nfs4_ff_layout_stat_io_start_read(struct inode *inode,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested, ktime_t now)
{
bool report;
spin_lock(&mirror->lock);
- report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
+ report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
spin_unlock(&mirror->lock);
if (report)
- pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+ pnfs_report_layoutstat(inode, GFP_KERNEL);
}
static void
@@ -522,23 +658,24 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
spin_lock(&mirror->lock);
nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
requested, completed,
- nfs4_ff_layout_calc_completion_time(task));
+ ktime_get(), task->tk_start);
spin_unlock(&mirror->lock);
}
static void
-nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
- __u64 requested)
+nfs4_ff_layout_stat_io_start_write(struct inode *inode,
+ struct nfs4_ff_layout_mirror *mirror,
+ __u64 requested, ktime_t now)
{
bool report;
spin_lock(&mirror->lock);
- report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
+ report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
spin_unlock(&mirror->lock);
if (report)
- pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
+ pnfs_report_layoutstat(inode, GFP_NOIO);
}
static void
@@ -553,8 +690,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
spin_lock(&mirror->lock);
nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
- requested, completed,
- nfs4_ff_layout_calc_completion_time(task));
+ requested, completed, ktime_get(), task->tk_start);
spin_unlock(&mirror->lock);
}
@@ -728,8 +864,6 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
/* no lseg means that pnfs is not in use, so no mirroring here */
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
nfs_pageio_reset_write_mds(pgio);
return 1;
}
@@ -931,18 +1065,26 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
if (task->tk_status >= 0)
return 0;
- if (task->tk_status != -EJUKEBOX) {
+ switch (task->tk_status) {
+ /* File access problems. Don't mark the device as unavailable */
+ case -EACCES:
+ case -ESTALE:
+ case -EISDIR:
+ case -EBADHANDLE:
+ case -ELOOP:
+ case -ENOSPC:
+ break;
+ case -EJUKEBOX:
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ goto out_retry;
+ default:
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
nfs4_mark_deviceid_unavailable(devid);
- if (ff_layout_has_available_ds(lseg))
- return -NFS4ERR_RESET_TO_PNFS;
- else
- return -NFS4ERR_RESET_TO_MDS;
}
-
- if (task->tk_status == -EJUKEBOX)
- nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ /* FIXME: Need to prevent infinite looping here. */
+ return -NFS4ERR_RESET_TO_PNFS;
+out_retry:
task->tk_status = 0;
rpc_restart_call(task);
rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
@@ -972,15 +1114,41 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
int idx, u64 offset, u64 length,
- u32 status, int opnum)
+ u32 status, int opnum, int error)
{
struct nfs4_ff_layout_mirror *mirror;
int err;
+ if (status == 0) {
+ switch (error) {
+ case -ETIMEDOUT:
+ case -EPFNOSUPPORT:
+ case -EPROTONOSUPPORT:
+ case -EOPNOTSUPP:
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EADDRINUSE:
+ case -ENOBUFS:
+ case -EPIPE:
+ case -EPERM:
+ status = NFS4ERR_NXIO;
+ break;
+ case -EACCES:
+ status = NFS4ERR_ACCESS;
+ break;
+ default:
+ return;
+ }
+ }
+
mirror = FF_LAYOUT_COMP(lseg, idx);
err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
mirror, offset, length, status, opnum,
GFP_NOIO);
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
}
@@ -989,16 +1157,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
- struct inode *inode;
int err;
trace_nfs4_pnfs_read(hdr, task->tk_status);
- if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
- hdr->res.op_status = NFS4ERR_NXIO;
- if (task->tk_status < 0 && hdr->res.op_status)
+ if (task->tk_status < 0)
ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
hdr->args.offset, hdr->args.count,
- hdr->res.op_status, OP_READ);
+ hdr->res.op_status, OP_READ,
+ task->tk_status);
err = ff_layout_async_handle_error(task, hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
hdr->pgio_mirror_idx);
@@ -1010,8 +1176,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
pnfs_read_resend_pnfs(hdr);
return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
- inode = hdr->lseg->pls_layout->plh_inode;
- pnfs_error_mark_layout_for_return(inode, hdr->lseg);
ff_layout_reset_read(hdr);
return task->tk_status;
case -EAGAIN:
@@ -1061,9 +1225,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
static int ff_layout_read_prepare_common(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
- nfs4_ff_layout_stat_io_start_read(
+ nfs4_ff_layout_stat_io_start_read(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count);
+ hdr->args.count,
+ task->tk_start);
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
@@ -1163,32 +1328,26 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
- struct inode *inode;
int err;
trace_nfs4_pnfs_write(hdr, task->tk_status);
- if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
- hdr->res.op_status = NFS4ERR_NXIO;
- if (task->tk_status < 0 && hdr->res.op_status)
+ if (task->tk_status < 0)
ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
hdr->args.offset, hdr->args.count,
- hdr->res.op_status, OP_WRITE);
+ hdr->res.op_status, OP_WRITE,
+ task->tk_status);
err = ff_layout_async_handle_error(task, hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
hdr->pgio_mirror_idx);
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
+ pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
+ ff_layout_reset_write(hdr, true);
+ return task->tk_status;
case -NFS4ERR_RESET_TO_MDS:
- inode = hdr->lseg->pls_layout->plh_inode;
- pnfs_error_mark_layout_for_return(inode, hdr->lseg);
- if (err == -NFS4ERR_RESET_TO_PNFS) {
- pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
- ff_layout_reset_write(hdr, true);
- } else {
- pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
- ff_layout_reset_write(hdr, false);
- }
+ pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
+ ff_layout_reset_write(hdr, false);
return task->tk_status;
case -EAGAIN:
rpc_restart_call_prepare(task);
@@ -1199,34 +1358,35 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
hdr->res.verf->committed == NFS_DATA_SYNC)
ff_layout_set_layoutcommit(hdr);
+ /* zero out fattr since we don't care DS attr at all */
+ hdr->fattr.valid = 0;
+ if (task->tk_status >= 0)
+ nfs_writeback_update_inode(hdr);
+
return 0;
}
static int ff_layout_commit_done_cb(struct rpc_task *task,
struct nfs_commit_data *data)
{
- struct inode *inode;
int err;
trace_nfs4_pnfs_commit_ds(data, task->tk_status);
- if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
- data->res.op_status = NFS4ERR_NXIO;
- if (task->tk_status < 0 && data->res.op_status)
+ if (task->tk_status < 0)
ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
data->args.offset, data->args.count,
- data->res.op_status, OP_COMMIT);
+ data->res.op_status, OP_COMMIT,
+ task->tk_status);
err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
data->lseg, data->ds_commit_index);
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
+ pnfs_set_retry_layoutget(data->lseg->pls_layout);
+ pnfs_generic_prepare_to_resend_writes(data);
+ return -EAGAIN;
case -NFS4ERR_RESET_TO_MDS:
- inode = data->lseg->pls_layout->plh_inode;
- pnfs_error_mark_layout_for_return(inode, data->lseg);
- if (err == -NFS4ERR_RESET_TO_PNFS)
- pnfs_set_retry_layoutget(data->lseg->pls_layout);
- else
- pnfs_clear_retry_layoutget(data->lseg->pls_layout);
+ pnfs_clear_retry_layoutget(data->lseg->pls_layout);
pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -EAGAIN:
@@ -1244,9 +1404,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
static int ff_layout_write_prepare_common(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
- nfs4_ff_layout_stat_io_start_write(
+ nfs4_ff_layout_stat_io_start_write(hdr->inode,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
- hdr->args.count);
+ hdr->args.count,
+ task->tk_start);
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
@@ -1325,9 +1486,9 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
static void ff_layout_commit_prepare_common(struct rpc_task *task,
struct nfs_commit_data *cdata)
{
- nfs4_ff_layout_stat_io_start_write(
+ nfs4_ff_layout_stat_io_start_write(cdata->inode,
FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
- 0);
+ 0, task->tk_start);
}
static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
@@ -1842,53 +2003,55 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr,
*start = cpu_to_be32((xdr->p - start - 1) * 4);
}
-static bool
+static int
ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
- struct pnfs_layout_segment *pls,
- int *dev_count, int dev_limit)
+ struct pnfs_layout_hdr *lo,
+ int dev_limit)
{
+ struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *dev;
struct nfs42_layoutstat_devinfo *devinfo;
- int i;
+ int i = 0;
- for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) {
- if (*dev_count >= dev_limit)
+ list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+ if (i >= dev_limit)
break;
- mirror = FF_LAYOUT_COMP(pls, i);
- if (!mirror || !mirror->mirror_ds)
+ if (!mirror->mirror_ds)
+ continue;
+ /* mirror refcount put in cleanup_layoutstats */
+ if (!atomic_inc_not_zero(&mirror->ref))
continue;
- dev = FF_LAYOUT_DEVID_NODE(pls, i);
- devinfo = &args->devinfo[*dev_count];
+ dev = &mirror->mirror_ds->id_node;
+ devinfo = &args->devinfo[i];
memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
- devinfo->offset = pls->pls_range.offset;
- devinfo->length = pls->pls_range.length;
- /* well, we don't really know if IO is continuous or not! */
- devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
+ devinfo->offset = 0;
+ devinfo->length = NFS4_MAX_UINT64;
+ devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
- devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
+ devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
devinfo->layout_type = LAYOUT_FLEX_FILES;
devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
devinfo->layout_private = mirror;
- /* lseg refcount put in cleanup_layoutstats */
- pnfs_get_lseg(pls);
- ++(*dev_count);
+ i++;
}
-
- return *dev_count < dev_limit;
+ return i;
}
static int
ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
{
- struct pnfs_layout_segment *pls;
+ struct nfs4_flexfile_layout *ff_layout;
+ struct nfs4_ff_layout_mirror *mirror;
int dev_count = 0;
spin_lock(&args->inode->i_lock);
- list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
- dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
+ ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
+ list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+ if (atomic_read(&mirror->ref) != 0)
+ dev_count ++;
}
spin_unlock(&args->inode->i_lock);
/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
@@ -1897,20 +2060,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
__func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
dev_count = PNFS_LAYOUTSTATS_MAXDEV;
}
- args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
+ args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
if (!args->devinfo)
return -ENOMEM;
- dev_count = 0;
spin_lock(&args->inode->i_lock);
- list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
- if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
- PNFS_LAYOUTSTATS_MAXDEV)) {
- break;
- }
- }
+ args->num_dev = ff_layout_mirror_prepare_stats(args,
+ &ff_layout->generic_hdr, dev_count);
spin_unlock(&args->inode->i_lock);
- args->num_dev = dev_count;
return 0;
}
@@ -1924,7 +2081,7 @@ ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
for (i = 0; i < data->args.num_dev; i++) {
mirror = data->args.devinfo[i].layout_private;
data->args.devinfo[i].layout_private = NULL;
- pnfs_put_lseg(mirror->lseg);
+ ff_layout_put_mirror(mirror);
}
}
@@ -1936,6 +2093,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.free_layout_hdr = ff_layout_free_layout_hdr,
.alloc_lseg = ff_layout_alloc_lseg,
.free_lseg = ff_layout_free_lseg,
+ .add_lseg = ff_layout_add_lseg,
.pg_read_ops = &ff_layout_pg_read_ops,
.pg_write_ops = &ff_layout_pg_write_ops,
.get_ds_info = ff_layout_get_ds_info,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index f92f9a0a856b..68cc0d9828f9 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -67,7 +67,8 @@ struct nfs4_ff_layoutstat {
};
struct nfs4_ff_layout_mirror {
- struct pnfs_layout_segment *lseg; /* back pointer */
+ struct pnfs_layout_hdr *layout;
+ struct list_head mirrors;
u32 ds_count;
u32 efficiency;
struct nfs4_ff_layout_ds *mirror_ds;
@@ -77,6 +78,7 @@ struct nfs4_ff_layout_mirror {
u32 uid;
u32 gid;
struct rpc_cred *cred;
+ atomic_t ref;
spinlock_t lock;
struct nfs4_ff_layoutstat read_stat;
struct nfs4_ff_layoutstat write_stat;
@@ -95,6 +97,7 @@ struct nfs4_ff_layout_segment {
struct nfs4_flexfile_layout {
struct pnfs_layout_hdr generic_hdr;
struct pnfs_ds_commit_info commit_info;
+ struct list_head mirrors;
struct list_head error_list; /* nfs4_ff_layout_ds_err */
};
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index f13e1969eedd..e125e55de86d 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -172,6 +172,32 @@ out_err:
return NULL;
}
+static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
+ struct nfs4_deviceid_node *devid)
+{
+ nfs4_mark_deviceid_unavailable(devid);
+ if (!ff_layout_has_available_ds(lseg))
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+ lseg);
+}
+
+static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
+ struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror == NULL || mirror->mirror_ds == NULL) {
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+ lseg);
+ return false;
+ }
+ if (mirror->mirror_ds->ds == NULL) {
+ struct nfs4_deviceid_node *devid;
+ devid = &mirror->mirror_ds->id_node;
+ ff_layout_mark_devid_invalid(lseg, devid);
+ return false;
+ }
+ return true;
+}
+
static u64
end_offset(u64 start, u64 len)
{
@@ -336,16 +362,10 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
{
struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
struct nfs_fh *fh = NULL;
- struct nfs4_deviceid_node *devid;
- if (mirror == NULL || mirror->mirror_ds == NULL ||
- mirror->mirror_ds->ds == NULL) {
- printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
+ if (!ff_layout_mirror_valid(lseg, mirror)) {
+ pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
__func__, mirror_idx);
- if (mirror && mirror->mirror_ds) {
- devid = &mirror->mirror_ds->id_node;
- pnfs_generic_mark_devid_invalid(devid);
- }
goto out;
}
@@ -368,14 +388,9 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
unsigned int max_payload;
rpc_authflavor_t flavor;
- if (mirror == NULL || mirror->mirror_ds == NULL ||
- mirror->mirror_ds->ds == NULL) {
- printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+ if (!ff_layout_mirror_valid(lseg, mirror)) {
+ pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
__func__, ds_idx);
- if (mirror && mirror->mirror_ds) {
- devid = &mirror->mirror_ds->id_node;
- pnfs_generic_mark_devid_invalid(devid);
- }
goto out;
}
@@ -500,16 +515,19 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
range->offset, range->length))
continue;
/* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
- * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
+ * + array length + deviceid(NFS4_DEVICEID4_SIZE)
+ * + status(4) + opnum(4)
*/
p = xdr_reserve_space(xdr,
- 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+ 28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
if (unlikely(!p))
return -ENOBUFS;
p = xdr_encode_hyper(p, err->offset);
p = xdr_encode_hyper(p, err->length);
p = xdr_encode_opaque_fixed(p, &err->stateid,
NFS4_STATEID_SIZE);
+ /* Encode 1 error */
+ *p++ = cpu_to_be32(1);
p = xdr_encode_opaque_fixed(p, &err->deviceid,
NFS4_DEVICEID4_SIZE);
*p++ = cpu_to_be32(err->status);
@@ -525,11 +543,11 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
return 0;
}
-bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *devid;
- int idx;
+ u32 idx;
for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
@@ -543,6 +561,32 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
return false;
}
+static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_deviceid_node *devid;
+ u32 idx;
+
+ for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ if (!mirror || !mirror->mirror_ds)
+ return false;
+ devid = &mirror->mirror_ds->id_node;
+ if (ff_layout_test_devid_unavailable(devid))
+ return false;
+ }
+
+ return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
+}
+
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ if (lseg->pls_range.iomode == IOMODE_READ)
+ return ff_read_layout_has_available_ds(lseg);
+ /* Note: RW layout needs all mirrors available */
+ return ff_rw_layout_has_available_ds(lseg);
+}
+
module_param(dataserver_retrans, uint, 0644);
MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
"retries a request before it attempts further "
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0adc7d245b3d..326d9e10d833 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -504,7 +504,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct nfs_fattr *fattr;
- int error = -ENOMEM;
+ int error = 0;
nfs_inc_stats(inode, NFSIOS_VFSSETATTR);
@@ -513,15 +513,14 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid &= ~ATTR_MODE;
if (attr->ia_valid & ATTR_SIZE) {
- loff_t i_size;
-
BUG_ON(!S_ISREG(inode->i_mode));
- i_size = i_size_read(inode);
- if (attr->ia_size == i_size)
+ error = inode_newsize_ok(inode, attr->ia_size);
+ if (error)
+ return error;
+
+ if (attr->ia_size == i_size_read(inode))
attr->ia_valid &= ~ATTR_SIZE;
- else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
- return -ETXTBSY;
}
/* Optimization: if the end result is no change, don't RPC */
@@ -536,8 +535,11 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
nfs_sync_inode(inode);
fattr = nfs_alloc_fattr();
- if (fattr == NULL)
+ if (fattr == NULL) {
+ error = -ENOMEM;
goto out;
+ }
+
/*
* Return any delegations if we're going to change ACLs
*/
@@ -759,11 +761,13 @@ EXPORT_SYMBOL_GPL(nfs_put_lock_context);
* @ctx: pointer to context
* @is_sync: is this a synchronous close
*
- * always ensure that the attributes are up to date if we're mounted
- * with close-to-open semantics
+ * Ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics and we have cached data that will
+ * need to be revalidated on open.
*/
void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
{
+ struct nfs_inode *nfsi;
struct inode *inode;
struct nfs_server *server;
@@ -772,7 +776,12 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
if (!is_sync)
return;
inode = d_inode(ctx->dentry);
- if (!list_empty(&NFS_I(inode)->open_files))
+ nfsi = NFS_I(inode);
+ if (inode->i_mapping->nrpages == 0)
+ return;
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ return;
+ if (!list_empty(&nfsi->open_files))
return;
server = NFS_SERVER(inode);
if (server->flags & NFS_MOUNT_NOCTO)
@@ -844,6 +853,11 @@ void put_nfs_open_context(struct nfs_open_context *ctx)
}
EXPORT_SYMBOL_GPL(put_nfs_open_context);
+static void put_nfs_open_context_sync(struct nfs_open_context *ctx)
+{
+ __put_nfs_open_context(ctx, 1);
+}
+
/*
* Ensure that mmap has a recent RPC credential for use when writing out
* shared pages
@@ -888,7 +902,7 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
return ctx;
}
-static void nfs_file_clear_open_context(struct file *filp)
+void nfs_file_clear_open_context(struct file *filp)
{
struct nfs_open_context *ctx = nfs_file_open_context(filp);
@@ -899,7 +913,7 @@ static void nfs_file_clear_open_context(struct file *filp)
spin_lock(&inode->i_lock);
list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
spin_unlock(&inode->i_lock);
- __put_nfs_open_context(ctx, filp->f_flags & O_DIRECT ? 0 : 1);
+ put_nfs_open_context_sync(ctx);
}
}
@@ -919,12 +933,6 @@ int nfs_open(struct inode *inode, struct file *filp)
return 0;
}
-int nfs_release(struct inode *inode, struct file *filp)
-{
- nfs_file_clear_open_context(filp);
- return 0;
-}
-
/*
* This function is called whenever some part of NFS notices that
* the cached attributes have to be refreshed.
@@ -1273,13 +1281,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
return 0;
}
-static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
-{
- if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
- return 0;
- return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
-}
-
static atomic_long_t nfs_attr_generation_counter;
static unsigned long nfs_read_attr_generation_counter(void)
@@ -1428,7 +1429,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
const struct nfs_inode *nfsi = NFS_I(inode);
return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
- nfs_ctime_need_update(inode, fattr) ||
((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
}
@@ -1491,6 +1491,13 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
{
unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ /*
+ * Don't revalidate the pagecache if we hold a delegation, but do
+ * force an attribute update
+ */
+ if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED;
+
if (S_ISDIR(inode->i_mode))
invalid |= NFS_INO_INVALID_DATA;
nfs_set_cache_invalid(inode, invalid);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9b372b845f6a..56cfde26fb9c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -219,10 +219,6 @@ static inline void nfs_fs_proc_exit(void)
}
#endif
-#ifdef CONFIG_NFS_V4_1
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
-#endif
-
/* callback_xdr.c */
extern struct svc_version nfs4_callback_version1;
extern struct svc_version nfs4_callback_version4;
@@ -364,7 +360,6 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *)
/* file.c */
int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
loff_t nfs_file_llseek(struct file *, loff_t, int);
-int nfs_file_flush(struct file *, fl_owner_t);
ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
size_t, unsigned int);
@@ -490,6 +485,9 @@ void nfs_retry_commit(struct list_head *page_list,
void nfs_commitdata_release(struct nfs_commit_data *data);
void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
struct nfs_commit_info *cinfo);
+void nfs_request_add_commit_list_locked(struct nfs_page *req,
+ struct list_head *dst,
+ struct nfs_commit_info *cinfo);
void nfs_request_remove_commit_list(struct nfs_page *req,
struct nfs_commit_info *cinfo);
void nfs_init_cinfo(struct nfs_commit_info *cinfo,
@@ -623,13 +621,15 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
* Record the page as unstable and mark its inode as dirty.
*/
static inline
-void nfs_mark_page_unstable(struct page *page)
+void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
{
- struct inode *inode = page_file_mapping(page)->host;
+ if (!cinfo->dreq) {
+ struct inode *inode = page_file_mapping(page)->host;
- inc_zone_page_state(page, NR_UNSTABLE_NFS);
- inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
- __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ inc_zone_page_state(page, NR_UNSTABLE_NFS);
+ inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ }
}
/*
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9b04c2e6fffc..267126d32ec0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1103,6 +1103,7 @@ static void nfs3_xdr_enc_symlink3args(struct rpc_rqst *req,
{
encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
encode_symlinkdata3(xdr, args);
+ xdr->buf->flags |= XDRBUF_WRITE;
}
/*
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index ff66ae700b89..814c1255f1d2 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -17,7 +17,5 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
-/* nfs4.2xdr.h */
-extern struct rpc_procinfo nfs4_2_procedures[];
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index a6bd27da6286..0eb29e14070d 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -238,8 +238,7 @@ out_overflow:
return -EIO;
}
-static int decode_layoutstats(struct xdr_stream *xdr,
- struct nfs42_layoutstat_res *res)
+static int decode_layoutstats(struct xdr_stream *xdr)
{
return decode_op_hdr(xdr, OP_LAYOUTSTATS);
}
@@ -343,7 +342,7 @@ static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
goto out;
WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
for (i = 0; i < res->num_dev; i++) {
- status = decode_layoutstats(xdr, res);
+ status = decode_layoutstats(xdr);
if (status)
goto out;
}
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ea3bee919a76..50cfc4ca7a02 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -405,9 +405,7 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
int nfs41_discover_server_trunking(struct nfs_client *clp,
struct nfs_client **, struct rpc_cred *);
extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
-extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp);
-extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
-
+extern void nfs41_notify_server(struct nfs_client *);
#else
static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
{
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 3aa6a9ba5113..223bedda64ae 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -729,10 +729,7 @@ static bool nfs4_cb_match_client(const struct sockaddr *addr,
return false;
/* Match only the IP address, not the port number */
- if (!nfs_sockaddr_match_ipaddr(addr, clap))
- return false;
-
- return true;
+ return rpc_cmp_addr(addr, clap);
}
/*
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index dcd39d4e2efe..b0dbe0abed53 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -6,7 +6,9 @@
#include <linux/fs.h>
#include <linux/falloc.h>
#include <linux/nfs_fs.h>
+#include "delegation.h"
#include "internal.h"
+#include "iostat.h"
#include "fscache.h"
#include "pnfs.h"
@@ -27,7 +29,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
struct inode *dir;
unsigned openflags = filp->f_flags;
struct iattr attr;
- int opened = 0;
int err;
/*
@@ -66,7 +67,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
nfs_sync_inode(inode);
}
- inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened);
+ inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
switch (err) {
@@ -100,6 +101,31 @@ out_drop:
goto out_put_ctx;
}
+/*
+ * Flush all dirty pages, and check for write errors.
+ */
+static int
+nfs4_file_flush(struct file *file, fl_owner_t id)
+{
+ struct inode *inode = file_inode(file);
+
+ dprintk("NFS: flush(%pD2)\n", file);
+
+ nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
+ if ((file->f_mode & FMODE_WRITE) == 0)
+ return 0;
+
+ /*
+ * If we're holding a write delegation, then check if we're required
+ * to flush the i/o on close. If not, then just start the i/o now.
+ */
+ if (!nfs4_delegation_flush_on_close(inode))
+ return filemap_fdatawrite(file->f_mapping);
+
+ /* Flush writes to the server and return any errors */
+ return vfs_fsync(file, 0);
+}
+
static int
nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
@@ -178,7 +204,7 @@ const struct file_operations nfs4_file_operations = {
.write_iter = nfs_file_write,
.mmap = nfs_file_mmap,
.open = nfs4_file_open,
- .flush = nfs_file_flush,
+ .flush = nfs4_file_flush,
.release = nfs_file_release,
.fsync = nfs4_file_fsync,
.lock = nfs_lock,
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 535dfc69c628..2e4902203c35 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -184,7 +184,7 @@ static struct key_type key_type_id_resolver = {
.read = user_read,
};
-static int nfs_idmap_init_keyring(void)
+int nfs_idmap_init(void)
{
struct cred *cred;
struct key *keyring;
@@ -230,7 +230,7 @@ failed_put_cred:
return ret;
}
-static void nfs_idmap_quit_keyring(void)
+void nfs_idmap_quit(void)
{
key_revoke(id_resolver_cache->thread_keyring);
unregister_key_type(&key_type_id_resolver);
@@ -492,16 +492,6 @@ nfs_idmap_delete(struct nfs_client *clp)
kfree(idmap);
}
-int nfs_idmap_init(void)
-{
- return nfs_idmap_init_keyring();
-}
-
-void nfs_idmap_quit(void)
-{
- nfs_idmap_quit_keyring();
-}
-
static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
struct idmap_msg *im,
struct rpc_pipe_msg *msg)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3acb1eb72930..693b903b48bd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -586,7 +586,7 @@ out_unlock:
spin_unlock(&tbl->slot_tbl_lock);
res->sr_slot = NULL;
if (send_new_highest_used_slotid)
- nfs41_server_notify_highest_slotid_update(session->clp);
+ nfs41_notify_server(session->clp);
}
int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -1150,7 +1150,8 @@ out:
return ret;
}
-static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
+ enum open_claim_type4 claim)
{
if (delegation == NULL)
return 0;
@@ -1158,6 +1159,16 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
return 0;
if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
return 0;
+ switch (claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ if (!test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
+ break;
+ default:
+ return 0;
+ }
nfs_mark_delegation_referenced(delegation);
return 1;
}
@@ -1220,6 +1231,7 @@ static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
}
static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
+ nfs4_stateid *arg_stateid,
nfs4_stateid *stateid, fmode_t fmode)
{
clear_bit(NFS_O_RDWR_STATE, &state->flags);
@@ -1238,8 +1250,9 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
if (stateid == NULL)
return;
/* Handle races with OPEN */
- if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
- !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+ if (!nfs4_stateid_match_other(arg_stateid, &state->open_stateid) ||
+ (nfs4_stateid_match_other(stateid, &state->open_stateid) &&
+ !nfs4_stateid_is_newer(stateid, &state->open_stateid))) {
nfs_resync_open_stateid_locked(state);
return;
}
@@ -1248,10 +1261,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
nfs4_stateid_copy(&state->open_stateid, stateid);
}
-static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_clear_open_stateid(struct nfs4_state *state,
+ nfs4_stateid *arg_stateid,
+ nfs4_stateid *stateid, fmode_t fmode)
{
write_seqlock(&state->seqlock);
- nfs_clear_open_stateid_locked(state, stateid, fmode);
+ nfs_clear_open_stateid_locked(state, arg_stateid, stateid, fmode);
write_sequnlock(&state->seqlock);
if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
@@ -1376,6 +1391,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
struct nfs_delegation *delegation;
int open_mode = opendata->o_arg.open_flags;
fmode_t fmode = opendata->o_arg.fmode;
+ enum open_claim_type4 claim = opendata->o_arg.claim;
nfs4_stateid stateid;
int ret = -EAGAIN;
@@ -1389,7 +1405,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
spin_unlock(&state->owner->so_lock);
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
- if (!can_open_delegated(delegation, fmode)) {
+ if (!can_open_delegated(delegation, fmode, claim)) {
rcu_read_unlock();
break;
}
@@ -1852,6 +1868,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
struct nfs4_opendata *data = calldata;
struct nfs4_state_owner *sp = data->owner;
struct nfs_client *clp = sp->so_server->nfs_client;
+ enum open_claim_type4 claim = data->o_arg.claim;
if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
goto out_wait;
@@ -1866,15 +1883,15 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
goto out_no_action;
rcu_read_lock();
delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
- if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR &&
- data->o_arg.claim != NFS4_OPEN_CLAIM_DELEG_CUR_FH &&
- can_open_delegated(delegation, data->o_arg.fmode))
+ if (can_open_delegated(delegation, data->o_arg.fmode, claim))
goto unlock_no_action;
rcu_read_unlock();
}
/* Update client id. */
data->o_arg.clientid = clp->cl_clientid;
- switch (data->o_arg.claim) {
+ switch (claim) {
+ default:
+ break;
case NFS4_OPEN_CLAIM_PREVIOUS:
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
@@ -2294,15 +2311,25 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
* fields corresponding to attributes that were used to store the verifier.
* Make sure we clobber those fields in the later setattr call
*/
-static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct iattr *sattr)
+static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
+ struct iattr *sattr, struct nfs4_label **label)
{
- if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
+ const u32 *attrset = opendata->o_res.attrset;
+
+ if ((attrset[1] & FATTR4_WORD1_TIME_ACCESS) &&
!(sattr->ia_valid & ATTR_ATIME_SET))
sattr->ia_valid |= ATTR_ATIME;
- if ((opendata->o_res.attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
+ if ((attrset[1] & FATTR4_WORD1_TIME_MODIFY) &&
!(sattr->ia_valid & ATTR_MTIME_SET))
sattr->ia_valid |= ATTR_MTIME;
+
+ /* Except MODE, it seems harmless of setting twice. */
+ if ((attrset[1] & FATTR4_WORD1_MODE))
+ sattr->ia_valid &= ~ATTR_MODE;
+
+ if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
+ *label = NULL;
}
static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
@@ -2425,9 +2452,9 @@ static int _nfs4_do_open(struct inode *dir,
goto err_free_label;
state = ctx->state;
- if ((opendata->o_arg.open_flags & O_EXCL) &&
+ if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) &&
(opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
- nfs4_exclusive_attrset(opendata, sattr);
+ nfs4_exclusive_attrset(opendata, sattr, &label);
nfs_fattr_init(opendata->o_res.f_attr);
status = nfs4_do_setattr(state->inode, cred,
@@ -2439,7 +2466,7 @@ static int _nfs4_do_open(struct inode *dir,
nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
}
}
- if (opendata->file_created)
+ if (opened && opendata->file_created)
*opened |= FILE_CREATED;
if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
@@ -2661,7 +2688,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
switch (task->tk_status) {
case 0:
res_stateid = &calldata->res.stateid;
- if (calldata->arg.fmode == 0 && calldata->roc)
+ if (calldata->roc)
pnfs_roc_set_barrier(state->inode,
calldata->roc_barrier);
renew_lease(server, calldata->timestamp);
@@ -2684,7 +2711,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
goto out_release;
}
}
- nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
+ nfs_clear_open_stateid(state, &calldata->arg.stateid,
+ res_stateid, calldata->arg.fmode);
out_release:
nfs_release_seqid(calldata->arg.seqid);
nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2735,14 +2763,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
goto out_no_action;
}
- if (calldata->arg.fmode == 0) {
+ if (calldata->arg.fmode == 0)
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
- if (calldata->roc &&
- pnfs_roc_drain(inode, &calldata->roc_barrier, task)) {
- nfs_release_seqid(calldata->arg.seqid);
- goto out_wait;
- }
- }
+ if (calldata->roc)
+ pnfs_roc_get_barrier(inode, &calldata->roc_barrier);
+
calldata->arg.share_access =
nfs4_map_atomic_open_share(NFS_SERVER(inode),
calldata->arg.fmode, 0);
@@ -2883,8 +2908,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
+ u32 bitmask[3] = {}, minorversion = server->nfs_client->cl_minorversion;
struct nfs4_server_caps_arg args = {
.fhandle = fhandle,
+ .bitmask = bitmask,
};
struct nfs4_server_caps_res res = {};
struct rpc_message msg = {
@@ -2894,10 +2921,18 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
};
int status;
+ bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS |
+ FATTR4_WORD0_FH_EXPIRE_TYPE |
+ FATTR4_WORD0_LINK_SUPPORT |
+ FATTR4_WORD0_SYMLINK_SUPPORT |
+ FATTR4_WORD0_ACLSUPPORT;
+ if (minorversion)
+ bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (status == 0) {
/* Sanity check the server answers */
- switch (server->nfs_client->cl_minorversion) {
+ switch (minorversion) {
case 0:
res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
res.attr_bitmask[2] = 0;
@@ -2950,6 +2985,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
server->cache_consistency_bitmask[2] = 0;
+ memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask,
+ sizeof(server->exclcreat_bitmask));
server->acl_bitmask = res.acl_bitmask;
server->fh_expire_type = res.fh_expire_type;
}
@@ -3552,7 +3589,6 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
struct nfs4_label l, *ilabel = NULL;
struct nfs_open_context *ctx;
struct nfs4_state *state;
- int opened = 0;
int status = 0;
ctx = alloc_nfs_open_context(dentry, FMODE_READ);
@@ -3562,7 +3598,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
sattr->ia_mode &= ~current_umask();
- state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, &opened);
+ state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
if (IS_ERR(state)) {
status = PTR_ERR(state);
goto out;
@@ -4978,13 +5014,12 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
int result;
size_t len;
char *str;
- bool retried = false;
if (clp->cl_owner_id != NULL)
return 0;
-retry:
+
rcu_read_lock();
- len = 10 + strlen(clp->cl_ipaddr) + 1 +
+ len = 14 + strlen(clp->cl_ipaddr) + 1 +
strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
1 +
strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
@@ -5010,14 +5045,6 @@ retry:
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
rcu_read_unlock();
- /* Did something change? */
- if (result >= len) {
- kfree(str);
- if (retried)
- return -EINVAL;
- retried = true;
- goto retry;
- }
clp->cl_owner_id = str;
return 0;
}
@@ -5049,10 +5076,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
clp->rpc_ops->version, clp->cl_minorversion,
nfs4_client_id_uniquifier,
clp->cl_rpcclient->cl_nodename);
- if (result >= len) {
- kfree(str);
- return -EINVAL;
- }
clp->cl_owner_id = str;
return 0;
}
@@ -5088,10 +5111,6 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
result = scnprintf(str, len, "Linux NFSv%u.%u %s",
clp->rpc_ops->version, clp->cl_minorversion,
clp->cl_rpcclient->cl_nodename);
- if (result >= len) {
- kfree(str);
- return -EINVAL;
- }
clp->cl_owner_id = str;
return 0;
}
@@ -5289,9 +5308,8 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
d_data = (struct nfs4_delegreturndata *)data;
- if (d_data->roc &&
- pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
- return;
+ if (d_data->roc)
+ pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier);
nfs4_setup_sequence(d_data->res.server,
&d_data->args.seq_args,
@@ -7746,10 +7764,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
case 0:
goto out;
/*
+ * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
+ * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
+ */
+ case -NFS4ERR_BADLAYOUT:
+ goto out_overflow;
+ /*
* NFS4ERR_LAYOUTTRYLATER is a conflict with another client
- * (or clients) writing to the same RAID stripe
+ * (or clients) writing to the same RAID stripe except when
+ * the minlength argument is 0 (see RFC5661 section 18.43.3).
*/
case -NFS4ERR_LAYOUTTRYLATER:
+ if (lgp->args.minlength == 0)
+ goto out_overflow;
/*
* NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
* existing layout before getting a new one).
@@ -7805,6 +7832,10 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
rpc_restart_call_prepare(task);
out:
dprintk("<-- %s\n", __func__);
+ return;
+out_overflow:
+ task->tk_status = -EOVERFLOW;
+ goto out;
}
static size_t max_response_pages(struct nfs_server *server)
@@ -8661,6 +8692,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
.state_renewal_ops = &nfs41_state_renewal_ops,
+ .mig_recovery_ops = &nfs41_mig_recovery_ops,
};
#endif
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f2e2ad894461..da73bc443238 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2152,23 +2152,13 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
}
EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
-static void nfs41_ping_server(struct nfs_client *clp)
+void nfs41_notify_server(struct nfs_client *clp)
{
/* Use CHECK_LEASE to ping the server with a SEQUENCE */
set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
nfs4_schedule_state_manager(clp);
}
-void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
-{
- nfs41_ping_server(clp);
-}
-
-void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
-{
- nfs41_ping_server(clp);
-}
-
static void nfs4_reset_all_state(struct nfs_client *clp)
{
if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 470af1a78bec..28df12e525ba 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -884,6 +884,66 @@ DEFINE_NFS4_GETATTR_EVENT(nfs4_getattr);
DEFINE_NFS4_GETATTR_EVENT(nfs4_lookup_root);
DEFINE_NFS4_GETATTR_EVENT(nfs4_fsinfo);
+DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const struct nfs_fh *fhandle,
+ const struct inode *inode,
+ int error
+ ),
+
+ TP_ARGS(clp, fhandle, inode, error),
+
+ TP_STRUCT__entry(
+ __field(int, error)
+ __field(dev_t, dev)
+ __field(u32, fhandle)
+ __field(u64, fileid)
+ __string(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ ),
+
+ TP_fast_assign(
+ __entry->error = error;
+ __entry->fhandle = nfs_fhandle_hash(fhandle);
+ if (inode != NULL) {
+ __entry->fileid = NFS_FILEID(inode);
+ __entry->dev = inode->i_sb->s_dev;
+ } else {
+ __entry->fileid = 0;
+ __entry->dev = 0;
+ }
+ __assign_str(dstaddr, clp ?
+ rpc_peeraddr2str(clp->cl_rpcclient,
+ RPC_DISPLAY_ADDR) : "unknown")
+ ),
+
+ TP_printk(
+ "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+ "dstaddr=%s",
+ __entry->error,
+ show_nfsv4_errors(__entry->error),
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long long)__entry->fileid,
+ __entry->fhandle,
+ __get_str(dstaddr)
+ )
+);
+
+#define DEFINE_NFS4_INODE_CALLBACK_EVENT(name) \
+ DEFINE_EVENT(nfs4_inode_callback_event, name, \
+ TP_PROTO( \
+ const struct nfs_client *clp, \
+ const struct nfs_fh *fhandle, \
+ const struct inode *inode, \
+ int error \
+ ), \
+ TP_ARGS(clp, fhandle, inode, error))
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
+DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode);
+
+
DECLARE_EVENT_CLASS(nfs4_idmap_event,
TP_PROTO(
const char *name,
@@ -1136,6 +1196,7 @@ TRACE_EVENT(nfs4_layoutget,
DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 558cd65dbdb7..788adf3897c7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -400,7 +400,8 @@ static int nfs4_stat_to_errno(int);
#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
- 1 /* FIXME: opaque lrf_body always empty at the moment */)
+ 1 + \
+ XDR_QUADLEN(NFS4_OPAQUE_LIMIT))
#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
1 + decode_stateid_maxsz)
#define encode_secinfo_no_name_maxsz (op_encode_hdr_maxsz + 1)
@@ -1001,7 +1002,8 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
const struct nfs4_label *label,
- const struct nfs_server *server)
+ const struct nfs_server *server,
+ bool excl_check)
{
char owner_name[IDMAP_NAMESZ];
char owner_group[IDMAP_NAMESZ];
@@ -1067,6 +1069,17 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET;
len += 4;
}
+
+ if (excl_check) {
+ const u32 *excl_bmval = server->exclcreat_bitmask;
+ bmval[0] &= excl_bmval[0];
+ bmval[1] &= excl_bmval[1];
+ bmval[2] &= excl_bmval[2];
+
+ if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL))
+ label = NULL;
+ }
+
if (label) {
len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2);
bmval[2] |= FATTR4_WORD2_SECURITY_LABEL;
@@ -1154,7 +1167,9 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
case NF4LNK:
p = reserve_space(xdr, 4);
*p = cpu_to_be32(create->u.symlink.len);
- xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
+ xdr_write_pages(xdr, create->u.symlink.pages, 0,
+ create->u.symlink.len);
+ xdr->buf->flags |= XDRBUF_WRITE;
break;
case NF4BLK: case NF4CHR:
@@ -1168,7 +1183,7 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
}
encode_string(xdr, create->name->len, create->name->name);
- encode_attrs(xdr, create->attrs, create->label, create->server);
+ encode_attrs(xdr, create->attrs, create->label, create->server, false);
}
static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1382,18 +1397,17 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
{
- struct iattr dummy;
__be32 *p;
p = reserve_space(xdr, 4);
switch(arg->createmode) {
case NFS4_CREATE_UNCHECKED:
*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
break;
case NFS4_CREATE_GUARDED:
*p = cpu_to_be32(NFS4_CREATE_GUARDED);
- encode_attrs(xdr, arg->u.attrs, arg->label, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
break;
case NFS4_CREATE_EXCLUSIVE:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1402,8 +1416,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
case NFS4_CREATE_EXCLUSIVE4_1:
*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
encode_nfs4_verifier(xdr, &arg->u.verifier);
- dummy.ia_valid = 0;
- encode_attrs(xdr, &dummy, arg->label, arg->server);
+ encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true);
}
}
@@ -1659,7 +1672,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
{
encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
encode_nfs4_stateid(xdr, &arg->stateid);
- encode_attrs(xdr, arg->iap, arg->label, server);
+ encode_attrs(xdr, arg->iap, arg->label, server, false);
}
static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -2580,6 +2593,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
struct xdr_stream *xdr,
struct nfs4_server_caps_arg *args)
{
+ const u32 *bitmask = args->bitmask;
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
};
@@ -2587,11 +2601,7 @@ static void nfs4_xdr_enc_server_caps(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fhandle, &hdr);
- encode_getattr_one(xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
- FATTR4_WORD0_FH_EXPIRE_TYPE|
- FATTR4_WORD0_LINK_SUPPORT|
- FATTR4_WORD0_SYMLINK_SUPPORT|
- FATTR4_WORD0_ACLSUPPORT, &hdr);
+ encode_getattr_three(xdr, bitmask[0], bitmask[1], bitmask[2], &hdr);
encode_nops(&hdr);
}
@@ -3368,6 +3378,22 @@ out_overflow:
return -EIO;
}
+static int decode_attr_exclcreat_supported(struct xdr_stream *xdr,
+ uint32_t *bitmap, uint32_t *bitmask)
+{
+ if (likely(bitmap[2] & FATTR4_WORD2_SUPPATTR_EXCLCREAT)) {
+ int ret;
+ ret = decode_attr_bitmap(xdr, bitmask);
+ if (unlikely(ret < 0))
+ return ret;
+ bitmap[2] &= ~FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+ } else
+ bitmask[0] = bitmask[1] = bitmask[2] = 0;
+ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__,
+ bitmask[0], bitmask[1], bitmask[2]);
+ return 0;
+}
+
static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fh *fh)
{
__be32 *p;
@@ -4321,6 +4347,9 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
goto xdr_error;
if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0)
goto xdr_error;
+ if ((status = decode_attr_exclcreat_supported(xdr, bitmap,
+ res->exclcreat_bitmask)) != 0)
+ goto xdr_error;
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
dprintk("%s: xdr returned %d!\n", __func__, -status);
@@ -4903,24 +4932,28 @@ static int decode_lookup(struct xdr_stream *xdr)
}
/* This is too sick! */
-static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
+static int decode_space_limit(struct xdr_stream *xdr,
+ unsigned long *pagemod_limit)
{
__be32 *p;
uint32_t limit_type, nblocks, blocksize;
+ u64 maxsize = 0;
p = xdr_inline_decode(xdr, 12);
if (unlikely(!p))
goto out_overflow;
limit_type = be32_to_cpup(p++);
switch (limit_type) {
- case 1:
- xdr_decode_hyper(p, maxsize);
+ case NFS4_LIMIT_SIZE:
+ xdr_decode_hyper(p, &maxsize);
break;
- case 2:
+ case NFS4_LIMIT_BLOCKS:
nblocks = be32_to_cpup(p++);
blocksize = be32_to_cpup(p);
- *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+ maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
}
+ maxsize >>= PAGE_CACHE_SHIFT;
+ *pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
return 0;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -4948,7 +4981,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr,
break;
case NFS4_OPEN_DELEGATE_WRITE:
res->delegation_type = FMODE_WRITE|FMODE_READ;
- if (decode_space_limit(xdr, &res->maxsize) < 0)
+ if (decode_space_limit(xdr, &res->pagemod_limit) < 0)
return -EIO;
}
return decode_ace(xdr, NULL, res->server->nfs_client);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 4984bbe55ff1..7c5718ba625e 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(nfs_pgheader_init);
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
{
spin_lock(&hdr->lock);
- if (pos < hdr->io_start + hdr->good_bytes) {
- set_bit(NFS_IOHDR_ERROR, &hdr->flags);
+ if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)
+ || pos < hdr->io_start + hdr->good_bytes) {
clear_bit(NFS_IOHDR_EOF, &hdr->flags);
hdr->good_bytes = pos - hdr->io_start;
hdr->error = error;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 70bf706b1090..ba1246433794 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -368,7 +368,6 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
return false;
lo->plh_return_iomode = 0;
- lo->plh_block_lgets++;
pnfs_get_layout_hdr(lo);
clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
return true;
@@ -817,25 +816,12 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
}
-static bool
-pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range)
-{
- return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
- (lo->plh_return_iomode == IOMODE_ANY ||
- lo->plh_return_iomode == range->iomode);
-}
-
/* lget is set to 1 if called from inside send_layoutget call chain */
static bool
-pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
- struct pnfs_layout_range *range, int lget)
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
{
return lo->plh_block_lgets ||
- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
- (list_empty(&lo->plh_segs) &&
- (atomic_read(&lo->plh_outstanding) > lget)) ||
- pnfs_layout_returning(lo, range);
+ test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
}
int
@@ -847,7 +833,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
- if (pnfs_layoutgets_blocked(lo, range, 1)) {
+ if (pnfs_layoutgets_blocked(lo)) {
status = -EAGAIN;
} else if (!nfs4_valid_open_stateid(open_state)) {
status = -EBADF;
@@ -882,6 +868,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
struct nfs_server *server = NFS_SERVER(ino);
struct nfs4_layoutget *lgp;
struct pnfs_layout_segment *lseg;
+ loff_t i_size;
dprintk("--> %s\n", __func__);
@@ -889,9 +876,17 @@ send_layoutget(struct pnfs_layout_hdr *lo,
if (lgp == NULL)
return NULL;
+ i_size = i_size_read(ino);
+
lgp->args.minlength = PAGE_CACHE_SIZE;
if (lgp->args.minlength > range->length)
lgp->args.minlength = range->length;
+ if (range->iomode == IOMODE_READ) {
+ if (range->offset >= i_size)
+ lgp->args.minlength = 0;
+ else if (i_size - range->offset < lgp->args.minlength)
+ lgp->args.minlength = i_size - range->offset;
+ }
lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
lgp->args.range = *range;
lgp->args.type = server->pnfs_curr_ld->id;
@@ -956,9 +951,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
if (unlikely(lrp == NULL)) {
status = -ENOMEM;
spin_lock(&ino->i_lock);
- lo->plh_block_lgets--;
pnfs_clear_layoutreturn_waitbit(lo);
- rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
spin_unlock(&ino->i_lock);
pnfs_put_layout_hdr(lo);
goto out;
@@ -1080,15 +1073,14 @@ bool pnfs_roc(struct inode *ino)
struct pnfs_layout_segment *lseg, *tmp;
nfs4_stateid stateid;
LIST_HEAD(tmp_list);
- bool found = false, layoutreturn = false;
+ bool found = false, layoutreturn = false, roc = false;
spin_lock(&ino->i_lock);
lo = nfsi->layout;
- if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
- test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+ if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
goto out_noroc;
- /* Don't return layout if we hold a delegation */
+ /* no roc if we hold a delegation */
if (nfs4_check_delegation(ino, FMODE_READ))
goto out_noroc;
@@ -1099,34 +1091,41 @@ bool pnfs_roc(struct inode *ino)
goto out_noroc;
}
+ stateid = lo->plh_stateid;
+ /* always send layoutreturn if being marked so */
+ if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &lo->plh_flags))
+ layoutreturn = pnfs_prepare_layoutreturn(lo);
+
pnfs_clear_retry_layoutget(lo);
list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
- if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
+ /* If we are sending layoutreturn, invalidate all valid lsegs */
+ if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
mark_lseg_invalid(lseg, &tmp_list);
found = true;
}
- if (!found)
- goto out_noroc;
- lo->plh_block_lgets++;
- pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
- spin_unlock(&ino->i_lock);
- pnfs_free_lseg_list(&tmp_list);
- pnfs_layoutcommit_inode(ino, true);
- return true;
+ /* pnfs_prepare_layoutreturn() grabs lo ref and it will be put
+ * in pnfs_roc_release(). We don't really send a layoutreturn but
+ * still want others to view us like we are sending one!
+ *
+ * If pnfs_prepare_layoutreturn() fails, it means someone else is doing
+ * LAYOUTRETURN, so we proceed like there are no layouts to return.
+ *
+ * ROC in three conditions:
+ * 1. there are ROC lsegs
+ * 2. we don't send layoutreturn
+ * 3. no others are sending layoutreturn
+ */
+ if (found && !layoutreturn && pnfs_prepare_layoutreturn(lo))
+ roc = true;
out_noroc:
- if (lo) {
- stateid = lo->plh_stateid;
- if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
- &lo->plh_flags))
- layoutreturn = pnfs_prepare_layoutreturn(lo);
- }
spin_unlock(&ino->i_lock);
- if (layoutreturn) {
- pnfs_layoutcommit_inode(ino, true);
+ pnfs_free_lseg_list(&tmp_list);
+ pnfs_layoutcommit_inode(ino, true);
+ if (layoutreturn)
pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
- }
- return false;
+ return roc;
}
void pnfs_roc_release(struct inode *ino)
@@ -1135,7 +1134,7 @@ void pnfs_roc_release(struct inode *ino)
spin_lock(&ino->i_lock);
lo = NFS_I(ino)->layout;
- lo->plh_block_lgets--;
+ pnfs_clear_layoutreturn_waitbit(lo);
if (atomic_dec_and_test(&lo->plh_refcount)) {
pnfs_detach_layout_hdr(lo);
spin_unlock(&ino->i_lock);
@@ -1153,27 +1152,16 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
lo->plh_barrier = barrier;
spin_unlock(&ino->i_lock);
+ trace_nfs4_layoutreturn_on_close(ino, 0);
}
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
{
struct nfs_inode *nfsi = NFS_I(ino);
struct pnfs_layout_hdr *lo;
- struct pnfs_layout_segment *lseg;
- nfs4_stateid stateid;
u32 current_seqid;
- bool layoutreturn = false;
spin_lock(&ino->i_lock);
- list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) {
- if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags))
- continue;
- if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
- continue;
- rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
- spin_unlock(&ino->i_lock);
- return true;
- }
lo = nfsi->layout;
current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
@@ -1181,19 +1169,7 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
* a barrier, we choose the worst-case barrier.
*/
*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
- stateid = lo->plh_stateid;
- if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
- &lo->plh_flags))
- layoutreturn = pnfs_prepare_layoutreturn(lo);
- if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
- rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-
spin_unlock(&ino->i_lock);
- if (layoutreturn) {
- pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
- return true;
- }
- return false;
}
/*
@@ -1221,16 +1197,41 @@ pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
}
-static void
-pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
- struct pnfs_layout_segment *lseg)
+static bool
+pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
+ const struct pnfs_layout_range *l2)
{
- struct pnfs_layout_segment *lp;
+ return pnfs_lseg_range_cmp(l1, l2) > 0;
+}
+
+static bool
+pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_segment *old)
+{
+ return false;
+}
+
+void
+pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ bool (*is_after)(const struct pnfs_layout_range *,
+ const struct pnfs_layout_range *),
+ bool (*do_merge)(struct pnfs_layout_segment *,
+ struct pnfs_layout_segment *),
+ struct list_head *free_me)
+{
+ struct pnfs_layout_segment *lp, *tmp;
dprintk("%s:Begin\n", __func__);
- list_for_each_entry(lp, &lo->plh_segs, pls_list) {
- if (pnfs_lseg_range_cmp(&lseg->pls_range, &lp->pls_range) > 0)
+ list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
+ if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
+ continue;
+ if (do_merge(lseg, lp)) {
+ mark_lseg_invalid(lp, free_me);
+ continue;
+ }
+ if (is_after(&lseg->pls_range, &lp->pls_range))
continue;
list_add_tail(&lseg->pls_list, &lp->pls_list);
dprintk("%s: inserted lseg %p "
@@ -1252,6 +1253,24 @@ out:
dprintk("%s:Return\n", __func__);
}
+EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
+
+static void
+pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me)
+{
+ struct inode *inode = lo->plh_inode;
+ struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+
+ if (ld->add_lseg != NULL)
+ ld->add_lseg(lo, lseg, free_me);
+ else
+ pnfs_generic_layout_insert_lseg(lo, lseg,
+ pnfs_lseg_range_is_after,
+ pnfs_lseg_no_merge,
+ free_me);
+}
static struct pnfs_layout_hdr *
alloc_init_layout_hdr(struct inode *ino,
@@ -1344,8 +1363,6 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
ret = pnfs_get_lseg(lseg);
break;
}
- if (lseg->pls_range.offset > range->offset)
- break;
}
dprintk("%s:Return lseg %p ref %d\n",
@@ -1438,6 +1455,8 @@ static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
{
+ if (!pnfs_should_retry_layoutget(lo))
+ return false;
/*
* send layoutcommit as it can hold up layoutreturn due to lseg
* reference
@@ -1484,6 +1503,9 @@ pnfs_update_layout(struct inode *ino,
if (!pnfs_enabled_sb(NFS_SERVER(ino)))
goto out;
+ if (iomode == IOMODE_READ && i_size_read(ino) == 0)
+ goto out;
+
if (pnfs_within_mdsthreshold(ctx, ino, iomode))
goto out;
@@ -1533,8 +1555,7 @@ lookup_again:
* Because we free lsegs before sending LAYOUTRETURN, we need to wait
* for LAYOUTRETURN even if first is true.
*/
- if (!lseg && pnfs_should_retry_layoutget(lo) &&
- test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
spin_unlock(&ino->i_lock);
dprintk("%s wait for layoutreturn\n", __func__);
if (pnfs_prepare_to_retry_layoutget(lo)) {
@@ -1547,7 +1568,7 @@ lookup_again:
goto out_put_layout_hdr;
}
- if (pnfs_layoutgets_blocked(lo, &arg, 0))
+ if (pnfs_layoutgets_blocked(lo))
goto out_unlock;
atomic_inc(&lo->plh_outstanding);
spin_unlock(&ino->i_lock);
@@ -1593,6 +1614,26 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(pnfs_update_layout);
+static bool
+pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
+{
+ switch (range->iomode) {
+ case IOMODE_READ:
+ case IOMODE_RW:
+ break;
+ default:
+ return false;
+ }
+ if (range->offset == NFS4_MAX_UINT64)
+ return false;
+ if (range->length == 0)
+ return false;
+ if (range->length != NFS4_MAX_UINT64 &&
+ range->length > NFS4_MAX_UINT64 - range->offset)
+ return false;
+ return true;
+}
+
struct pnfs_layout_segment *
pnfs_layout_process(struct nfs4_layoutget *lgp)
{
@@ -1601,7 +1642,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct pnfs_layout_segment *lseg;
struct inode *ino = lo->plh_inode;
LIST_HEAD(free_me);
- int status = 0;
+ int status = -EINVAL;
+
+ if (!pnfs_sanity_check_layout_range(&res->range))
+ goto out;
/* Inject layout blob into I/O device driver */
lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
@@ -1619,12 +1663,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
lseg->pls_range = res->range;
spin_lock(&ino->i_lock);
- if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
- dprintk("%s forget reply due to recall\n", __func__);
- goto out_forget_reply;
- }
-
- if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
+ if (pnfs_layoutgets_blocked(lo)) {
dprintk("%s forget reply due to state\n", __func__);
goto out_forget_reply;
}
@@ -1651,12 +1690,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
pnfs_get_lseg(lseg);
- pnfs_layout_insert_lseg(lo, lseg);
+ pnfs_layout_insert_lseg(lo, lseg, &free_me);
- if (res->return_on_close) {
+ if (res->return_on_close)
set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
- set_bit(NFS_LAYOUT_ROC, &lo->plh_flags);
- }
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me);
@@ -1692,6 +1729,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
lseg->pls_range.length);
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
mark_lseg_invalid(lseg, tmp_list);
+ set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &lo->plh_flags);
}
}
@@ -2267,7 +2306,7 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
#if IS_ENABLED(CONFIG_NFS_V4_2)
int
-pnfs_report_layoutstat(struct inode *inode)
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
{
struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
struct nfs_server *server = NFS_SERVER(inode);
@@ -2294,7 +2333,7 @@ pnfs_report_layoutstat(struct inode *inode)
pnfs_get_layout_hdr(hdr);
spin_unlock(&inode->i_lock);
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), gfp_flags);
if (!data) {
status = -ENOMEM;
goto out_put;
@@ -2324,3 +2363,7 @@ out_put:
}
EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
#endif
+
+unsigned int layoutstats_timer;
+module_param(layoutstats_timer, uint, 0644);
+EXPORT_SYMBOL_GPL(layoutstats_timer);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 3e6ab7bfbabd..78c9351ff117 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,7 +94,6 @@ enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
- NFS_LAYOUT_ROC, /* some lseg had roc bit set */
NFS_LAYOUT_RETURN, /* Return this layout ASAP */
NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
@@ -129,6 +128,9 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
void (*free_lseg) (struct pnfs_layout_segment *lseg);
+ void (*add_lseg) (struct pnfs_layout_hdr *layoutid,
+ struct pnfs_layout_segment *lseg,
+ struct list_head *free_me);
void (*return_range) (struct pnfs_layout_hdr *lo,
struct pnfs_layout_range *range);
@@ -184,15 +186,15 @@ struct pnfs_layoutdriver_type {
struct pnfs_layout_hdr {
atomic_t plh_refcount;
+ atomic_t plh_outstanding; /* number of RPCs out */
struct list_head plh_layouts; /* other client layouts */
struct list_head plh_bulk_destroy;
struct list_head plh_segs; /* layout segments list */
- nfs4_stateid plh_stateid;
- atomic_t plh_outstanding; /* number of RPCs out */
unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */
- u32 plh_barrier; /* ignore lower seqids */
unsigned long plh_retry_timestamp;
unsigned long plh_flags;
+ nfs4_stateid plh_stateid;
+ u32 plh_barrier; /* ignore lower seqids */
enum pnfs_iomode plh_return_iomode;
loff_t plh_lwb; /* last write byte for layoutcommit */
struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
@@ -267,7 +269,7 @@ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
-bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
+void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier);
void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
@@ -286,6 +288,14 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
gfp_t gfp_flags);
void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
+void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg,
+ bool (*is_after)(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *old),
+ bool (*do_merge)(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_segment *old),
+ struct list_head *free_me);
+
void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
@@ -529,12 +539,31 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
nfss->pnfs_curr_ld->id == src->l_type);
}
+static inline u64
+pnfs_calc_offset_end(u64 offset, u64 len)
+{
+ if (len == NFS4_MAX_UINT64 || len >= NFS4_MAX_UINT64 - offset)
+ return NFS4_MAX_UINT64;
+ return offset + len - 1;
+}
+
+static inline u64
+pnfs_calc_offset_length(u64 offset, u64 end)
+{
+ if (end == NFS4_MAX_UINT64 || end <= offset)
+ return NFS4_MAX_UINT64;
+ return 1 + end - offset;
+}
+
+extern unsigned int layoutstats_timer;
+
#ifdef NFS_DEBUG
void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
#else
static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
{
}
+
#endif /* NFS_DEBUG */
#else /* CONFIG_NFS_V4_1 */
@@ -605,10 +634,9 @@ pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
{
}
-static inline bool
-pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
+static inline void
+pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
{
- return false;
}
static inline void set_pnfs_layoutdriver(struct nfs_server *s,
@@ -691,10 +719,10 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
#endif /* CONFIG_NFS_V4_1 */
#if IS_ENABLED(CONFIG_NFS_V4_2)
-int pnfs_report_layoutstat(struct inode *inode);
+int pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags);
#else
static inline int
-pnfs_report_layoutstat(struct inode *inode)
+pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
{
return 0;
}
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index f37e25b6311c..24655b807d44 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -124,11 +124,12 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
if (ret) {
cinfo->ds->nwritten -= ret;
cinfo->ds->ncommitting += ret;
- bucket->clseg = bucket->wlseg;
- if (list_empty(src))
+ if (bucket->clseg == NULL)
+ bucket->clseg = pnfs_get_lseg(bucket->wlseg);
+ if (list_empty(src)) {
+ pnfs_put_lseg_locked(bucket->wlseg);
bucket->wlseg = NULL;
- else
- pnfs_get_lseg(bucket->clseg);
+ }
}
return ret;
}
@@ -182,19 +183,23 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
struct pnfs_commit_bucket *bucket;
struct pnfs_layout_segment *freeme;
+ LIST_HEAD(pages);
int i;
+ spin_lock(cinfo->lock);
for (i = idx; i < fl_cinfo->nbuckets; i++) {
bucket = &fl_cinfo->buckets[i];
if (list_empty(&bucket->committing))
continue;
- nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
- spin_lock(cinfo->lock);
freeme = bucket->clseg;
bucket->clseg = NULL;
+ list_splice_init(&bucket->committing, &pages);
spin_unlock(cinfo->lock);
+ nfs_retry_commit(&pages, freeme, cinfo, i);
pnfs_put_lseg(freeme);
+ spin_lock(cinfo->lock);
}
+ spin_unlock(cinfo->lock);
}
static unsigned int
@@ -216,10 +221,6 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
if (!data)
break;
data->ds_commit_index = i;
- spin_lock(cinfo->lock);
- data->lseg = bucket->clseg;
- bucket->clseg = NULL;
- spin_unlock(cinfo->lock);
list_add(&data->pages, list);
nreq++;
}
@@ -229,6 +230,22 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
return nreq;
}
+static inline
+void pnfs_fetch_commit_bucket_list(struct list_head *pages,
+ struct nfs_commit_data *data,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_commit_bucket *bucket;
+
+ bucket = &cinfo->ds->buckets[data->ds_commit_index];
+ spin_lock(cinfo->lock);
+ list_splice_init(&bucket->committing, pages);
+ data->lseg = bucket->clseg;
+ bucket->clseg = NULL;
+ spin_unlock(cinfo->lock);
+
+}
+
/* This follows nfs_commit_list pretty closely */
int
pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
@@ -243,7 +260,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
if (!list_empty(mds_pages)) {
data = nfs_commitdata_alloc();
if (data != NULL) {
- data->lseg = NULL;
+ data->ds_commit_index = -1;
list_add(&data->pages, &list);
nreq++;
} else {
@@ -265,19 +282,16 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
list_for_each_entry_safe(data, tmp, &list, pages) {
list_del_init(&data->pages);
- if (!data->lseg) {
+ if (data->ds_commit_index < 0) {
nfs_init_commit(data, mds_pages, NULL, cinfo);
nfs_initiate_commit(NFS_CLIENT(inode), data,
NFS_PROTO(data->inode),
data->mds_ops, how, 0);
} else {
- struct pnfs_commit_bucket *buckets;
+ LIST_HEAD(pages);
- buckets = cinfo->ds->buckets;
- nfs_init_commit(data,
- &buckets[data->ds_commit_index].committing,
- data->lseg,
- cinfo);
+ pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
+ nfs_init_commit(data, &pages, data->lseg, cinfo);
initiate_commit(data, how);
}
}
@@ -359,26 +373,31 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
return false;
}
+/*
+ * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does,
+ * declare a match.
+ */
static bool
_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
const struct list_head *dsaddrs2)
{
struct nfs4_pnfs_ds_addr *da1, *da2;
-
- /* step through both lists, comparing as we go */
- for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
- da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
- da1 != NULL && da2 != NULL;
- da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
- da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
- if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
- (struct sockaddr *)&da2->da_addr))
- return false;
+ struct sockaddr *sa1, *sa2;
+ bool match = false;
+
+ list_for_each_entry(da1, dsaddrs1, da_node) {
+ sa1 = (struct sockaddr *)&da1->da_addr;
+ match = false;
+ list_for_each_entry(da2, dsaddrs2, da_node) {
+ sa2 = (struct sockaddr *)&da2->da_addr;
+ match = same_sockaddr(sa1, sa2);
+ if (match)
+ break;
+ }
+ if (!match)
+ break;
}
- if (da1 == NULL && da2 == NULL)
- return true;
-
- return false;
+ return match;
}
/*
@@ -863,9 +882,10 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
}
set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
cinfo->ds->nwritten++;
- spin_unlock(cinfo->lock);
- nfs_request_add_commit_list(req, list, cinfo);
+ nfs_request_add_commit_list_locked(req, list, cinfo);
+ spin_unlock(cinfo->lock);
+ nfs_mark_page_unstable(req->wb_page, cinfo);
}
EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index aa62004f1706..383a027de452 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -381,9 +381,12 @@ int __init register_nfs_fs(void)
ret = nfs_register_sysctl();
if (ret < 0)
goto error_2;
- register_shrinker(&acl_shrinker);
+ ret = register_shrinker(&acl_shrinker);
+ if (ret < 0)
+ goto error_3;
return 0;
-
+error_3:
+ nfs_unregister_sysctl();
error_2:
unregister_nfs4_fs();
error_1:
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 75a35a1afa79..388f48079c43 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -768,6 +768,28 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
}
/**
+ * nfs_request_add_commit_list_locked - add request to a commit list
+ * @req: pointer to a struct nfs_page
+ * @dst: commit list head
+ * @cinfo: holds list lock and accounting info
+ *
+ * This sets the PG_CLEAN bit, updates the cinfo count of
+ * number of outstanding requests requiring a commit as well as
+ * the MM page stats.
+ *
+ * The caller must hold the cinfo->lock, and the nfs_page lock.
+ */
+void
+nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ set_bit(PG_CLEAN, &req->wb_flags);
+ nfs_list_add_request(req, dst);
+ cinfo->mds->ncommit++;
+}
+EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
+
+/**
* nfs_request_add_commit_list - add request to a commit list
* @req: pointer to a struct nfs_page
* @dst: commit list head
@@ -784,13 +806,10 @@ void
nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
struct nfs_commit_info *cinfo)
{
- set_bit(PG_CLEAN, &(req)->wb_flags);
spin_lock(cinfo->lock);
- nfs_list_add_request(req, dst);
- cinfo->mds->ncommit++;
+ nfs_request_add_commit_list_locked(req, dst, cinfo);
spin_unlock(cinfo->lock);
- if (!cinfo->dreq)
- nfs_mark_page_unstable(req->wb_page);
+ nfs_mark_page_unstable(req->wb_page, cinfo);
}
EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
@@ -1793,7 +1812,7 @@ out_mark_dirty:
return res;
}
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct nfs_inode *nfsi = NFS_I(inode);
int flags = FLUSH_SYNC;
@@ -1828,11 +1847,6 @@ out_mark_dirty:
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
}
-
-int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- return nfs_commit_unstable_pages(inode, wbc);
-}
EXPORT_SYMBOL_GPL(nfs_write_inode);
/*
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index ae6e58ea4de5..fd8c9a5bcac4 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -63,14 +63,33 @@ EXPORT_SYMBOL_GPL(locks_end_grace);
* lock reclaims.
*/
int
-locks_in_grace(struct net *net)
+__state_in_grace(struct net *net, bool open)
{
struct list_head *grace_list = net_generic(net, grace_net_id);
+ struct lock_manager *lm;
- return !list_empty(grace_list);
+ if (!open)
+ return !list_empty(grace_list);
+
+ list_for_each_entry(lm, grace_list, list) {
+ if (lm->block_opens)
+ return true;
+ }
+ return false;
+}
+
+int locks_in_grace(struct net *net)
+{
+ return __state_in_grace(net, 0);
}
EXPORT_SYMBOL_GPL(locks_in_grace);
+int opens_in_grace(struct net *net)
+{
+ return __state_in_grace(net, 1);
+}
+EXPORT_SYMBOL_GPL(opens_in_grace);
+
static int __net_init
grace_init_net(struct net *net)
{
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 9aa2796da90d..6d834dc9bbc8 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -101,7 +101,7 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
}
nr_iomaps = be32_to_cpup(p++);
- expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+ expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
if (len != expected) {
dprintk("%s: extent array size mismatch: %u/%u\n",
__func__, len, expected);
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index fdc79037c0e7..6de925fe8499 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -7,13 +7,6 @@
struct iomap;
struct xdr_stream;
-enum pnfs_block_extent_state {
- PNFS_BLOCK_READWRITE_DATA = 0,
- PNFS_BLOCK_READ_DATA = 1,
- PNFS_BLOCK_INVALID_DATA = 2,
- PNFS_BLOCK_NONE_DATA = 3,
-};
-
struct pnfs_block_extent {
struct nfsd4_deviceid vol_id;
u64 foff;
@@ -21,14 +14,6 @@ struct pnfs_block_extent {
u64 soff;
enum pnfs_block_extent_state es;
};
-#define NFS4_BLOCK_EXTENT_SIZE 44
-
-enum pnfs_block_volume_type {
- PNFS_BLOCK_VOLUME_SIMPLE = 0,
- PNFS_BLOCK_VOLUME_SLICE = 1,
- PNFS_BLOCK_VOLUME_CONCAT = 2,
- PNFS_BLOCK_VOLUME_STRIPE = 3,
-};
/*
* Random upper cap for the uuid length to avoid unbounded allocation.
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index f79521a59747..b4d84b579f20 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1075,73 +1075,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
return rv;
}
-/* Iterator */
-
-static void *e_start(struct seq_file *m, loff_t *pos)
- __acquires(((struct cache_detail *)m->private)->hash_lock)
-{
- loff_t n = *pos;
- unsigned hash, export;
- struct cache_head *ch;
- struct cache_detail *cd = m->private;
- struct cache_head **export_table = cd->hash_table;
-
- read_lock(&cd->hash_lock);
- if (!n--)
- return SEQ_START_TOKEN;
- hash = n >> 32;
- export = n & ((1LL<<32) - 1);
-
-
- for (ch=export_table[hash]; ch; ch=ch->next)
- if (!export--)
- return ch;
- n &= ~((1LL<<32) - 1);
- do {
- hash++;
- n += 1LL<<32;
- } while(hash < EXPORT_HASHMAX && export_table[hash]==NULL);
- if (hash >= EXPORT_HASHMAX)
- return NULL;
- *pos = n+1;
- return export_table[hash];
-}
-
-static void *e_next(struct seq_file *m, void *p, loff_t *pos)
-{
- struct cache_head *ch = p;
- int hash = (*pos >> 32);
- struct cache_detail *cd = m->private;
- struct cache_head **export_table = cd->hash_table;
-
- if (p == SEQ_START_TOKEN)
- hash = 0;
- else if (ch->next == NULL) {
- hash++;
- *pos += 1LL<<32;
- } else {
- ++*pos;
- return ch->next;
- }
- *pos &= ~((1LL<<32) - 1);
- while (hash < EXPORT_HASHMAX && export_table[hash] == NULL) {
- hash++;
- *pos += 1LL<<32;
- }
- if (hash >= EXPORT_HASHMAX)
- return NULL;
- ++*pos;
- return export_table[hash];
-}
-
-static void e_stop(struct seq_file *m, void *p)
- __releases(((struct cache_detail *)m->private)->hash_lock)
-{
- struct cache_detail *cd = m->private;
-
- read_unlock(&cd->hash_lock);
-}
-
static struct flags {
int flag;
char *name[2];
@@ -1270,9 +1203,9 @@ static int e_show(struct seq_file *m, void *p)
}
const struct seq_operations nfs_exports_op = {
- .start = e_start,
- .next = e_next,
- .stop = e_stop,
+ .start = cache_seq_start,
+ .next = cache_seq_next,
+ .stop = cache_seq_stop,
.show = e_show,
};
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 1f52bfcc436f..2e315072bf3f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -6,6 +6,7 @@
#include <linux/sunrpc/cache.h>
#include <uapi/linux/nfsd/export.h>
+#include <linux/nfs4.h>
struct knfsd_fh;
struct svc_fh;
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
index a3f34900091f..23cc85d1efdd 100644
--- a/fs/nfsd/idmap.h
+++ b/fs/nfsd/idmap.h
@@ -37,9 +37,7 @@
#include <linux/in.h>
#include <linux/sunrpc/svc.h>
-
-/* XXX from linux/nfs_idmap.h */
-#define IDMAP_NAMESZ 128
+#include <linux/nfs_idmap.h>
#ifdef CONFIG_NFSD_V4
int nfsd_idmap_init(struct net *);
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ea6749a32760..d8b16c2568f3 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -110,6 +110,7 @@ struct nfsd_net {
unsigned int max_connections;
u32 clientid_counter;
+ u32 clverifier_counter;
struct svc_serv *nfsd_serv;
};
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index d54701f6dc78..1580ea6fd64d 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -44,13 +44,13 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
inode = d_inode(fh->fh_dentry);
- if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+ if (argp->mask & ~NFS_ACL_MASK)
RETURN_STATUS(nfserr_inval);
resp->mask = argp->mask;
nfserr = fh_getattr(fh, &resp->stat);
if (nfserr)
- goto fail;
+ RETURN_STATUS(nfserr);
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
acl = get_acl(inode, ACL_TYPE_ACCESS);
@@ -202,7 +202,7 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
if (!p)
return 0;
argp->mask = ntohl(*p++);
- if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
+ if (argp->mask & ~NFS_ACL_MASK ||
!xdr_argsize_check(rqstp, p))
return 0;
@@ -293,9 +293,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
resp->acl_default,
resp->mask & NFS_DFACL,
NFS_ACL_DEFAULT);
- if (n <= 0)
- return 0;
- return 1;
+ return (n > 0);
}
static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 882b1a14bc3e..01df4cd7c753 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -41,7 +41,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
inode = d_inode(fh->fh_dentry);
- if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+ if (argp->mask & ~NFS_ACL_MASK)
RETURN_STATUS(nfserr_inval);
resp->mask = argp->mask;
@@ -148,7 +148,7 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
if (!p)
return 0;
args->mask = ntohl(*p++);
- if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
+ if (args->mask & ~NFS_ACL_MASK ||
!xdr_argsize_check(rqstp, p))
return 0;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index eb5accf1b37f..6adabd6049b7 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -34,8 +34,10 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <linux/fs.h>
#include <linux/slab.h>
-#include <linux/nfs_fs.h>
+#include <linux/posix_acl.h>
+
#include "nfsfh.h"
#include "nfsd.h"
#include "acl.h"
@@ -100,7 +102,7 @@ deny_mask_from_posix(unsigned short perm, u32 flags)
/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the
* side of being more restrictive, so the mode bit mapping below is
* pessimistic. An optimistic version would be needed to handle DENY's,
- * but we espect to coalesce all ALLOWs and DENYs before mapping to mode
+ * but we expect to coalesce all ALLOWs and DENYs before mapping to mode
* bits. */
static void
@@ -458,7 +460,7 @@ init_state(struct posix_acl_state *state, int cnt)
state->empty = 1;
/*
* In the worst case, each individual acl could be for a distinct
- * named user or group, but we don't no which, so we allocate
+ * named user or group, but we don't know which, so we allocate
* enough space for either:
*/
alloc = sizeof(struct posix_ace_state_array)
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index a49201835a97..e7f50c4081d6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -435,12 +435,12 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
*/
status = 0;
out:
- if (status)
- nfsd4_mark_cb_fault(cb->cb_clp, status);
+ cb->cb_seq_status = status;
return status;
out_overflow:
print_overflow_msg(__func__, xdr);
- return -EIO;
+ status = -EIO;
+ goto out;
}
static int decode_cb_sequence4res(struct xdr_stream *xdr,
@@ -451,11 +451,10 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr,
if (cb->cb_minorversion == 0)
return 0;
- status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_status);
- if (unlikely(status || cb->cb_status))
+ status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status);
+ if (unlikely(status || cb->cb_seq_status))
return status;
- cb->cb_update_seq_nr = true;
return decode_cb_sequence4resok(xdr, cb);
}
@@ -527,7 +526,7 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
if (cb != NULL) {
status = decode_cb_sequence4res(xdr, cb);
- if (unlikely(status || cb->cb_status))
+ if (unlikely(status || cb->cb_seq_status))
return status;
}
@@ -617,7 +616,7 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
if (cb) {
status = decode_cb_sequence4res(xdr, cb);
- if (unlikely(status || cb->cb_status))
+ if (unlikely(status || cb->cb_seq_status))
return status;
}
return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status);
@@ -876,7 +875,11 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
u32 minorversion = clp->cl_minorversion;
cb->cb_minorversion = minorversion;
- cb->cb_update_seq_nr = false;
+ /*
+ * cb_seq_status is only set in decode_cb_sequence4res,
+ * and so will remain 1 if an rpc level failure occurs.
+ */
+ cb->cb_seq_status = 1;
cb->cb_status = 0;
if (minorversion) {
if (!nfsd41_cb_get_slot(clp, task))
@@ -885,15 +888,30 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
rpc_call_start(task);
}
-static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
+static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback *cb)
{
- struct nfsd4_callback *cb = calldata;
struct nfs4_client *clp = cb->cb_clp;
+ struct nfsd4_session *session = clp->cl_cb_session;
+ bool ret = true;
- dprintk("%s: minorversion=%d\n", __func__,
- clp->cl_minorversion);
+ if (!clp->cl_minorversion) {
+ /*
+ * If the backchannel connection was shut down while this
+ * task was queued, we need to resubmit it after setting up
+ * a new backchannel connection.
+ *
+ * Note that if we lost our callback connection permanently
+ * the submission code will error out, so we don't need to
+ * handle that case here.
+ */
+ if (task->tk_flags & RPC_TASK_KILLED)
+ goto need_restart;
+
+ return true;
+ }
- if (clp->cl_minorversion) {
+ switch (cb->cb_seq_status) {
+ case 0:
/*
* No need for lock, access serialized in nfsd4_cb_prepare
*
@@ -901,29 +919,63 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
* If CB_SEQUENCE returns an error, then the state of the slot
* (sequence ID, cached reply) MUST NOT change.
*/
- if (cb->cb_update_seq_nr)
- ++clp->cl_cb_session->se_cb_seq_nr;
-
- clear_bit(0, &clp->cl_cb_slot_busy);
- rpc_wake_up_next(&clp->cl_cb_waitq);
- dprintk("%s: freed slot, new seqid=%d\n", __func__,
- clp->cl_cb_session->se_cb_seq_nr);
+ ++session->se_cb_seq_nr;
+ break;
+ case -ESERVERFAULT:
+ ++session->se_cb_seq_nr;
+ case 1:
+ case -NFS4ERR_BADSESSION:
+ nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+ ret = false;
+ break;
+ case -NFS4ERR_DELAY:
+ if (!rpc_restart_call(task))
+ goto out;
+
+ rpc_delay(task, 2 * HZ);
+ return false;
+ case -NFS4ERR_BADSLOT:
+ goto retry_nowait;
+ case -NFS4ERR_SEQ_MISORDERED:
+ if (session->se_cb_seq_nr != 1) {
+ session->se_cb_seq_nr = 1;
+ goto retry_nowait;
+ }
+ break;
+ default:
+ dprintk("%s: unprocessed error %d\n", __func__,
+ cb->cb_seq_status);
}
- /*
- * If the backchannel connection was shut down while this
- * task was queued, we need to resubmit it after setting up
- * a new backchannel connection.
- *
- * Note that if we lost our callback connection permanently
- * the submission code will error out, so we don't need to
- * handle that case here.
- */
- if (task->tk_flags & RPC_TASK_KILLED) {
- task->tk_status = 0;
- cb->cb_need_restart = true;
+ clear_bit(0, &clp->cl_cb_slot_busy);
+ rpc_wake_up_next(&clp->cl_cb_waitq);
+ dprintk("%s: freed slot, new seqid=%d\n", __func__,
+ clp->cl_cb_session->se_cb_seq_nr);
+
+ if (task->tk_flags & RPC_TASK_KILLED)
+ goto need_restart;
+out:
+ return ret;
+retry_nowait:
+ if (rpc_restart_call_prepare(task))
+ ret = false;
+ goto out;
+need_restart:
+ task->tk_status = 0;
+ cb->cb_need_restart = true;
+ return false;
+}
+
+static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
+{
+ struct nfsd4_callback *cb = calldata;
+ struct nfs4_client *clp = cb->cb_clp;
+
+ dprintk("%s: minorversion=%d\n", __func__,
+ clp->cl_minorversion);
+
+ if (!nfsd4_cb_sequence_done(task, cb))
return;
- }
if (cb->cb_status) {
WARN_ON_ONCE(task->tk_status);
@@ -1099,8 +1151,8 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
cb->cb_msg.rpc_resp = cb;
cb->cb_ops = ops;
INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
+ cb->cb_seq_status = 1;
cb->cb_status = 0;
- cb->cb_update_seq_nr = false;
cb->cb_need_restart = false;
}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index e1b3d3d472da..5b20577dcdd2 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -59,9 +59,6 @@ MODULE_PARM_DESC(nfs4_disable_idmapping,
* that.
*/
-#define IDMAP_TYPE_USER 0
-#define IDMAP_TYPE_GROUP 1
-
struct ent {
struct cache_head h;
int type; /* User / Group */
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 90cfda75313c..4ce6b97b31ad 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -276,13 +276,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
/*
- * Following rfc 3530 14.2.16, use the returned bitmask
- * to indicate which attributes we used to store the
- * verifier:
+ * Following rfc 3530 14.2.16, and rfc 5661 18.16.4
+ * use the returned bitmask to indicate which attributes
+ * we used to store the verifier:
*/
- if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
- open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
- FATTR4_WORD1_TIME_MODIFY);
+ if (nfsd_create_is_exclusive(open->op_createmode) && status == 0)
+ open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
+ FATTR4_WORD1_TIME_MODIFY);
} else
/*
* Note this may exit with the parent still locked.
@@ -362,7 +362,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
struct svc_fh *resfh = NULL;
- struct nfsd4_compoundres *resp;
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -389,8 +388,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
copy_clientid(&open->op_clientid, cstate->session);
/* check seqid for replay. set nfs4_owner */
- resp = rqstp->rq_resp;
- status = nfsd4_process_open1(&resp->cstate, open, nn);
+ status = nfsd4_process_open1(cstate, open, nn);
if (status == nfserr_replay_me) {
struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
fh_put(&cstate->current_fh);
@@ -417,10 +415,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Openowner is now set, so sequence id will get bumped. Now we need
* these checks before we do any creates: */
status = nfserr_grace;
- if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+ if (opens_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
status = nfserr_no_grace;
- if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+ if (!opens_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
switch (open->op_claim_type) {
@@ -829,7 +827,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
- if (locks_in_grace(SVC_NET(rqstp)))
+ if (opens_in_grace(SVC_NET(rqstp)))
return nfserr_grace;
status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
remove->rm_name, remove->rm_namelen);
@@ -848,7 +846,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!cstate->save_fh.fh_dentry)
return status;
- if (locks_in_grace(SVC_NET(rqstp)) &&
+ if (opens_in_grace(SVC_NET(rqstp)) &&
!(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK))
return nfserr_grace;
status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
@@ -1364,10 +1362,6 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
goto out;
}
- nfserr = ops->proc_layoutcommit(inode, lcp);
- if (nfserr)
- goto out_put_stid;
-
if (new_size > i_size_read(inode)) {
lcp->lc_size_chg = 1;
lcp->lc_newsize = new_size;
@@ -1375,7 +1369,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
lcp->lc_size_chg = 0;
}
-out_put_stid:
+ nfserr = ops->proc_layoutcommit(inode, lcp);
nfs4_put_stid(&ls->ls_stid);
out:
return nfserr;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index d88ea7b9a85c..e3d47091b191 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -272,6 +272,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
.ctx.actor = nfsd4_build_namelist,
.names = LIST_HEAD_INIT(ctx.names)
};
+ struct name_list *entry, *tmp;
int status;
status = nfs4_save_creds(&original_cred);
@@ -286,9 +287,8 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
status = iterate_dir(nn->rec_file, &ctx.ctx);
mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
- while (!list_empty(&ctx.names)) {
- struct name_list *entry;
- entry = list_entry(ctx.names.next, struct name_list, list);
+
+ list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
if (!status) {
struct dentry *dentry;
dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
@@ -304,6 +304,12 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
}
mutex_unlock(&d_inode(dir)->i_mutex);
nfs4_reset_creds(original_cred);
+
+ list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
+ dprintk("NFSD: %s. Left entry %s\n", __func__, entry->name);
+ list_del(&entry->list);
+ kfree(entry);
+ }
return status;
}
@@ -541,8 +547,7 @@ nfsd4_legacy_tracking_init(struct net *net)
/* XXX: The legacy code won't work in a container */
if (net != &init_net) {
- WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client "
- "tracking in a container!\n");
+ pr_warn("NFSD: attempt to initialize legacy client tracking in a container ignored.\n");
return -EINVAL;
}
@@ -1254,8 +1259,7 @@ nfsd4_umh_cltrack_init(struct net *net)
/* XXX: The usermode helper s not working in container yet. */
if (net != &init_net) {
- WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
- "tracking in a container!\n");
+ pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n");
return -EINVAL;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 95202719a1fd..0f1d5691b795 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -777,13 +777,16 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
}
-static void
+static bool
unhash_delegation_locked(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_stid.sc_file;
lockdep_assert_held(&state_lock);
+ if (list_empty(&dp->dl_perfile))
+ return false;
+
dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
/* Ensure that deleg break won't try to requeue it */
++dp->dl_time;
@@ -792,16 +795,21 @@ unhash_delegation_locked(struct nfs4_delegation *dp)
list_del_init(&dp->dl_recall_lru);
list_del_init(&dp->dl_perfile);
spin_unlock(&fp->fi_lock);
+ return true;
}
static void destroy_delegation(struct nfs4_delegation *dp)
{
+ bool unhashed;
+
spin_lock(&state_lock);
- unhash_delegation_locked(dp);
+ unhashed = unhash_delegation_locked(dp);
spin_unlock(&state_lock);
- put_clnt_odstate(dp->dl_clnt_odstate);
- nfs4_put_deleg_lease(dp->dl_stid.sc_file);
- nfs4_put_stid(&dp->dl_stid);
+ if (unhashed) {
+ put_clnt_odstate(dp->dl_clnt_odstate);
+ nfs4_put_deleg_lease(dp->dl_stid.sc_file);
+ nfs4_put_stid(&dp->dl_stid);
+ }
}
static void revoke_delegation(struct nfs4_delegation *dp)
@@ -990,6 +998,12 @@ release_all_access(struct nfs4_ol_stateid *stp)
}
}
+static inline void nfs4_free_stateowner(struct nfs4_stateowner *sop)
+{
+ kfree(sop->so_owner.data);
+ sop->so_ops->so_free(sop);
+}
+
static void nfs4_put_stateowner(struct nfs4_stateowner *sop)
{
struct nfs4_client *clp = sop->so_client;
@@ -1000,20 +1014,23 @@ static void nfs4_put_stateowner(struct nfs4_stateowner *sop)
return;
sop->so_ops->so_unhash(sop);
spin_unlock(&clp->cl_lock);
- kfree(sop->so_owner.data);
- sop->so_ops->so_free(sop);
+ nfs4_free_stateowner(sop);
}
-static void unhash_ol_stateid(struct nfs4_ol_stateid *stp)
+static bool unhash_ol_stateid(struct nfs4_ol_stateid *stp)
{
struct nfs4_file *fp = stp->st_stid.sc_file;
lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock);
+ if (list_empty(&stp->st_perfile))
+ return false;
+
spin_lock(&fp->fi_lock);
- list_del(&stp->st_perfile);
+ list_del_init(&stp->st_perfile);
spin_unlock(&fp->fi_lock);
list_del(&stp->st_perstateowner);
+ return true;
}
static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
@@ -1063,25 +1080,27 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
list_add(&stp->st_locks, reaplist);
}
-static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
+static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp)
{
struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
lockdep_assert_held(&oo->oo_owner.so_client->cl_lock);
list_del_init(&stp->st_locks);
- unhash_ol_stateid(stp);
nfs4_unhash_stid(&stp->st_stid);
+ return unhash_ol_stateid(stp);
}
static void release_lock_stateid(struct nfs4_ol_stateid *stp)
{
struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
+ bool unhashed;
spin_lock(&oo->oo_owner.so_client->cl_lock);
- unhash_lock_stateid(stp);
+ unhashed = unhash_lock_stateid(stp);
spin_unlock(&oo->oo_owner.so_client->cl_lock);
- nfs4_put_stid(&stp->st_stid);
+ if (unhashed)
+ nfs4_put_stid(&stp->st_stid);
}
static void unhash_lockowner_locked(struct nfs4_lockowner *lo)
@@ -1129,7 +1148,7 @@ static void release_lockowner(struct nfs4_lockowner *lo)
while (!list_empty(&lo->lo_owner.so_stateids)) {
stp = list_first_entry(&lo->lo_owner.so_stateids,
struct nfs4_ol_stateid, st_perstateowner);
- unhash_lock_stateid(stp);
+ WARN_ON(!unhash_lock_stateid(stp));
put_ol_stateid_locked(stp, &reaplist);
}
spin_unlock(&clp->cl_lock);
@@ -1142,21 +1161,26 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
{
struct nfs4_ol_stateid *stp;
+ lockdep_assert_held(&open_stp->st_stid.sc_client->cl_lock);
+
while (!list_empty(&open_stp->st_locks)) {
stp = list_entry(open_stp->st_locks.next,
struct nfs4_ol_stateid, st_locks);
- unhash_lock_stateid(stp);
+ WARN_ON(!unhash_lock_stateid(stp));
put_ol_stateid_locked(stp, reaplist);
}
}
-static void unhash_open_stateid(struct nfs4_ol_stateid *stp,
+static bool unhash_open_stateid(struct nfs4_ol_stateid *stp,
struct list_head *reaplist)
{
+ bool unhashed;
+
lockdep_assert_held(&stp->st_stid.sc_client->cl_lock);
- unhash_ol_stateid(stp);
+ unhashed = unhash_ol_stateid(stp);
release_open_stateid_locks(stp, reaplist);
+ return unhashed;
}
static void release_open_stateid(struct nfs4_ol_stateid *stp)
@@ -1164,8 +1188,8 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp)
LIST_HEAD(reaplist);
spin_lock(&stp->st_stid.sc_client->cl_lock);
- unhash_open_stateid(stp, &reaplist);
- put_ol_stateid_locked(stp, &reaplist);
+ if (unhash_open_stateid(stp, &reaplist))
+ put_ol_stateid_locked(stp, &reaplist);
spin_unlock(&stp->st_stid.sc_client->cl_lock);
free_ol_stateid_reaplist(&reaplist);
}
@@ -1210,8 +1234,8 @@ static void release_openowner(struct nfs4_openowner *oo)
while (!list_empty(&oo->oo_owner.so_stateids)) {
stp = list_first_entry(&oo->oo_owner.so_stateids,
struct nfs4_ol_stateid, st_perstateowner);
- unhash_open_stateid(stp, &reaplist);
- put_ol_stateid_locked(stp, &reaplist);
+ if (unhash_open_stateid(stp, &reaplist))
+ put_ol_stateid_locked(stp, &reaplist);
}
spin_unlock(&clp->cl_lock);
free_ol_stateid_reaplist(&reaplist);
@@ -1714,7 +1738,7 @@ __destroy_client(struct nfs4_client *clp)
spin_lock(&state_lock);
while (!list_empty(&clp->cl_delegations)) {
dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
- unhash_delegation_locked(dp);
+ WARN_ON(!unhash_delegation_locked(dp));
list_add(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&state_lock);
@@ -1894,7 +1918,7 @@ static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn)
* __force to keep sparse happy
*/
verf[0] = (__force __be32)get_seconds();
- verf[1] = (__force __be32)nn->clientid_counter;
+ verf[1] = (__force __be32)nn->clverifier_counter++;
memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data));
}
@@ -2241,6 +2265,9 @@ static bool client_has_state(struct nfs4_client *clp)
* Also note we should probably be using this in 4.0 case too.
*/
return !list_empty(&clp->cl_openowners)
+#ifdef CONFIG_NFSD_PNFS
+ || !list_empty(&clp->cl_lo_states)
+#endif
|| !list_empty(&clp->cl_delegations)
|| !list_empty(&clp->cl_sessions);
}
@@ -2547,11 +2574,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_conn;
cs_slot = &conf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
- if (status == nfserr_replay_cache) {
- status = nfsd4_replay_create_session(cr_ses, cs_slot);
- goto out_free_conn;
- } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
- status = nfserr_seq_misordered;
+ if (status) {
+ if (status == nfserr_replay_cache)
+ status = nfsd4_replay_create_session(cr_ses, cs_slot);
goto out_free_conn;
}
} else if (unconf) {
@@ -3041,10 +3066,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
unconf = find_unconfirmed_client_by_name(&clname, nn);
if (unconf)
unhash_client_locked(unconf);
- if (conf && same_verf(&conf->cl_verifier, &clverifier))
+ if (conf && same_verf(&conf->cl_verifier, &clverifier)) {
/* case 1: probable callback update */
copy_clid(new, conf);
- else /* case 4 (new client) or cases 2, 3 (client reboot): */
+ gen_confirm(new, nn);
+ } else /* case 4 (new client) or cases 2, 3 (client reboot): */
gen_clid(new, nn);
new->cl_minorversion = 0;
gen_callback(new, setclid, rqstp);
@@ -3085,10 +3111,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
/*
* We try hard to give out unique clientid's, so if we get an
* attempt to confirm the same clientid with a different cred,
- * there's a bug somewhere. Let's charitably assume it's our
- * bug.
+ * the client may be buggy; this should never happen.
+ *
+ * Nevertheless, RFC 7530 recommends INUSE for this case:
*/
- status = nfserr_serverfault;
+ status = nfserr_clid_inuse;
if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred))
goto out;
if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred))
@@ -3315,7 +3342,8 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
hash_openowner(oo, clp, strhashval);
ret = oo;
} else
- nfs4_free_openowner(&oo->oo_owner);
+ nfs4_free_stateowner(&oo->oo_owner);
+
spin_unlock(&clp->cl_lock);
return ret;
}
@@ -3482,6 +3510,9 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
{
struct nfs4_delegation *dp = cb_to_delegation(cb);
+ if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID)
+ return 1;
+
switch (task->tk_status) {
case 0:
return 1;
@@ -3885,12 +3916,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
return status;
}
-static void
-nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session)
-{
- open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
-}
-
/* Should we give out recallable state?: */
static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
{
@@ -3923,7 +3948,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
static int nfs4_setlease(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_stid.sc_file;
- struct file_lock *fl, *ret;
+ struct file_lock *fl;
struct file *filp;
int status = 0;
@@ -3934,10 +3959,10 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
if (!filp) {
/* We should always have a readable file here */
WARN_ON_ONCE(1);
+ locks_free_lock(fl);
return -EBADF;
}
fl->fl_file = filp;
- ret = fl;
status = vfs_setlease(filp, fl->fl_type, &fl, NULL);
if (fl)
locks_free_lock(fl);
@@ -4063,7 +4088,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
case NFS4_OPEN_CLAIM_FH:
/*
* Let's not give out any delegations till everyone's
- * had the chance to reclaim theirs....
+ * had the chance to reclaim theirs, *and* until
+ * NLM locks have all been reclaimed:
*/
if (locks_in_grace(clp->net))
goto out_no_deleg;
@@ -4209,7 +4235,7 @@ out:
if (fp)
put_nfs4_file(fp);
if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
- nfs4_set_claim_prev(open, nfsd4_has_session(&resp->cstate));
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
/*
* To finish the open response, we just need to set the rflags.
*/
@@ -4338,14 +4364,12 @@ nfs4_laundromat(struct nfsd_net *nn)
spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
- continue;
if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
t = dp->dl_time - cutoff;
new_timeo = min(new_timeo, t);
break;
}
- unhash_delegation_locked(dp);
+ WARN_ON(!unhash_delegation_locked(dp));
list_add(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&state_lock);
@@ -4440,7 +4464,7 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid,
{
if (ONE_STATEID(stateid) && (flags & RD_STATE))
return nfs_ok;
- else if (locks_in_grace(net)) {
+ else if (opens_in_grace(net)) {
/* Answer in remaining cases depends on existence of
* conflicting state; so we must wait out the grace period. */
return nfserr_grace;
@@ -4459,7 +4483,7 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid,
static inline int
grace_disallows_io(struct net *net, struct inode *inode)
{
- return locks_in_grace(net) && mandatory_lock(inode);
+ return opens_in_grace(net) && mandatory_lock(inode);
}
/* Returns true iff a is later than b: */
@@ -4751,7 +4775,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (check_for_locks(stp->st_stid.sc_file,
lockowner(stp->st_stateowner)))
break;
- unhash_lock_stateid(stp);
+ WARN_ON(!unhash_lock_stateid(stp));
spin_unlock(&cl->cl_lock);
nfs4_put_stid(s);
ret = nfs_ok;
@@ -4967,20 +4991,23 @@ out:
static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
{
struct nfs4_client *clp = s->st_stid.sc_client;
+ bool unhashed;
LIST_HEAD(reaplist);
s->st_stid.sc_type = NFS4_CLOSED_STID;
spin_lock(&clp->cl_lock);
- unhash_open_stateid(s, &reaplist);
+ unhashed = unhash_open_stateid(s, &reaplist);
if (clp->cl_minorversion) {
- put_ol_stateid_locked(s, &reaplist);
+ if (unhashed)
+ put_ol_stateid_locked(s, &reaplist);
spin_unlock(&clp->cl_lock);
free_ol_stateid_reaplist(&reaplist);
} else {
spin_unlock(&clp->cl_lock);
free_ol_stateid_reaplist(&reaplist);
- move_to_close_lru(s, clp->net);
+ if (unhashed)
+ move_to_close_lru(s, clp->net);
}
}
@@ -5045,9 +5072,6 @@ out:
return status;
}
-
-#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start))
-
static inline u64
end_offset(u64 start, u64 len)
{
@@ -5139,8 +5163,7 @@ nevermind:
}
static struct nfs4_lockowner *
-find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner,
- struct nfs4_client *clp)
+find_lockowner_str_locked(struct nfs4_client *clp, struct xdr_netobj *owner)
{
unsigned int strhashval = ownerstr_hashval(owner);
struct nfs4_stateowner *so;
@@ -5158,13 +5181,12 @@ find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner,
}
static struct nfs4_lockowner *
-find_lockowner_str(clientid_t *clid, struct xdr_netobj *owner,
- struct nfs4_client *clp)
+find_lockowner_str(struct nfs4_client *clp, struct xdr_netobj *owner)
{
struct nfs4_lockowner *lo;
spin_lock(&clp->cl_lock);
- lo = find_lockowner_str_locked(clid, owner, clp);
+ lo = find_lockowner_str_locked(clp, owner);
spin_unlock(&clp->cl_lock);
return lo;
}
@@ -5208,14 +5230,14 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
lo->lo_owner.so_seqid = lock->lk_new_lock_seqid;
lo->lo_owner.so_ops = &lockowner_ops;
spin_lock(&clp->cl_lock);
- ret = find_lockowner_str_locked(&clp->cl_clientid,
- &lock->lk_new_owner, clp);
+ ret = find_lockowner_str_locked(clp, &lock->lk_new_owner);
if (ret == NULL) {
list_add(&lo->lo_owner.so_strhash,
&clp->cl_ownerstr_hashtbl[strhashval]);
ret = lo;
} else
- nfs4_free_lockowner(&lo->lo_owner);
+ nfs4_free_stateowner(&lo->lo_owner);
+
spin_unlock(&clp->cl_lock);
return ret;
}
@@ -5298,8 +5320,8 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi,
static int
check_lock_length(u64 offset, u64 length)
{
- return ((length == 0) || ((length != NFS4_MAX_UINT64) &&
- LOFF_OVERFLOW(offset, length)));
+ return ((length == 0) || ((length != NFS4_MAX_UINT64) &&
+ (length > ~offset)));
}
static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
@@ -5328,9 +5350,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
struct nfs4_lockowner *lo;
unsigned int strhashval;
- lo = find_lockowner_str(&cl->cl_clientid, &lock->v.new.owner, cl);
+ lo = find_lockowner_str(cl, &lock->lk_new_owner);
if (!lo) {
- strhashval = ownerstr_hashval(&lock->v.new.owner);
+ strhashval = ownerstr_hashval(&lock->lk_new_owner);
lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
if (lo == NULL)
return nfserr_jukebox;
@@ -5391,7 +5413,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (lock->lk_is_new) {
if (nfsd4_has_session(cstate))
/* See rfc 5661 18.10.3: given clientid is ignored: */
- memcpy(&lock->v.new.clientid,
+ memcpy(&lock->lk_new_clientid,
&cstate->session->se_client->cl_clientid,
sizeof(clientid_t));
@@ -5409,7 +5431,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
open_sop = openowner(open_stp->st_stateowner);
status = nfserr_bad_stateid;
if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
- &lock->v.new.clientid))
+ &lock->lk_new_clientid))
goto out;
status = lookup_or_create_lock_state(cstate, open_stp, lock,
&lock_stp, &new);
@@ -5603,8 +5625,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- lo = find_lockowner_str(&lockt->lt_clientid, &lockt->lt_owner,
- cstate->clp);
+ lo = find_lockowner_str(cstate->clp, &lockt->lt_owner);
if (lo)
file_lock->fl_owner = (fl_owner_t)lo;
file_lock->fl_pid = current->tgid;
@@ -6019,7 +6040,7 @@ nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst,
static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max,
struct list_head *collect,
- void (*func)(struct nfs4_ol_stateid *))
+ bool (*func)(struct nfs4_ol_stateid *))
{
struct nfs4_openowner *oop;
struct nfs4_ol_stateid *stp, *st_next;
@@ -6033,9 +6054,9 @@ static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max,
list_for_each_entry_safe(lst, lst_next,
&stp->st_locks, st_locks) {
if (func) {
- func(lst);
- nfsd_inject_add_lock_to_list(lst,
- collect);
+ if (func(lst))
+ nfsd_inject_add_lock_to_list(lst,
+ collect);
}
++count;
/*
@@ -6305,7 +6326,7 @@ static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
continue;
atomic_inc(&clp->cl_refcount);
- unhash_delegation_locked(dp);
+ WARN_ON(!unhash_delegation_locked(dp));
list_add(&dp->dl_recall_lru, victims);
}
++count;
@@ -6584,6 +6605,7 @@ nfs4_state_start_net(struct net *net)
return ret;
nn->boot_time = get_seconds();
nn->grace_ended = false;
+ nn->nfsd4_manager.block_opens = true;
locks_start_grace(net, &nn->nfsd4_manager);
nfsd4_client_tracking_init(net);
printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
@@ -6602,7 +6624,7 @@ nfs4_state_start(void)
ret = set_callback_cred();
if (ret)
return -ENOMEM;
- laundry_wq = create_singlethread_workqueue("nfsd4");
+ laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
if (laundry_wq == NULL) {
ret = -ENOMEM;
goto out_recovery;
@@ -6635,7 +6657,7 @@ nfs4_state_shutdown_net(struct net *net)
spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
- unhash_delegation_locked(dp);
+ WARN_ON(!unhash_delegation_locked(dp));
list_add(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&state_lock);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 75e0563c09d1..51c9e9ca39a4 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2140,6 +2140,27 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
}
+static inline __be32
+nfsd4_encode_layout_type(struct xdr_stream *xdr, enum pnfs_layouttype layout_type)
+{
+ __be32 *p;
+
+ if (layout_type) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(layout_type);
+ } else {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0);
+ }
+
+ return 0;
+}
+
#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
FATTR4_WORD0_RDATTR_ERROR)
#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
@@ -2205,6 +2226,39 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
return err;
}
+static __be32
+nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2)
+{
+ __be32 *p;
+
+ if (bmval2) {
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(bmval0);
+ *p++ = cpu_to_be32(bmval1);
+ *p++ = cpu_to_be32(bmval2);
+ } else if (bmval1) {
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(bmval0);
+ *p++ = cpu_to_be32(bmval1);
+ } else {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(bmval0);
+ }
+
+ return 0;
+out_resource:
+ return nfserr_resource;
+}
+
/*
* Note: @fhp can be NULL; in this case, we might have to compose the filehandle
* ourselves.
@@ -2301,28 +2355,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
}
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
- if (bmval2) {
- p = xdr_reserve_space(xdr, 16);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(3);
- *p++ = cpu_to_be32(bmval0);
- *p++ = cpu_to_be32(bmval1);
- *p++ = cpu_to_be32(bmval2);
- } else if (bmval1) {
- p = xdr_reserve_space(xdr, 12);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(bmval0);
- *p++ = cpu_to_be32(bmval1);
- } else {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- *p++ = cpu_to_be32(bmval0);
- }
+ status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2);
+ if (status)
+ goto out;
attrlen_offset = xdr->buf->len;
p = xdr_reserve_space(xdr, 4);
@@ -2675,6 +2710,9 @@ out_acl:
*p++ = cpu_to_be32(stat.mtime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
+ struct kstat parent_stat;
+ u64 ino = stat.ino;
+
p = xdr_reserve_space(xdr, 8);
if (!p)
goto out_resource;
@@ -2683,25 +2721,25 @@ out_acl:
* and this is the root of a cross-mounted filesystem.
*/
if (ignore_crossmnt == 0 &&
- dentry == exp->ex_path.mnt->mnt_root)
- get_parent_attributes(exp, &stat);
- p = xdr_encode_hyper(p, stat.ino);
+ dentry == exp->ex_path.mnt->mnt_root) {
+ err = get_parent_attributes(exp, &parent_stat);
+ if (err)
+ goto out_nfserr;
+ ino = parent_stat.ino;
+ }
+ p = xdr_encode_hyper(p, ino);
}
#ifdef CONFIG_NFSD_PNFS
- if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
- (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
- if (exp->ex_layout_type) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(1);
- *p++ = cpu_to_be32(exp->ex_layout_type);
- } else {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(0);
- }
+ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
+ status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type);
+ if (status)
+ goto out;
+ }
+
+ if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) {
+ status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type);
+ if (status)
+ goto out;
}
if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
@@ -2711,21 +2749,20 @@ out_acl:
*p++ = cpu_to_be32(stat.blksize);
}
#endif /* CONFIG_NFSD_PNFS */
+ if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+ status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0,
+ NFSD_SUPPATTR_EXCLCREAT_WORD1,
+ NFSD_SUPPATTR_EXCLCREAT_WORD2);
+ if (status)
+ goto out;
+ }
+
if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
status = nfsd4_encode_security_label(xdr, rqstp, context,
contextlen);
if (status)
goto out;
}
- if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
- p = xdr_reserve_space(xdr, 16);
- if (!p)
- goto out_resource;
- *p++ = cpu_to_be32(3);
- *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
- *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
- *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
- }
attrlen = htonl(xdr->buf->len - attrlen_offset - 4);
write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4);
@@ -3044,13 +3081,12 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
__be32 *p;
if (!nfserr) {
- p = xdr_reserve_space(xdr, 32);
+ p = xdr_reserve_space(xdr, 20);
if (!p)
return nfserr_resource;
- p = encode_cinfo(p, &create->cr_cinfo);
- *p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(create->cr_bmval[0]);
- *p++ = cpu_to_be32(create->cr_bmval[1]);
+ encode_cinfo(p, &create->cr_cinfo);
+ nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0],
+ create->cr_bmval[1], create->cr_bmval[2]);
}
return nfserr;
}
@@ -3190,16 +3226,22 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
if (nfserr)
goto out;
- p = xdr_reserve_space(xdr, 40);
+ p = xdr_reserve_space(xdr, 24);
if (!p)
return nfserr_resource;
p = encode_cinfo(p, &open->op_cinfo);
*p++ = cpu_to_be32(open->op_rflags);
- *p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(open->op_bmval[0]);
- *p++ = cpu_to_be32(open->op_bmval[1]);
- *p++ = cpu_to_be32(open->op_delegate_type);
+ nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1],
+ open->op_bmval[2]);
+ if (nfserr)
+ goto out;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = cpu_to_be32(open->op_delegate_type);
switch (open->op_delegate_type) {
case NFS4_OPEN_DELEGATE_NONE:
break;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 9277cc91c21b..ad4e2377dd63 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -391,6 +391,14 @@ static int nfsd_get_default_max_blksize(void)
return ret;
}
+static struct svc_serv_ops nfsd_thread_sv_ops = {
+ .svo_shutdown = nfsd_last_thread,
+ .svo_function = nfsd,
+ .svo_enqueue_xprt = svc_xprt_do_enqueue,
+ .svo_setup = svc_set_num_threads,
+ .svo_module = THIS_MODULE,
+};
+
int nfsd_create_serv(struct net *net)
{
int error;
@@ -405,7 +413,7 @@ int nfsd_create_serv(struct net *net)
nfsd_max_blksize = nfsd_get_default_max_blksize();
nfsd_reset_versions();
nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
- nfsd_last_thread, nfsd, THIS_MODULE);
+ &nfsd_thread_sv_ops);
if (nn->nfsd_serv == NULL)
return -ENOMEM;
@@ -500,8 +508,8 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
/* apply the new numbers */
svc_get(nn->nfsd_serv);
for (i = 0; i < n; i++) {
- err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i],
- nthreads[i]);
+ err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
+ &nn->nfsd_serv->sv_pools[i], nthreads[i]);
if (err)
break;
}
@@ -540,7 +548,8 @@ nfsd_svc(int nrservs, struct net *net)
error = nfsd_startup_net(nrservs, net);
if (error)
goto out_destroy;
- error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs);
+ error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
+ NULL, nrservs);
if (error)
goto out_shutdown;
/* We are holding a reference to nn->nfsd_serv which
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4874ce515fc1..583ffc13cae2 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -67,8 +67,8 @@ struct nfsd4_callback {
struct rpc_message cb_msg;
struct nfsd4_callback_ops *cb_ops;
struct work_struct cb_work;
+ int cb_seq_status;
int cb_status;
- bool cb_update_seq_nr;
bool cb_need_restart;
};
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b5e077a6e7d4..45c04979e7b3 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1249,12 +1249,6 @@ out_nfserr:
#ifdef CONFIG_NFSD_V3
-static inline int nfsd_create_is_exclusive(int createmode)
-{
- return createmode == NFS3_CREATE_EXCLUSIVE
- || createmode == NFS4_CREATE_EXCLUSIVE4_1;
-}
-
/*
* NFSv3 and NFSv4 version of nfsd_create
*/
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 5be875e3e638..fee2451ae248 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -131,4 +131,10 @@ static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat)
return nfserrno(vfs_getattr(&p, stat));
}
+static inline int nfsd_create_is_exclusive(int createmode)
+{
+ return createmode == NFS3_CREATE_EXCLUSIVE
+ || createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
+
#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 44523f4a6084..6faaf710e563 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -154,6 +154,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
struct dnotify_struct *dn;
struct dnotify_struct **prev;
struct inode *inode;
+ bool free = false;
inode = file_inode(filp);
if (!S_ISDIR(inode->i_mode))
@@ -182,11 +183,15 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
/* nothing else could have found us thanks to the dnotify_groups
mark_mutex */
- if (dn_mark->dn == NULL)
- fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
+ if (dn_mark->dn == NULL) {
+ fsnotify_detach_mark(fsn_mark);
+ free = true;
+ }
mutex_unlock(&dnotify_group->mark_mutex);
+ if (free)
+ fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
}
@@ -362,9 +367,10 @@ out:
spin_unlock(&fsn_mark->lock);
if (destroy)
- fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
-
+ fsnotify_detach_mark(fsn_mark);
mutex_unlock(&dnotify_group->mark_mutex);
+ if (destroy)
+ fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
out_err:
if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index cf275500a665..8e8e6bcd1d43 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -529,8 +529,10 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
if (destroy_mark)
- fsnotify_destroy_mark_locked(fsn_mark, group);
+ fsnotify_detach_mark(fsn_mark);
mutex_unlock(&group->mark_mutex);
+ if (destroy_mark)
+ fsnotify_free_mark(fsn_mark);
fsnotify_put_mark(fsn_mark);
if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -557,8 +559,10 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
&destroy_mark);
if (destroy_mark)
- fsnotify_destroy_mark_locked(fsn_mark, group);
+ fsnotify_detach_mark(fsn_mark);
mutex_unlock(&group->mark_mutex);
+ if (destroy_mark)
+ fsnotify_free_mark(fsn_mark);
/* matches the fsnotify_find_inode_mark() */
fsnotify_put_mark(fsn_mark);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 58b7cdb63da9..6b6f0d472ae8 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -76,7 +76,8 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
struct inotify_inode_mark *inode_mark;
struct inode *inode;
- if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
+ !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
return;
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index dd3fb0b17be7..db39de2dd4cb 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -26,7 +26,6 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
-#include "../mount.h"
/*
* Clear all of the marks on an inode when it is being evicted from core
@@ -205,6 +204,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
mnt = NULL;
/*
+ * Optimization: srcu_read_lock() has a memory barrier which can
+ * be expensive. It protects walking the *_fsnotify_marks lists.
+ * However, if we do not walk the lists, we do not have to do
+ * SRCU because we have no references to any objects and do not
+ * need SRCU to keep them "alive".
+ */
+ if (hlist_empty(&to_tell->i_fsnotify_marks) &&
+ (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
+ return 0;
+ /*
* if this is a modify event we may need to clear the ignored masks
* otherwise return if neither the inode nor the vfsmount care about
* this type of event.
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 13a00be516d2..b44c68a857e7 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,6 +6,8 @@
#include <linux/srcu.h>
#include <linux/types.h>
+#include "../mount.h"
+
/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);
@@ -38,15 +40,22 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
/* inode specific destruction of a mark */
extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
-/* Destroy all marks in the given list */
-extern void fsnotify_destroy_marks(struct list_head *to_free);
/* Find mark belonging to given group in the list of marks */
extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
struct fsnotify_group *group);
-/* run the list of all marks associated with inode and flag them to be freed */
-extern void fsnotify_clear_marks_by_inode(struct inode *inode);
-/* run the list of all marks associated with vfsmount and flag them to be freed */
-extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
+/* Destroy all marks in the given list protected by 'lock' */
+extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
+/* run the list of all marks associated with inode and destroy them */
+static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+ fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
+}
+/* run the list of all marks associated with vfsmount and destroy them */
+static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
+{
+ fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
+ &mnt->mnt_root->d_lock);
+}
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3daf513ee99e..e785fd954c30 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -65,26 +65,6 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
}
/*
- * Given an inode, destroy all of the marks associated with that inode.
- */
-void fsnotify_clear_marks_by_inode(struct inode *inode)
-{
- struct fsnotify_mark *mark;
- struct hlist_node *n;
- LIST_HEAD(free_list);
-
- spin_lock(&inode->i_lock);
- hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
- list_add(&mark->free_list, &free_list);
- hlist_del_init_rcu(&mark->obj_list);
- fsnotify_get_mark(mark);
- }
- spin_unlock(&inode->i_lock);
-
- fsnotify_destroy_marks(&free_list);
-}
-
-/*
* Given a group clear all of the inode marks associated with that group.
*/
void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
@@ -163,17 +143,17 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
/**
* fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
- * @list: list of inodes being unmounted (sb->s_inodes)
+ * @sb: superblock being unmounted.
*
* Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
+ * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
*/
-void fsnotify_unmount_inodes(struct list_head *list)
+void fsnotify_unmount_inodes(struct super_block *sb)
{
struct inode *inode, *next_i, *need_iput = NULL;
- spin_lock(&inode_sb_list_lock);
- list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) {
struct inode *need_iput_tmp;
/*
@@ -209,7 +189,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
spin_unlock(&inode->i_lock);
/* In case the dropping of a reference would nuke next_i. */
- while (&next_i->i_sb_list != list) {
+ while (&next_i->i_sb_list != &sb->s_inodes) {
spin_lock(&next_i->i_lock);
if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
atomic_read(&next_i->i_count)) {
@@ -224,12 +204,12 @@ void fsnotify_unmount_inodes(struct list_head *list)
}
/*
- * We can safely drop inode_sb_list_lock here because either
+ * We can safely drop s_inode_list_lock here because either
* we actually hold references on both inode and next_i or
* end of list. Also no new inodes will be added since the
* umount has begun.
*/
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
if (need_iput_tmp)
iput(need_iput_tmp);
@@ -241,7 +221,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
iput(inode);
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 39ddcaf0918f..fc0df4442f7b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -122,26 +122,27 @@ u32 fsnotify_recalc_mask(struct hlist_head *head)
}
/*
- * Any time a mark is getting freed we end up here.
- * The caller had better be holding a reference to this mark so we don't actually
- * do the final put under the mark->lock
+ * Remove mark from inode / vfsmount list, group list, drop inode reference
+ * if we got one.
+ *
+ * Must be called with group->mark_mutex held.
*/
-void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
- struct fsnotify_group *group)
+void fsnotify_detach_mark(struct fsnotify_mark *mark)
{
struct inode *inode = NULL;
+ struct fsnotify_group *group = mark->group;
BUG_ON(!mutex_is_locked(&group->mark_mutex));
spin_lock(&mark->lock);
/* something else already called this function on this mark */
- if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
spin_unlock(&mark->lock);
return;
}
- mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+ mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
inode = mark->inode;
@@ -150,6 +151,12 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
fsnotify_destroy_vfsmount_mark(mark);
else
BUG();
+ /*
+ * Note that we didn't update flags telling whether inode cares about
+ * what's happening with children. We update these flags from
+ * __fsnotify_parent() lazily when next event happens on one of our
+ * children.
+ */
list_del_init(&mark->g_list);
@@ -157,18 +164,32 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
iput(inode);
- /* release lock temporarily */
- mutex_unlock(&group->mark_mutex);
+
+ atomic_dec(&group->num_marks);
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a kthread which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+ struct fsnotify_group *group = mark->group;
+
+ spin_lock(&mark->lock);
+ /* something else already called this function on this mark */
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+ spin_unlock(&mark->lock);
+ return;
+ }
+ mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+ spin_unlock(&mark->lock);
spin_lock(&destroy_lock);
list_add(&mark->g_list, &destroy_list);
spin_unlock(&destroy_lock);
wake_up(&destroy_waitq);
- /*
- * We don't necessarily have a ref on mark from caller so the above destroy
- * may have actually freed it, unless this group provides a 'freeing_mark'
- * function which must be holding a reference.
- */
/*
* Some groups like to know that marks are being freed. This is a
@@ -177,50 +198,45 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
*/
if (group->ops->freeing_mark)
group->ops->freeing_mark(mark, group);
-
- /*
- * __fsnotify_update_child_dentry_flags(inode);
- *
- * I really want to call that, but we can't, we have no idea if the inode
- * still exists the second we drop the mark->lock.
- *
- * The next time an event arrive to this inode from one of it's children
- * __fsnotify_parent will see that the inode doesn't care about it's
- * children and will update all of these flags then. So really this
- * is just a lazy update (and could be a perf win...)
- */
-
- atomic_dec(&group->num_marks);
-
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
}
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group)
{
mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
- fsnotify_destroy_mark_locked(mark, group);
+ fsnotify_detach_mark(mark);
mutex_unlock(&group->mark_mutex);
+ fsnotify_free_mark(mark);
}
-/*
- * Destroy all marks in the given list. The marks must be already detached from
- * the original inode / vfsmount.
- */
-void fsnotify_destroy_marks(struct list_head *to_free)
+void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
{
- struct fsnotify_mark *mark, *lmark;
- struct fsnotify_group *group;
-
- list_for_each_entry_safe(mark, lmark, to_free, free_list) {
- spin_lock(&mark->lock);
- fsnotify_get_group(mark->group);
- group = mark->group;
- spin_unlock(&mark->lock);
+ struct fsnotify_mark *mark;
- fsnotify_destroy_mark(mark, group);
+ while (1) {
+ /*
+ * We have to be careful since we can race with e.g.
+ * fsnotify_clear_marks_by_group() and once we drop 'lock',
+ * mark can get removed from the obj_list and destroyed. But
+ * we are holding mark reference so mark cannot be freed and
+ * calling fsnotify_destroy_mark() more than once is fine.
+ */
+ spin_lock(lock);
+ if (hlist_empty(head)) {
+ spin_unlock(lock);
+ break;
+ }
+ mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
+ /*
+ * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
+ * since inode / mount is going away anyway. So just remove
+ * mark from the list.
+ */
+ hlist_del_init_rcu(&mark->obj_list);
+ fsnotify_get_mark(mark);
+ spin_unlock(lock);
+ fsnotify_destroy_mark(mark, mark->group);
fsnotify_put_mark(mark);
- fsnotify_put_group(group);
}
}
@@ -332,7 +348,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
* inode->i_lock
*/
spin_lock(&mark->lock);
- mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+ mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
fsnotify_get_group(group);
mark->group = group;
@@ -438,8 +454,9 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
}
mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
fsnotify_get_mark(mark);
- fsnotify_destroy_mark_locked(mark, group);
+ fsnotify_detach_mark(mark);
mutex_unlock(&group->mark_mutex);
+ fsnotify_free_mark(mark);
fsnotify_put_mark(mark);
}
}
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 326b148e623c..a8fcab68faef 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -28,25 +28,6 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
-#include "../mount.h"
-
-void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
-{
- struct fsnotify_mark *mark;
- struct hlist_node *n;
- struct mount *m = real_mount(mnt);
- LIST_HEAD(free_list);
-
- spin_lock(&mnt->mnt_root->d_lock);
- hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) {
- list_add(&mark->free_list, &free_list);
- hlist_del_init_rcu(&mark->obj_list);
- fsnotify_get_mark(mark);
- }
- spin_unlock(&mnt->mnt_root->d_lock);
-
- fsnotify_destroy_marks(&free_list);
-}
void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
{
diff --git a/fs/nsfs.c b/fs/nsfs.c
index e4905fbf3396..8f20d6016e20 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -142,7 +142,8 @@ static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
struct inode *inode = d_inode(dentry);
const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
- return seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
+ seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
+ return 0;
}
static const struct super_operations nsfs_ops = {
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index c1128bcbeb5e..d1a853585b53 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2204,17 +2204,12 @@ get_ctx_vol_failed:
return true;
#ifdef NTFS_RW
iput_usnjrnl_err_out:
- if (vol->usnjrnl_j_ino)
- iput(vol->usnjrnl_j_ino);
- if (vol->usnjrnl_max_ino)
- iput(vol->usnjrnl_max_ino);
- if (vol->usnjrnl_ino)
- iput(vol->usnjrnl_ino);
+ iput(vol->usnjrnl_j_ino);
+ iput(vol->usnjrnl_max_ino);
+ iput(vol->usnjrnl_ino);
iput_quota_err_out:
- if (vol->quota_q_ino)
- iput(vol->quota_q_ino);
- if (vol->quota_ino)
- iput(vol->quota_ino);
+ iput(vol->quota_q_ino);
+ iput(vol->quota_ino);
iput(vol->extend_ino);
#endif /* NTFS_RW */
iput_sec_err_out:
@@ -2223,8 +2218,7 @@ iput_root_err_out:
iput(vol->root_ino);
iput_logfile_err_out:
#ifdef NTFS_RW
- if (vol->logfile_ino)
- iput(vol->logfile_ino);
+ iput(vol->logfile_ino);
iput_vol_err_out:
#endif /* NTFS_RW */
iput(vol->vol_ino);
@@ -2254,8 +2248,7 @@ iput_mftbmp_err_out:
iput(vol->mftbmp_ino);
iput_mirr_err_out:
#ifdef NTFS_RW
- if (vol->mftmirr_ino)
- iput(vol->mftmirr_ino);
+ iput(vol->mftmirr_ino);
#endif /* NTFS_RW */
return false;
}
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index c58a1bcfda0f..0cdf497c91ef 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle,
int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+ struct buffer_head *bh = NULL;
+ int status = 0;
+
+ status = ocfs2_inode_lock(inode, &bh, 1);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+ status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(bh);
+ return status;
}
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
@@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
struct ocfs2_super *osb;
struct buffer_head *di_bh = NULL;
struct posix_acl *acl;
- int ret = -EAGAIN;
+ int ret;
osb = OCFS2_SB(inode->i_sb);
if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
return NULL;
-
- ret = ocfs2_read_inode_block(inode, &di_bh);
- if (ret < 0)
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ mlog_errno(ret);
return ERR_PTR(ret);
+ }
acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+ ocfs2_inode_unlock(inode, 0);
brelse(di_bh);
-
return acl;
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 5997c00a1515..86181d6526dc 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -908,32 +908,30 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- ocfs2_error(sb,
- "Extent block #%llu has bad signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- eb->h_signature);
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ eb->h_signature);
+ goto bail;
}
if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Extent block #%llu has an invalid h_blkno "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(eb->h_blkno));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
+ goto bail;
}
if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Extent block #%llu has an invalid "
- "h_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(eb->h_fs_generation));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(eb->h_fs_generation));
+ goto bail;
}
-
- return 0;
+bail:
+ return rc;
}
int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
@@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty "
- "extent list (next_free_rec == 0)",
+ "Owner %llu has empty extent list (next_free_rec == 0)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
status = -EIO;
goto bail;
@@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (!blkno) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has extent "
- "list where extent # %d has no physical "
- "block start",
+ "Owner %llu has extent list where extent # %d has no physical block start\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
status = -EIO;
goto bail;
@@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
while (el->l_tree_depth) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has empty extent list at "
- "depth %u\n",
+ "Owner %llu has empty extent list at depth %u\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth));
ret = -EROFS;
@@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
blkno = le64_to_cpu(el->l_recs[i].e_blkno);
if (blkno == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has bad blkno in extent list "
- "at depth %u (index %d)\n",
+ "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
le16_to_cpu(el->l_tree_depth), i);
ret = -EROFS;
@@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
if (le16_to_cpu(el->l_next_free_rec) >
le16_to_cpu(el->l_count)) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
- "Owner %llu has bad count in extent list "
- "at block %llu (next free=%u, count=%u)\n",
+ "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)bh->b_blocknr,
le16_to_cpu(el->l_next_free_rec),
@@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
if (left_el->l_next_free_rec != left_el->l_count) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Inode %llu has non-full interior leaf node %llu"
- "(next free = %u)",
+ "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)left_leaf_bh->b_blocknr,
le16_to_cpu(left_el->l_next_free_rec));
@@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
- ocfs2_error(sb,
- "Invalid extent tree at extent block %llu\n",
+ ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
@@ -2872,8 +2862,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
* If we got here, we never found a valid node where
* the tree indicated one should be.
*/
- ocfs2_error(sb,
- "Invalid extent tree at extent block %llu\n",
+ ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
(unsigned long long)blkno);
ret = -EROFS;
goto out;
@@ -3131,6 +3120,30 @@ out:
return ret;
}
+static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ handle_t *handle;
+ int ret;
+ int credits = path->p_tree_depth * 2 + 1;
+
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return ret;
+ }
+
+ ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+ return ret;
+}
+
/*
* Left rotation of btree records.
*
@@ -3200,7 +3213,7 @@ rightmost_no_delete:
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ret = -EIO;
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has empty extent block at %llu",
+ "Owner %llu has empty extent block at %llu\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
(unsigned long long)le64_to_cpu(eb->h_blkno));
goto out;
@@ -3930,7 +3943,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has a bad extent list",
+ "Owner %llu has a bad extent list\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
ret = -EIO;
return;
@@ -4355,10 +4368,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
bh = path_leaf_bh(left_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
ocfs2_error(sb,
- "Extent block #%llu has an "
- "invalid l_next_free_rec of "
- "%d. It should have "
- "matched the l_count of %d",
+ "Extent block #%llu has an invalid l_next_free_rec of %d. It should have matched the l_count of %d\n",
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(new_el->l_next_free_rec),
le16_to_cpu(new_el->l_count));
@@ -4413,8 +4423,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
bh = path_leaf_bh(right_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
ocfs2_error(sb,
- "Extent block #%llu has an "
- "invalid l_next_free_rec of %d",
+ "Extent block #%llu has an invalid l_next_free_rec of %d\n",
(unsigned long long)le64_to_cpu(eb->h_blkno),
le16_to_cpu(new_el->l_next_free_rec));
status = -EINVAL;
@@ -4970,10 +4979,9 @@ leftright:
split_index = ocfs2_search_extent_list(el, cpos);
if (split_index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has an extent at cpos %u "
- "which can no longer be found.\n",
- (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
- cpos);
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5158,10 +5166,9 @@ int ocfs2_change_extent_flag(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(sb,
- "Owner %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)
- ocfs2_metadata_cache_owner(et->et_ci), cpos);
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+ cpos);
ret = -EROFS;
goto out;
}
@@ -5228,9 +5235,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
cpos, len, phys);
if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
- "that are being written to, but the feature bit "
- "is not set in the super block.",
+ ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
ret = -EROFS;
goto out;
@@ -5514,8 +5519,7 @@ int ocfs2_remove_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu has an extent at cpos %u which can no "
- "longer be found.\n",
+ "Owner %llu has an extent at cpos %u which can no longer be found\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
@@ -5580,7 +5584,7 @@ int ocfs2_remove_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu: split at cpos %u lost record.",
+ "Owner %llu: split at cpos %u lost record\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos);
ret = -EROFS;
@@ -5596,8 +5600,7 @@ int ocfs2_remove_extent(handle_t *handle,
ocfs2_rec_clusters(el, rec);
if (rec_range != trunc_range) {
ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
- "Owner %llu: error after split at cpos %u"
- "trunc len %u, existing record is (%u,%u)",
+ "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
cpos, len, le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
@@ -6175,7 +6178,7 @@ bail:
iput(tl_inode);
brelse(tl_bh);
- if (status < 0 && (*tl_copy)) {
+ if (status < 0) {
kfree(*tl_copy);
*tl_copy = NULL;
mlog_errno(status);
@@ -7108,15 +7111,23 @@ start:
* to check it up here before changing the tree.
*/
if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
- ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+ mlog(ML_ERROR, "Inode %lu has an empty "
"extent record, depth %u\n", inode->i_ino,
le16_to_cpu(root_el->l_tree_depth));
- status = -EROFS;
- goto bail;
+ status = ocfs2_remove_rightmost_empty_extent(osb,
+ &et, path, &dealloc);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ ocfs2_reinit_path(path, 1);
+ goto start;
+ } else {
+ trunc_cpos = le32_to_cpu(rec->e_cpos);
+ trunc_len = 0;
+ blkno = 0;
}
- trunc_cpos = le32_to_cpu(rec->e_cpos);
- trunc_len = 0;
- blkno = 0;
} else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
/*
* Truncate entire record.
@@ -7204,8 +7215,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
!ocfs2_supports_inline_data(osb)) {
ocfs2_error(inode->i_sb,
- "Inline data flags for inode %llu don't agree! "
- "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
+ "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
le16_to_cpu(di->i_dyn_features),
OCFS2_I(inode)->ip_dyn_features,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0f5fd9db8194..64b11d90eca6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
- ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+ ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
return -EROFS;
}
@@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
if (size > PAGE_CACHE_SIZE ||
size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
ocfs2_error(inode->i_sb,
- "Inode %llu has with inline data has bad size: %Lu",
+ "Inode %llu has with inline data has bad size: %Lu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)size);
return -EROFS;
@@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
/* This figures out the size of the next contiguous block, and
* our logical offset */
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks, &ext_flags);
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
if (ret) {
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
@@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
alloc_locked = 1;
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
/* fill hole, allocate blocks can't be larger than the size
* of the hole */
clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
@@ -569,6 +575,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
ret = ocfs2_extend_allocation(inode, cpos,
clusters_to_alloc, 0);
if (ret < 0) {
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
mlog_errno(ret);
goto bail;
}
@@ -576,11 +583,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
&contig_blocks, &ext_flags);
if (ret < 0) {
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
(unsigned long long)iblock);
ret = -EIO;
goto bail;
}
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
}
/*
@@ -627,10 +636,13 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
- ocfs2_iocb_clear_rw_locked(iocb);
+ /* Let rw unlock to be done later to protect append direct io write */
+ if (offset + bytes <= i_size_read(inode)) {
+ ocfs2_iocb_clear_rw_locked(iocb);
- level = ocfs2_iocb_rw_locked_level(iocb);
- ocfs2_rw_unlock(inode, level);
+ level = ocfs2_iocb_rw_locked_level(iocb);
+ ocfs2_rw_unlock(inode, level);
+ }
}
static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -832,12 +844,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
/* zeroing out the previously allocated cluster tail
* that but not zeroed */
- if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
zero_len_tail, cluster_align_tail);
- else
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ } else {
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
offset);
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ }
if (ret < 0) {
mlog_errno(ret);
ocfs2_inode_unlock(inode, 1);
@@ -857,7 +874,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
offset, ocfs2_direct_IO_get_blocks,
ocfs2_dio_end_io, NULL, 0);
- if (unlikely(written < 0)) {
+ /* overwrite aio may return -EIOCBQUEUED, and it is not an error */
+ if ((written < 0) && (written != -EIOCBQUEUED)) {
loff_t i_size = i_size_read(inode);
if (offset + count > i_size) {
@@ -876,12 +894,14 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
+ di_bh = NULL;
goto clean_orphan;
}
}
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
+ di_bh = NULL;
ret = jbd2_journal_force_commit(journal);
if (ret < 0)
@@ -936,10 +956,12 @@ clean_orphan:
if (tmp_ret < 0) {
ret = tmp_ret;
mlog_errno(ret);
+ brelse(di_bh);
goto out;
}
ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
tmp_ret = jbd2_journal_force_commit(journal);
if (tmp_ret < 0) {
@@ -2185,10 +2207,7 @@ try_again:
if (ret)
goto out_commit;
}
- /*
- * We don't want this to fail in ocfs2_write_end(), so do it
- * here.
- */
+
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
@@ -2345,7 +2364,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- int i;
+ int i, ret;
unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2354,6 +2373,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
handle_t *handle = wc->w_handle;
struct page *tmppage;
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ copied = ret;
+ mlog_errno(ret);
+ goto out;
+ }
+
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
goto out_write_size;
@@ -2409,6 +2436,7 @@ out_write_size:
ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, wc->w_di_bh);
+out:
/* unlock pages before dealloc since it needs acquiring j_trans_barrier
* lock, or it will cause a deadlock since journal commit threads holds
* this lock and will ask for the page lock when flushing the data.
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 1edcb141f639..fe50ded1b4ce 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
bh = bhs[i];
if (!(flags & OCFS2_BH_READAHEAD)) {
+ if (status) {
+ /* Clear the rest of the buffers on error */
+ put_bh(bh);
+ bhs[i] = NULL;
+ continue;
+ }
/* We know this can't have changed as we hold the
* owner sem. Avoid doing any work on the bh if the
* journal has it. */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 140de3c93d2e..fa15debcc02b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -36,7 +36,7 @@
#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/bitmap.h>
-
+#include <linux/ktime.h>
#include "heartbeat.h"
#include "tcp.h"
#include "nodemanager.h"
@@ -1060,37 +1060,6 @@ bail:
return ret;
}
-/* Subtract b from a, storing the result in a. a *must* have a larger
- * value than b. */
-static void o2hb_tv_subtract(struct timeval *a,
- struct timeval *b)
-{
- /* just return 0 when a is after b */
- if (a->tv_sec < b->tv_sec ||
- (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
- a->tv_sec = 0;
- a->tv_usec = 0;
- return;
- }
-
- a->tv_sec -= b->tv_sec;
- a->tv_usec -= b->tv_usec;
- while ( a->tv_usec < 0 ) {
- a->tv_sec--;
- a->tv_usec += 1000000;
- }
-}
-
-static unsigned int o2hb_elapsed_msecs(struct timeval *start,
- struct timeval *end)
-{
- struct timeval res = *end;
-
- o2hb_tv_subtract(&res, start);
-
- return res.tv_sec * 1000 + res.tv_usec / 1000;
-}
-
/*
* we ride the region ref that the region dir holds. before the region
* dir is removed and drops it ref it will wait to tear down this
@@ -1101,7 +1070,7 @@ static int o2hb_thread(void *data)
int i, ret;
struct o2hb_region *reg = data;
struct o2hb_bio_wait_ctxt write_wc;
- struct timeval before_hb, after_hb;
+ ktime_t before_hb, after_hb;
unsigned int elapsed_msec;
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
@@ -1118,18 +1087,18 @@ static int o2hb_thread(void *data)
* hr_timeout_ms between disk writes. On busy systems
* this should result in a heartbeat which is less
* likely to time itself out. */
- do_gettimeofday(&before_hb);
+ before_hb = ktime_get_real();
ret = o2hb_do_disk_heartbeat(reg);
- do_gettimeofday(&after_hb);
- elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+ after_hb = ktime_get_real();
+
+ elapsed_msec = (unsigned int)
+ ktime_ms_delta(after_hb, before_hb);
mlog(ML_HEARTBEAT,
- "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n",
- before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
- after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
- elapsed_msec, ret);
+ "start = %lld, end = %lld, msec = %u, ret = %d\n",
+ before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
if (!kthread_should_stop() &&
elapsed_msec < reg->hr_timeout_ms) {
@@ -1619,17 +1588,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
struct o2hb_disk_slot *slot;
reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
- if (reg->hr_tmp_block == NULL) {
- mlog_errno(-ENOMEM);
+ if (reg->hr_tmp_block == NULL)
return -ENOMEM;
- }
reg->hr_slots = kcalloc(reg->hr_blocks,
sizeof(struct o2hb_disk_slot), GFP_KERNEL);
- if (reg->hr_slots == NULL) {
- mlog_errno(-ENOMEM);
+ if (reg->hr_slots == NULL)
return -ENOMEM;
- }
for(i = 0; i < reg->hr_blocks; i++) {
slot = &reg->hr_slots[i];
@@ -1645,17 +1610,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
GFP_KERNEL);
- if (!reg->hr_slot_data) {
- mlog_errno(-ENOMEM);
+ if (!reg->hr_slot_data)
return -ENOMEM;
- }
for(i = 0; i < reg->hr_num_pages; i++) {
page = alloc_page(GFP_KERNEL);
- if (!page) {
- mlog_errno(-ENOMEM);
+ if (!page)
return -ENOMEM;
- }
reg->hr_slot_data[i] = page;
@@ -1687,10 +1648,8 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
struct o2hb_disk_heartbeat_block *hb_block;
ret = o2hb_read_slots(reg, reg->hr_blocks);
- if (ret) {
- mlog_errno(ret);
+ if (ret)
goto out;
- }
/* We only want to get an idea of the values initially in each
* slot, so we do no verification - o2hb_check_slot will
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 02878a83f0b4..ffecf89c8c1c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -480,33 +480,26 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Invalid dirblock #%llu: "
- "signature = %.*s\n",
- (unsigned long long)bh->b_blocknr, 7,
- trailer->db_signature);
+ rc = ocfs2_error(dir->i_sb,
+ "Invalid dirblock #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ trailer->db_signature);
goto out;
}
if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Directory block #%llu has an invalid "
- "db_blkno of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ rc = ocfs2_error(dir->i_sb,
+ "Directory block #%llu has an invalid db_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
goto out;
}
if (le64_to_cpu(trailer->db_parent_dinode) !=
OCFS2_I(dir)->ip_blkno) {
- rc = -EINVAL;
- ocfs2_error(dir->i_sb,
- "Directory block #%llu on dinode "
- "#%llu has an invalid parent_dinode "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)OCFS2_I(dir)->ip_blkno,
- (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ rc = ocfs2_error(dir->i_sb,
+ "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)OCFS2_I(dir)->ip_blkno,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
goto out;
}
out:
@@ -604,14 +597,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
- ocfs2_error(sb,
- "Dir Index Root # %llu has bad signature %.*s",
- (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
- 7, dx_root->dr_signature);
- return -EINVAL;
+ ret = ocfs2_error(sb,
+ "Dir Index Root # %llu has bad signature %.*s\n",
+ (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+ 7, dx_root->dr_signature);
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
@@ -648,12 +640,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb,
}
if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
- ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
- 7, dx_leaf->dl_signature);
- return -EROFS;
+ ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n",
+ 7, dx_leaf->dl_signature);
}
- return 0;
+ return ret;
}
static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
@@ -812,11 +803,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "btree tree block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in btree tree block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -832,11 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
}
if (!found) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in btree", inode->i_ino,
- le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0) in btree\n",
+ inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
goto out;
}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 7df88a6dd626..6918f30d02cd 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1465,39 +1465,46 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
if (status == -ENOPROTOOPT) {
status = 0;
*response = JOIN_OK_NO_MAP;
- } else if (packet.code == JOIN_DISALLOW ||
- packet.code == JOIN_OK_NO_MAP) {
- *response = packet.code;
- } else if (packet.code == JOIN_PROTOCOL_MISMATCH) {
- mlog(ML_NOTICE,
- "This node requested DLM locking protocol %u.%u and "
- "filesystem locking protocol %u.%u. At least one of "
- "the protocol versions on node %d is not compatible, "
- "disconnecting\n",
- dlm->dlm_locking_proto.pv_major,
- dlm->dlm_locking_proto.pv_minor,
- dlm->fs_locking_proto.pv_major,
- dlm->fs_locking_proto.pv_minor,
- node);
- status = -EPROTO;
- *response = packet.code;
- } else if (packet.code == JOIN_OK) {
- *response = packet.code;
- /* Use the same locking protocol as the remote node */
- dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
- dlm->fs_locking_proto.pv_minor = packet.fs_minor;
- mlog(0,
- "Node %d responds JOIN_OK with DLM locking protocol "
- "%u.%u and fs locking protocol %u.%u\n",
- node,
- dlm->dlm_locking_proto.pv_major,
- dlm->dlm_locking_proto.pv_minor,
- dlm->fs_locking_proto.pv_major,
- dlm->fs_locking_proto.pv_minor);
} else {
- status = -EINVAL;
- mlog(ML_ERROR, "invalid response %d from node %u\n",
- packet.code, node);
+ *response = packet.code;
+ switch (packet.code) {
+ case JOIN_DISALLOW:
+ case JOIN_OK_NO_MAP:
+ break;
+ case JOIN_PROTOCOL_MISMATCH:
+ mlog(ML_NOTICE,
+ "This node requested DLM locking protocol %u.%u and "
+ "filesystem locking protocol %u.%u. At least one of "
+ "the protocol versions on node %d is not compatible, "
+ "disconnecting\n",
+ dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor,
+ dlm->fs_locking_proto.pv_major,
+ dlm->fs_locking_proto.pv_minor,
+ node);
+ status = -EPROTO;
+ break;
+ case JOIN_OK:
+ /* Use the same locking protocol as the remote node */
+ dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
+ dlm->fs_locking_proto.pv_minor = packet.fs_minor;
+ mlog(0,
+ "Node %d responds JOIN_OK with DLM locking protocol "
+ "%u.%u and fs locking protocol %u.%u\n",
+ node,
+ dlm->dlm_locking_proto.pv_major,
+ dlm->dlm_locking_proto.pv_minor,
+ dlm->fs_locking_proto.pv_major,
+ dlm->fs_locking_proto.pv_minor);
+ break;
+ default:
+ status = -EINVAL;
+ mlog(ML_ERROR, "invalid response %d from node %u\n",
+ packet.code, node);
+ /* Reset response to JOIN_DISALLOW */
+ *response = JOIN_DISALLOW;
+ break;
+ }
}
mlog(0, "status %d, node %d response is %d\n", status, node,
@@ -1725,12 +1732,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+ o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
+ dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+
status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
if (status)
goto bail;
- o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
- dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
if (status)
goto bail;
@@ -1845,8 +1853,6 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
sizeof(struct dlm_exit_domain),
dlm_begin_exit_domain_handler,
dlm, NULL, &dlm->dlm_domain_handlers);
- if (status)
- goto bail;
bail:
if (status)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index fdf4b41d0609..46b8b2bbc95a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
- spin_lock(&dlm->track_lock);
- if (!list_empty(&res->tracking))
- list_del_init(&res->tracking);
- else {
- mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
- res->lockname.len, res->lockname.name);
- dlm_print_one_lock_resource(res);
- }
- spin_unlock(&dlm->track_lock);
-
atomic_dec(&dlm->res_cur_count);
if (!hlist_unhashed(&res->hash_node) ||
@@ -795,8 +785,18 @@ lookup:
dlm_lockres_grab_inflight_ref(dlm, tmpres);
spin_unlock(&tmpres->spinlock);
- if (res)
+ if (res) {
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else
+ mlog(ML_ERROR, "Resource %.*s not "
+ "on the Tracking list\n",
+ res->lockname.len,
+ res->lockname.name);
+ spin_unlock(&dlm->track_lock);
dlm_lockres_put(res);
+ }
res = tmpres;
goto leave;
}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 69aac6f088ad..2e5e6d5fffe8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
__dlm_unhash_lockres(dlm, res);
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+ res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
/* lockres is not in the hash now. drop the flag and wake up
* any processes waiting in dlm_get_lock_resource. */
if (!master) {
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 23157e40dd74..1c91103c1333 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3035,8 +3035,6 @@ local:
ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->cconn = conn;
-
- status = 0;
bail:
if (status < 0) {
ocfs2_dlm_shutdown_debug(osb);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 767370b656ca..e4719e0a3f99 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0)", inode->i_ino,
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0)\n",
+ inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
@@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "xattr leaf block %llu\n", inode->i_ino,
+ "Inode %lu has non zero tree depth in xattr leaf block %llu\n",
+ inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
ret = -EROFS;
goto out;
@@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in xattr", inode->i_ino,
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+ inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7210583b472f..0e5b4515f92e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1130,6 +1130,7 @@ out:
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
{
int status = 0, size_change;
+ int inode_locked = 0;
struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
struct ocfs2_super *osb = OCFS2_SB(sb);
@@ -1178,6 +1179,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
mlog_errno(status);
goto bail_unlock_rw;
}
+ inode_locked = 1;
if (size_change) {
status = inode_newsize_ok(inode, attr->ia_size);
@@ -1258,7 +1260,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
- ocfs2_inode_unlock(inode, 1);
+ if (status) {
+ ocfs2_inode_unlock(inode, 1);
+ inode_locked = 0;
+ }
bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
@@ -1274,6 +1279,8 @@ bail:
if (status < 0)
mlog_errno(status);
}
+ if (inode_locked)
+ ocfs2_inode_unlock(inode, 1);
return status;
}
@@ -2262,8 +2269,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
ssize_t written = 0;
ssize_t ret;
size_t count = iov_iter_count(from), orig_count;
- loff_t old_size;
- u32 old_clusters;
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2271,6 +2276,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
OCFS2_MOUNT_COHERENCY_BUFFERED);
int unaligned_dio = 0;
int dropped_dio = 0;
+ int append_write = ((iocb->ki_pos + count) >=
+ i_size_read(inode) ? 1 : 0);
trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2290,8 +2297,9 @@ relock:
/*
* Concurrent O_DIRECT writes are allowed with
* mount_option "coherency=buffered".
+ * For append write, we must take rw EX.
*/
- rw_level = (!direct_io || full_coherency);
+ rw_level = (!direct_io || full_coherency || append_write);
ret = ocfs2_rw_lock(inode, rw_level);
if (ret < 0) {
@@ -2364,13 +2372,6 @@ relock:
ocfs2_iocb_set_unaligned_aio(iocb);
}
- /*
- * To later detect whether a journal commit for sync writes is
- * necessary, we sample i_size, and cluster count here.
- */
- old_size = i_size_read(inode);
- old_clusters = OCFS2_I(inode)->ip_clusters;
-
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb, rw_level);
@@ -2378,6 +2379,20 @@ relock:
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
+ /*
+ * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
+ * function pointer which is called when o_direct io completes so that
+ * it can unlock our rw lock.
+ * Unfortunately there are error cases which call end_io and others
+ * that don't. so we don't have to unlock the rw_lock if either an
+ * async dio is going to do it in the future or an end_io after an
+ * error has already done it.
+ */
+ if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
+ rw_level = -1;
+ unaligned_dio = 0;
+ }
+
if (unlikely(written <= 0))
goto no_sync;
@@ -2402,21 +2417,7 @@ relock:
}
no_sync:
- /*
- * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
- * function pointer which is called when o_direct io completes so that
- * it can unlock our rw lock.
- * Unfortunately there are error cases which call end_io and others
- * that don't. so we don't have to unlock the rw_lock if either an
- * async dio is going to do it in the future or an end_io after an
- * error has already done it.
- */
- if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
- rw_level = -1;
- unaligned_dio = 0;
- }
-
- if (unaligned_dio) {
+ if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
ocfs2_iocb_clear_unaligned_aio(iocb);
mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b254416dc8d9..8f87e05ee25d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -971,6 +971,7 @@ static void ocfs2_delete_inode(struct inode *inode)
int wipe, status;
sigset_t oldset;
struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
trace_ocfs2_delete_inode(inode->i_ino,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1025,6 +1026,14 @@ static void ocfs2_delete_inode(struct inode *inode)
goto bail_unlock_nfs_sync;
}
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ /* Skip inode deletion and wait for dio orphan entry recovered
+ * first */
+ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+ ocfs2_cleanup_delete_inode(inode, 0);
+ goto bail_unlock_inode;
+ }
+
/* Query the cluster. This will be the final decision made
* before we go ahead and wipe the inode. */
status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
@@ -1191,17 +1200,19 @@ void ocfs2_evict_inode(struct inode *inode)
int ocfs2_drop_inode(struct inode *inode)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- int res;
trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
inode->i_nlink, oi->ip_flags);
- if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
- res = 1;
- else
- res = generic_drop_inode(inode);
+ assert_spin_locked(&inode->i_lock);
+ inode->i_state |= I_WILL_FREE;
+ spin_unlock(&inode->i_lock);
+ write_inode_now(inode, 1);
+ spin_lock(&inode->i_lock);
+ WARN_ON(inode->i_state & I_NEW);
+ inode->i_state &= ~I_WILL_FREE;
- return res;
+ return 1;
}
/*
@@ -1350,32 +1361,32 @@ int ocfs2_validate_inode_block(struct super_block *sb,
rc = -EINVAL;
if (!OCFS2_IS_VALID_DINODE(di)) {
- ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
- (unsigned long long)bh->b_blocknr, 7,
- di->i_signature);
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ di->i_signature);
goto bail;
}
if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
- ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(di->i_blkno));
+ rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
goto bail;
}
if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
- ocfs2_error(sb,
- "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
- (unsigned long long)bh->b_blocknr);
+ rc = ocfs2_error(sb,
+ "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+ (unsigned long long)bh->b_blocknr);
goto bail;
}
if (le32_to_cpu(di->i_fs_generation) !=
OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Invalid dinode #%llu: fs_generation is %u\n",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(di->i_fs_generation));
+ rc = ocfs2_error(sb,
+ "Invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
goto bail;
}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 5e86b247c821..ca3431ee7f24 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,8 +81,6 @@ struct ocfs2_inode_info
tid_t i_sync_tid;
tid_t i_datasync_tid;
- wait_queue_head_t append_dio_wq;
-
struct dquot *i_dquot[MAXQUOTAS];
};
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7c099f7032fd..ff82b28462a6 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
mlog_errno(PTR_ERR(handle));
if (is_journal_aborted(journal)) {
- ocfs2_abort(osb->sb, "Detected aborted journal");
+ ocfs2_abort(osb->sb, "Detected aborted journal\n");
handle = ERR_PTR(-EROFS);
}
} else {
@@ -668,7 +668,23 @@ static int __ocfs2_journal_access(handle_t *handle,
mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
mlog(ML_ERROR, "b_blocknr=%llu\n",
(unsigned long long)bh->b_blocknr);
- BUG();
+
+ lock_buffer(bh);
+ /*
+ * A previous attempt to write this buffer head failed.
+ * Nothing we can do but to retry the write and hope for
+ * the best.
+ */
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) {
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ }
+
+ if (!buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ return -EIO;
+ }
+ unlock_buffer(bh);
}
/* Set the current transaction information on the ci so
@@ -2170,6 +2186,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
iter = oi->ip_next_orphan;
oi->ip_next_orphan = NULL;
+ mutex_lock(&inode->i_mutex);
ret = ocfs2_rw_lock(inode, 1);
if (ret < 0) {
mlog_errno(ret);
@@ -2193,7 +2210,9 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
spin_unlock(&oi->ip_lock);
- } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
+ }
+
+ if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
ret = ocfs2_truncate_file(inode, di_bh,
i_size_read(inode));
@@ -2206,17 +2225,16 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
if (ret)
mlog_errno(ret);
-
- wake_up(&OCFS2_I(inode)->append_dio_wq);
} /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
unlock_inode:
ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ di_bh = NULL;
unlock_rw:
ocfs2_rw_unlock(inode, 1);
next:
+ mutex_unlock(&inode->i_mutex);
iput(inode);
- brelse(di_bh);
- di_bh = NULL;
inode = iter;
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 857bbbcd39f3..0a4457fb0711 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
#ifdef CONFIG_OCFS2_DEBUG_FS
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
- ocfs2_error(osb->sb, "local alloc inode %llu says it has "
- "%u used bits, but a count shows %u",
+ ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
(unsigned long long)le64_to_cpu(alloc->i_blkno),
le32_to_cpu(alloc->id1.bitmap1.i_used),
ocfs2_local_alloc_count_bits(alloc));
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 56a768d06aa6..124471d26a73 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -99,11 +99,9 @@ static int __ocfs2_move_extent(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
- ocfs2_error(inode->i_sb,
- "Inode %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)ino, cpos);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ino, cpos);
goto out;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 948681e37cfd..b7dfac226b1e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1035,11 +1035,6 @@ leave:
if (handle)
ocfs2_commit_trans(osb, handle);
- if (child_locked)
- ocfs2_inode_unlock(inode, 1);
-
- ocfs2_inode_unlock(dir, 1);
-
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
@@ -1047,6 +1042,11 @@ leave:
iput(orphan_dir);
}
+ if (child_locked)
+ ocfs2_inode_unlock(inode, 1);
+
+ ocfs2_inode_unlock(dir, 1);
+
brelse(fe_bh);
brelse(parent_node_bh);
@@ -1309,6 +1309,11 @@ static int ocfs2_rename(struct inode *old_dir,
}
parents_locked = 1;
+ if (!new_dir->i_nlink) {
+ status = -EACCES;
+ goto bail;
+ }
+
/* make sure both dirs have bhs
* get an extra ref on old_dir_bh if old==new */
if (!new_dir_bh) {
@@ -1569,12 +1574,25 @@ static int ocfs2_rename(struct inode *old_dir,
status = ocfs2_find_entry(old_dentry->d_name.name,
old_dentry->d_name.len, old_dir,
&old_entry_lookup);
- if (status)
+ if (status) {
+ if (!is_journal_aborted(osb->journal->j_journal)) {
+ ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+ "is not deleted.",
+ new_dentry->d_name.len, new_dentry->d_name.name,
+ old_dentry->d_name.len, old_dentry->d_name.name);
+ }
goto bail;
+ }
status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
if (status < 0) {
mlog_errno(status);
+ if (!is_journal_aborted(osb->journal->j_journal)) {
+ ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+ "is not deleted.",
+ new_dentry->d_name.len, new_dentry->d_name.name,
+ old_dentry->d_name.len, old_dentry->d_name.name);
+ }
goto bail;
}
@@ -1633,21 +1651,9 @@ static int ocfs2_rename(struct inode *old_dir,
ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
status = 0;
bail:
- if (rename_lock)
- ocfs2_rename_unlock(osb);
-
if (handle)
ocfs2_commit_trans(osb, handle);
- if (parents_locked)
- ocfs2_double_unlock(old_dir, new_dir);
-
- if (old_child_locked)
- ocfs2_inode_unlock(old_inode, 1);
-
- if (new_child_locked)
- ocfs2_inode_unlock(new_inode, 1);
-
if (orphan_dir) {
/* This was locked for us in ocfs2_prepare_orphan_dir() */
ocfs2_inode_unlock(orphan_dir, 1);
@@ -1655,6 +1661,18 @@ bail:
iput(orphan_dir);
}
+ if (new_child_locked)
+ ocfs2_inode_unlock(new_inode, 1);
+
+ if (old_child_locked)
+ ocfs2_inode_unlock(old_inode, 1);
+
+ if (parents_locked)
+ ocfs2_double_unlock(old_dir, new_dir);
+
+ if (rename_lock)
+ ocfs2_rename_unlock(osb);
+
if (new_inode)
sync_mapping_buffers(old_inode->i_mapping);
@@ -2601,27 +2619,6 @@ leave:
return status;
}
-static int ocfs2_dio_orphan_recovered(struct inode *inode)
-{
- int ret;
- struct buffer_head *di_bh = NULL;
- struct ocfs2_dinode *di = NULL;
-
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- mlog_errno(ret);
- return 0;
- }
-
- di = (struct ocfs2_dinode *) di_bh->b_data;
- ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
-
- return ret;
-}
-
-#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
struct inode *inode)
{
@@ -2633,7 +2630,6 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
handle_t *handle = NULL;
struct ocfs2_dinode *di = NULL;
-restart:
status = ocfs2_inode_lock(inode, &di_bh, 1);
if (status < 0) {
mlog_errno(status);
@@ -2643,15 +2639,21 @@ restart:
di = (struct ocfs2_dinode *) di_bh->b_data;
/*
* Another append dio crashed?
- * If so, wait for recovery first.
+ * If so, manually recover it first.
*/
if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
- ocfs2_dio_orphan_recovered(inode),
- msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
- goto restart;
+ status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode));
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+
+ status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
}
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 690ddc60189b..7a0126267847 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -286,6 +286,8 @@ enum ocfs2_mount_options
OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
+ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
+ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
};
#define OCFS2_OSB_SOFT_RO 0x0001
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index bb07004df72a..8a54fd8a4fa5 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
ocfs2_error(inode->i_sb,
- "Quota file %llu is probably corrupted! Requested "
- "to read block %Lu but file has size only %Lu\n",
+ "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)v_block,
(unsigned long long)i_size_read(inode));
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7dc818b87cd8..e5d57cd32505 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -102,32 +102,30 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
- ocfs2_error(sb,
- "Refcount block #%llu has bad signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- rb->rf_signature);
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ rb->rf_signature);
+ goto out;
}
if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Refcount block #%llu has an invalid rf_blkno "
- "of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(rb->rf_blkno));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(rb->rf_blkno));
+ goto out;
}
if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Refcount block #%llu has an invalid "
- "rf_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(rb->rf_fs_generation));
- return -EINVAL;
+ rc = ocfs2_error(sb,
+ "Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(rb->rf_fs_generation));
+ goto out;
}
-
- return 0;
+out:
+ return rc;
}
static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
@@ -1102,12 +1100,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(sb,
- "refcount tree %llu has non zero tree "
- "depth in leaf btree tree block %llu\n",
- (unsigned long long)ocfs2_metadata_cache_owner(ci),
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(sb,
+ "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -2359,10 +2355,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
cpos, len, phys);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
goto out;
}
@@ -2545,10 +2539,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
goto out;
}
@@ -2672,11 +2664,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "leaf block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in leaf block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -3106,11 +3097,9 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
- ocfs2_error(sb,
- "Inode %llu has an extent at cpos %u which can no "
- "longer be found.\n",
- (unsigned long long)ino, cpos);
- ret = -EROFS;
+ ret = ocfs2_error(sb,
+ "Inode %llu has an extent at cpos %u which can no longer be found\n",
+ (unsigned long long)ino, cpos);
goto out;
}
@@ -3376,10 +3365,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
- ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
- "tree, but the feature bit is not set in the "
- "super block.", inode->i_ino);
- return -EROFS;
+ return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+ inode->i_ino);
}
ocfs2_init_dealloc_ctxt(&context->dealloc);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4479029630bb..d83d2602cf2b 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -149,10 +149,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
brelse(ac->ac_bh);
ac->ac_bh = NULL;
ac->ac_resv = NULL;
- if (ac->ac_find_loc_priv) {
- kfree(ac->ac_find_loc_priv);
- ac->ac_find_loc_priv = NULL;
- }
+ kfree(ac->ac_find_loc_priv);
+ ac->ac_find_loc_priv = NULL;
}
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -167,12 +165,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
}
#define do_error(fmt, ...) \
- do{ \
- if (resize) \
- mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
- else \
- ocfs2_error(sb, fmt, ##__VA_ARGS__); \
- } while (0)
+do { \
+ if (resize) \
+ mlog(ML_ERROR, fmt, ##__VA_ARGS__); \
+ else \
+ return ocfs2_error(sb, fmt, ##__VA_ARGS__); \
+} while (0)
static int ocfs2_validate_gd_self(struct super_block *sb,
struct buffer_head *bh,
@@ -181,44 +179,35 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
- do_error("Group descriptor #%llu has bad signature %.*s",
+ do_error("Group descriptor #%llu has bad signature %.*s\n",
(unsigned long long)bh->b_blocknr, 7,
gd->bg_signature);
- return -EINVAL;
}
if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
- do_error("Group descriptor #%llu has an invalid bg_blkno "
- "of %llu",
+ do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_blkno));
- return -EINVAL;
}
if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
- do_error("Group descriptor #%llu has an invalid "
- "fs_generation of #%u",
+ do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(gd->bg_generation));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
- do_error("Group descriptor #%llu has bit count %u but "
- "claims that %u are free",
+ do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
le16_to_cpu(gd->bg_free_bits_count));
- return -EINVAL;
}
if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
- do_error("Group descriptor #%llu has bit count %u but "
- "max bitmap bits of %u",
+ do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits),
8 * le16_to_cpu(gd->bg_size));
- return -EINVAL;
}
return 0;
@@ -233,20 +222,17 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (di->i_blkno != gd->bg_parent_dinode) {
- do_error("Group descriptor #%llu has bad parent "
- "pointer (%llu, expected %llu)",
+ do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
(unsigned long long)le64_to_cpu(di->i_blkno));
- return -EINVAL;
}
max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
if (le16_to_cpu(gd->bg_bits) > max_bits) {
- do_error("Group descriptor #%llu has bit count of %u",
+ do_error("Group descriptor #%llu has bit count of %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_bits));
- return -EINVAL;
}
/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
@@ -254,10 +240,9 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
((le16_to_cpu(gd->bg_chain) ==
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
- do_error("Group descriptor #%llu has bad chain %u",
+ do_error("Group descriptor #%llu has bad chain %u\n",
(unsigned long long)bh->b_blocknr,
le16_to_cpu(gd->bg_chain));
- return -EINVAL;
}
return 0;
@@ -384,11 +369,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
struct super_block * sb = alloc_inode->i_sb;
if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
- ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
- "b_blocknr (%llu)",
- (unsigned long long)group_blkno,
- (unsigned long long) bg_bh->b_blocknr);
- status = -EIO;
+ status = ocfs2_error(alloc_inode->i_sb,
+ "group block (%llu) != b_blocknr (%llu)\n",
+ (unsigned long long)group_blkno,
+ (unsigned long long) bg_bh->b_blocknr);
goto bail;
}
@@ -834,9 +818,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
- ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
- (unsigned long long)le64_to_cpu(fe->i_blkno));
- status = -EIO;
+ status = ocfs2_error(alloc_inode->i_sb,
+ "Invalid chain allocator %llu\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno));
goto bail;
}
@@ -1370,12 +1354,11 @@ int ocfs2_block_group_set_bits(handle_t *handle,
le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count),
+ num_bits);
}
while(num_bits--)
ocfs2_set_bit(bit_off++, bitmap);
@@ -1905,13 +1888,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
- ocfs2_error(ac->ac_inode->i_sb,
- "Chain allocator dinode %llu has %u used "
- "bits but only %u total.",
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- le32_to_cpu(fe->id1.bitmap1.i_used),
- le32_to_cpu(fe->id1.bitmap1.i_total));
- status = -EIO;
+ status = ocfs2_error(ac->ac_inode->i_sb,
+ "Chain allocator dinode %llu has %u used bits but only %u total\n",
+ (unsigned long long)le64_to_cpu(fe->i_blkno),
+ le32_to_cpu(fe->id1.bitmap1.i_used),
+ le32_to_cpu(fe->id1.bitmap1.i_total));
goto bail;
}
@@ -2429,12 +2410,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
}
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
+ return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+ (unsigned long long)le64_to_cpu(bg->bg_blkno),
+ le16_to_cpu(bg->bg_bits),
+ le16_to_cpu(bg->bg_free_bits_count),
+ num_bits);
}
if (undo_fn)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 403c5660b306..2de4c8a9340c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -192,6 +192,7 @@ enum {
Opt_resv_level,
Opt_dir_resv_level,
Opt_journal_async_commit,
+ Opt_err_cont,
Opt_err,
};
@@ -224,6 +225,7 @@ static const match_table_t tokens = {
{Opt_resv_level, "resv_level=%u"},
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_journal_async_commit, "journal_async_commit"},
+ {Opt_err_cont, "errors=continue"},
{Opt_err, NULL}
};
@@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb,
mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
break;
case Opt_err_panic:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
break;
case Opt_err_ro:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS;
+ break;
+ case Opt_err_cont:
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
+ mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+ mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT;
break;
case Opt_data_ordered:
mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
@@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (opts & OCFS2_MOUNT_ERRORS_PANIC)
seq_printf(s, ",errors=panic");
+ else if (opts & OCFS2_MOUNT_ERRORS_CONT)
+ seq_printf(s, ",errors=continue");
else
seq_printf(s, ",errors=remount-ro");
@@ -1550,8 +1563,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",localflocks,");
if (osb->osb_cluster_stack[0])
- seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
- osb->osb_cluster_stack);
+ seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack,
+ OCFS2_STACK_LABEL_LEN);
if (opts & OCFS2_MOUNT_USRQUOTA)
seq_printf(s, ",usrquota");
if (opts & OCFS2_MOUNT_GRPQUOTA)
@@ -1746,8 +1759,6 @@ static void ocfs2_inode_init_once(void *data)
ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
- init_waitqueue_head(&oi->append_dio_wq);
-
ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
&ocfs2_inode_caching_ops);
@@ -2541,31 +2552,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
memset(osb, 0, sizeof(struct ocfs2_super));
}
-/* Put OCFS2 into a readonly state, or (if the user specifies it),
- * panic(). We do not support continue-on-error operation. */
-static void ocfs2_handle_error(struct super_block *sb)
+/* Depending on the mount option passed, perform one of the following:
+ * Put OCFS2 into a readonly state (default)
+ * Return EIO so that only the process errs
+ * Fix the error as if fsck.ocfs2 -y
+ * panic
+ */
+static int ocfs2_handle_error(struct super_block *sb)
{
struct ocfs2_super *osb = OCFS2_SB(sb);
-
- if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
- panic("OCFS2: (device %s): panic forced after error\n",
- sb->s_id);
+ int rv = 0;
ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+ pr_crit("On-disk corruption discovered. "
+ "Please run fsck.ocfs2 once the filesystem is unmounted.\n");
- if (sb->s_flags & MS_RDONLY &&
- (ocfs2_is_soft_readonly(osb) ||
- ocfs2_is_hard_readonly(osb)))
- return;
-
- printk(KERN_CRIT "File system is now read-only due to the potential "
- "of on-disk corruption. Please run fsck.ocfs2 once the file "
- "system is unmounted.\n");
- sb->s_flags |= MS_RDONLY;
- ocfs2_set_ro_flag(osb, 0);
+ if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) {
+ panic("OCFS2: (device %s): panic forced after error\n",
+ sb->s_id);
+ } else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) {
+ pr_crit("OCFS2: Returning error to the calling process.\n");
+ rv = -EIO;
+ } else { /* default option */
+ rv = -EROFS;
+ if (sb->s_flags & MS_RDONLY &&
+ (ocfs2_is_soft_readonly(osb) ||
+ ocfs2_is_hard_readonly(osb)))
+ return rv;
+
+ pr_crit("OCFS2: File system is now read-only.\n");
+ sb->s_flags |= MS_RDONLY;
+ ocfs2_set_ro_flag(osb, 0);
+ }
+
+ return rv;
}
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
const char *fmt, ...)
{
struct va_format vaf;
@@ -2577,12 +2600,12 @@ void __ocfs2_error(struct super_block *sb, const char *function,
/* Not using mlog here because we want to show the actual
* function the error came from. */
- printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
+ printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV",
sb->s_id, function, &vaf);
va_end(args);
- ocfs2_handle_error(sb);
+ return ocfs2_handle_error(sb);
}
/* Handle critical errors. This is intentionally more drastic than
@@ -2599,7 +2622,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function,
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
+ printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV",
sb->s_id, function, &vaf);
va_end(args);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 74ff74cf78fe..b477d0b1c7b6 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -32,16 +32,18 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
int node_num);
__printf(3, 4)
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
const char *fmt, ...);
-#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_error(sb, fmt, ...) \
+ __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
__printf(3, 4)
void __ocfs2_abort(struct super_block *sb, const char *function,
const char *fmt, ...);
-#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_abort(sb, fmt, ...) \
+ __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
/*
* Void signal blockers, because in-kernel sigprocmask() only fails
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 889f3796a0d7..ebfdea78659b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -499,30 +499,24 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
*/
if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has bad "
- "signature %.*s",
- (unsigned long long)bh->b_blocknr, 7,
- xb->xb_signature);
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has bad signature %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ xb->xb_signature);
}
if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has an "
- "invalid xb_blkno of %llu",
- (unsigned long long)bh->b_blocknr,
- (unsigned long long)le64_to_cpu(xb->xb_blkno));
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid xb_blkno of %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(xb->xb_blkno));
}
if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
- ocfs2_error(sb,
- "Extended attribute block #%llu has an invalid "
- "xb_fs_generation of #%u",
- (unsigned long long)bh->b_blocknr,
- le32_to_cpu(xb->xb_fs_generation));
- return -EINVAL;
+ return ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(xb->xb_fs_generation));
}
return 0;
@@ -3694,11 +3688,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
el = &eb->h_list;
if (el->l_tree_depth) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has non zero tree depth in "
- "xattr tree block %llu\n", inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in xattr tree block %llu\n",
+ inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
@@ -3713,11 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
}
if (!e_blkno) {
- ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0) in xattr", inode->i_ino,
- le32_to_cpu(rec->e_cpos),
- ocfs2_rec_clusters(el, rec));
- ret = -EROFS;
+ ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+ inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
goto out;
}
@@ -7334,6 +7326,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
if (list && total_len <= list_size) {
memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
memcpy(list + prefix_len, name, name_len);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7466ff339c66..79073d68b475 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -588,10 +588,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
struct super_block *sb = dentry->d_sb;
struct ovl_fs *ufs = sb->s_fs_info;
- seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+ seq_show_option(m, "lowerdir", ufs->config.lowerdir);
if (ufs->config.upperdir) {
- seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
- seq_printf(m, ",workdir=%s", ufs->config.workdir);
+ seq_show_option(m, "upperdir", ufs->config.upperdir);
+ seq_show_option(m, "workdir", ufs->config.workdir);
}
return 0;
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ce065cf3104f..f60f0121e331 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
static inline void task_cap(struct seq_file *m, struct task_struct *p)
{
const struct cred *cred;
- kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
+ kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
+ cap_bset, cap_ambient;
rcu_read_lock();
cred = __task_cred(p);
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
cap_permitted = cred->cap_permitted;
cap_effective = cred->cap_effective;
cap_bset = cred->cap_bset;
+ cap_ambient = cred->cap_ambient;
rcu_read_unlock();
render_cap_t(m, "CapInh:\t", &cap_inheritable);
render_cap_t(m, "CapPrm:\t", &cap_permitted);
render_cap_t(m, "CapEff:\t", &cap_effective);
render_cap_t(m, "CapBnd:\t", &cap_bset);
+ render_cap_t(m, "CapAmb:\t", &cap_ambient);
}
static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index aa50d1ac28fc..b25eee4cead5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1230,10 +1230,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
size_t count, loff_t *ppos)
{
struct inode * inode = file_inode(file);
- char *page, *tmp;
- ssize_t length;
uid_t loginuid;
kuid_t kloginuid;
+ int rv;
rcu_read_lock();
if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1242,46 +1241,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
}
rcu_read_unlock();
- if (count >= PAGE_SIZE)
- count = PAGE_SIZE - 1;
-
if (*ppos != 0) {
/* No partial writes. */
return -EINVAL;
}
- page = (char*)__get_free_page(GFP_TEMPORARY);
- if (!page)
- return -ENOMEM;
- length = -EFAULT;
- if (copy_from_user(page, buf, count))
- goto out_free_page;
-
- page[count] = '\0';
- loginuid = simple_strtoul(page, &tmp, 10);
- if (tmp == page) {
- length = -EINVAL;
- goto out_free_page;
- }
+ rv = kstrtou32_from_user(buf, count, 10, &loginuid);
+ if (rv < 0)
+ return rv;
/* is userspace tring to explicitly UNSET the loginuid? */
if (loginuid == AUDIT_UID_UNSET) {
kloginuid = INVALID_UID;
} else {
kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
- if (!uid_valid(kloginuid)) {
- length = -EINVAL;
- goto out_free_page;
- }
+ if (!uid_valid(kloginuid))
+ return -EINVAL;
}
- length = audit_set_loginuid(kloginuid);
- if (likely(length == 0))
- length = count;
-
-out_free_page:
- free_page((unsigned long) page);
- return length;
+ rv = audit_set_loginuid(kloginuid);
+ if (rv < 0)
+ return rv;
+ return count;
}
static const struct file_operations proc_loginuid_operations = {
@@ -1335,8 +1316,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
const char __user * buf, size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF], *end;
+ char buffer[PROC_NUMBUF];
int make_it_fail;
+ int rv;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
@@ -1345,9 +1327,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
- if (*end)
- return -EINVAL;
+ rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
+ if (rv < 0)
+ return rv;
if (make_it_fail < 0 || make_it_fail > 1)
return -EINVAL;
@@ -1836,8 +1818,6 @@ end_instantiate:
return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
-
/*
* dname_to_vma_addr - maps a dentry name into two unsigned longs
* which represent vma start and end addresses.
@@ -1864,11 +1844,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
- if (!capable(CAP_SYS_ADMIN)) {
- status = -EPERM;
- goto out_notask;
- }
-
inode = d_inode(dentry);
task = get_proc_task(inode);
if (!task)
@@ -1957,6 +1932,29 @@ struct map_files_info {
unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
};
+/*
+ * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
+ * symlinks may be used to bypass permissions on ancestor directories in the
+ * path to the file in question.
+ */
+static const char *
+proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ return proc_pid_follow_link(dentry, NULL);
+}
+
+/*
+ * Identical to proc_pid_link_inode_operations except for follow_link()
+ */
+static const struct inode_operations proc_map_files_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_map_files_follow_link,
+ .setattr = proc_setattr,
+};
+
static int
proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
struct task_struct *task, const void *ptr)
@@ -1972,7 +1970,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
ei = PROC_I(inode);
ei->op.proc_get_link = proc_map_files_get_link;
- inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
inode->i_mode = S_IFLNK;
@@ -1996,10 +1994,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
int result;
struct mm_struct *mm;
- result = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
result = -ENOENT;
task = get_proc_task(dir);
if (!task)
@@ -2053,10 +2047,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
struct map_files_info *p;
int ret;
- ret = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out;
-
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2245,7 +2235,6 @@ static const struct file_operations proc_timers_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif /* CONFIG_CHECKPOINT_RESTORE */
static int proc_pident_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2481,32 +2470,20 @@ static ssize_t proc_coredump_filter_write(struct file *file,
{
struct task_struct *task;
struct mm_struct *mm;
- char buffer[PROC_NUMBUF], *end;
unsigned int val;
int ret;
int i;
unsigned long mask;
- ret = -EFAULT;
- memset(buffer, 0, sizeof(buffer));
- if (count > sizeof(buffer) - 1)
- count = sizeof(buffer) - 1;
- if (copy_from_user(buffer, buf, count))
- goto out_no_task;
-
- ret = -EINVAL;
- val = (unsigned int)simple_strtoul(buffer, &end, 0);
- if (*end == '\n')
- end++;
- if (end - buffer == 0)
- goto out_no_task;
+ ret = kstrtouint_from_user(buf, count, 0, &val);
+ if (ret < 0)
+ return ret;
ret = -ESRCH;
task = get_proc_task(file_inode(file));
if (!task)
goto out_no_task;
- ret = end - buffer;
mm = get_task_mm(task);
if (!mm)
goto out_no_mm;
@@ -2522,7 +2499,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
out_no_mm:
put_task_struct(task);
out_no_task:
- return ret;
+ if (ret < 0)
+ return ret;
+ return count;
}
static const struct file_operations proc_coredump_filter_operations = {
@@ -2744,9 +2723,7 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-#ifdef CONFIG_CHECKPOINT_RESTORE
DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
-#endif
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e5dee5c3188e..ff3ffc76a937 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -26,7 +26,7 @@
#include "internal.h"
-static DEFINE_SPINLOCK(proc_subdir_lock);
+static DEFINE_RWLOCK(proc_subdir_lock);
static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
{
@@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
{
int rv;
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
rv = __xlate_proc_name(name, ret, residual);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return rv;
}
@@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
{
struct inode *inode;
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
if (de) {
pde_get(de);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
d_add(dentry, inode);
return NULL;
}
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
}
@@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
if (!dir_emit_dots(file, ctx))
return 0;
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
de = pde_subdir_first(de);
i = ctx->pos - 2;
for (;;) {
if (!de) {
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return 0;
}
if (!i)
@@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
do {
struct proc_dir_entry *next;
pde_get(de);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
if (!dir_emit(ctx, de->name, de->namelen,
de->low_ino, de->mode >> 12)) {
pde_put(de);
return 0;
}
- spin_lock(&proc_subdir_lock);
+ read_lock(&proc_subdir_lock);
ctx->pos++;
next = pde_subdir_next(de);
pde_put(de);
de = next;
} while (de);
- spin_unlock(&proc_subdir_lock);
+ read_unlock(&proc_subdir_lock);
return 1;
}
@@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
if (ret)
return ret;
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
dp->parent = dir;
if (pde_subdir_insert(dir, dp) == false) {
WARN(1, "proc_dir_entry '%s/%s' already registered\n",
dir->name, dp->name);
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
proc_free_inum(dp->low_ino);
return -EEXIST;
}
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return 0;
}
@@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
const char *fn = name;
unsigned int len;
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
if (__xlate_proc_name(name, &parent, &fn) != 0) {
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return;
}
len = strlen(fn);
@@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
de = pde_subdir_find(parent, fn, len);
if (de)
rb_erase(&de->subdir_node, &parent->subdir);
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
if (!de) {
WARN(1, "name '%s'\n", name);
return;
@@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
const char *fn = name;
unsigned int len;
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
if (__xlate_proc_name(name, &parent, &fn) != 0) {
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return -ENOENT;
}
len = strlen(fn);
root = pde_subdir_find(parent, fn, len);
if (!root) {
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
return -ENOENT;
}
rb_erase(&root->subdir_node, &parent->subdir);
@@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
de = next;
continue;
}
- spin_unlock(&proc_subdir_lock);
+ write_unlock(&proc_subdir_lock);
proc_entry_rundown(de);
next = de->parent;
@@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
break;
pde_put(de);
- spin_lock(&proc_subdir_lock);
+ write_lock(&proc_subdir_lock);
de = next;
}
pde_put(root);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7eee2d8b97d9..93484034a03d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -9,12 +9,16 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
#include <linux/kernel-page-flags.h>
#include <asm/uaccess.h>
#include "internal.h"
#define KPMSIZE sizeof(u64)
#define KPMMASK (KPMSIZE - 1)
+#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
/* /proc/kpagecount - an array exposing page counts
*
@@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
pfn++;
out++;
count -= KPMSIZE;
+
+ cond_resched();
}
*ppos += (char __user *)out - buf;
@@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page)
if (PageBalloon(page))
u |= 1 << KPF_BALLOON;
+ if (page_is_idle(page))
+ u |= 1 << KPF_IDLE;
+
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
@@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
pfn++;
out++;
count -= KPMSIZE;
+
+ cond_resched();
}
*ppos += (char __user *)out - buf;
@@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = {
.read = kpageflags_read,
};
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ u64 __user *out = (u64 __user *)buf;
+ struct page *ppage;
+ unsigned long src = *ppos;
+ unsigned long pfn;
+ ssize_t ret = 0;
+ u64 ino;
+
+ pfn = src / KPMSIZE;
+ count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+ if (src & KPMMASK || count & KPMMASK)
+ return -EINVAL;
+
+ while (count > 0) {
+ if (pfn_valid(pfn))
+ ppage = pfn_to_page(pfn);
+ else
+ ppage = NULL;
+
+ if (ppage)
+ ino = page_cgroup_ino(ppage);
+ else
+ ino = 0;
+
+ if (put_user(ino, out)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ pfn++;
+ out++;
+ count -= KPMSIZE;
+
+ cond_resched();
+ }
+
+ *ppos += (char __user *)out - buf;
+ if (!ret)
+ ret = (char __user *)out - buf;
+ return ret;
+}
+
+static const struct file_operations proc_kpagecgroup_operations = {
+ .llseek = mem_lseek,
+ .read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
static int __init proc_page_init(void)
{
proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+#ifdef CONFIG_MEMCG
+ proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+#endif
return 0;
}
fs_initcall(proc_page_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ca1e091881d4..e2d46adb54b4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -13,6 +13,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
#include <asm/elf.h>
#include <asm/uaccess.h>
@@ -446,6 +447,7 @@ struct mem_size_stats {
unsigned long anonymous_thp;
unsigned long swap;
u64 pss;
+ u64 swap_pss;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -458,7 +460,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
- if (young || PageReferenced(page))
+ if (young || page_is_young(page) || PageReferenced(page))
mss->referenced += size;
mapcount = page_mapcount(page);
if (mapcount >= 2) {
@@ -492,9 +494,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
- if (!non_swap_entry(swpent))
+ if (!non_swap_entry(swpent)) {
+ int mapcount;
+
mss->swap += PAGE_SIZE;
- else if (is_migration_entry(swpent))
+ mapcount = swp_swapcount(swpent);
+ if (mapcount >= 2) {
+ u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
+
+ do_div(pss_delta, mapcount);
+ mss->swap_pss += pss_delta;
+ } else {
+ mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
+ }
+ } else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
}
@@ -597,6 +610,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_HUGEPAGE)] = "hg",
[ilog2(VM_NOHUGEPAGE)] = "nh",
[ilog2(VM_MERGEABLE)] = "mg",
+ [ilog2(VM_UFFD_MISSING)]= "um",
+ [ilog2(VM_UFFD_WP)] = "uw",
};
size_t i;
@@ -638,6 +653,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Anonymous: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
"Swap: %8lu kB\n"
+ "SwapPss: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
"MMUPageSize: %8lu kB\n"
"Locked: %8lu kB\n",
@@ -652,6 +668,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
mss.swap >> 10,
+ (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
vma_kernel_pagesize(vma) >> 10,
vma_mmu_pagesize(vma) >> 10,
(vma->vm_flags & VM_LOCKED) ?
@@ -710,23 +727,6 @@ const struct file_operations proc_tid_smaps_operations = {
.release = proc_map_release,
};
-/*
- * We do not want to have constant page-shift bits sitting in
- * pagemap entries and are about to reuse them some time soon.
- *
- * Here's the "migration strategy":
- * 1. when the system boots these bits remain what they are,
- * but a warning about future change is printed in log;
- * 2. once anyone clears soft-dirty bits via clear_refs file,
- * these flag is set to denote, that user is aware of the
- * new API and those page-shift bits change their meaning.
- * The respective warning is printed in dmesg;
- * 3. In a couple of releases we will remove all the mentions
- * of page-shift in pagemap entries.
- */
-
-static bool soft_dirty_cleared __read_mostly;
-
enum clear_refs_types {
CLEAR_REFS_ALL = 1,
CLEAR_REFS_ANON,
@@ -808,6 +808,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
/* Clear accessed and referenced bits. */
pmdp_test_and_clear_young(vma, addr, pmd);
+ test_and_clear_page_young(page);
ClearPageReferenced(page);
out:
spin_unlock(ptl);
@@ -835,6 +836,7 @@ out:
/* Clear accessed and referenced bits. */
ptep_test_and_clear_young(vma, addr, pte);
+ test_and_clear_page_young(page);
ClearPageReferenced(page);
}
pte_unmap_unlock(pte - 1, ptl);
@@ -887,13 +889,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
return -EINVAL;
- if (type == CLEAR_REFS_SOFT_DIRTY) {
- soft_dirty_cleared = true;
- pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
- " See the linux/Documentation/vm/pagemap.txt for "
- "details.\n");
- }
-
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
@@ -961,36 +956,26 @@ typedef struct {
struct pagemapread {
int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
pagemap_entry_t *buffer;
- bool v2;
+ bool show_pfn;
};
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
#define PAGEMAP_WALK_MASK (PMD_MASK)
-#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
-#define PM_STATUS_BITS 3
-#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
-#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_PSHIFT_BITS 6
-#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
-#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
-#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
-#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
-/* in "new" pagemap pshift bits are occupied with more status bits */
-#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
-
-#define __PM_SOFT_DIRTY (1LL)
-#define PM_PRESENT PM_STATUS(4LL)
-#define PM_SWAP PM_STATUS(2LL)
-#define PM_FILE PM_STATUS(1LL)
-#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
+#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
+#define PM_SOFT_DIRTY BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_FILE BIT_ULL(61)
+#define PM_SWAP BIT_ULL(62)
+#define PM_PRESENT BIT_ULL(63)
+
#define PM_END_OF_BUFFER 1
-static inline pagemap_entry_t make_pme(u64 val)
+static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
{
- return (pagemap_entry_t) { .pme = val };
+ return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
}
static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
@@ -1011,7 +996,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
while (addr < end) {
struct vm_area_struct *vma = find_vma(walk->mm, addr);
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ pagemap_entry_t pme = make_pme(0, 0);
/* End of address space hole, which we mark as non-present. */
unsigned long hole_end;
@@ -1031,7 +1016,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
/* Addresses in the VMA. */
if (vma->vm_flags & VM_SOFTDIRTY)
- pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+ pme = make_pme(0, PM_SOFT_DIRTY);
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
err = add_to_pagemap(addr, &pme, pm);
if (err)
@@ -1042,67 +1027,42 @@ out:
return err;
}
-static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
- u64 frame, flags;
+ u64 frame = 0, flags = 0;
struct page *page = NULL;
- int flags2 = 0;
if (pte_present(pte)) {
- frame = pte_pfn(pte);
- flags = PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte);
+ flags |= PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
- flags2 |= __PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
if (pte_swp_soft_dirty(pte))
- flags2 |= __PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY;
entry = pte_to_swp_entry(pte);
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
- flags = PM_SWAP;
+ flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
- } else {
- if (vma->vm_flags & VM_SOFTDIRTY)
- flags2 |= __PM_SOFT_DIRTY;
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
- return;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
- if ((vma->vm_flags & VM_SOFTDIRTY))
- flags2 |= __PM_SOFT_DIRTY;
-
- *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
-}
+ if (page && page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pmd_t pmd, int offset, int pmd_flags2)
-{
- /*
- * Currently pmd for thp is always present because thp can not be
- * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
- * This if-check is just to prepare for future implementation.
- */
- if (pmd_present(pmd))
- *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
- | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
- else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
+ return make_pme(frame, flags);
}
-#else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pmd_t pmd, int offset, int pmd_flags2)
-{
-}
-#endif
-static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
@@ -1111,41 +1071,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte, *orig_pte;
int err = 0;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- int pmd_flags2;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+ u64 flags = 0, frame = 0;
+ pmd_t pmd = *pmdp;
- if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
- pmd_flags2 = __PM_SOFT_DIRTY;
- else
- pmd_flags2 = 0;
+ if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+
+ /*
+ * Currently pmd for thp is always present because thp
+ * can not be swapped-out, migrated, or HWPOISONed
+ * (split in such cases instead.)
+ * This if-check is just to prepare for future implementation.
+ */
+ if (pmd_present(pmd)) {
+ struct page *page = pmd_page(pmd);
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pmd_pfn(pmd) +
+ ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ }
for (; addr != end; addr += PAGE_SIZE) {
- unsigned long offset;
- pagemap_entry_t pme;
+ pagemap_entry_t pme = make_pme(frame, flags);
- offset = (addr & ~PAGEMAP_WALK_MASK) >>
- PAGE_SHIFT;
- thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
}
spin_unlock(ptl);
return err;
}
- if (pmd_trans_unstable(pmd))
+ if (pmd_trans_unstable(pmdp))
return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* We can assume that @vma always points to a valid one and @end never
* goes beyond vma->vm_end.
*/
- orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
for (; addr < end; pte++, addr += PAGE_SIZE) {
pagemap_entry_t pme;
- pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+ pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
@@ -1158,40 +1135,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
}
#ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pte_t pte, int offset, int flags2)
-{
- if (pte_present(pte))
- *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
- PM_STATUS2(pm->v2, flags2) |
- PM_PRESENT);
- else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
- PM_STATUS2(pm->v2, flags2));
-}
-
/* This function walks within one hugetlb entry in the single call */
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
struct vm_area_struct *vma = walk->vma;
+ u64 flags = 0, frame = 0;
int err = 0;
- int flags2;
- pagemap_entry_t pme;
+ pte_t pte;
if (vma->vm_flags & VM_SOFTDIRTY)
- flags2 = __PM_SOFT_DIRTY;
- else
- flags2 = 0;
+ flags |= PM_SOFT_DIRTY;
+
+ pte = huge_ptep_get(ptep);
+ if (pte_present(pte)) {
+ struct page *page = pte_page(pte);
+
+ if (!PageAnon(page))
+ flags |= PM_FILE;
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte) +
+ ((addr & ~hmask) >> PAGE_SHIFT);
+ }
for (; addr != end; addr += PAGE_SIZE) {
- int offset = (addr & ~hmask) >> PAGE_SHIFT;
- huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
+ pagemap_entry_t pme = make_pme(frame, flags);
+
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
}
cond_resched();
@@ -1209,7 +1190,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
* Bits 0-54 page frame number (PFN) if present
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
- * Bits 55-60 page shift (page size = 1<<page shift)
+ * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit 56 page exclusively mapped
+ * Bits 57-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
@@ -1227,42 +1210,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file_inode(file));
- struct mm_struct *mm;
+ struct mm_struct *mm = file->private_data;
struct pagemapread pm;
- int ret = -ESRCH;
struct mm_walk pagemap_walk = {};
unsigned long src;
unsigned long svpfn;
unsigned long start_vaddr;
unsigned long end_vaddr;
- int copied = 0;
+ int ret = 0, copied = 0;
- if (!task)
+ if (!mm || !atomic_inc_not_zero(&mm->mm_users))
goto out;
ret = -EINVAL;
/* file position must be aligned */
if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
- goto out_task;
+ goto out_mm;
ret = 0;
if (!count)
- goto out_task;
+ goto out_mm;
+
+ /* do not disclose physical addresses: attack vector */
+ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
- pm.v2 = soft_dirty_cleared;
pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
ret = -ENOMEM;
if (!pm.buffer)
- goto out_task;
-
- mm = mm_access(task, PTRACE_MODE_READ);
- ret = PTR_ERR(mm);
- if (!mm || IS_ERR(mm))
- goto out_free;
+ goto out_mm;
- pagemap_walk.pmd_entry = pagemap_pte_range;
+ pagemap_walk.pmd_entry = pagemap_pmd_range;
pagemap_walk.pte_hole = pagemap_pte_hole;
#ifdef CONFIG_HUGETLB_PAGE
pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -1273,10 +1251,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
src = *ppos;
svpfn = src / PM_ENTRY_BYTES;
start_vaddr = svpfn << PAGE_SHIFT;
- end_vaddr = TASK_SIZE_OF(task);
+ end_vaddr = mm->task_size;
/* watch out for wraparound */
- if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+ if (svpfn > mm->task_size >> PAGE_SHIFT)
start_vaddr = end_vaddr;
/*
@@ -1303,7 +1281,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
len = min(count, PM_ENTRY_BYTES * pm.pos);
if (copy_to_user(buf, pm.buffer, len)) {
ret = -EFAULT;
- goto out_mm;
+ goto out_free;
}
copied += len;
buf += len;
@@ -1313,24 +1291,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!ret || ret == PM_END_OF_BUFFER)
ret = copied;
-out_mm:
- mmput(mm);
out_free:
kfree(pm.buffer);
-out_task:
- put_task_struct(task);
+out_mm:
+ mmput(mm);
out:
return ret;
}
static int pagemap_open(struct inode *inode, struct file *file)
{
- /* do not disclose physical addresses: attack vector */
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
- "to stop being page-shift some time soon. See the "
- "linux/Documentation/vm/pagemap.txt for details.\n");
+ struct mm_struct *mm;
+
+ mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+ file->private_data = mm;
+ return 0;
+}
+
+static int pagemap_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+
+ if (mm)
+ mmdrop(mm);
return 0;
}
@@ -1338,6 +1323,7 @@ const struct file_operations proc_pagemap_operations = {
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
+ .release = pagemap_release,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index fed66e2c9fe8..ef0d64b2a6d9 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -928,7 +928,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
int reserved = 0;
#endif
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
@@ -939,7 +939,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
}
__iget(inode);
spin_unlock(&inode->i_lock);
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
#ifdef CONFIG_QUOTA_DEBUG
if (unlikely(inode_get_rsv_space(inode) > 0))
@@ -951,15 +951,15 @@ static void add_dquot_ref(struct super_block *sb, int type)
/*
* We hold a reference to 'inode' so it couldn't have been
* removed from s_inodes list while we dropped the
- * inode_sb_list_lock We cannot iput the inode now as we can be
+ * s_inode_list_lock. We cannot iput the inode now as we can be
* holding the last reference and we cannot iput it under
- * inode_sb_list_lock. So we keep the reference and iput it
+ * s_inode_list_lock. So we keep the reference and iput it
* later.
*/
old_inode = inode;
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
iput(old_inode);
#ifdef CONFIG_QUOTA_DEBUG
@@ -1028,7 +1028,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
struct inode *inode;
int reserved = 0;
- spin_lock(&inode_sb_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
/*
* We have to scan also I_NEW inodes because they can already
@@ -1044,7 +1044,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
}
spin_unlock(&dq_data_lock);
}
- spin_unlock(&inode_sb_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
#ifdef CONFIG_QUOTA_DEBUG
if (reserved) {
printk(KERN_WARNING "VFS (%s): Writes happened after quota"
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0e4cf728126f..4a62fe8cc3bf 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -714,18 +714,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",acl");
if (REISERFS_SB(s)->s_jdev)
- seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev);
+ seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
if (journal->j_max_commit_age != journal->j_default_max_commit_age)
seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
#ifdef CONFIG_QUOTA
if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
- seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]);
+ seq_show_option(seq, "usrjquota",
+ REISERFS_SB(s)->s_qf_names[USRQUOTA]);
else if (opts & (1 << REISERFS_USRQUOTA))
seq_puts(seq, ",usrquota");
if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
- seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
+ seq_show_option(seq, "grpjquota",
+ REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
else if (opts & (1 << REISERFS_GRPQUOTA))
seq_puts(seq, ",grpquota");
if (REISERFS_SB(s)->s_jquota_fmt) {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index ce9e39fd5daf..225586e141ca 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -12,6 +12,7 @@
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mm.h>
+#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -371,16 +372,16 @@ EXPORT_SYMBOL(seq_release);
* @esc: set of characters that need escaping
*
* Puts string into buffer, replacing each occurrence of character from
- * @esc with usual octal escape. Returns 0 in case of success, -1 - in
- * case of overflow.
+ * @esc with usual octal escape.
+ * Use seq_has_overflowed() to check for errors.
*/
-int seq_escape(struct seq_file *m, const char *s, const char *esc)
+void seq_escape(struct seq_file *m, const char *s, const char *esc)
{
char *end = m->buf + m->size;
- char *p;
+ char *p;
char c;
- for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
+ for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
if (!strchr(esc, c)) {
*p++ = c;
continue;
@@ -393,14 +394,13 @@ int seq_escape(struct seq_file *m, const char *s, const char *esc)
continue;
}
seq_set_overflow(m);
- return -1;
- }
+ return;
+ }
m->count = p - m->buf;
- return 0;
}
EXPORT_SYMBOL(seq_escape);
-int seq_vprintf(struct seq_file *m, const char *f, va_list args)
+void seq_vprintf(struct seq_file *m, const char *f, va_list args)
{
int len;
@@ -408,24 +408,20 @@ int seq_vprintf(struct seq_file *m, const char *f, va_list args)
len = vsnprintf(m->buf + m->count, m->size - m->count, f, args);
if (m->count + len < m->size) {
m->count += len;
- return 0;
+ return;
}
}
seq_set_overflow(m);
- return -1;
}
EXPORT_SYMBOL(seq_vprintf);
-int seq_printf(struct seq_file *m, const char *f, ...)
+void seq_printf(struct seq_file *m, const char *f, ...)
{
- int ret;
va_list args;
va_start(args, f);
- ret = seq_vprintf(m, f, args);
+ seq_vprintf(m, f, args);
va_end(args);
-
- return ret;
}
EXPORT_SYMBOL(seq_printf);
@@ -663,26 +659,25 @@ int seq_open_private(struct file *filp, const struct seq_operations *ops,
}
EXPORT_SYMBOL(seq_open_private);
-int seq_putc(struct seq_file *m, char c)
+void seq_putc(struct seq_file *m, char c)
{
- if (m->count < m->size) {
- m->buf[m->count++] = c;
- return 0;
- }
- return -1;
+ if (m->count >= m->size)
+ return;
+
+ m->buf[m->count++] = c;
}
EXPORT_SYMBOL(seq_putc);
-int seq_puts(struct seq_file *m, const char *s)
+void seq_puts(struct seq_file *m, const char *s)
{
int len = strlen(s);
- if (m->count + len < m->size) {
- memcpy(m->buf + m->count, s, len);
- m->count += len;
- return 0;
+
+ if (m->count + len >= m->size) {
+ seq_set_overflow(m);
+ return;
}
- seq_set_overflow(m);
- return -1;
+ memcpy(m->buf + m->count, s, len);
+ m->count += len;
}
EXPORT_SYMBOL(seq_puts);
@@ -693,8 +688,8 @@ EXPORT_SYMBOL(seq_puts);
* This routine is very quick when you show lots of numbers.
* In usual cases, it will be better to use seq_printf(). It's easier to read.
*/
-int seq_put_decimal_ull(struct seq_file *m, char delimiter,
- unsigned long long num)
+void seq_put_decimal_ull(struct seq_file *m, char delimiter,
+ unsigned long long num)
{
int len;
@@ -706,35 +701,33 @@ int seq_put_decimal_ull(struct seq_file *m, char delimiter,
if (num < 10) {
m->buf[m->count++] = num + '0';
- return 0;
+ return;
}
len = num_to_str(m->buf + m->count, m->size - m->count, num);
if (!len)
goto overflow;
m->count += len;
- return 0;
+ return;
+
overflow:
seq_set_overflow(m);
- return -1;
}
EXPORT_SYMBOL(seq_put_decimal_ull);
-int seq_put_decimal_ll(struct seq_file *m, char delimiter,
- long long num)
+void seq_put_decimal_ll(struct seq_file *m, char delimiter, long long num)
{
if (num < 0) {
if (m->count + 3 >= m->size) {
seq_set_overflow(m);
- return -1;
+ return;
}
if (delimiter)
m->buf[m->count++] = delimiter;
num = -num;
delimiter = '-';
}
- return seq_put_decimal_ull(m, delimiter, num);
-
+ seq_put_decimal_ull(m, delimiter, num);
}
EXPORT_SYMBOL(seq_put_decimal_ll);
@@ -773,6 +766,47 @@ void seq_pad(struct seq_file *m, char c)
}
EXPORT_SYMBOL(seq_pad);
+/* A complete analogue of print_hex_dump() */
+void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
+ int rowsize, int groupsize, const void *buf, size_t len,
+ bool ascii)
+{
+ const u8 *ptr = buf;
+ int i, linelen, remaining = len;
+ int ret;
+
+ if (rowsize != 16 && rowsize != 32)
+ rowsize = 16;
+
+ for (i = 0; i < len && !seq_has_overflowed(m); i += rowsize) {
+ linelen = min(remaining, rowsize);
+ remaining -= rowsize;
+
+ switch (prefix_type) {
+ case DUMP_PREFIX_ADDRESS:
+ seq_printf(m, "%s%p: ", prefix_str, ptr + i);
+ break;
+ case DUMP_PREFIX_OFFSET:
+ seq_printf(m, "%s%.8x: ", prefix_str, i);
+ break;
+ default:
+ seq_printf(m, "%s", prefix_str);
+ break;
+ }
+
+ ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
+ m->buf + m->count, m->size - m->count,
+ ascii);
+ if (ret >= m->size - m->count) {
+ seq_set_overflow(m);
+ } else {
+ m->count += ret;
+ seq_putc(m, '\n');
+ }
+ }
+}
+EXPORT_SYMBOL(seq_hex_dump);
+
struct list_head *seq_list_start(struct list_head *head, loff_t pos)
{
struct list_head *lh;
diff --git a/fs/super.c b/fs/super.c
index b61372354f2b..954aeb80e202 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -135,6 +135,24 @@ static unsigned long super_cache_count(struct shrinker *shrink,
return total_objects;
}
+static void destroy_super_work(struct work_struct *work)
+{
+ struct super_block *s = container_of(work, struct super_block,
+ destroy_work);
+ int i;
+
+ for (i = 0; i < SB_FREEZE_LEVELS; i++)
+ percpu_free_rwsem(&s->s_writers.rw_sem[i]);
+ kfree(s);
+}
+
+static void destroy_super_rcu(struct rcu_head *head)
+{
+ struct super_block *s = container_of(head, struct super_block, rcu);
+ INIT_WORK(&s->destroy_work, destroy_super_work);
+ schedule_work(&s->destroy_work);
+}
+
/**
* destroy_super - frees a superblock
* @s: superblock to free
@@ -143,16 +161,13 @@ static unsigned long super_cache_count(struct shrinker *shrink,
*/
static void destroy_super(struct super_block *s)
{
- int i;
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
- for (i = 0; i < SB_FREEZE_LEVELS; i++)
- percpu_counter_destroy(&s->s_writers.counter[i]);
security_sb_free(s);
WARN_ON(!list_empty(&s->s_mounts));
kfree(s->s_subtype);
kfree(s->s_options);
- kfree_rcu(s, rcu);
+ call_rcu(&s->rcu, destroy_super_rcu);
}
/**
@@ -178,19 +193,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
goto fail;
for (i = 0; i < SB_FREEZE_LEVELS; i++) {
- if (percpu_counter_init(&s->s_writers.counter[i], 0,
- GFP_KERNEL) < 0)
+ if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
+ sb_writers_name[i],
+ &type->s_writers_key[i]))
goto fail;
- lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
- &type->s_writers_key[i], 0);
}
- init_waitqueue_head(&s->s_writers.wait);
init_waitqueue_head(&s->s_writers.wait_unfrozen);
s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
+ mutex_init(&s->s_sync_lock);
INIT_LIST_HEAD(&s->s_inodes);
+ spin_lock_init(&s->s_inode_list_lock);
if (list_lru_init_memcg(&s->s_dentry_lru))
goto fail;
@@ -399,7 +414,7 @@ void generic_shutdown_super(struct super_block *sb)
sync_filesystem(sb);
sb->s_flags &= ~MS_ACTIVE;
- fsnotify_unmount_inodes(&sb->s_inodes);
+ fsnotify_unmount_inodes(sb);
evict_inodes(sb);
@@ -1146,72 +1161,46 @@ out:
*/
void __sb_end_write(struct super_block *sb, int level)
{
- percpu_counter_dec(&sb->s_writers.counter[level-1]);
- /*
- * Make sure s_writers are updated before we wake up waiters in
- * freeze_super().
- */
- smp_mb();
- if (waitqueue_active(&sb->s_writers.wait))
- wake_up(&sb->s_writers.wait);
- rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+ percpu_up_read(sb->s_writers.rw_sem + level-1);
}
EXPORT_SYMBOL(__sb_end_write);
-#ifdef CONFIG_LOCKDEP
-/*
- * We want lockdep to tell us about possible deadlocks with freezing but
- * it's it bit tricky to properly instrument it. Getting a freeze protection
- * works as getting a read lock but there are subtle problems. XFS for example
- * gets freeze protection on internal level twice in some cases, which is OK
- * only because we already hold a freeze protection also on higher level. Due
- * to these cases we have to tell lockdep we are doing trylock when we
- * already hold a freeze protection for a higher freeze level.
- */
-static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
- unsigned long ip)
-{
- int i;
-
- if (!trylock) {
- for (i = 0; i < level - 1; i++)
- if (lock_is_held(&sb->s_writers.lock_map[i])) {
- trylock = true;
- break;
- }
- }
- rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
-}
-#endif
-
/*
* This is an internal function, please use sb_start_{write,pagefault,intwrite}
* instead.
*/
int __sb_start_write(struct super_block *sb, int level, bool wait)
{
-retry:
- if (unlikely(sb->s_writers.frozen >= level)) {
- if (!wait)
- return 0;
- wait_event(sb->s_writers.wait_unfrozen,
- sb->s_writers.frozen < level);
- }
+ bool force_trylock = false;
+ int ret = 1;
#ifdef CONFIG_LOCKDEP
- acquire_freeze_lock(sb, level, !wait, _RET_IP_);
-#endif
- percpu_counter_inc(&sb->s_writers.counter[level-1]);
/*
- * Make sure counter is updated before we check for frozen.
- * freeze_super() first sets frozen and then checks the counter.
+ * We want lockdep to tell us about possible deadlocks with freezing
+ * but it's it bit tricky to properly instrument it. Getting a freeze
+ * protection works as getting a read lock but there are subtle
+ * problems. XFS for example gets freeze protection on internal level
+ * twice in some cases, which is OK only because we already hold a
+ * freeze protection also on higher level. Due to these cases we have
+ * to use wait == F (trylock mode) which must not fail.
*/
- smp_mb();
- if (unlikely(sb->s_writers.frozen >= level)) {
- __sb_end_write(sb, level);
- goto retry;
+ if (wait) {
+ int i;
+
+ for (i = 0; i < level - 1; i++)
+ if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) {
+ force_trylock = true;
+ break;
+ }
}
- return 1;
+#endif
+ if (wait && !force_trylock)
+ percpu_down_read(sb->s_writers.rw_sem + level-1);
+ else
+ ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
+
+ WARN_ON(force_trylock & !ret);
+ return ret;
}
EXPORT_SYMBOL(__sb_start_write);
@@ -1221,37 +1210,33 @@ EXPORT_SYMBOL(__sb_start_write);
* @level: type of writers we wait for (normal vs page fault)
*
* This function waits until there are no writers of given type to given file
- * system. Caller of this function should make sure there can be no new writers
- * of type @level before calling this function. Otherwise this function can
- * livelock.
+ * system.
*/
static void sb_wait_write(struct super_block *sb, int level)
{
- s64 writers;
-
+ percpu_down_write(sb->s_writers.rw_sem + level-1);
/*
- * We just cycle-through lockdep here so that it does not complain
- * about returning with lock to userspace
+ * We are going to return to userspace and forget about this lock, the
+ * ownership goes to the caller of thaw_super() which does unlock.
+ *
+ * FIXME: we should do this before return from freeze_super() after we
+ * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super()
+ * should re-acquire these locks before s_op->unfreeze_fs(sb). However
+ * this leads to lockdep false-positives, so currently we do the early
+ * release right after acquire.
*/
- rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
- rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
-
- do {
- DEFINE_WAIT(wait);
+ percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_);
+}
- /*
- * We use a barrier in prepare_to_wait() to separate setting
- * of frozen and checking of the counter
- */
- prepare_to_wait(&sb->s_writers.wait, &wait,
- TASK_UNINTERRUPTIBLE);
+static void sb_freeze_unlock(struct super_block *sb)
+{
+ int level;
- writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
- if (writers)
- schedule();
+ for (level = 0; level < SB_FREEZE_LEVELS; ++level)
+ percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
- finish_wait(&sb->s_writers.wait, &wait);
- } while (writers);
+ for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
+ percpu_up_write(sb->s_writers.rw_sem + level);
}
/**
@@ -1310,20 +1295,14 @@ int freeze_super(struct super_block *sb)
return 0;
}
- /* From now on, no new normal writers can start */
sb->s_writers.frozen = SB_FREEZE_WRITE;
- smp_wmb();
-
/* Release s_umount to preserve sb_start_write -> s_umount ordering */
up_write(&sb->s_umount);
-
sb_wait_write(sb, SB_FREEZE_WRITE);
+ down_write(&sb->s_umount);
/* Now we go and block page faults... */
- down_write(&sb->s_umount);
sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
- smp_wmb();
-
sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
/* All writers are done so after syncing there won't be dirty data */
@@ -1331,7 +1310,6 @@ int freeze_super(struct super_block *sb)
/* Now wait for internal filesystem counter */
sb->s_writers.frozen = SB_FREEZE_FS;
- smp_wmb();
sb_wait_write(sb, SB_FREEZE_FS);
if (sb->s_op->freeze_fs) {
@@ -1340,7 +1318,7 @@ int freeze_super(struct super_block *sb)
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_writers.frozen = SB_UNFROZEN;
- smp_wmb();
+ sb_freeze_unlock(sb);
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return ret;
@@ -1372,8 +1350,10 @@ int thaw_super(struct super_block *sb)
return -EINVAL;
}
- if (sb->s_flags & MS_RDONLY)
+ if (sb->s_flags & MS_RDONLY) {
+ sb->s_writers.frozen = SB_UNFROZEN;
goto out;
+ }
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
@@ -1385,12 +1365,11 @@ int thaw_super(struct super_block *sb)
}
}
-out:
sb->s_writers.frozen = SB_UNFROZEN;
- smp_wmb();
+ sb_freeze_unlock(sb);
+out:
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
-
return 0;
}
EXPORT_SYMBOL(thaw_super);
diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile
index 4d0e02b022b3..392db25c0b56 100644
--- a/fs/ufs/Makefile
+++ b/fs/ufs/Makefile
@@ -5,5 +5,5 @@
obj-$(CONFIG_UFS_FS) += ufs.o
ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \
- namei.o super.o symlink.o truncate.o util.o
+ namei.o super.o symlink.o util.o
ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index a7106eda5024..dc5fae601c24 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -417,12 +417,14 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
if (oldcount == 0) {
result = ufs_alloc_fragments (inode, cgno, goal, count, err);
if (result) {
+ ufs_clear_frags(inode, result + oldcount,
+ newcount - oldcount, locked_page != NULL);
+ write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
+ write_sequnlock(&UFS_I(inode)->meta_lock);
*err = 0;
UFS_I(inode)->i_lastfrag =
max(UFS_I(inode)->i_lastfrag, fragment + count);
- ufs_clear_frags(inode, result + oldcount,
- newcount - oldcount, locked_page != NULL);
}
mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
@@ -473,7 +475,9 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
ufs_change_blocknr(inode, fragment - oldcount, oldcount,
uspi->s_sbbase + tmp,
uspi->s_sbbase + result, locked_page);
+ write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
+ write_sequnlock(&UFS_I(inode)->meta_lock);
*err = 0;
UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
fragment + count);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index f913a6924b23..a064cf44b143 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -41,9 +41,7 @@
#include "swab.h"
#include "util.h"
-static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock);
-
-static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
+static int ufs_block_to_path(struct inode *inode, sector_t i_block, unsigned offsets[4])
{
struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi;
int ptrs = uspi->s_apb;
@@ -75,227 +73,232 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
return n;
}
+typedef struct {
+ void *p;
+ union {
+ __fs32 key32;
+ __fs64 key64;
+ };
+ struct buffer_head *bh;
+} Indirect;
+
+static inline int grow_chain32(struct ufs_inode_info *ufsi,
+ struct buffer_head *bh, __fs32 *v,
+ Indirect *from, Indirect *to)
+{
+ Indirect *p;
+ unsigned seq;
+ to->bh = bh;
+ do {
+ seq = read_seqbegin(&ufsi->meta_lock);
+ to->key32 = *(__fs32 *)(to->p = v);
+ for (p = from; p <= to && p->key32 == *(__fs32 *)p->p; p++)
+ ;
+ } while (read_seqretry(&ufsi->meta_lock, seq));
+ return (p > to);
+}
+
+static inline int grow_chain64(struct ufs_inode_info *ufsi,
+ struct buffer_head *bh, __fs64 *v,
+ Indirect *from, Indirect *to)
+{
+ Indirect *p;
+ unsigned seq;
+ to->bh = bh;
+ do {
+ seq = read_seqbegin(&ufsi->meta_lock);
+ to->key64 = *(__fs64 *)(to->p = v);
+ for (p = from; p <= to && p->key64 == *(__fs64 *)p->p; p++)
+ ;
+ } while (read_seqretry(&ufsi->meta_lock, seq));
+ return (p > to);
+}
+
/*
* Returns the location of the fragment from
* the beginning of the filesystem.
*/
-static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock)
+static u64 ufs_frag_map(struct inode *inode, unsigned offsets[4], int depth)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift;
int shift = uspi->s_apbshift-uspi->s_fpbshift;
- sector_t offsets[4], *p;
- int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets);
- u64 ret = 0L;
- __fs32 block;
- __fs64 u2_block = 0L;
+ Indirect chain[4], *q = chain;
+ unsigned *p;
unsigned flags = UFS_SB(sb)->s_flags;
- u64 temp = 0L;
+ u64 res = 0;
- UFSD(": frag = %llu depth = %d\n", (unsigned long long)frag, depth);
UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n",
uspi->s_fpbshift, uspi->s_apbmask,
(unsigned long long)mask);
if (depth == 0)
- return 0;
+ goto no_block;
+again:
p = offsets;
- if (needs_lock)
- lock_ufs(sb);
if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
goto ufs2;
- block = ufsi->i_u1.i_data[*p++];
- if (!block)
- goto out;
+ if (!grow_chain32(ufsi, NULL, &ufsi->i_u1.i_data[*p++], chain, q))
+ goto changed;
+ if (!q->key32)
+ goto no_block;
while (--depth) {
+ __fs32 *ptr;
struct buffer_head *bh;
- sector_t n = *p++;
+ unsigned n = *p++;
- bh = sb_bread(sb, uspi->s_sbbase + fs32_to_cpu(sb, block)+(n>>shift));
+ bh = sb_bread(sb, uspi->s_sbbase +
+ fs32_to_cpu(sb, q->key32) + (n>>shift));
if (!bh)
- goto out;
- block = ((__fs32 *) bh->b_data)[n & mask];
- brelse (bh);
- if (!block)
- goto out;
- }
- ret = (u64) (uspi->s_sbbase + fs32_to_cpu(sb, block) + (frag & uspi->s_fpbmask));
- goto out;
-ufs2:
- u2_block = ufsi->i_u1.u2_i_data[*p++];
- if (!u2_block)
- goto out;
+ goto no_block;
+ ptr = (__fs32 *)bh->b_data + (n & mask);
+ if (!grow_chain32(ufsi, bh, ptr, chain, ++q))
+ goto changed;
+ if (!q->key32)
+ goto no_block;
+ }
+ res = fs32_to_cpu(sb, q->key32);
+ goto found;
+ufs2:
+ if (!grow_chain64(ufsi, NULL, &ufsi->i_u1.u2_i_data[*p++], chain, q))
+ goto changed;
+ if (!q->key64)
+ goto no_block;
while (--depth) {
+ __fs64 *ptr;
struct buffer_head *bh;
- sector_t n = *p++;
-
+ unsigned n = *p++;
- temp = (u64)(uspi->s_sbbase) + fs64_to_cpu(sb, u2_block);
- bh = sb_bread(sb, temp +(u64) (n>>shift));
+ bh = sb_bread(sb, uspi->s_sbbase +
+ fs64_to_cpu(sb, q->key64) + (n>>shift));
if (!bh)
- goto out;
- u2_block = ((__fs64 *)bh->b_data)[n & mask];
- brelse(bh);
- if (!u2_block)
- goto out;
+ goto no_block;
+ ptr = (__fs64 *)bh->b_data + (n & mask);
+ if (!grow_chain64(ufsi, bh, ptr, chain, ++q))
+ goto changed;
+ if (!q->key64)
+ goto no_block;
}
- temp = (u64)uspi->s_sbbase + fs64_to_cpu(sb, u2_block);
- ret = temp + (u64) (frag & uspi->s_fpbmask);
+ res = fs64_to_cpu(sb, q->key64);
+found:
+ res += uspi->s_sbbase;
+no_block:
+ while (q > chain) {
+ brelse(q->bh);
+ q--;
+ }
+ return res;
-out:
- if (needs_lock)
- unlock_ufs(sb);
- return ret;
+changed:
+ while (q > chain) {
+ brelse(q->bh);
+ q--;
+ }
+ goto again;
+}
+
+/*
+ * Unpacking tails: we have a file with partial final block and
+ * we had been asked to extend it. If the fragment being written
+ * is within the same block, we need to extend the tail just to cover
+ * that fragment. Otherwise the tail is extended to full block.
+ *
+ * Note that we might need to create a _new_ tail, but that will
+ * be handled elsewhere; this is strictly for resizing old
+ * ones.
+ */
+static bool
+ufs_extend_tail(struct inode *inode, u64 writes_to,
+ int *err, struct page *locked_page)
+{
+ struct ufs_inode_info *ufsi = UFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ unsigned lastfrag = ufsi->i_lastfrag; /* it's a short file, so unsigned is enough */
+ unsigned block = ufs_fragstoblks(lastfrag);
+ unsigned new_size;
+ void *p;
+ u64 tmp;
+
+ if (writes_to < (lastfrag | uspi->s_fpbmask))
+ new_size = (writes_to & uspi->s_fpbmask) + 1;
+ else
+ new_size = uspi->s_fpb;
+
+ p = ufs_get_direct_data_ptr(uspi, ufsi, block);
+ tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p),
+ new_size, err, locked_page);
+ return tmp != 0;
}
/**
* ufs_inode_getfrag() - allocate new fragment(s)
* @inode: pointer to inode
- * @fragment: number of `fragment' which hold pointer
- * to new allocated fragment(s)
+ * @index: number of block pointer within the inode's array.
* @new_fragment: number of new allocated fragment(s)
- * @required: how many fragment(s) we require
* @err: we set it if something wrong
- * @phys: pointer to where we save physical number of new allocated fragments,
- * NULL if we allocate not data(indirect blocks for example).
* @new: we set it if we allocate new block
* @locked_page: for ufs_new_fragments()
*/
-static struct buffer_head *
-ufs_inode_getfrag(struct inode *inode, u64 fragment,
- sector_t new_fragment, unsigned int required, int *err,
- long *phys, int *new, struct page *locked_page)
+static u64
+ufs_inode_getfrag(struct inode *inode, unsigned index,
+ sector_t new_fragment, int *err,
+ int *new, struct page *locked_page)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- struct buffer_head * result;
- unsigned blockoff, lastblockoff;
- u64 tmp, goal, lastfrag, block, lastblock;
- void *p, *p2;
-
- UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, required %u, "
- "metadata %d\n", inode->i_ino, (unsigned long long)fragment,
- (unsigned long long)new_fragment, required, !phys);
+ u64 tmp, goal, lastfrag;
+ unsigned nfrags = uspi->s_fpb;
+ void *p;
/* TODO : to be done for write support
if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
goto ufs2;
*/
- block = ufs_fragstoblks (fragment);
- blockoff = ufs_fragnum (fragment);
- p = ufs_get_direct_data_ptr(uspi, ufsi, block);
-
- goal = 0;
-
-repeat:
+ p = ufs_get_direct_data_ptr(uspi, ufsi, index);
tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (tmp)
+ goto out;
lastfrag = ufsi->i_lastfrag;
- if (tmp && fragment < lastfrag) {
- if (!phys) {
- result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
- if (tmp == ufs_data_ptr_to_cpu(sb, p)) {
- UFSD("EXIT, result %llu\n",
- (unsigned long long)tmp + blockoff);
- return result;
- }
- brelse (result);
- goto repeat;
- } else {
- *phys = uspi->s_sbbase + tmp + blockoff;
- return NULL;
- }
- }
- lastblock = ufs_fragstoblks (lastfrag);
- lastblockoff = ufs_fragnum (lastfrag);
- /*
- * We will extend file into new block beyond last allocated block
- */
- if (lastblock < block) {
- /*
- * We must reallocate last allocated block
- */
- if (lastblockoff) {
- p2 = ufs_get_direct_data_ptr(uspi, ufsi, lastblock);
- tmp = ufs_new_fragments(inode, p2, lastfrag,
- ufs_data_ptr_to_cpu(sb, p2),
- uspi->s_fpb - lastblockoff,
- err, locked_page);
- if (!tmp) {
- if (lastfrag != ufsi->i_lastfrag)
- goto repeat;
- else
- return NULL;
- }
- lastfrag = ufsi->i_lastfrag;
-
- }
- tmp = ufs_data_ptr_to_cpu(sb,
- ufs_get_direct_data_ptr(uspi, ufsi,
- lastblock));
- if (tmp)
- goal = tmp + uspi->s_fpb;
- tmp = ufs_new_fragments (inode, p, fragment - blockoff,
- goal, required + blockoff,
- err,
- phys != NULL ? locked_page : NULL);
- } else if (lastblock == block) {
- /*
- * We will extend last allocated block
- */
- tmp = ufs_new_fragments(inode, p, fragment -
- (blockoff - lastblockoff),
- ufs_data_ptr_to_cpu(sb, p),
- required + (blockoff - lastblockoff),
- err, phys != NULL ? locked_page : NULL);
- } else /* (lastblock > block) */ {
- /*
- * We will allocate new block before last allocated block
- */
- if (block) {
- tmp = ufs_data_ptr_to_cpu(sb,
- ufs_get_direct_data_ptr(uspi, ufsi, block - 1));
- if (tmp)
- goal = tmp + uspi->s_fpb;
- }
- tmp = ufs_new_fragments(inode, p, fragment - blockoff,
- goal, uspi->s_fpb, err,
- phys != NULL ? locked_page : NULL);
+ /* will that be a new tail? */
+ if (new_fragment < UFS_NDIR_FRAGMENT && new_fragment >= lastfrag)
+ nfrags = (new_fragment & uspi->s_fpbmask) + 1;
+
+ goal = 0;
+ if (index) {
+ goal = ufs_data_ptr_to_cpu(sb,
+ ufs_get_direct_data_ptr(uspi, ufsi, index - 1));
+ if (goal)
+ goal += uspi->s_fpb;
}
+ tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment),
+ goal, uspi->s_fpb, err, locked_page);
+
if (!tmp) {
- if ((!blockoff && ufs_data_ptr_to_cpu(sb, p)) ||
- (blockoff && lastfrag != ufsi->i_lastfrag))
- goto repeat;
*err = -ENOSPC;
- return NULL;
+ return 0;
}
- if (!phys) {
- result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
- } else {
- *phys = uspi->s_sbbase + tmp + blockoff;
- result = NULL;
- *err = 0;
+ if (new)
*new = 1;
- }
-
inode->i_ctime = CURRENT_TIME_SEC;
if (IS_SYNC(inode))
ufs_sync_inode (inode);
mark_inode_dirty(inode);
- UFSD("EXIT, result %llu\n", (unsigned long long)tmp + blockoff);
- return result;
+out:
+ return tmp + uspi->s_sbbase;
/* This part : To be implemented ....
Required only for writing, not required for READ-ONLY.
@@ -316,95 +319,70 @@ repeat2:
/**
* ufs_inode_getblock() - allocate new block
* @inode: pointer to inode
- * @bh: pointer to block which hold "pointer" to new allocated block
- * @fragment: number of `fragment' which hold pointer
- * to new allocated block
+ * @ind_block: block number of the indirect block
+ * @index: number of pointer within the indirect block
* @new_fragment: number of new allocated fragment
* (block will hold this fragment and also uspi->s_fpb-1)
* @err: see ufs_inode_getfrag()
- * @phys: see ufs_inode_getfrag()
* @new: see ufs_inode_getfrag()
* @locked_page: see ufs_inode_getfrag()
*/
-static struct buffer_head *
-ufs_inode_getblock(struct inode *inode, struct buffer_head *bh,
- u64 fragment, sector_t new_fragment, int *err,
- long *phys, int *new, struct page *locked_page)
+static u64
+ufs_inode_getblock(struct inode *inode, u64 ind_block,
+ unsigned index, sector_t new_fragment, int *err,
+ int *new, struct page *locked_page)
{
struct super_block *sb = inode->i_sb;
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- struct buffer_head * result;
- unsigned blockoff;
- u64 tmp, goal, block;
+ int shift = uspi->s_apbshift - uspi->s_fpbshift;
+ u64 tmp = 0, goal;
+ struct buffer_head *bh;
void *p;
- block = ufs_fragstoblks (fragment);
- blockoff = ufs_fragnum (fragment);
-
- UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, metadata %d\n",
- inode->i_ino, (unsigned long long)fragment,
- (unsigned long long)new_fragment, !phys);
+ if (!ind_block)
+ return 0;
- result = NULL;
- if (!bh)
- goto out;
- if (!buffer_uptodate(bh)) {
- ll_rw_block (READ, 1, &bh);
- wait_on_buffer (bh);
- if (!buffer_uptodate(bh))
- goto out;
+ bh = sb_bread(sb, ind_block + (index >> shift));
+ if (unlikely(!bh)) {
+ *err = -EIO;
+ return 0;
}
+
+ index &= uspi->s_apbmask >> uspi->s_fpbshift;
if (uspi->fs_magic == UFS2_MAGIC)
- p = (__fs64 *)bh->b_data + block;
+ p = (__fs64 *)bh->b_data + index;
else
- p = (__fs32 *)bh->b_data + block;
-repeat:
+ p = (__fs32 *)bh->b_data + index;
+
tmp = ufs_data_ptr_to_cpu(sb, p);
- if (tmp) {
- if (!phys) {
- result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
- if (tmp == ufs_data_ptr_to_cpu(sb, p))
- goto out;
- brelse (result);
- goto repeat;
- } else {
- *phys = uspi->s_sbbase + tmp + blockoff;
- goto out;
- }
- }
+ if (tmp)
+ goto out;
- if (block && (uspi->fs_magic == UFS2_MAGIC ?
- (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[block-1])) :
- (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[block-1]))))
+ if (index && (uspi->fs_magic == UFS2_MAGIC ?
+ (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[index-1])) :
+ (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[index-1]))))
goal = tmp + uspi->s_fpb;
else
goal = bh->b_blocknr + uspi->s_fpb;
tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
uspi->s_fpb, err, locked_page);
- if (!tmp) {
- if (ufs_data_ptr_to_cpu(sb, p))
- goto repeat;
+ if (!tmp)
goto out;
- }
-
- if (!phys) {
- result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff);
- } else {
- *phys = uspi->s_sbbase + tmp + blockoff;
+ if (new)
*new = 1;
- }
mark_buffer_dirty(bh);
if (IS_SYNC(inode))
sync_dirty_buffer(bh);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- UFSD("result %llu\n", (unsigned long long)tmp + blockoff);
out:
brelse (bh);
UFSD("EXIT\n");
- return result;
+ if (tmp)
+ tmp += uspi->s_sbbase;
+ return tmp;
}
/**
@@ -412,103 +390,64 @@ out:
* readpage, writepage and so on
*/
-int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
+static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
{
- struct super_block * sb = inode->i_sb;
- struct ufs_sb_info * sbi = UFS_SB(sb);
- struct ufs_sb_private_info * uspi = sbi->s_uspi;
- struct buffer_head * bh;
- int ret, err, new;
- unsigned long ptr,phys;
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ int err = 0, new = 0;
+ unsigned offsets[4];
+ int depth = ufs_block_to_path(inode, fragment >> uspi->s_fpbshift, offsets);
u64 phys64 = 0;
- bool needs_lock = (sbi->mutex_owner != current);
-
+ unsigned frag = fragment & uspi->s_fpbmask;
+
if (!create) {
- phys64 = ufs_frag_map(inode, fragment, needs_lock);
- UFSD("phys64 = %llu\n", (unsigned long long)phys64);
- if (phys64)
- map_bh(bh_result, sb, phys64);
- return 0;
+ phys64 = ufs_frag_map(inode, offsets, depth);
+ goto out;
}
/* This code entered only while writing ....? */
- err = -EIO;
- new = 0;
- ret = 0;
- bh = NULL;
-
- if (needs_lock)
- lock_ufs(sb);
+ mutex_lock(&UFS_I(inode)->truncate_mutex);
UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
- if (fragment >
- ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb)
- << uspi->s_fpbshift))
- goto abort_too_big;
-
- err = 0;
- ptr = fragment;
-
- /*
- * ok, these macros clean the logic up a bit and make
- * it much more readable:
- */
-#define GET_INODE_DATABLOCK(x) \
- ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new,\
- bh_result->b_page)
-#define GET_INODE_PTR(x) \
- ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL,\
- bh_result->b_page)
-#define GET_INDIRECT_DATABLOCK(x) \
- ufs_inode_getblock(inode, bh, x, fragment, \
- &err, &phys, &new, bh_result->b_page)
-#define GET_INDIRECT_PTR(x) \
- ufs_inode_getblock(inode, bh, x, fragment, \
- &err, NULL, NULL, NULL)
-
- if (ptr < UFS_NDIR_FRAGMENT) {
- bh = GET_INODE_DATABLOCK(ptr);
+ if (unlikely(!depth)) {
+ ufs_warning(sb, "ufs_get_block", "block > big");
+ err = -EIO;
goto out;
}
- ptr -= UFS_NDIR_FRAGMENT;
- if (ptr < (1 << (uspi->s_apbshift + uspi->s_fpbshift))) {
- bh = GET_INODE_PTR(UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift));
- goto get_indirect;
- }
- ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift);
- if (ptr < (1 << (uspi->s_2apbshift + uspi->s_fpbshift))) {
- bh = GET_INODE_PTR(UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift));
- goto get_double;
- }
- ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift);
- bh = GET_INODE_PTR(UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift));
- bh = GET_INDIRECT_PTR((ptr >> uspi->s_2apbshift) & uspi->s_apbmask);
-get_double:
- bh = GET_INDIRECT_PTR((ptr >> uspi->s_apbshift) & uspi->s_apbmask);
-get_indirect:
- bh = GET_INDIRECT_DATABLOCK(ptr & uspi->s_apbmask);
-
-#undef GET_INODE_DATABLOCK
-#undef GET_INODE_PTR
-#undef GET_INDIRECT_DATABLOCK
-#undef GET_INDIRECT_PTR
-out:
- if (err)
- goto abort;
- if (new)
- set_buffer_new(bh_result);
- map_bh(bh_result, sb, phys);
-abort:
- if (needs_lock)
- unlock_ufs(sb);
+ if (UFS_I(inode)->i_lastfrag < UFS_NDIR_FRAGMENT) {
+ unsigned lastfrag = UFS_I(inode)->i_lastfrag;
+ unsigned tailfrags = lastfrag & uspi->s_fpbmask;
+ if (tailfrags && fragment >= lastfrag) {
+ if (!ufs_extend_tail(inode, fragment,
+ &err, bh_result->b_page))
+ goto out;
+ }
+ }
+ if (depth == 1) {
+ phys64 = ufs_inode_getfrag(inode, offsets[0], fragment,
+ &err, &new, bh_result->b_page);
+ } else {
+ int i;
+ phys64 = ufs_inode_getfrag(inode, offsets[0], fragment,
+ &err, NULL, NULL);
+ for (i = 1; i < depth - 1; i++)
+ phys64 = ufs_inode_getblock(inode, phys64, offsets[i],
+ fragment, &err, NULL, NULL);
+ phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1],
+ fragment, &err, &new, bh_result->b_page);
+ }
+out:
+ if (phys64) {
+ phys64 += frag;
+ map_bh(bh_result, sb, phys64);
+ if (new)
+ set_buffer_new(bh_result);
+ }
+ mutex_unlock(&UFS_I(inode)->truncate_mutex);
return err;
-
-abort_too_big:
- ufs_warning(sb, "ufs_get_block", "block > big");
- goto abort;
}
static int ufs_writepage(struct page *page, struct writeback_control *wbc)
@@ -526,12 +465,16 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
return __block_write_begin(page, pos, len, ufs_getfrag_block);
}
+static void ufs_truncate_blocks(struct inode *);
+
static void ufs_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
- if (to > inode->i_size)
+ if (to > inode->i_size) {
truncate_pagecache(inode, inode->i_size);
+ ufs_truncate_blocks(inode);
+ }
}
static int ufs_write_begin(struct file *file, struct address_space *mapping,
@@ -548,6 +491,18 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
+static int ufs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ int ret;
+
+ ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (ret < len)
+ ufs_write_failed(mapping, pos + len);
+ return ret;
+}
+
static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,ufs_getfrag_block);
@@ -557,7 +512,7 @@ const struct address_space_operations ufs_aops = {
.readpage = ufs_readpage,
.writepage = ufs_writepage,
.write_begin = ufs_write_begin,
- .write_end = generic_write_end,
+ .write_end = ufs_write_end,
.bmap = ufs_bmap
};
@@ -599,7 +554,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino);
return -1;
}
-
+
/*
* Linux now has 32-bit uid and gid, so we can support EFT.
*/
@@ -619,7 +574,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow);
ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag);
-
+
if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr,
sizeof(ufs_inode->ui_u2.ui_addr));
@@ -753,7 +708,7 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
ufs_set_inode_uid(sb, ufs_inode, i_uid_read(inode));
ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode));
-
+
ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec);
ufs_inode->ui_atime.tv_usec = 0;
@@ -855,23 +810,19 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
ufs1_update_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
}
-
+
mark_buffer_dirty(bh);
if (do_sync)
sync_dirty_buffer(bh);
brelse (bh);
-
+
UFSD("EXIT\n");
return 0;
}
int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- int ret;
- lock_ufs(inode->i_sb);
- ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
- unlock_ufs(inode->i_sb);
- return ret;
+ return ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
}
int ufs_sync_inode (struct inode *inode)
@@ -888,24 +839,389 @@ void ufs_evict_inode(struct inode * inode)
truncate_inode_pages_final(&inode->i_data);
if (want_delete) {
- loff_t old_i_size;
- /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
- lock_ufs(inode->i_sb);
- mark_inode_dirty(inode);
- ufs_update_inode(inode, IS_SYNC(inode));
- old_i_size = inode->i_size;
inode->i_size = 0;
- if (inode->i_blocks && ufs_truncate(inode, old_i_size))
- ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
- unlock_ufs(inode->i_sb);
+ if (inode->i_blocks)
+ ufs_truncate_blocks(inode);
}
invalidate_inode_buffers(inode);
clear_inode(inode);
- if (want_delete) {
- lock_ufs(inode->i_sb);
+ if (want_delete)
ufs_free_inode(inode);
- unlock_ufs(inode->i_sb);
+}
+
+struct to_free {
+ struct inode *inode;
+ u64 to;
+ unsigned count;
+};
+
+static inline void free_data(struct to_free *ctx, u64 from, unsigned count)
+{
+ if (ctx->count && ctx->to != from) {
+ ufs_free_blocks(ctx->inode, ctx->to - ctx->count, ctx->count);
+ ctx->count = 0;
+ }
+ ctx->count += count;
+ ctx->to = from + count;
+}
+
+#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift)
+#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
+
+static void ufs_trunc_direct(struct inode *inode)
+{
+ struct ufs_inode_info *ufsi = UFS_I(inode);
+ struct super_block * sb;
+ struct ufs_sb_private_info * uspi;
+ void *p;
+ u64 frag1, frag2, frag3, frag4, block1, block2;
+ struct to_free ctx = {.inode = inode};
+ unsigned i, tmp;
+
+ UFSD("ENTER: ino %lu\n", inode->i_ino);
+
+ sb = inode->i_sb;
+ uspi = UFS_SB(sb)->s_uspi;
+
+ frag1 = DIRECT_FRAGMENT;
+ frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
+ frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1);
+ frag3 = frag4 & ~uspi->s_fpbmask;
+ block1 = block2 = 0;
+ if (frag2 > frag3) {
+ frag2 = frag4;
+ frag3 = frag4 = 0;
+ } else if (frag2 < frag3) {
+ block1 = ufs_fragstoblks (frag2);
+ block2 = ufs_fragstoblks (frag3);
+ }
+
+ UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu,"
+ " frag3 %llu, frag4 %llu\n", inode->i_ino,
+ (unsigned long long)frag1, (unsigned long long)frag2,
+ (unsigned long long)block1, (unsigned long long)block2,
+ (unsigned long long)frag3, (unsigned long long)frag4);
+
+ if (frag1 >= frag2)
+ goto next1;
+
+ /*
+ * Free first free fragments
+ */
+ p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1));
+ tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp )
+ ufs_panic (sb, "ufs_trunc_direct", "internal error");
+ frag2 -= frag1;
+ frag1 = ufs_fragnum (frag1);
+
+ ufs_free_fragments(inode, tmp + frag1, frag2);
+
+next1:
+ /*
+ * Free whole blocks
+ */
+ for (i = block1 ; i < block2; i++) {
+ p = ufs_get_direct_data_ptr(uspi, ufsi, i);
+ tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp)
+ continue;
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+
+ free_data(&ctx, tmp, uspi->s_fpb);
+ }
+
+ free_data(&ctx, 0, 0);
+
+ if (frag3 >= frag4)
+ goto next3;
+
+ /*
+ * Free last free fragments
+ */
+ p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3));
+ tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp )
+ ufs_panic(sb, "ufs_truncate_direct", "internal error");
+ frag4 = ufs_fragnum (frag4);
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+
+ ufs_free_fragments (inode, tmp, frag4);
+ next3:
+
+ UFSD("EXIT: ino %lu\n", inode->i_ino);
+}
+
+static void free_full_branch(struct inode *inode, u64 ind_block, int depth)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ struct ufs_buffer_head *ubh = ubh_bread(sb, ind_block, uspi->s_bsize);
+ unsigned i;
+
+ if (!ubh)
+ return;
+
+ if (--depth) {
+ for (i = 0; i < uspi->s_apb; i++) {
+ void *p = ubh_get_data_ptr(uspi, ubh, i);
+ u64 block = ufs_data_ptr_to_cpu(sb, p);
+ if (block)
+ free_full_branch(inode, block, depth);
+ }
+ } else {
+ struct to_free ctx = {.inode = inode};
+
+ for (i = 0; i < uspi->s_apb; i++) {
+ void *p = ubh_get_data_ptr(uspi, ubh, i);
+ u64 block = ufs_data_ptr_to_cpu(sb, p);
+ if (block)
+ free_data(&ctx, block, uspi->s_fpb);
+ }
+ free_data(&ctx, 0, 0);
+ }
+
+ ubh_bforget(ubh);
+ ufs_free_blocks(inode, ind_block, uspi->s_fpb);
+}
+
+static void free_branch_tail(struct inode *inode, unsigned from, struct ufs_buffer_head *ubh, int depth)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ unsigned i;
+
+ if (--depth) {
+ for (i = from; i < uspi->s_apb ; i++) {
+ void *p = ubh_get_data_ptr(uspi, ubh, i);
+ u64 block = ufs_data_ptr_to_cpu(sb, p);
+ if (block) {
+ write_seqlock(&UFS_I(inode)->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&UFS_I(inode)->meta_lock);
+ ubh_mark_buffer_dirty(ubh);
+ free_full_branch(inode, block, depth);
+ }
+ }
+ } else {
+ struct to_free ctx = {.inode = inode};
+
+ for (i = from; i < uspi->s_apb; i++) {
+ void *p = ubh_get_data_ptr(uspi, ubh, i);
+ u64 block = ufs_data_ptr_to_cpu(sb, p);
+ if (block) {
+ write_seqlock(&UFS_I(inode)->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&UFS_I(inode)->meta_lock);
+ ubh_mark_buffer_dirty(ubh);
+ free_data(&ctx, block, uspi->s_fpb);
+ }
+ }
+ free_data(&ctx, 0, 0);
+ }
+ if (IS_SYNC(inode) && ubh_buffer_dirty(ubh))
+ ubh_sync_block(ubh);
+ ubh_brelse(ubh);
+}
+
+static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
+{
+ int err = 0;
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ unsigned i, end;
+ sector_t lastfrag;
+ struct page *lastpage;
+ struct buffer_head *bh;
+ u64 phys64;
+
+ lastfrag = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
+
+ if (!lastfrag)
+ goto out;
+
+ lastfrag--;
+
+ lastpage = ufs_get_locked_page(mapping, lastfrag >>
+ (PAGE_CACHE_SHIFT - inode->i_blkbits));
+ if (IS_ERR(lastpage)) {
+ err = -EIO;
+ goto out;
+ }
+
+ end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1);
+ bh = page_buffers(lastpage);
+ for (i = 0; i < end; ++i)
+ bh = bh->b_this_page;
+
+
+ err = ufs_getfrag_block(inode, lastfrag, bh, 1);
+
+ if (unlikely(err))
+ goto out_unlock;
+
+ if (buffer_new(bh)) {
+ clear_buffer_new(bh);
+ unmap_underlying_metadata(bh->b_bdev,
+ bh->b_blocknr);
+ /*
+ * we do not zeroize fragment, because of
+ * if it maped to hole, it already contains zeroes
+ */
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ set_page_dirty(lastpage);
+ }
+
+ if (lastfrag >= UFS_IND_FRAGMENT) {
+ end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1;
+ phys64 = bh->b_blocknr + 1;
+ for (i = 0; i < end; ++i) {
+ bh = sb_getblk(sb, i + phys64);
+ lock_buffer(bh);
+ memset(bh->b_data, 0, sb->s_blocksize);
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ unlock_buffer(bh);
+ sync_dirty_buffer(bh);
+ brelse(bh);
+ }
+ }
+out_unlock:
+ ufs_put_locked_page(lastpage);
+out:
+ return err;
+}
+
+static void __ufs_truncate_blocks(struct inode *inode)
+{
+ struct ufs_inode_info *ufsi = UFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ unsigned offsets[4];
+ int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets);
+ int depth2;
+ unsigned i;
+ struct ufs_buffer_head *ubh[3];
+ void *p;
+ u64 block;
+
+ if (!depth)
+ return;
+
+ /* find the last non-zero in offsets[] */
+ for (depth2 = depth - 1; depth2; depth2--)
+ if (offsets[depth2])
+ break;
+
+ mutex_lock(&ufsi->truncate_mutex);
+ if (depth == 1) {
+ ufs_trunc_direct(inode);
+ offsets[0] = UFS_IND_BLOCK;
+ } else {
+ /* get the blocks that should be partially emptied */
+ p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]);
+ for (i = 0; i < depth2; i++) {
+ offsets[i]++; /* next branch is fully freed */
+ block = ufs_data_ptr_to_cpu(sb, p);
+ if (!block)
+ break;
+ ubh[i] = ubh_bread(sb, block, uspi->s_bsize);
+ if (!ubh[i]) {
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+ break;
+ }
+ p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]);
+ }
+ while (i--)
+ free_branch_tail(inode, offsets[i + 1], ubh[i], depth - i - 1);
+ }
+ for (i = offsets[0]; i <= UFS_TIND_BLOCK; i++) {
+ p = ufs_get_direct_data_ptr(uspi, ufsi, i);
+ block = ufs_data_ptr_to_cpu(sb, p);
+ if (block) {
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+ free_full_branch(inode, block, i - UFS_IND_BLOCK + 1);
+ }
}
+ ufsi->i_lastfrag = DIRECT_FRAGMENT;
+ mark_inode_dirty(inode);
+ mutex_unlock(&ufsi->truncate_mutex);
+}
+
+static int ufs_truncate(struct inode *inode, loff_t size)
+{
+ int err = 0;
+
+ UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n",
+ inode->i_ino, (unsigned long long)size,
+ (unsigned long long)i_size_read(inode));
+
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return -EINVAL;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return -EPERM;
+
+ err = ufs_alloc_lastblock(inode, size);
+
+ if (err)
+ goto out;
+
+ block_truncate_page(inode->i_mapping, size, ufs_getfrag_block);
+
+ truncate_setsize(inode, size);
+
+ __ufs_truncate_blocks(inode);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+ mark_inode_dirty(inode);
+out:
+ UFSD("EXIT: err %d\n", err);
+ return err;
+}
+
+void ufs_truncate_blocks(struct inode *inode)
+{
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+ __ufs_truncate_blocks(inode);
+}
+
+int ufs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = d_inode(dentry);
+ unsigned int ia_valid = attr->ia_valid;
+ int error;
+
+ error = inode_change_ok(inode, attr);
+ if (error)
+ return error;
+
+ if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
+ error = ufs_truncate(inode, attr->ia_size);
+ if (error)
+ return error;
+ }
+
+ setattr_copy(inode, attr);
+ mark_inode_dirty(inode);
+ return 0;
}
+
+const struct inode_operations ufs_file_inode_operations = {
+ .setattr = ufs_setattr,
+};
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 250579a80d90..f6390eec02ca 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -94,22 +94,6 @@
#include "swab.h"
#include "util.h"
-void lock_ufs(struct super_block *sb)
-{
- struct ufs_sb_info *sbi = UFS_SB(sb);
-
- mutex_lock(&sbi->mutex);
- sbi->mutex_owner = current;
-}
-
-void unlock_ufs(struct super_block *sb)
-{
- struct ufs_sb_info *sbi = UFS_SB(sb);
-
- sbi->mutex_owner = NULL;
- mutex_unlock(&sbi->mutex);
-}
-
static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
{
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
@@ -694,7 +678,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
struct ufs_super_block_third * usb3;
unsigned flags;
- lock_ufs(sb);
mutex_lock(&UFS_SB(sb)->s_lock);
UFSD("ENTER\n");
@@ -714,7 +697,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
UFSD("EXIT\n");
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return 0;
}
@@ -758,7 +740,6 @@ static void ufs_put_super(struct super_block *sb)
ubh_brelse_uspi (sbi->s_uspi);
kfree (sbi->s_uspi);
- mutex_destroy(&sbi->mutex);
kfree (sbi);
sb->s_fs_info = NULL;
UFSD("EXIT\n");
@@ -801,7 +782,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
- mutex_init(&sbi->mutex);
mutex_init(&sbi->s_lock);
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
@@ -1257,7 +1237,6 @@ magic_found:
return 0;
failed:
- mutex_destroy(&sbi->mutex);
if (ubh)
ubh_brelse_uspi (uspi);
kfree (uspi);
@@ -1280,7 +1259,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
unsigned flags;
sync_filesystem(sb);
- lock_ufs(sb);
mutex_lock(&UFS_SB(sb)->s_lock);
uspi = UFS_SB(sb)->s_uspi;
flags = UFS_SB(sb)->s_flags;
@@ -1296,7 +1274,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
ufs_set_opt (new_mount_opt, ONERROR_LOCK);
if (!ufs_parse_options (data, &new_mount_opt)) {
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return -EINVAL;
}
if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
@@ -1304,14 +1281,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
pr_err("ufstype can't be changed during remount\n");
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return -EINVAL;
}
if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
UFS_SB(sb)->s_mount_opt = new_mount_opt;
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return 0;
}
@@ -1335,7 +1310,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
#ifndef CONFIG_UFS_FS_WRITE
pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return -EINVAL;
#else
if (ufstype != UFS_MOUNT_UFSTYPE_SUN &&
@@ -1345,13 +1319,11 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
pr_err("this ufstype is read-only supported\n");
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return -EINVAL;
}
if (!ufs_read_cylinder_structures(sb)) {
pr_err("failed during remounting\n");
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return -EPERM;
}
sb->s_flags &= ~MS_RDONLY;
@@ -1359,7 +1331,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
}
UFS_SB(sb)->s_mount_opt = new_mount_opt;
mutex_unlock(&UFS_SB(sb)->s_lock);
- unlock_ufs(sb);
return 0;
}
@@ -1391,8 +1362,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct ufs_super_block_third *usb3;
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
- lock_ufs(sb);
-
+ mutex_lock(&UFS_SB(sb)->s_lock);
usb3 = ubh_get_usb_third(uspi);
if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
@@ -1413,7 +1383,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
- unlock_ufs(sb);
+ mutex_unlock(&UFS_SB(sb)->s_lock);
return 0;
}
@@ -1429,6 +1399,8 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
return NULL;
ei->vfs_inode.i_version = 1;
+ seqlock_init(&ei->meta_lock);
+ mutex_init(&ei->truncate_mutex);
return &ei->vfs_inode;
}
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
deleted file mode 100644
index 21154704c168..000000000000
--- a/fs/ufs/truncate.c
+++ /dev/null
@@ -1,523 +0,0 @@
-/*
- * linux/fs/ufs/truncate.c
- *
- * Copyright (C) 1998
- * Daniel Pirkl <daniel.pirkl@email.cz>
- * Charles University, Faculty of Mathematics and Physics
- *
- * from
- *
- * linux/fs/ext2/truncate.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- * from
- *
- * linux/fs/minix/truncate.c
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * Big-endian to little-endian byte-swapping/bitmaps by
- * David S. Miller (davem@caip.rutgers.edu), 1995
- */
-
-/*
- * Real random numbers for secure rm added 94/02/18
- * Idea from Pierre del Perugia <delperug@gla.ecoledoc.ibp.fr>
- */
-
-/*
- * Adoptation to use page cache and UFS2 write support by
- * Evgeniy Dushistov <dushistov@mail.ru>, 2006-2007
- */
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/fcntl.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/buffer_head.h>
-#include <linux/blkdev.h>
-#include <linux/sched.h>
-
-#include "ufs_fs.h"
-#include "ufs.h"
-#include "swab.h"
-#include "util.h"
-
-/*
- * Secure deletion currently doesn't work. It interacts very badly
- * with buffers shared with memory mappings, and for that reason
- * can't be done in the truncate() routines. It should instead be
- * done separately in "release()" before calling the truncate routines
- * that will release the actual file blocks.
- *
- * Linus
- */
-
-#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift)
-#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
-
-
-static int ufs_trunc_direct(struct inode *inode)
-{
- struct ufs_inode_info *ufsi = UFS_I(inode);
- struct super_block * sb;
- struct ufs_sb_private_info * uspi;
- void *p;
- u64 frag1, frag2, frag3, frag4, block1, block2;
- unsigned frag_to_free, free_count;
- unsigned i, tmp;
- int retry;
-
- UFSD("ENTER: ino %lu\n", inode->i_ino);
-
- sb = inode->i_sb;
- uspi = UFS_SB(sb)->s_uspi;
-
- frag_to_free = 0;
- free_count = 0;
- retry = 0;
-
- frag1 = DIRECT_FRAGMENT;
- frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
- frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1);
- frag3 = frag4 & ~uspi->s_fpbmask;
- block1 = block2 = 0;
- if (frag2 > frag3) {
- frag2 = frag4;
- frag3 = frag4 = 0;
- } else if (frag2 < frag3) {
- block1 = ufs_fragstoblks (frag2);
- block2 = ufs_fragstoblks (frag3);
- }
-
- UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu,"
- " frag3 %llu, frag4 %llu\n", inode->i_ino,
- (unsigned long long)frag1, (unsigned long long)frag2,
- (unsigned long long)block1, (unsigned long long)block2,
- (unsigned long long)frag3, (unsigned long long)frag4);
-
- if (frag1 >= frag2)
- goto next1;
-
- /*
- * Free first free fragments
- */
- p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1));
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp )
- ufs_panic (sb, "ufs_trunc_direct", "internal error");
- frag2 -= frag1;
- frag1 = ufs_fragnum (frag1);
-
- ufs_free_fragments(inode, tmp + frag1, frag2);
- mark_inode_dirty(inode);
- frag_to_free = tmp + frag1;
-
-next1:
- /*
- * Free whole blocks
- */
- for (i = block1 ; i < block2; i++) {
- p = ufs_get_direct_data_ptr(uspi, ufsi, i);
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp)
- continue;
- ufs_data_ptr_clear(uspi, p);
-
- if (free_count == 0) {
- frag_to_free = tmp;
- free_count = uspi->s_fpb;
- } else if (free_count > 0 && frag_to_free == tmp - free_count)
- free_count += uspi->s_fpb;
- else {
- ufs_free_blocks (inode, frag_to_free, free_count);
- frag_to_free = tmp;
- free_count = uspi->s_fpb;
- }
- mark_inode_dirty(inode);
- }
-
- if (free_count > 0)
- ufs_free_blocks (inode, frag_to_free, free_count);
-
- if (frag3 >= frag4)
- goto next3;
-
- /*
- * Free last free fragments
- */
- p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3));
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp )
- ufs_panic(sb, "ufs_truncate_direct", "internal error");
- frag4 = ufs_fragnum (frag4);
- ufs_data_ptr_clear(uspi, p);
-
- ufs_free_fragments (inode, tmp, frag4);
- mark_inode_dirty(inode);
- next3:
-
- UFSD("EXIT: ino %lu\n", inode->i_ino);
- return retry;
-}
-
-
-static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
-{
- struct super_block * sb;
- struct ufs_sb_private_info * uspi;
- struct ufs_buffer_head * ind_ubh;
- void *ind;
- u64 tmp, indirect_block, i, frag_to_free;
- unsigned free_count;
- int retry;
-
- UFSD("ENTER: ino %lu, offset %llu, p: %p\n",
- inode->i_ino, (unsigned long long)offset, p);
-
- BUG_ON(!p);
-
- sb = inode->i_sb;
- uspi = UFS_SB(sb)->s_uspi;
-
- frag_to_free = 0;
- free_count = 0;
- retry = 0;
-
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp)
- return 0;
- ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize);
- if (tmp != ufs_data_ptr_to_cpu(sb, p)) {
- ubh_brelse (ind_ubh);
- return 1;
- }
- if (!ind_ubh) {
- ufs_data_ptr_clear(uspi, p);
- return 0;
- }
-
- indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0;
- for (i = indirect_block; i < uspi->s_apb; i++) {
- ind = ubh_get_data_ptr(uspi, ind_ubh, i);
- tmp = ufs_data_ptr_to_cpu(sb, ind);
- if (!tmp)
- continue;
-
- ufs_data_ptr_clear(uspi, ind);
- ubh_mark_buffer_dirty(ind_ubh);
- if (free_count == 0) {
- frag_to_free = tmp;
- free_count = uspi->s_fpb;
- } else if (free_count > 0 && frag_to_free == tmp - free_count)
- free_count += uspi->s_fpb;
- else {
- ufs_free_blocks (inode, frag_to_free, free_count);
- frag_to_free = tmp;
- free_count = uspi->s_fpb;
- }
-
- mark_inode_dirty(inode);
- }
-
- if (free_count > 0) {
- ufs_free_blocks (inode, frag_to_free, free_count);
- }
- for (i = 0; i < uspi->s_apb; i++)
- if (!ufs_is_data_ptr_zero(uspi,
- ubh_get_data_ptr(uspi, ind_ubh, i)))
- break;
- if (i >= uspi->s_apb) {
- tmp = ufs_data_ptr_to_cpu(sb, p);
- ufs_data_ptr_clear(uspi, p);
-
- ufs_free_blocks (inode, tmp, uspi->s_fpb);
- mark_inode_dirty(inode);
- ubh_bforget(ind_ubh);
- ind_ubh = NULL;
- }
- if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
- ubh_sync_block(ind_ubh);
- ubh_brelse (ind_ubh);
-
- UFSD("EXIT: ino %lu\n", inode->i_ino);
-
- return retry;
-}
-
-static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
-{
- struct super_block * sb;
- struct ufs_sb_private_info * uspi;
- struct ufs_buffer_head *dind_bh;
- u64 i, tmp, dindirect_block;
- void *dind;
- int retry = 0;
-
- UFSD("ENTER: ino %lu\n", inode->i_ino);
-
- sb = inode->i_sb;
- uspi = UFS_SB(sb)->s_uspi;
-
- dindirect_block = (DIRECT_BLOCK > offset)
- ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0;
- retry = 0;
-
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp)
- return 0;
- dind_bh = ubh_bread(sb, tmp, uspi->s_bsize);
- if (tmp != ufs_data_ptr_to_cpu(sb, p)) {
- ubh_brelse (dind_bh);
- return 1;
- }
- if (!dind_bh) {
- ufs_data_ptr_clear(uspi, p);
- return 0;
- }
-
- for (i = dindirect_block ; i < uspi->s_apb ; i++) {
- dind = ubh_get_data_ptr(uspi, dind_bh, i);
- tmp = ufs_data_ptr_to_cpu(sb, dind);
- if (!tmp)
- continue;
- retry |= ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind);
- ubh_mark_buffer_dirty(dind_bh);
- }
-
- for (i = 0; i < uspi->s_apb; i++)
- if (!ufs_is_data_ptr_zero(uspi,
- ubh_get_data_ptr(uspi, dind_bh, i)))
- break;
- if (i >= uspi->s_apb) {
- tmp = ufs_data_ptr_to_cpu(sb, p);
- ufs_data_ptr_clear(uspi, p);
-
- ufs_free_blocks(inode, tmp, uspi->s_fpb);
- mark_inode_dirty(inode);
- ubh_bforget(dind_bh);
- dind_bh = NULL;
- }
- if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
- ubh_sync_block(dind_bh);
- ubh_brelse (dind_bh);
-
- UFSD("EXIT: ino %lu\n", inode->i_ino);
-
- return retry;
-}
-
-static int ufs_trunc_tindirect(struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
- struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- struct ufs_inode_info *ufsi = UFS_I(inode);
- struct ufs_buffer_head * tind_bh;
- u64 tindirect_block, tmp, i;
- void *tind, *p;
- int retry;
-
- UFSD("ENTER: ino %lu\n", inode->i_ino);
-
- retry = 0;
-
- tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb))
- ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0;
-
- p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK);
- if (!(tmp = ufs_data_ptr_to_cpu(sb, p)))
- return 0;
- tind_bh = ubh_bread (sb, tmp, uspi->s_bsize);
- if (tmp != ufs_data_ptr_to_cpu(sb, p)) {
- ubh_brelse (tind_bh);
- return 1;
- }
- if (!tind_bh) {
- ufs_data_ptr_clear(uspi, p);
- return 0;
- }
-
- for (i = tindirect_block ; i < uspi->s_apb ; i++) {
- tind = ubh_get_data_ptr(uspi, tind_bh, i);
- retry |= ufs_trunc_dindirect(inode, UFS_NDADDR +
- uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind);
- ubh_mark_buffer_dirty(tind_bh);
- }
- for (i = 0; i < uspi->s_apb; i++)
- if (!ufs_is_data_ptr_zero(uspi,
- ubh_get_data_ptr(uspi, tind_bh, i)))
- break;
- if (i >= uspi->s_apb) {
- tmp = ufs_data_ptr_to_cpu(sb, p);
- ufs_data_ptr_clear(uspi, p);
-
- ufs_free_blocks(inode, tmp, uspi->s_fpb);
- mark_inode_dirty(inode);
- ubh_bforget(tind_bh);
- tind_bh = NULL;
- }
- if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
- ubh_sync_block(tind_bh);
- ubh_brelse (tind_bh);
-
- UFSD("EXIT: ino %lu\n", inode->i_ino);
- return retry;
-}
-
-static int ufs_alloc_lastblock(struct inode *inode)
-{
- int err = 0;
- struct super_block *sb = inode->i_sb;
- struct address_space *mapping = inode->i_mapping;
- struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- unsigned i, end;
- sector_t lastfrag;
- struct page *lastpage;
- struct buffer_head *bh;
- u64 phys64;
-
- lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift;
-
- if (!lastfrag)
- goto out;
-
- lastfrag--;
-
- lastpage = ufs_get_locked_page(mapping, lastfrag >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits));
- if (IS_ERR(lastpage)) {
- err = -EIO;
- goto out;
- }
-
- end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1);
- bh = page_buffers(lastpage);
- for (i = 0; i < end; ++i)
- bh = bh->b_this_page;
-
-
- err = ufs_getfrag_block(inode, lastfrag, bh, 1);
-
- if (unlikely(err))
- goto out_unlock;
-
- if (buffer_new(bh)) {
- clear_buffer_new(bh);
- unmap_underlying_metadata(bh->b_bdev,
- bh->b_blocknr);
- /*
- * we do not zeroize fragment, because of
- * if it maped to hole, it already contains zeroes
- */
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- set_page_dirty(lastpage);
- }
-
- if (lastfrag >= UFS_IND_FRAGMENT) {
- end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1;
- phys64 = bh->b_blocknr + 1;
- for (i = 0; i < end; ++i) {
- bh = sb_getblk(sb, i + phys64);
- lock_buffer(bh);
- memset(bh->b_data, 0, sb->s_blocksize);
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- unlock_buffer(bh);
- sync_dirty_buffer(bh);
- brelse(bh);
- }
- }
-out_unlock:
- ufs_put_locked_page(lastpage);
-out:
- return err;
-}
-
-int ufs_truncate(struct inode *inode, loff_t old_i_size)
-{
- struct ufs_inode_info *ufsi = UFS_I(inode);
- struct super_block *sb = inode->i_sb;
- struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- int retry, err = 0;
-
- UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n",
- inode->i_ino, (unsigned long long)i_size_read(inode),
- (unsigned long long)old_i_size);
-
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return -EINVAL;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return -EPERM;
-
- err = ufs_alloc_lastblock(inode);
-
- if (err) {
- i_size_write(inode, old_i_size);
- goto out;
- }
-
- block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
-
- while (1) {
- retry = ufs_trunc_direct(inode);
- retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK,
- ufs_get_direct_data_ptr(uspi, ufsi,
- UFS_IND_BLOCK));
- retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb,
- ufs_get_direct_data_ptr(uspi, ufsi,
- UFS_DIND_BLOCK));
- retry |= ufs_trunc_tindirect (inode);
- if (!retry)
- break;
- if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
- ufs_sync_inode (inode);
- yield();
- }
-
- inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
- ufsi->i_lastfrag = DIRECT_FRAGMENT;
- mark_inode_dirty(inode);
-out:
- UFSD("EXIT: err %d\n", err);
- return err;
-}
-
-int ufs_setattr(struct dentry *dentry, struct iattr *attr)
-{
- struct inode *inode = d_inode(dentry);
- unsigned int ia_valid = attr->ia_valid;
- int error;
-
- error = inode_change_ok(inode, attr);
- if (error)
- return error;
-
- if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
- loff_t old_i_size = inode->i_size;
-
- /* XXX(truncate): truncate_setsize should be called last */
- truncate_setsize(inode, attr->ia_size);
-
- lock_ufs(inode->i_sb);
- error = ufs_truncate(inode, old_i_size);
- unlock_ufs(inode->i_sb);
- if (error)
- return error;
- }
-
- setattr_copy(inode, attr);
- mark_inode_dirty(inode);
- return 0;
-}
-
-const struct inode_operations ufs_file_inode_operations = {
- .setattr = ufs_setattr,
-};
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 2e31ea2e35a3..7da4aca868c0 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -24,8 +24,6 @@ struct ufs_sb_info {
unsigned s_cgno[UFS_MAX_GROUP_LOADED];
unsigned short s_cg_loaded;
unsigned s_mount_opt;
- struct mutex mutex;
- struct task_struct *mutex_owner;
struct super_block *sb;
int work_queued; /* non-zero if the delayed work is queued */
struct delayed_work sync_work; /* FS sync delayed work */
@@ -46,6 +44,8 @@ struct ufs_inode_info {
__u32 i_oeftflag;
__u16 i_osync;
__u64 i_lastfrag;
+ seqlock_t meta_lock;
+ struct mutex truncate_mutex;
__u32 i_dir_start_lookup;
struct inode vfs_inode;
};
@@ -122,7 +122,7 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long);
extern int ufs_write_inode (struct inode *, struct writeback_control *);
extern int ufs_sync_inode (struct inode *);
extern void ufs_evict_inode (struct inode *);
-extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
+extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
/* namei.c */
extern const struct file_operations ufs_dir_operations;
@@ -140,10 +140,6 @@ void ufs_mark_sb_dirty(struct super_block *sb);
extern const struct inode_operations ufs_fast_symlink_inode_operations;
extern const struct inode_operations ufs_symlink_inode_operations;
-/* truncate.c */
-extern int ufs_truncate (struct inode *, loff_t);
-extern int ufs_setattr(struct dentry *dentry, struct iattr *attr);
-
static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
{
return sb->s_fs_info;
@@ -170,7 +166,4 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b)
return do_div(b, uspi->s_fpg);
}
-extern void lock_ufs(struct super_block *sb);
-extern void unlock_ufs(struct super_block *sb);
-
#endif /* _UFS_UFS_H */
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
new file mode 100644
index 000000000000..634e676072cb
--- /dev/null
+++ b/fs/userfaultfd.c
@@ -0,0 +1,1330 @@
+/*
+ * fs/userfaultfd.c
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2008-2009 Red Hat, Inc.
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Some part derived from fs/eventfd.c (anon inode setup) and
+ * mm/ksm.c (mm hashing).
+ */
+
+#include <linux/hashtable.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/file.h>
+#include <linux/bug.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mempolicy.h>
+#include <linux/ioctl.h>
+#include <linux/security.h>
+
+static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+
+enum userfaultfd_state {
+ UFFD_STATE_WAIT_API,
+ UFFD_STATE_RUNNING,
+};
+
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ */
+struct userfaultfd_ctx {
+ /* waitqueue head for the pending (i.e. not read) userfaults */
+ wait_queue_head_t fault_pending_wqh;
+ /* waitqueue head for the userfaults */
+ wait_queue_head_t fault_wqh;
+ /* waitqueue head for the pseudo fd to wakeup poll/read */
+ wait_queue_head_t fd_wqh;
+ /* a refile sequence protected by fault_pending_wqh lock */
+ struct seqcount refile_seq;
+ /* pseudo fd refcounting */
+ atomic_t refcount;
+ /* userfaultfd syscall flags */
+ unsigned int flags;
+ /* state machine */
+ enum userfaultfd_state state;
+ /* released */
+ bool released;
+ /* mm with one ore more vmas attached to this userfaultfd_ctx */
+ struct mm_struct *mm;
+};
+
+struct userfaultfd_wait_queue {
+ struct uffd_msg msg;
+ wait_queue_t wq;
+ struct userfaultfd_ctx *ctx;
+};
+
+struct userfaultfd_wake_range {
+ unsigned long start;
+ unsigned long len;
+};
+
+static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+ int wake_flags, void *key)
+{
+ struct userfaultfd_wake_range *range = key;
+ int ret;
+ struct userfaultfd_wait_queue *uwq;
+ unsigned long start, len;
+
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ ret = 0;
+ /* len == 0 means wake all */
+ start = range->start;
+ len = range->len;
+ if (len && (start > uwq->msg.arg.pagefault.address ||
+ start + len <= uwq->msg.arg.pagefault.address))
+ goto out;
+ ret = wake_up_state(wq->private, mode);
+ if (ret)
+ /*
+ * Wake only once, autoremove behavior.
+ *
+ * After the effect of list_del_init is visible to the
+ * other CPUs, the waitqueue may disappear from under
+ * us, see the !list_empty_careful() in
+ * handle_userfault(). try_to_wake_up() has an
+ * implicit smp_mb__before_spinlock, and the
+ * wq->private is read before calling the extern
+ * function "wake_up_state" (which in turns calls
+ * try_to_wake_up). While the spin_lock;spin_unlock;
+ * wouldn't be enough, the smp_mb__before_spinlock is
+ * enough to avoid an explicit smp_mb() here.
+ */
+ list_del_init(&wq->task_list);
+out:
+ return ret;
+}
+
+/**
+ * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to the userfaultfd context.
+ *
+ * Returns: In case of success, returns not zero.
+ */
+static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
+{
+ if (!atomic_inc_not_zero(&ctx->refcount))
+ BUG();
+}
+
+/**
+ * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to userfaultfd context.
+ *
+ * The userfaultfd context reference must have been previously acquired either
+ * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
+ */
+static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
+{
+ if (atomic_dec_and_test(&ctx->refcount)) {
+ VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
+ VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
+ VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
+ VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
+ mmput(ctx->mm);
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+ }
+}
+
+static inline void msg_init(struct uffd_msg *msg)
+{
+ BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
+ /*
+ * Must use memset to zero out the paddings or kernel data is
+ * leaked to userland.
+ */
+ memset(msg, 0, sizeof(struct uffd_msg));
+}
+
+static inline struct uffd_msg userfault_msg(unsigned long address,
+ unsigned int flags,
+ unsigned long reason)
+{
+ struct uffd_msg msg;
+ msg_init(&msg);
+ msg.event = UFFD_EVENT_PAGEFAULT;
+ msg.arg.pagefault.address = address;
+ if (flags & FAULT_FLAG_WRITE)
+ /*
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
+ * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
+ * was not set in a UFFD_EVENT_PAGEFAULT, it means it
+ * was a read fault, otherwise if set it means it's
+ * a write fault.
+ */
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
+ if (reason & VM_UFFD_WP)
+ /*
+ * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
+ * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
+ * not set in a UFFD_EVENT_PAGEFAULT, it means it was
+ * a missing fault, otherwise if set it means it's a
+ * write protect fault.
+ */
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+ return msg;
+}
+
+/*
+ * Verify the pagetables are still not ok after having reigstered into
+ * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
+ * userfault that has already been resolved, if userfaultfd_read and
+ * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
+ * threads.
+ */
+static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
+ unsigned long address,
+ unsigned long flags,
+ unsigned long reason)
+{
+ struct mm_struct *mm = ctx->mm;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ bool ret = true;
+
+ VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+ pmd = pmd_offset(pud, address);
+ /*
+ * READ_ONCE must function as a barrier with narrower scope
+ * and it must be equivalent to:
+ * _pmd = *pmd; barrier();
+ *
+ * This is to deal with the instability (as in
+ * pmd_trans_unstable) of the pmd.
+ */
+ _pmd = READ_ONCE(*pmd);
+ if (!pmd_present(_pmd))
+ goto out;
+
+ ret = false;
+ if (pmd_trans_huge(_pmd))
+ goto out;
+
+ /*
+ * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
+ * and use the standard pte_offset_map() instead of parsing _pmd.
+ */
+ pte = pte_offset_map(pmd, address);
+ /*
+ * Lockless access: we're in a wait_event so it's ok if it
+ * changes under us.
+ */
+ if (pte_none(*pte))
+ ret = true;
+ pte_unmap(pte);
+
+out:
+ return ret;
+}
+
+/*
+ * The locking rules involved in returning VM_FAULT_RETRY depending on
+ * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
+ * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
+ * recommendation in __lock_page_or_retry is not an understatement.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
+ * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
+ * not set.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
+ * set, VM_FAULT_RETRY can still be returned if and only if there are
+ * fatal_signal_pending()s, and the mmap_sem must be released before
+ * returning it.
+ */
+int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, unsigned long reason)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct userfaultfd_ctx *ctx;
+ struct userfaultfd_wait_queue uwq;
+ int ret;
+ bool must_wait, return_to_userland;
+
+ BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ ret = VM_FAULT_SIGBUS;
+ ctx = vma->vm_userfaultfd_ctx.ctx;
+ if (!ctx)
+ goto out;
+
+ BUG_ON(ctx->mm != mm);
+
+ VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
+ VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+
+ /*
+ * If it's already released don't get it. This avoids to loop
+ * in __get_user_pages if userfaultfd_release waits on the
+ * caller of handle_userfault to release the mmap_sem.
+ */
+ if (unlikely(ACCESS_ONCE(ctx->released)))
+ goto out;
+
+ /*
+ * Check that we can return VM_FAULT_RETRY.
+ *
+ * NOTE: it should become possible to return VM_FAULT_RETRY
+ * even if FAULT_FLAG_TRIED is set without leading to gup()
+ * -EBUSY failures, if the userfaultfd is to be extended for
+ * VM_UFFD_WP tracking and we intend to arm the userfault
+ * without first stopping userland access to the memory. For
+ * VM_UFFD_MISSING userfaults this is enough for now.
+ */
+ if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
+ /*
+ * Validate the invariant that nowait must allow retry
+ * to be sure not to return SIGBUS erroneously on
+ * nowait invocations.
+ */
+ BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
+#ifdef CONFIG_DEBUG_VM
+ if (printk_ratelimit()) {
+ printk(KERN_WARNING
+ "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
+ dump_stack();
+ }
+#endif
+ goto out;
+ }
+
+ /*
+ * Handle nowait, not much to do other than tell it to retry
+ * and wait.
+ */
+ ret = VM_FAULT_RETRY;
+ if (flags & FAULT_FLAG_RETRY_NOWAIT)
+ goto out;
+
+ /* take the reference before dropping the mmap_sem */
+ userfaultfd_ctx_get(ctx);
+
+ init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
+ uwq.wq.private = current;
+ uwq.msg = userfault_msg(address, flags, reason);
+ uwq.ctx = ctx;
+
+ return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+ (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /*
+ * After the __add_wait_queue the uwq is visible to userland
+ * through poll/read().
+ */
+ __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
+ /*
+ * The smp_mb() after __set_current_state prevents the reads
+ * following the spin_unlock to happen before the list_add in
+ * __add_wait_queue.
+ */
+ set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
+ TASK_KILLABLE);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
+ up_read(&mm->mmap_sem);
+
+ if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
+ (return_to_userland ? !signal_pending(current) :
+ !fatal_signal_pending(current)))) {
+ wake_up_poll(&ctx->fd_wqh, POLLIN);
+ schedule();
+ ret |= VM_FAULT_MAJOR;
+ }
+
+ __set_current_state(TASK_RUNNING);
+
+ if (return_to_userland) {
+ if (signal_pending(current) &&
+ !fatal_signal_pending(current)) {
+ /*
+ * If we got a SIGSTOP or SIGCONT and this is
+ * a normal userland page fault, just let
+ * userland return so the signal will be
+ * handled and gdb debugging works. The page
+ * fault code immediately after we return from
+ * this function is going to release the
+ * mmap_sem and it's not depending on it
+ * (unlike gup would if we were not to return
+ * VM_FAULT_RETRY).
+ *
+ * If a fatal signal is pending we still take
+ * the streamlined VM_FAULT_RETRY failure path
+ * and there's no need to retake the mmap_sem
+ * in such case.
+ */
+ down_read(&mm->mmap_sem);
+ ret = 0;
+ }
+ }
+
+ /*
+ * Here we race with the list_del; list_add in
+ * userfaultfd_ctx_read(), however because we don't ever run
+ * list_del_init() to refile across the two lists, the prev
+ * and next pointers will never point to self. list_add also
+ * would never let any of the two pointers to point to
+ * self. So list_empty_careful won't risk to see both pointers
+ * pointing to self at any time during the list refile. The
+ * only case where list_del_init() is called is the full
+ * removal in the wake function and there we don't re-list_add
+ * and it's fine not to block on the spinlock. The uwq on this
+ * kernel stack can be released after the list_del_init.
+ */
+ if (!list_empty_careful(&uwq.wq.task_list)) {
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /*
+ * No need of list_del_init(), the uwq on the stack
+ * will be freed shortly anyway.
+ */
+ list_del(&uwq.wq.task_list);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ }
+
+ /*
+ * ctx may go away after this if the userfault pseudo fd is
+ * already released.
+ */
+ userfaultfd_ctx_put(ctx);
+
+out:
+ return ret;
+}
+
+static int userfaultfd_release(struct inode *inode, struct file *file)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev;
+ /* len == 0 means wake all */
+ struct userfaultfd_wake_range range = { .len = 0, };
+ unsigned long new_flags;
+
+ ACCESS_ONCE(ctx->released) = true;
+
+ /*
+ * Flush page faults out of all CPUs. NOTE: all page faults
+ * must be retried without returning VM_FAULT_SIGBUS if
+ * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
+ * changes while handle_userfault released the mmap_sem. So
+ * it's critical that released is set to true (above), before
+ * taking the mmap_sem for writing.
+ */
+ down_write(&mm->mmap_sem);
+ prev = NULL;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ cond_resched();
+ BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
+ !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+ if (vma->vm_userfaultfd_ctx.ctx != ctx) {
+ prev = vma;
+ continue;
+ }
+ new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+ prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+ new_flags, vma->anon_vma,
+ vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev)
+ vma = prev;
+ else
+ prev = vma;
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ }
+ up_write(&mm->mmap_sem);
+
+ /*
+ * After no new page faults can wait on this fault_*wqh, flush
+ * the last page faults that may have been already waiting on
+ * the fault_*wqh.
+ */
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range);
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ wake_up_poll(&ctx->fd_wqh, POLLHUP);
+ userfaultfd_ctx_put(ctx);
+ return 0;
+}
+
+/* fault_pending_wqh.lock must be hold by the caller */
+static inline struct userfaultfd_wait_queue *find_userfault(
+ struct userfaultfd_ctx *ctx)
+{
+ wait_queue_t *wq;
+ struct userfaultfd_wait_queue *uwq;
+
+ VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
+
+ uwq = NULL;
+ if (!waitqueue_active(&ctx->fault_pending_wqh))
+ goto out;
+ /* walk in reverse to provide FIFO behavior to read userfaults */
+ wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
+ typeof(*wq), task_list);
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+out:
+ return uwq;
+}
+
+static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ unsigned int ret;
+
+ poll_wait(file, &ctx->fd_wqh, wait);
+
+ switch (ctx->state) {
+ case UFFD_STATE_WAIT_API:
+ return POLLERR;
+ case UFFD_STATE_RUNNING:
+ /*
+ * poll() never guarantees that read won't block.
+ * userfaults can be waken before they're read().
+ */
+ if (unlikely(!(file->f_flags & O_NONBLOCK)))
+ return POLLERR;
+ /*
+ * lockless access to see if there are pending faults
+ * __pollwait last action is the add_wait_queue but
+ * the spin_unlock would allow the waitqueue_active to
+ * pass above the actual list_add inside
+ * add_wait_queue critical section. So use a full
+ * memory barrier to serialize the list_add write of
+ * add_wait_queue() with the waitqueue_active read
+ * below.
+ */
+ ret = 0;
+ smp_mb();
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ ret = POLLIN;
+ return ret;
+ default:
+ BUG();
+ }
+}
+
+static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
+ struct uffd_msg *msg)
+{
+ ssize_t ret;
+ DECLARE_WAITQUEUE(wait, current);
+ struct userfaultfd_wait_queue *uwq;
+
+ /* always take the fd_wqh lock before the fault_pending_wqh lock */
+ spin_lock(&ctx->fd_wqh.lock);
+ __add_wait_queue(&ctx->fd_wqh, &wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ uwq = find_userfault(ctx);
+ if (uwq) {
+ /*
+ * Use a seqcount to repeat the lockless check
+ * in wake_userfault() to avoid missing
+ * wakeups because during the refile both
+ * waitqueue could become empty if this is the
+ * only userfault.
+ */
+ write_seqcount_begin(&ctx->refile_seq);
+
+ /*
+ * The fault_pending_wqh.lock prevents the uwq
+ * to disappear from under us.
+ *
+ * Refile this userfault from
+ * fault_pending_wqh to fault_wqh, it's not
+ * pending anymore after we read it.
+ *
+ * Use list_del() by hand (as
+ * userfaultfd_wake_function also uses
+ * list_del_init() by hand) to be sure nobody
+ * changes __remove_wait_queue() to use
+ * list_del_init() in turn breaking the
+ * !list_empty_careful() check in
+ * handle_userfault(). The uwq->wq.task_list
+ * must never be empty at any time during the
+ * refile, or the waitqueue could disappear
+ * from under us. The "wait_queue_head_t"
+ * parameter of __remove_wait_queue() is unused
+ * anyway.
+ */
+ list_del(&uwq->wq.task_list);
+ __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+
+ write_seqcount_end(&ctx->refile_seq);
+
+ /* careful to always initialize msg if ret == 0 */
+ *msg = uwq->msg;
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ ret = 0;
+ break;
+ }
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+ if (no_wait) {
+ ret = -EAGAIN;
+ break;
+ }
+ spin_unlock(&ctx->fd_wqh.lock);
+ schedule();
+ spin_lock(&ctx->fd_wqh.lock);
+ }
+ __remove_wait_queue(&ctx->fd_wqh, &wait);
+ __set_current_state(TASK_RUNNING);
+ spin_unlock(&ctx->fd_wqh.lock);
+
+ return ret;
+}
+
+static ssize_t userfaultfd_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct userfaultfd_ctx *ctx = file->private_data;
+ ssize_t _ret, ret = 0;
+ struct uffd_msg msg;
+ int no_wait = file->f_flags & O_NONBLOCK;
+
+ if (ctx->state == UFFD_STATE_WAIT_API)
+ return -EINVAL;
+
+ for (;;) {
+ if (count < sizeof(msg))
+ return ret ? ret : -EINVAL;
+ _ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
+ if (_ret < 0)
+ return ret ? ret : _ret;
+ if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
+ return ret ? ret : -EFAULT;
+ ret += sizeof(msg);
+ buf += sizeof(msg);
+ count -= sizeof(msg);
+ /*
+ * Allow to read more than one fault at time but only
+ * block if waiting for the very first one.
+ */
+ no_wait = O_NONBLOCK;
+ }
+}
+
+static void __wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_range *range)
+{
+ unsigned long start, end;
+
+ start = range->start;
+ end = range->start + range->len;
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ /* wake all in the range and autoremove */
+ if (waitqueue_active(&ctx->fault_pending_wqh))
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0,
+ range);
+ if (waitqueue_active(&ctx->fault_wqh))
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+}
+
+static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
+ struct userfaultfd_wake_range *range)
+{
+ unsigned seq;
+ bool need_wakeup;
+
+ /*
+ * To be sure waitqueue_active() is not reordered by the CPU
+ * before the pagetable update, use an explicit SMP memory
+ * barrier here. PT lock release or up_read(mmap_sem) still
+ * have release semantics that can allow the
+ * waitqueue_active() to be reordered before the pte update.
+ */
+ smp_mb();
+
+ /*
+ * Use waitqueue_active because it's very frequent to
+ * change the address space atomically even if there are no
+ * userfaults yet. So we take the spinlock only when we're
+ * sure we've userfaults to wake.
+ */
+ do {
+ seq = read_seqcount_begin(&ctx->refile_seq);
+ need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
+ waitqueue_active(&ctx->fault_wqh);
+ cond_resched();
+ } while (read_seqcount_retry(&ctx->refile_seq, seq));
+ if (need_wakeup)
+ __wake_userfault(ctx, range);
+}
+
+static __always_inline int validate_range(struct mm_struct *mm,
+ __u64 start, __u64 len)
+{
+ __u64 task_size = mm->task_size;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ if (len & ~PAGE_MASK)
+ return -EINVAL;
+ if (!len)
+ return -EINVAL;
+ if (start < mmap_min_addr)
+ return -EINVAL;
+ if (start >= task_size)
+ return -EINVAL;
+ if (len > task_size - start)
+ return -EINVAL;
+ return 0;
+}
+
+static int userfaultfd_register(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev, *cur;
+ int ret;
+ struct uffdio_register uffdio_register;
+ struct uffdio_register __user *user_uffdio_register;
+ unsigned long vm_flags, new_flags;
+ bool found;
+ unsigned long start, end, vma_end;
+
+ user_uffdio_register = (struct uffdio_register __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_register, user_uffdio_register,
+ sizeof(uffdio_register)-sizeof(__u64)))
+ goto out;
+
+ ret = -EINVAL;
+ if (!uffdio_register.mode)
+ goto out;
+ if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
+ UFFDIO_REGISTER_MODE_WP))
+ goto out;
+ vm_flags = 0;
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
+ vm_flags |= VM_UFFD_MISSING;
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+ vm_flags |= VM_UFFD_WP;
+ /*
+ * FIXME: remove the below error constraint by
+ * implementing the wprotect tracking mode.
+ */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = validate_range(mm, uffdio_register.range.start,
+ uffdio_register.range.len);
+ if (ret)
+ goto out;
+
+ start = uffdio_register.range.start;
+ end = start + uffdio_register.range.len;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma_prev(mm, start, &prev);
+
+ ret = -ENOMEM;
+ if (!vma)
+ goto out_unlock;
+
+ /* check that there's at least one vma in the range */
+ ret = -EINVAL;
+ if (vma->vm_start >= end)
+ goto out_unlock;
+
+ /*
+ * Search for not compatible vmas.
+ *
+ * FIXME: this shall be relaxed later so that it doesn't fail
+ * on tmpfs backed vmas (in addition to the current allowance
+ * on anonymous vmas).
+ */
+ found = false;
+ for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ cond_resched();
+
+ BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+ /* check not compatible vmas */
+ ret = -EINVAL;
+ if (cur->vm_ops)
+ goto out_unlock;
+
+ /*
+ * Check that this vma isn't already owned by a
+ * different userfaultfd. We can't allow more than one
+ * userfaultfd to own a single vma simultaneously or we
+ * wouldn't know which one to deliver the userfaults to.
+ */
+ ret = -EBUSY;
+ if (cur->vm_userfaultfd_ctx.ctx &&
+ cur->vm_userfaultfd_ctx.ctx != ctx)
+ goto out_unlock;
+
+ found = true;
+ }
+ BUG_ON(!found);
+
+ if (vma->vm_start < start)
+ prev = vma;
+
+ ret = 0;
+ do {
+ cond_resched();
+
+ BUG_ON(vma->vm_ops);
+ BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
+ vma->vm_userfaultfd_ctx.ctx != ctx);
+
+ /*
+ * Nothing to do: this vma is already registered into this
+ * userfaultfd and with the right tracking mode too.
+ */
+ if (vma->vm_userfaultfd_ctx.ctx == ctx &&
+ (vma->vm_flags & vm_flags) == vm_flags)
+ goto skip;
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
+ prev = vma_merge(mm, prev, start, vma_end, new_flags,
+ vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ ((struct vm_userfaultfd_ctx){ ctx }));
+ if (prev) {
+ vma = prev;
+ goto next;
+ }
+ if (vma->vm_start < start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ break;
+ }
+ if (vma->vm_end > end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+ break;
+ }
+ next:
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx.ctx = ctx;
+
+ skip:
+ prev = vma;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+out_unlock:
+ up_write(&mm->mmap_sem);
+ if (!ret) {
+ /*
+ * Now that we scanned all vmas we can already tell
+ * userland which ioctls methods are guaranteed to
+ * succeed on this range.
+ */
+ if (put_user(UFFD_API_RANGE_IOCTLS,
+ &user_uffdio_register->ioctls))
+ ret = -EFAULT;
+ }
+out:
+ return ret;
+}
+
+static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma, *prev, *cur;
+ int ret;
+ struct uffdio_range uffdio_unregister;
+ unsigned long new_flags;
+ bool found;
+ unsigned long start, end, vma_end;
+ const void __user *buf = (void __user *)arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
+ goto out;
+
+ ret = validate_range(mm, uffdio_unregister.start,
+ uffdio_unregister.len);
+ if (ret)
+ goto out;
+
+ start = uffdio_unregister.start;
+ end = start + uffdio_unregister.len;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma_prev(mm, start, &prev);
+
+ ret = -ENOMEM;
+ if (!vma)
+ goto out_unlock;
+
+ /* check that there's at least one vma in the range */
+ ret = -EINVAL;
+ if (vma->vm_start >= end)
+ goto out_unlock;
+
+ /*
+ * Search for not compatible vmas.
+ *
+ * FIXME: this shall be relaxed later so that it doesn't fail
+ * on tmpfs backed vmas (in addition to the current allowance
+ * on anonymous vmas).
+ */
+ found = false;
+ ret = -EINVAL;
+ for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+ cond_resched();
+
+ BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+ /*
+ * Check not compatible vmas, not strictly required
+ * here as not compatible vmas cannot have an
+ * userfaultfd_ctx registered on them, but this
+ * provides for more strict behavior to notice
+ * unregistration errors.
+ */
+ if (cur->vm_ops)
+ goto out_unlock;
+
+ found = true;
+ }
+ BUG_ON(!found);
+
+ if (vma->vm_start < start)
+ prev = vma;
+
+ ret = 0;
+ do {
+ cond_resched();
+
+ BUG_ON(vma->vm_ops);
+
+ /*
+ * Nothing to do: this vma is already registered into this
+ * userfaultfd and with the right tracking mode too.
+ */
+ if (!vma->vm_userfaultfd_ctx.ctx)
+ goto skip;
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+ prev = vma_merge(mm, prev, start, vma_end, new_flags,
+ vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev) {
+ vma = prev;
+ goto next;
+ }
+ if (vma->vm_start < start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+ break;
+ }
+ if (vma->vm_end > end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+ break;
+ }
+ next:
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ vma->vm_flags = new_flags;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+ skip:
+ prev = vma;
+ start = vma->vm_end;
+ vma = vma->vm_next;
+ } while (vma && vma->vm_start < end);
+out_unlock:
+ up_write(&mm->mmap_sem);
+out:
+ return ret;
+}
+
+/*
+ * userfaultfd_wake may be used in combination with the
+ * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
+ */
+static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ int ret;
+ struct uffdio_range uffdio_wake;
+ struct userfaultfd_wake_range range;
+ const void __user *buf = (void __user *)arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+ if (ret)
+ goto out;
+
+ range.start = uffdio_wake.start;
+ range.len = uffdio_wake.len;
+
+ /*
+ * len == 0 means wake all and we don't want to wake all here,
+ * so check it again to be sure.
+ */
+ VM_BUG_ON(!range.len);
+
+ wake_userfault(ctx, &range);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_copy uffdio_copy;
+ struct uffdio_copy __user *user_uffdio_copy;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_copy = (struct uffdio_copy __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_copy, user_uffdio_copy,
+ /* don't copy "copy" last field */
+ sizeof(uffdio_copy)-sizeof(__s64)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+ if (ret)
+ goto out;
+ /*
+ * double check for wraparound just in case. copy_from_user()
+ * will later check uffdio_copy.src + uffdio_copy.len to fit
+ * in the userland range.
+ */
+ ret = -EINVAL;
+ if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
+ goto out;
+ if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
+ goto out;
+
+ ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len);
+ if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+ BUG_ON(!ret);
+ /* len == 0 would wake all */
+ range.len = ret;
+ if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
+ range.start = uffdio_copy.dst;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
+out:
+ return ret;
+}
+
+static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_zeropage uffdio_zeropage;
+ struct uffdio_zeropage __user *user_uffdio_zeropage;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
+ /* don't copy "zeropage" last field */
+ sizeof(uffdio_zeropage)-sizeof(__s64)))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ if (ret)
+ goto out;
+ ret = -EINVAL;
+ if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
+ goto out;
+
+ ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+ /* len == 0 would wake all */
+ BUG_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
+ range.start = uffdio_zeropage.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
+out:
+ return ret;
+}
+
+/*
+ * userland asks for a certain API version and we return which bits
+ * and ioctl commands are implemented in this kernel for such API
+ * version or -EINVAL if unknown.
+ */
+static int userfaultfd_api(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct uffdio_api uffdio_api;
+ void __user *buf = (void __user *)arg;
+ int ret;
+
+ ret = -EINVAL;
+ if (ctx->state != UFFD_STATE_WAIT_API)
+ goto out;
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
+ goto out;
+ if (uffdio_api.api != UFFD_API || uffdio_api.features) {
+ memset(&uffdio_api, 0, sizeof(uffdio_api));
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ goto out;
+ ret = -EINVAL;
+ goto out;
+ }
+ uffdio_api.features = UFFD_API_FEATURES;
+ uffdio_api.ioctls = UFFD_API_IOCTLS;
+ ret = -EFAULT;
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ goto out;
+ ctx->state = UFFD_STATE_RUNNING;
+ ret = 0;
+out:
+ return ret;
+}
+
+static long userfaultfd_ioctl(struct file *file, unsigned cmd,
+ unsigned long arg)
+{
+ int ret = -EINVAL;
+ struct userfaultfd_ctx *ctx = file->private_data;
+
+ if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+ return -EINVAL;
+
+ switch(cmd) {
+ case UFFDIO_API:
+ ret = userfaultfd_api(ctx, arg);
+ break;
+ case UFFDIO_REGISTER:
+ ret = userfaultfd_register(ctx, arg);
+ break;
+ case UFFDIO_UNREGISTER:
+ ret = userfaultfd_unregister(ctx, arg);
+ break;
+ case UFFDIO_WAKE:
+ ret = userfaultfd_wake(ctx, arg);
+ break;
+ case UFFDIO_COPY:
+ ret = userfaultfd_copy(ctx, arg);
+ break;
+ case UFFDIO_ZEROPAGE:
+ ret = userfaultfd_zeropage(ctx, arg);
+ break;
+ }
+ return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct userfaultfd_ctx *ctx = f->private_data;
+ wait_queue_t *wq;
+ struct userfaultfd_wait_queue *uwq;
+ unsigned long pending = 0, total = 0;
+
+ spin_lock(&ctx->fault_pending_wqh.lock);
+ list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ pending++;
+ total++;
+ }
+ list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+ uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+ total++;
+ }
+ spin_unlock(&ctx->fault_pending_wqh.lock);
+
+ /*
+ * If more protocols will be added, there will be all shown
+ * separated by a space. Like this:
+ * protocols: aa:... bb:...
+ */
+ seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
+ pending, total, UFFD_API, UFFD_API_FEATURES,
+ UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
+}
+#endif
+
+static const struct file_operations userfaultfd_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = userfaultfd_show_fdinfo,
+#endif
+ .release = userfaultfd_release,
+ .poll = userfaultfd_poll,
+ .read = userfaultfd_read,
+ .unlocked_ioctl = userfaultfd_ioctl,
+ .compat_ioctl = userfaultfd_ioctl,
+ .llseek = noop_llseek,
+};
+
+static void init_once_userfaultfd_ctx(void *mem)
+{
+ struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
+
+ init_waitqueue_head(&ctx->fault_pending_wqh);
+ init_waitqueue_head(&ctx->fault_wqh);
+ init_waitqueue_head(&ctx->fd_wqh);
+ seqcount_init(&ctx->refile_seq);
+}
+
+/**
+ * userfaultfd_file_create - Creates an userfaultfd file pointer.
+ * @flags: Flags for the userfaultfd file.
+ *
+ * This function creates an userfaultfd file pointer, w/out installing
+ * it into the fd table. This is useful when the userfaultfd file is
+ * used during the initialization of data structures that require
+ * extra setup after the userfaultfd creation. So the userfaultfd
+ * creation is split into the file pointer creation phase, and the
+ * file descriptor installation phase. In this way races with
+ * userspace closing the newly installed file descriptor can be
+ * avoided. Returns an userfaultfd file pointer, or a proper error
+ * pointer.
+ */
+static struct file *userfaultfd_file_create(int flags)
+{
+ struct file *file;
+ struct userfaultfd_ctx *ctx;
+
+ BUG_ON(!current->mm);
+
+ /* Check the UFFD_* constants for consistency. */
+ BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
+ BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
+
+ file = ERR_PTR(-EINVAL);
+ if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
+ goto out;
+
+ file = ERR_PTR(-ENOMEM);
+ ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+ if (!ctx)
+ goto out;
+
+ atomic_set(&ctx->refcount, 1);
+ ctx->flags = flags;
+ ctx->state = UFFD_STATE_WAIT_API;
+ ctx->released = false;
+ ctx->mm = current->mm;
+ /* prevent the mm struct to be freed */
+ atomic_inc(&ctx->mm->mm_users);
+
+ file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+ O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+ if (IS_ERR(file))
+ kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+out:
+ return file;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+ int fd, error;
+ struct file *file;
+
+ error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
+ if (error < 0)
+ return error;
+ fd = error;
+
+ file = userfaultfd_file_create(flags);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_put_unused_fd;
+ }
+ fd_install(fd, file);
+
+ return fd;
+
+err_put_unused_fd:
+ put_unused_fd(fd);
+
+ return error;
+}
+
+static int __init userfaultfd_init(void)
+{
+ userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
+ sizeof(struct userfaultfd_ctx),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ init_once_userfaultfd_ctx);
+ return 0;
+}
+__initcall(userfaultfd_init);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index df6828570e87..a096841bd06c 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -33,6 +33,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_attr.o \
xfs_attr_leaf.o \
xfs_attr_remote.o \
+ xfs_bit.o \
xfs_bmap.o \
xfs_bmap_btree.o \
xfs_btree.o \
@@ -63,7 +64,6 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
xfs-y += xfs_aops.o \
xfs_attr_inactive.o \
xfs_attr_list.o \
- xfs_bit.o \
xfs_bmap_util.o \
xfs_buf.o \
xfs_dir2_readdir.o \
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index f9e9ffe6fb46..ffad7f20342f 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -464,7 +464,7 @@ xfs_agfl_verify(
struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
int i;
- if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
return false;
@@ -1937,7 +1937,7 @@ xfs_alloc_fix_freelist(
struct xfs_alloc_arg targs; /* local allocation arguments */
xfs_agblock_t bno; /* freelist block */
xfs_extlen_t need; /* total blocks needed in freelist */
- int error;
+ int error = 0;
if (!pag->pagf_init) {
error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
@@ -2260,7 +2260,7 @@ xfs_agf_verify(
struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid))
+ !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 59d521c09a17..90de071dd4c2 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -295,7 +295,7 @@ xfs_allocbt_verify(
case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
return false;
@@ -313,7 +313,7 @@ xfs_allocbt_verify(
case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
return false;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 3349c9a1e845..ff065578969f 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -139,6 +139,8 @@ xfs_attr_get(
args.value = value;
args.valuelen = *valuelenp;
+ /* Entirely possible to look up a name which doesn't exist */
+ args.op_flags = XFS_DA_OP_OKNOENT;
lock_mode = xfs_ilock_attr_map_shared(ip);
if (!xfs_inode_hasattr(ip))
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index e9d401ce93bb..33df52d97ec7 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -262,7 +262,7 @@ xfs_attr3_leaf_verify(
if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
return false;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
return false;
@@ -1056,7 +1056,7 @@ xfs_attr3_leaf_create(
hdr3->blkno = cpu_to_be64(bp->b_bn);
hdr3->owner = cpu_to_be64(dp->i_ino);
- uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr);
} else {
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index dd714037c322..f38f9bd81557 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -100,7 +100,7 @@ xfs_attr3_rmt_verify(
return false;
if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
return false;
- if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(rmt->rm_blkno) != bno)
return false;
@@ -222,7 +222,7 @@ xfs_attr3_rmt_hdr_set(
rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC);
rmt->rm_offset = cpu_to_be32(offset);
rmt->rm_bytes = cpu_to_be32(size);
- uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid);
rmt->rm_owner = cpu_to_be64(ino);
rmt->rm_blkno = cpu_to_be64(bno);
@@ -618,9 +618,8 @@ xfs_attr_rmtval_remove(
xfs_bmap_init(args->flist, args->firstblock);
error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
- XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
- 1, args->firstblock, args->flist,
- &done);
+ XFS_BMAPI_ATTRFORK, 1, args->firstblock,
+ args->flist, &done);
if (!error) {
error = xfs_bmap_finish(&args->trans, args->flist,
&committed);
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
index 0e8885a59646..0e8885a59646 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/libxfs/xfs_bit.c
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 63e05b663380..8e2010d53b07 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5945,6 +5945,7 @@ xfs_bmap_split_extent(
return xfs_trans_commit(tp);
out:
+ xfs_bmap_cancel(&free_list);
xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 2c44c8e50782..6b0cf6546a82 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -349,7 +349,8 @@ xfs_bmbt_to_bmdr(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC));
- ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid));
+ ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid,
+ &mp->m_sb.sb_meta_uuid));
ASSERT(rblock->bb_u.l.bb_blkno ==
cpu_to_be64(XFS_BUF_DADDR_NULL));
} else
@@ -647,7 +648,7 @@ xfs_bmbt_verify(
case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
- if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn)
return false;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c72283dd8d44..f7d7ee7a2607 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -65,7 +65,8 @@ xfs_btree_check_lblock(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
lblock_ok = lblock_ok &&
- uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
+ uuid_equal(&block->bb_u.l.bb_uuid,
+ &mp->m_sb.sb_meta_uuid) &&
block->bb_u.l.bb_blkno == cpu_to_be64(
bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
}
@@ -115,7 +116,8 @@ xfs_btree_check_sblock(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
sblock_ok = sblock_ok &&
- uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
+ uuid_equal(&block->bb_u.s.bb_uuid,
+ &mp->m_sb.sb_meta_uuid) &&
block->bb_u.s.bb_blkno == cpu_to_be64(
bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
}
@@ -1000,7 +1002,7 @@ xfs_btree_init_block_int(
if (flags & XFS_BTREE_CRC_BLOCKS) {
buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
buf->bb_u.l.bb_owner = cpu_to_be64(owner);
- uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
buf->bb_u.l.bb_pad = 0;
buf->bb_u.l.bb_lsn = 0;
}
@@ -1013,7 +1015,7 @@ xfs_btree_init_block_int(
if (flags & XFS_BTREE_CRC_BLOCKS) {
buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
- uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
buf->bb_u.s.bb_lsn = 0;
}
}
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 2385f8cd08ab..be43248a5822 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -146,7 +146,7 @@ xfs_da3_node_verify(
if (ichdr.magic != XFS_DA3_NODE_MAGIC)
return false;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
return false;
@@ -233,6 +233,7 @@ xfs_da3_node_read_verify(
bp->b_ops->verify_read(bp);
return;
default:
+ xfs_buf_ioerror(bp, -EFSCORRUPTED);
break;
}
@@ -324,7 +325,7 @@ xfs_da3_node_create(
ichdr.magic = XFS_DA3_NODE_MAGIC;
hdr3->info.blkno = cpu_to_be64(bp->b_bn);
hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
- uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
ichdr.magic = XFS_DA_NODE_MAGIC;
}
@@ -1822,6 +1823,7 @@ xfs_da3_path_shift(
struct xfs_da_args *args;
struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr nodehdr;
+ struct xfs_buf *bp;
xfs_dablk_t blkno = 0;
int level;
int error;
@@ -1866,20 +1868,24 @@ xfs_da3_path_shift(
*/
for (blk++, level++; level < path->active; blk++, level++) {
/*
- * Release the old block.
- * (if it's dirty, trans won't actually let go)
+ * Read the next child block into a local buffer.
*/
- if (release)
- xfs_trans_brelse(args->trans, blk->bp);
+ error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp,
+ args->whichfork);
+ if (error)
+ return error;
/*
- * Read the next child block.
+ * Release the old block (if it's dirty, the trans doesn't
+ * actually let go) and swap the local buffer into the path
+ * structure. This ensures failure of the above read doesn't set
+ * a NULL buffer in an active slot in the path.
*/
+ if (release)
+ xfs_trans_brelse(args->trans, blk->bp);
blk->blkno = blkno;
- error = xfs_da3_node_read(args->trans, dp, blkno, -1,
- &blk->bp, args->whichfork);
- if (error)
- return error;
+ blk->bp = bp;
+
info = blk->bp->b_addr;
ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
@@ -2351,8 +2357,8 @@ xfs_da_shrink_inode(
* the last block to the place we want to kill.
*/
error = xfs_bunmapi(tp, dp, dead_blkno, count,
- xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
- 0, args->firstblock, args->flist, &done);
+ xfs_bmapi_aflag(w), 0, args->firstblock,
+ args->flist, &done);
if (error == -ENOSPC) {
if (w != XFS_DATA_FORK)
break;
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 74bcbabfa523..b14bbd6bb05f 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -680,8 +680,15 @@ typedef struct xfs_attr_leaf_name_remote {
typedef struct xfs_attr_leafblock {
xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
- xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */
- xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */
+ /*
+ * The rest of the block contains the following structures after the
+ * leaf entries, growing from the bottom up. The variables are never
+ * referenced and definining them can actually make gcc optimize away
+ * accesses to the 'entries' array above index 0 so don't do that.
+ *
+ * xfs_attr_leaf_name_local_t namelist;
+ * xfs_attr_leaf_name_remote_t valuelist;
+ */
} xfs_attr_leafblock_t;
/*
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index a69fb3a1e161..9de401d297e5 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -362,6 +362,7 @@ xfs_dir_lookup(
struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
+ int lock_mode;
ASSERT(S_ISDIR(dp->i_d.di_mode));
XFS_STATS_INC(xs_dir_lookup);
@@ -387,6 +388,7 @@ xfs_dir_lookup(
if (ci_name)
args->op_flags |= XFS_DA_OP_CILOOKUP;
+ lock_mode = xfs_ilock_data_map_shared(dp);
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
rval = xfs_dir2_sf_lookup(args);
goto out_check_rval;
@@ -419,6 +421,7 @@ out_check_rval:
}
}
out_free:
+ xfs_iunlock(dp, lock_mode);
kmem_free(args);
return rval;
}
@@ -674,25 +677,22 @@ xfs_dir2_shrink_inode(
mp = dp->i_mount;
tp = args->trans;
da = xfs_dir2_db_to_da(args->geo, db);
- /*
- * Unmap the fsblock(s).
- */
- if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
- XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
- &done))) {
+
+ /* Unmap the fsblock(s). */
+ error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, 0, 0,
+ args->firstblock, args->flist, &done);
+ if (error) {
/*
- * ENOSPC actually can happen if we're in a removename with
- * no space reservation, and the resulting block removal
- * would cause a bmap btree split or conversion from extents
- * to btree. This can only happen for un-fragmented
- * directory blocks, since you need to be punching out
- * the middle of an extent.
- * In this case we need to leave the block in the file,
- * and not binval it.
- * So the block has to be in a consistent empty state
- * and appropriately logged.
- * We don't free up the buffer, the caller can tell it
- * hasn't happened since it got an error back.
+ * ENOSPC actually can happen if we're in a removename with no
+ * space reservation, and the resulting block removal would
+ * cause a bmap btree split or conversion from extents to btree.
+ * This can only happen for un-fragmented directory blocks,
+ * since you need to be punching out the middle of an extent.
+ * In this case we need to leave the block in the file, and not
+ * binval it. So the block has to be in a consistent empty
+ * state and appropriately logged. We don't free up the buffer,
+ * the caller can tell it hasn't happened since it got an error
+ * back.
*/
return error;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 9354e190b82e..4778d1dd511a 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -67,7 +67,7 @@ xfs_dir3_block_verify(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
return false;
- if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
@@ -157,7 +157,7 @@ xfs_dir3_block_init(
hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
hdr3->blkno = cpu_to_be64(bp->b_bn);
hdr3->owner = cpu_to_be64(dp->i_ino);
- uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
return;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index de1ea16f5748..824131e71bc5 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -220,7 +220,7 @@ xfs_dir3_data_verify(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
return false;
- if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
@@ -252,7 +252,8 @@ xfs_dir3_data_reada_verify(
return;
case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
- xfs_dir3_data_verify(bp);
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ bp->b_ops->verify_read(bp);
return;
default:
xfs_buf_ioerror(bp, -EFSCORRUPTED);
@@ -604,7 +605,7 @@ xfs_dir3_data_init(
hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
hdr3->blkno = cpu_to_be64(bp->b_bn);
hdr3->owner = cpu_to_be64(dp->i_ino);
- uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid);
} else
hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 106119955400..f300240ebb8d 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -160,7 +160,7 @@ xfs_dir3_leaf_verify(
if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
return false;
- if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
return false;
@@ -310,7 +310,7 @@ xfs_dir3_leaf_init(
: cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
leaf3->info.blkno = cpu_to_be64(bp->b_bn);
leaf3->info.owner = cpu_to_be64(owner);
- uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid);
} else {
memset(leaf, 0, sizeof(*leaf));
leaf->hdr.info.magic = cpu_to_be16(type);
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 41b80d3d3877..cc28e924545b 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -93,7 +93,7 @@ xfs_dir3_free_verify(
if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
return false;
- if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
@@ -226,7 +226,7 @@ xfs_dir3_free_get_buf(
hdr3->hdr.blkno = cpu_to_be64(bp->b_bn);
hdr3->hdr.owner = cpu_to_be64(dp->i_ino);
- uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid);
} else
hdr.magic = XFS_DIR2_FREE_MAGIC;
dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
@@ -1845,8 +1845,7 @@ xfs_dir2_node_addname_int(
if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
xfs_alert(mp,
- "%s: dir ino %llu needed freesp block %lld for\n"
- " data block %lld, got %lld ifbno %llu lastfbno %d",
+"%s: dir ino %llu needed freesp block %lld for data block %lld, got %lld ifbno %llu lastfbno %d",
__func__, (unsigned long long)dp->i_ino,
(long long)dp->d_ops->db_to_fdb(
args->geo, dbno),
@@ -2132,6 +2131,7 @@ xfs_dir2_node_replace(
int error; /* error return value */
int i; /* btree level */
xfs_ino_t inum; /* new inode number */
+ int ftype; /* new file type */
xfs_dir2_leaf_t *leaf; /* leaf structure */
xfs_dir2_leaf_entry_t *lep; /* leaf entry being changed */
int rval; /* internal return value */
@@ -2145,7 +2145,14 @@ xfs_dir2_node_replace(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
+
+ /*
+ * We have to save new inode number and ftype since
+ * xfs_da3_node_lookup_int() is going to overwrite them
+ */
inum = args->inumber;
+ ftype = args->filetype;
+
/*
* Lookup the entry to change in the btree.
*/
@@ -2183,7 +2190,7 @@ xfs_dir2_node_replace(
* Fill in the new inode number and log the entry.
*/
dep->inumber = cpu_to_be64(inum);
- args->dp->d_ops->data_put_ftype(dep, args->filetype);
+ args->dp->d_ops->data_put_ftype(dep, ftype);
xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
rval = 0;
}
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 6fbf2d853a54..5331b7f0460c 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -163,7 +163,7 @@ xfs_dqcheck(
d->dd_diskdq.d_id = cpu_to_be32(id);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
@@ -198,7 +198,7 @@ xfs_dquot_buf_verify_crc(
if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF))
return false;
- if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid))
return false;
}
return true;
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index a0ae572051de..9590a069e556 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -100,7 +100,7 @@ typedef struct xfs_sb {
xfs_rfsblock_t sb_dblocks; /* number of data blocks */
xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
xfs_rtblock_t sb_rextents; /* number of realtime extents */
- uuid_t sb_uuid; /* file system unique id */
+ uuid_t sb_uuid; /* user-visible file system unique id */
xfs_fsblock_t sb_logstart; /* starting block of log if internal */
xfs_ino_t sb_rootino; /* root inode number */
xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
@@ -174,6 +174,7 @@ typedef struct xfs_sb {
xfs_ino_t sb_pquotino; /* project quota inode */
xfs_lsn_t sb_lsn; /* last write sequence */
+ uuid_t sb_meta_uuid; /* metadata file system unique id */
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -190,7 +191,7 @@ typedef struct xfs_dsb {
__be64 sb_dblocks; /* number of data blocks */
__be64 sb_rblocks; /* number of realtime blocks */
__be64 sb_rextents; /* number of realtime extents */
- uuid_t sb_uuid; /* file system unique id */
+ uuid_t sb_uuid; /* user-visible file system unique id */
__be64 sb_logstart; /* starting block of log if internal */
__be64 sb_rootino; /* root inode number */
__be64 sb_rbmino; /* bitmap inode for realtime extents */
@@ -260,6 +261,7 @@ typedef struct xfs_dsb {
__be64 sb_pquotino; /* project quota inode */
__be64 sb_lsn; /* last write sequence */
+ uuid_t sb_meta_uuid; /* metadata file system unique id */
/* must be padded to 64 bit alignment */
} xfs_dsb_t;
@@ -458,9 +460,11 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */
+#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE| \
- XFS_SB_FEAT_INCOMPAT_SPINODES)
+ XFS_SB_FEAT_INCOMPAT_SPINODES| \
+ XFS_SB_FEAT_INCOMPAT_META_UUID)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -515,6 +519,18 @@ static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp)
}
/*
+ * XFS_SB_FEAT_INCOMPAT_META_UUID indicates that the metadata UUID
+ * is stored separately from the user-visible UUID; this allows the
+ * user-visible UUID to be changed on V5 filesystems which have a
+ * filesystem UUID stamped into every piece of metadata.
+ */
+static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID);
+}
+
+/*
* end of superblock version macros
*/
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 66efc702452a..54deb2d12ac6 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -338,7 +338,8 @@ xfs_ialloc_inode_init(
if (version == 3) {
free->di_ino = cpu_to_be64(ino);
ino++;
- uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&free->di_uuid,
+ &mp->m_sb.sb_meta_uuid);
xfs_dinode_calc_crc(mp, free);
} else if (tp) {
/* just log the inode core */
@@ -2232,7 +2233,7 @@ xfs_imap_lookup(
}
xfs_trans_brelse(tp, agbp);
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
if (error)
return error;
@@ -2500,7 +2501,7 @@ xfs_agi_verify(
struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid))
+ !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
return false;
/*
* Validate the magic number of the agi block.
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 674ad8f760be..f39b285beb19 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -239,7 +239,7 @@ xfs_inobt_verify(
case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
- if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
return false;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 6526e7696184..268c00f4f83a 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -304,7 +304,7 @@ xfs_dinode_verify(
return false;
if (be64_to_cpu(dip->di_ino) != ip->i_ino)
return false;
- if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
return false;
return true;
}
@@ -366,7 +366,7 @@ xfs_iread(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
ip->i_d.di_version = 3;
ip->i_d.di_ino = ip->i_ino;
- uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid);
} else
ip->i_d.di_version = 2;
return 0;
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index df9851c46b5c..47425140f343 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -131,10 +131,11 @@ xfs_mount_validate_sb(
if (xfs_sb_has_compat_feature(sbp,
XFS_SB_FEAT_COMPAT_UNKNOWN)) {
xfs_warn(mp,
-"Superblock has unknown compatible features (0x%x) enabled.\n"
-"Using a more recent kernel is recommended.",
+"Superblock has unknown compatible features (0x%x) enabled.",
(sbp->sb_features_compat &
XFS_SB_FEAT_COMPAT_UNKNOWN));
+ xfs_warn(mp,
+"Using a more recent kernel is recommended.");
}
if (xfs_sb_has_ro_compat_feature(sbp,
@@ -145,18 +146,21 @@ xfs_mount_validate_sb(
XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
xfs_warn(mp,
-"Attempted to mount read-only compatible filesystem read-write.\n"
+"Attempted to mount read-only compatible filesystem read-write.");
+ xfs_warn(mp,
"Filesystem can only be safely mounted read only.");
+
return -EINVAL;
}
}
if (xfs_sb_has_incompat_feature(sbp,
XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
xfs_warn(mp,
-"Superblock has unknown incompatible features (0x%x) enabled.\n"
-"Filesystem can not be safely mounted by this kernel.",
+"Superblock has unknown incompatible features (0x%x) enabled.",
(sbp->sb_features_incompat &
XFS_SB_FEAT_INCOMPAT_UNKNOWN));
+ xfs_warn(mp,
+"Filesystem can not be safely mounted by this kernel.");
return -EINVAL;
}
}
@@ -182,9 +186,6 @@ xfs_mount_validate_sb(
if (xfs_sb_version_hassparseinodes(sbp)) {
uint32_t align;
- xfs_alert(mp,
- "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
-
align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize
>> sbp->sb_blocklog;
if (sbp->sb_inoalignmt != align) {
@@ -398,6 +399,14 @@ __xfs_sb_from_disk(
to->sb_spino_align = be32_to_cpu(from->sb_spino_align);
to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
to->sb_lsn = be64_to_cpu(from->sb_lsn);
+ /*
+ * sb_meta_uuid is only on disk if it differs from sb_uuid and the
+ * feature flag is set; if not set we keep it only in memory.
+ */
+ if (xfs_sb_version_hasmetauuid(to))
+ uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
+ else
+ uuid_copy(&to->sb_meta_uuid, &from->sb_uuid);
/* Convert on-disk flags to in-memory flags? */
if (convert_xquota)
xfs_sb_quota_from_disk(to);
@@ -539,6 +548,8 @@ xfs_sb_to_disk(
cpu_to_be32(from->sb_features_log_incompat);
to->sb_spino_align = cpu_to_be32(from->sb_spino_align);
to->sb_lsn = cpu_to_be64(from->sb_lsn);
+ if (xfs_sb_version_hasmetauuid(from))
+ uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid);
}
}
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index e7e26bd6468f..8f8af05b3f13 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -63,7 +63,7 @@ xfs_symlink_hdr_set(
dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
dsl->sl_offset = cpu_to_be32(offset);
dsl->sl_bytes = cpu_to_be32(size);
- uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid);
dsl->sl_owner = cpu_to_be64(ino);
dsl->sl_blkno = cpu_to_be64(bp->b_bn);
bp->b_ops = &xfs_symlink_buf_ops;
@@ -107,7 +107,7 @@ xfs_symlink_verify(
return false;
if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
return false;
- if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
+ if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
return false;
if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
return false;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index c77499bcbd7a..50ab2879b9da 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -119,8 +119,7 @@ xfs_setfilesize_trans_alloc(
* We may pass freeze protection with a transaction. So tell lockdep
* we released it.
*/
- rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 1, _THIS_IP_);
+ __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
/*
* We hand off the transaction to the completion thread now, so
* clear the flag here.
@@ -171,8 +170,7 @@ xfs_setfilesize_ioend(
* Similarly for freeze protection.
*/
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
- rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 0, 1, _THIS_IP_);
+ __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
}
@@ -355,7 +353,8 @@ xfs_end_bio(
{
xfs_ioend_t *ioend = bio->bi_private;
- ioend->io_error = bio->bi_error;
+ if (!ioend->io_error)
+ ioend->io_error = bio->bi_error;
/* Toss bio and pass work off to an xfsdatad thread */
bio->bi_private = NULL;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 0f34886cf726..3bf4ad0d19e4 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -67,16 +67,15 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
*/
int /* error */
xfs_bmap_finish(
- xfs_trans_t **tp, /* transaction pointer addr */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- int *committed) /* xact committed or not */
+ struct xfs_trans **tp, /* transaction pointer addr */
+ struct xfs_bmap_free *flist, /* i/o: list extents to free */
+ int *committed)/* xact committed or not */
{
- xfs_efd_log_item_t *efd; /* extent free data */
- xfs_efi_log_item_t *efi; /* extent free intention */
- int error; /* error return value */
- xfs_bmap_free_item_t *free; /* free extent item */
- xfs_mount_t *mp; /* filesystem mount structure */
- xfs_bmap_free_item_t *next; /* next item on free list */
+ struct xfs_efd_log_item *efd; /* extent free data */
+ struct xfs_efi_log_item *efi; /* extent free intention */
+ int error; /* error return value */
+ struct xfs_bmap_free_item *free; /* free extent item */
+ struct xfs_bmap_free_item *next; /* next item on free list */
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
if (flist->xbf_count == 0) {
@@ -88,40 +87,48 @@ xfs_bmap_finish(
xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
free->xbfi_blockcount);
- error = xfs_trans_roll(tp, NULL);
- *committed = 1;
- /*
- * We have a new transaction, so we should return committed=1,
- * even though we're returning an error.
- */
- if (error)
+ error = __xfs_trans_roll(tp, NULL, committed);
+ if (error) {
+ /*
+ * If the transaction was committed, drop the EFD reference
+ * since we're bailing out of here. The other reference is
+ * dropped when the EFI hits the AIL.
+ *
+ * If the transaction was not committed, the EFI is freed by the
+ * EFI item unlock handler on abort. Also, we have a new
+ * transaction so we should return committed=1 even though we're
+ * returning an error.
+ */
+ if (*committed) {
+ xfs_efi_release(efi);
+ xfs_force_shutdown((*tp)->t_mountp,
+ (error == -EFSCORRUPTED) ?
+ SHUTDOWN_CORRUPT_INCORE :
+ SHUTDOWN_META_IO_ERROR);
+ } else {
+ *committed = 1;
+ }
+
return error;
+ }
+ /*
+ * Get an EFD and free each extent in the list, logging to the EFD in
+ * the process. The remaining bmap free list is cleaned up by the caller
+ * on error.
+ */
efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
for (free = flist->xbf_first; free != NULL; free = next) {
next = free->xbfi_next;
- if ((error = xfs_free_extent(*tp, free->xbfi_startblock,
- free->xbfi_blockcount))) {
- /*
- * The bmap free list will be cleaned up at a
- * higher level. The EFI will be canceled when
- * this transaction is aborted.
- * Need to force shutdown here to make sure it
- * happens, since this transaction may not be
- * dirty yet.
- */
- mp = (*tp)->t_mountp;
- if (!XFS_FORCED_SHUTDOWN(mp))
- xfs_force_shutdown(mp,
- (error == -EFSCORRUPTED) ?
- SHUTDOWN_CORRUPT_INCORE :
- SHUTDOWN_META_IO_ERROR);
+
+ error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
+ free->xbfi_blockcount);
+ if (error)
return error;
- }
- xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock,
- free->xbfi_blockcount);
+
xfs_bmap_del_free(flist, NULL, free);
}
+
return 0;
}
@@ -1467,7 +1474,7 @@ xfs_shift_file_space(
XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
XFS_QMOPT_RES_REGBLKS);
if (error)
- goto out;
+ goto out_trans_cancel;
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -1481,18 +1488,20 @@ xfs_shift_file_space(
&done, stop_fsb, &first_block, &free_list,
direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
if (error)
- goto out;
+ goto out_bmap_cancel;
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
- goto out;
+ goto out_bmap_cancel;
error = xfs_trans_commit(tp);
}
return error;
-out:
+out_bmap_cancel:
+ xfs_bmap_cancel(&free_list);
+out_trans_cancel:
xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 01bd6781974e..8ecffb35935b 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -438,7 +438,6 @@ _xfs_buf_find(
xfs_buf_flags_t flags,
xfs_buf_t *new_bp)
{
- size_t numbytes;
struct xfs_perag *pag;
struct rb_node **rbp;
struct rb_node *parent;
@@ -450,10 +449,9 @@ _xfs_buf_find(
for (i = 0; i < nmaps; i++)
numblks += map[i].bm_len;
- numbytes = BBTOB(numblks);
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(numbytes < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize));
ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
/*
@@ -1532,9 +1530,10 @@ xfs_wait_buftarg(
list_del_init(&bp->b_lru);
if (bp->b_flags & XBF_WRITE_FAIL) {
xfs_alert(btp->bt_mount,
-"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
-"Please run xfs_repair to determine the extent of the problem.",
+"Corruption Alert: Buffer at block 0x%llx had permanent write failures!",
(long long)bp->b_bn);
+ xfs_alert(btp->bt_mount,
+"Please run xfs_repair to determine the extent of the problem.");
}
xfs_buf_rele(bp);
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 331c1ccf8264..c79b717d9b88 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -23,6 +23,7 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/dax.h>
#include <linux/buffer_head.h>
#include <linux/uio.h>
#include <linux/list_lru.h>
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 092d652bc03d..7e986da34f6c 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -647,11 +647,7 @@ xfs_buf_item_unlock(
xfs_buf_item_relse(bp);
else if (aborted) {
ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
- if (lip->li_flags & XFS_LI_IN_AIL) {
- spin_lock(&lip->li_ailp->xa_lock);
- xfs_trans_ail_delete(lip->li_ailp, lip,
- SHUTDOWN_LOG_IO_ERROR);
- }
+ xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
xfs_buf_item_relse(bp);
}
}
@@ -750,13 +746,13 @@ xfs_buf_item_free_format(
* buffer (see xfs_buf_attach_iodone() below), then put the
* buf log item at the front.
*/
-void
+int
xfs_buf_item_init(
- xfs_buf_t *bp,
- xfs_mount_t *mp)
+ struct xfs_buf *bp,
+ struct xfs_mount *mp)
{
- xfs_log_item_t *lip = bp->b_fspriv;
- xfs_buf_log_item_t *bip;
+ struct xfs_log_item *lip = bp->b_fspriv;
+ struct xfs_buf_log_item *bip;
int chunks;
int map_size;
int error;
@@ -770,12 +766,11 @@ xfs_buf_item_init(
*/
ASSERT(bp->b_target->bt_mount == mp);
if (lip != NULL && lip->li_type == XFS_LI_BUF)
- return;
+ return 0;
bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP);
xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops);
bip->bli_buf = bp;
- xfs_buf_hold(bp);
/*
* chunks is the number of XFS_BLF_CHUNK size pieces the buffer
@@ -788,6 +783,11 @@ xfs_buf_item_init(
*/
error = xfs_buf_item_get_format(bip, bp->b_map_count);
ASSERT(error == 0);
+ if (error) { /* to stop gcc throwing set-but-unused warnings */
+ kmem_zone_free(xfs_buf_item_zone, bip);
+ return error;
+ }
+
for (i = 0; i < bip->bli_format_count; i++) {
chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
@@ -807,6 +807,8 @@ xfs_buf_item_init(
if (bp->b_fspriv)
bip->bli_item.li_bio_list = bp->b_fspriv;
bp->b_fspriv = bip;
+ xfs_buf_hold(bp);
+ return 0;
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 3f3455a41510..f7eba99d19dd 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -61,7 +61,7 @@ typedef struct xfs_buf_log_item {
struct xfs_buf_log_format __bli_format; /* embedded in-log header */
} xfs_buf_log_item_t;
-void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_relse(struct xfs_buf *);
void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
uint xfs_buf_item_dirty(xfs_buf_log_item_t *);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 098cd78fe708..a989a9c7edb7 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -171,6 +171,7 @@ xfs_dir2_block_getdents(
int wantoff; /* starting block offset */
xfs_off_t cook;
struct xfs_da_geometry *geo = args->geo;
+ int lock_mode;
/*
* If the block number in the offset is out of range, we're done.
@@ -178,7 +179,9 @@ xfs_dir2_block_getdents(
if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
+ lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir3_block_read(NULL, dp, &bp);
+ xfs_iunlock(dp, lock_mode);
if (error)
return error;
@@ -529,9 +532,12 @@ xfs_dir2_leaf_getdents(
* current buffer, need to get another one.
*/
if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
+ int lock_mode;
+ lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
&curoff, &bp);
+ xfs_iunlock(dp, lock_mode);
if (error || !map_info->map_valid)
break;
@@ -653,7 +659,6 @@ xfs_readdir(
struct xfs_da_args args = { NULL };
int rval;
int v;
- uint lock_mode;
trace_xfs_readdir(dp);
@@ -666,7 +671,7 @@ xfs_readdir(
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
- lock_mode = xfs_ilock_data_map_shared(dp);
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_getdents(&args, ctx);
else if ((rval = xfs_dir2_isblock(&args, &v)))
@@ -675,7 +680,7 @@ xfs_readdir(
rval = xfs_dir2_block_getdents(&args, ctx);
else
rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
- xfs_iunlock(dp, lock_mode);
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return rval;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 4143dc75dca4..30cb3afb67f0 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -251,7 +251,7 @@ xfs_qm_init_dquot_blk(
d->dd_diskdq.d_id = cpu_to_be32(curid);
d->dd_diskdq.d_flags = type;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
@@ -954,12 +954,8 @@ xfs_qm_dqflush(
struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
dqp->dq_flags &= ~XFS_DQ_DIRTY;
- spin_lock(&mp->m_ail->xa_lock);
- if (lip->li_flags & XFS_LI_IN_AIL)
- xfs_trans_ail_delete(mp->m_ail, lip,
- SHUTDOWN_CORRUPT_INCORE);
- else
- spin_unlock(&mp->m_ail->xa_lock);
+ xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE);
+
error = -EIO;
goto out_unlock;
}
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index adc8f8fdd145..4aa0153214f9 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -47,28 +47,6 @@ xfs_efi_item_free(
}
/*
- * Freeing the efi requires that we remove it from the AIL if it has already
- * been placed there. However, the EFI may not yet have been placed in the AIL
- * when called by xfs_efi_release() from EFD processing due to the ordering of
- * committed vs unpin operations in bulk insert operations. Hence the reference
- * count to ensure only the last caller frees the EFI.
- */
-STATIC void
-__xfs_efi_release(
- struct xfs_efi_log_item *efip)
-{
- struct xfs_ail *ailp = efip->efi_item.li_ailp;
-
- if (atomic_dec_and_test(&efip->efi_refcount)) {
- spin_lock(&ailp->xa_lock);
- /* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, &efip->efi_item,
- SHUTDOWN_LOG_IO_ERROR);
- xfs_efi_item_free(efip);
- }
-}
-
-/*
* This returns the number of iovecs needed to log the given efi item.
* We only need 1 iovec for an efi item. It just logs the efi_log_format
* structure.
@@ -128,12 +106,12 @@ xfs_efi_item_pin(
}
/*
- * While EFIs cannot really be pinned, the unpin operation is the last place at
- * which the EFI is manipulated during a transaction. If we are being asked to
- * remove the EFI it's because the transaction has been cancelled and by
- * definition that means the EFI cannot be in the AIL so remove it from the
- * transaction and free it. Otherwise coordinate with xfs_efi_release()
- * to determine who gets to free the EFI.
+ * The unpin operation is the last place an EFI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the EFI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the EFI to either construct
+ * and commit the EFD or drop the EFD's reference in the event of error. Simply
+ * drop the log's EFI reference now that the log is done with it.
*/
STATIC void
xfs_efi_item_unpin(
@@ -141,15 +119,7 @@ xfs_efi_item_unpin(
int remove)
{
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
-
- if (remove) {
- ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
- if (lip->li_desc)
- xfs_trans_del_item(lip);
- xfs_efi_item_free(efip);
- return;
- }
- __xfs_efi_release(efip);
+ xfs_efi_release(efip);
}
/*
@@ -167,6 +137,11 @@ xfs_efi_item_push(
return XFS_ITEM_PINNED;
}
+/*
+ * The EFI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an EFD isn't going to be
+ * constructed and thus we free the EFI here directly.
+ */
STATIC void
xfs_efi_item_unlock(
struct xfs_log_item *lip)
@@ -301,23 +276,19 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
/*
- * This is called by the efd item code below to release references to the given
- * efi item. Each efd calls this with the number of extents that it has
- * logged, and when the sum of these reaches the total number of extents logged
- * by this efi item we can free the efi item.
+ * Freeing the efi requires that we remove it from the AIL if it has already
+ * been placed there. However, the EFI may not yet have been placed in the AIL
+ * when called by xfs_efi_release() from EFD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the EFI.
*/
void
-xfs_efi_release(xfs_efi_log_item_t *efip,
- uint nextents)
+xfs_efi_release(
+ struct xfs_efi_log_item *efip)
{
- ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
- if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) {
- /* recovery needs us to drop the EFI reference, too */
- if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
- __xfs_efi_release(efip);
-
- __xfs_efi_release(efip);
- /* efip may now have been freed, do not reference it again. */
+ if (atomic_dec_and_test(&efip->efi_refcount)) {
+ xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_efi_item_free(efip);
}
}
@@ -415,20 +386,27 @@ xfs_efd_item_push(
return XFS_ITEM_PINNED;
}
+/*
+ * The EFD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the EFI and free the EFD.
+ */
STATIC void
xfs_efd_item_unlock(
struct xfs_log_item *lip)
{
- if (lip->li_flags & XFS_LI_ABORTED)
- xfs_efd_item_free(EFD_ITEM(lip));
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+
+ if (lip->li_flags & XFS_LI_ABORTED) {
+ xfs_efi_release(efdp->efd_efip);
+ xfs_efd_item_free(efdp);
+ }
}
/*
- * When the efd item is committed to disk, all we need to do
- * is delete our reference to our partner efi item and then
- * free ourselves. Since we're freeing ourselves we must
- * return -1 to keep the transaction code from further referencing
- * this item.
+ * When the efd item is committed to disk, all we need to do is delete our
+ * reference to our partner efi item and then free ourselves. Since we're
+ * freeing ourselves we must return -1 to keep the transaction code from further
+ * referencing this item.
*/
STATIC xfs_lsn_t
xfs_efd_item_committed(
@@ -438,13 +416,14 @@ xfs_efd_item_committed(
struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
/*
- * If we got a log I/O error, it's always the case that the LR with the
- * EFI got unpinned and freed before the EFD got aborted.
+ * Drop the EFI reference regardless of whether the EFD has been
+ * aborted. Once the EFD transaction is constructed, it is the sole
+ * responsibility of the EFD to release the EFI (even if the EFI is
+ * aborted due to log I/O error).
*/
- if (!(lip->li_flags & XFS_LI_ABORTED))
- xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents);
-
+ xfs_efi_release(efdp->efd_efip);
xfs_efd_item_free(efdp);
+
return (xfs_lsn_t)-1;
}
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0ffbce32d569..8fa8651705e1 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -39,9 +39,28 @@ struct kmem_zone;
* "extent free done" log item described below.
*
* The EFI is reference counted so that it is not freed prior to both the EFI
- * and EFD being committed and unpinned. This ensures that when the last
- * reference goes away the EFI will always be in the AIL as it has been
- * unpinned, regardless of whether the EFD is processed before or after the EFI.
+ * and EFD being committed and unpinned. This ensures the EFI is inserted into
+ * the AIL even in the event of out of order EFI/EFD processing. In other words,
+ * an EFI is born with two references:
+ *
+ * 1.) an EFI held reference to track EFI AIL insertion
+ * 2.) an EFD held reference to track EFD commit
+ *
+ * On allocation, both references are the responsibility of the caller. Once the
+ * EFI is added to and dirtied in a transaction, ownership of reference one
+ * transfers to the transaction. The reference is dropped once the EFI is
+ * inserted to the AIL or in the event of failure along the way (e.g., commit
+ * failure, log I/O error, etc.). Note that the caller remains responsible for
+ * the EFD reference under all circumstances to this point. The caller has no
+ * means to detect failure once the transaction is committed, however.
+ * Therefore, an EFD is required after this point, even in the event of
+ * unrelated failure.
+ *
+ * Once an EFD is allocated and dirtied in a transaction, reference two
+ * transfers to the transaction. The EFD reference is dropped once it reaches
+ * the unpin handler. Similar to the EFI, the reference also drops in the event
+ * of commit failure or log I/O errors. Note that the EFD is not inserted in the
+ * AIL, so at this point both the EFI and EFD are freed.
*/
typedef struct xfs_efi_log_item {
xfs_log_item_t efi_item;
@@ -77,5 +96,6 @@ xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
int xfs_efi_copy_format(xfs_log_iovec_t *buf,
xfs_efi_log_format_t *dst_efi_fmt);
void xfs_efi_item_free(xfs_efi_log_item_t *);
+void xfs_efi_release(struct xfs_efi_log_item *);
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index db4acc1c3e73..e78feb400e22 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -317,24 +317,33 @@ xfs_file_read_iter(
return -EIO;
/*
- * Locking is a bit tricky here. If we take an exclusive lock
- * for direct IO, we effectively serialise all new concurrent
- * read IO to this file and block it behind IO that is currently in
- * progress because IO in progress holds the IO lock shared. We only
- * need to hold the lock exclusive to blow away the page cache, so
- * only take lock exclusively if the page cache needs invalidation.
- * This allows the normal direct IO case of no page cache pages to
- * proceeed concurrently without serialisation.
+ * Locking is a bit tricky here. If we take an exclusive lock for direct
+ * IO, we effectively serialise all new concurrent read IO to this file
+ * and block it behind IO that is currently in progress because IO in
+ * progress holds the IO lock shared. We only need to hold the lock
+ * exclusive to blow away the page cache, so only take lock exclusively
+ * if the page cache needs invalidation. This allows the normal direct
+ * IO case of no page cache pages to proceeed concurrently without
+ * serialisation.
*/
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+ /*
+ * The generic dio code only flushes the range of the particular
+ * I/O. Because we take an exclusive lock here, this whole
+ * sequence is considerably more expensive for us. This has a
+ * noticeable performance impact for any file with cached pages,
+ * even when outside of the range of the particular I/O.
+ *
+ * Hence, amortize the cost of the lock against a full file
+ * flush and reduce the chances of repeated iolock cycles going
+ * forward.
+ */
if (inode->i_mapping->nrpages) {
- ret = filemap_write_and_wait_range(
- VFS_I(ip)->i_mapping,
- pos, pos + size - 1);
+ ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
@@ -345,9 +354,7 @@ xfs_file_read_iter(
* we fail to invalidate a page, but this should never
* happen on XFS. Warn if it does fail.
*/
- ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
- pos >> PAGE_CACHE_SHIFT,
- (pos + size - 1) >> PAGE_CACHE_SHIFT);
+ ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
WARN_ON_ONCE(ret);
ret = 0;
}
@@ -733,19 +740,19 @@ xfs_file_dio_aio_write(
pos = iocb->ki_pos;
end = pos + count - 1;
+ /*
+ * See xfs_file_read_iter() for why we do a full-file flush here.
+ */
if (mapping->nrpages) {
- ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
- pos, end);
+ ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
if (ret)
goto out;
/*
- * Invalidate whole pages. This can return an error if
- * we fail to invalidate a page, but this should never
- * happen on XFS. Warn if it does fail.
+ * Invalidate whole pages. This can return an error if we fail
+ * to invalidate a page, but this should never happen on XFS.
+ * Warn if it does fail.
*/
- ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
- pos >> PAGE_CACHE_SHIFT,
- end >> PAGE_CACHE_SHIFT);
+ ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
WARN_ON_ONCE(ret);
ret = 0;
}
@@ -1539,8 +1546,36 @@ xfs_filemap_fault(
return ret;
}
+STATIC int
+xfs_filemap_pmd_fault(
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ int ret;
+
+ if (!IS_DAX(inode))
+ return VM_FAULT_FALLBACK;
+
+ trace_xfs_filemap_pmd_fault(ip);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
+ xfs_end_io_dax_write);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
+ .pmd_fault = xfs_filemap_pmd_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
};
@@ -1553,7 +1588,7 @@ xfs_file_mmap(
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 9b3438a7680f..ee3aaa0a5317 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -250,7 +250,7 @@ xfs_growfs_data_private(
agf->agf_freeblks = cpu_to_be32(tmpsize);
agf->agf_longest = cpu_to_be32(tmpsize);
if (xfs_sb_version_hascrc(&mp->m_sb))
- uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
@@ -273,7 +273,7 @@ xfs_growfs_data_private(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
agfl->agfl_seqno = cpu_to_be32(agno);
- uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
}
agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
@@ -309,7 +309,7 @@ xfs_growfs_data_private(
agi->agi_newino = cpu_to_be32(NULLAGINO);
agi->agi_dirino = cpu_to_be32(NULLAGINO);
if (xfs_sb_version_hascrc(&mp->m_sb))
- uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid);
+ uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
agi->agi_free_level = cpu_to_be32(1);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 76a9f2783282..0a326bd64d4e 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -412,6 +412,8 @@ xfs_iget(
if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return -EINVAL;
+ XFS_STATS_INC(xs_ig_attempts);
+
/* get the perag structure and ensure that it's inode capable */
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
agino = XFS_INO_TO_AGINO(mp, ino);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3da9f4da4f3d..dc40a6d5ae0d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -164,7 +164,7 @@ xfs_ilock(
(XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
if (lock_flags & XFS_IOLOCK_EXCL)
mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
@@ -212,7 +212,7 @@ xfs_ilock_nowait(
(XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
if (lock_flags & XFS_IOLOCK_EXCL) {
if (!mrtryupdate(&ip->i_iolock))
@@ -281,7 +281,7 @@ xfs_iunlock(
(XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
(XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
ASSERT(lock_flags != 0);
if (lock_flags & XFS_IOLOCK_EXCL)
@@ -363,31 +363,57 @@ int xfs_lock_delays;
#endif
/*
+ * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
+ * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined
+ * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build
+ * errors and warnings.
+ */
+#if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP)
+static bool
+xfs_lockdep_subclass_ok(
+ int subclass)
+{
+ return subclass < MAX_LOCKDEP_SUBCLASSES;
+}
+#else
+#define xfs_lockdep_subclass_ok(subclass) (true)
+#endif
+
+/*
* Bump the subclass so xfs_lock_inodes() acquires each lock with a different
- * value. This shouldn't be called for page fault locking, but we also need to
- * ensure we don't overrun the number of lockdep subclasses for the iolock or
- * mmaplock as that is limited to 12 by the mmap lock lockdep annotations.
+ * value. This can be called for any type of inode lock combination, including
+ * parent locking. Care must be taken to ensure we don't overrun the subclass
+ * storage fields in the class mask we build.
*/
static inline int
xfs_lock_inumorder(int lock_mode, int subclass)
{
+ int class = 0;
+
+ ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
+ XFS_ILOCK_RTSUM)));
+ ASSERT(xfs_lockdep_subclass_ok(subclass));
+
if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
- ASSERT(subclass + XFS_LOCK_INUMORDER <
- (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT)));
- lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+ ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
+ ASSERT(xfs_lockdep_subclass_ok(subclass +
+ XFS_IOLOCK_PARENT_VAL));
+ class += subclass << XFS_IOLOCK_SHIFT;
+ if (lock_mode & XFS_IOLOCK_PARENT)
+ class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT;
}
if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
- ASSERT(subclass + XFS_LOCK_INUMORDER <
- (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT)));
- lock_mode |= (subclass + XFS_LOCK_INUMORDER) <<
- XFS_MMAPLOCK_SHIFT;
+ ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS);
+ class += subclass << XFS_MMAPLOCK_SHIFT;
}
- if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
- lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
+ if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) {
+ ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS);
+ class += subclass << XFS_ILOCK_SHIFT;
+ }
- return lock_mode;
+ return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class;
}
/*
@@ -399,6 +425,11 @@ xfs_lock_inumorder(int lock_mode, int subclass)
* transaction (such as truncate). This can result in deadlock since the long
* running trans might need to wait for the inode we just locked in order to
* push the tail and free space in the log.
+ *
+ * xfs_lock_inodes() can only be used to lock one type of lock at a time -
+ * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
+ * lock more than one at a time, lockdep will report false positives saying we
+ * have violated locking orders.
*/
void
xfs_lock_inodes(
@@ -409,8 +440,29 @@ xfs_lock_inodes(
int attempts = 0, i, j, try_lock;
xfs_log_item_t *lp;
- /* currently supports between 2 and 5 inodes */
+ /*
+ * Currently supports between 2 and 5 inodes with exclusive locking. We
+ * support an arbitrary depth of locking here, but absolute limits on
+ * inodes depend on the the type of locking and the limits placed by
+ * lockdep annotations in xfs_lock_inumorder. These are all checked by
+ * the asserts.
+ */
ASSERT(ips && inodes >= 2 && inodes <= 5);
+ ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
+ XFS_ILOCK_EXCL));
+ ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
+ XFS_ILOCK_SHARED)));
+ ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) ||
+ inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1);
+ ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
+ inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
+ ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
+ inodes <= XFS_ILOCK_MAX_SUBCLASS + 1);
+
+ if (lock_mode & XFS_IOLOCK_EXCL) {
+ ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL)));
+ } else if (lock_mode & XFS_MMAPLOCK_EXCL)
+ ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
try_lock = 0;
i = 0;
@@ -629,30 +681,29 @@ xfs_lookup(
{
xfs_ino_t inum;
int error;
- uint lock_mode;
trace_xfs_lookup(dp, name);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
- lock_mode = xfs_ilock_data_map_shared(dp);
+ xfs_ilock(dp, XFS_IOLOCK_SHARED);
error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
- xfs_iunlock(dp, lock_mode);
-
if (error)
- goto out;
+ goto out_unlock;
error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
if (error)
goto out_free_name;
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return 0;
out_free_name:
if (ci_name)
kmem_free(ci_name->name);
-out:
+out_unlock:
+ xfs_iunlock(dp, XFS_IOLOCK_SHARED);
*ipp = NULL;
return error;
}
@@ -787,7 +838,7 @@ xfs_ialloc(
if (ip->i_d.di_version == 3) {
ASSERT(ip->i_d.di_ino == ino);
- ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
+ ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid));
ip->i_d.di_crc = 0;
ip->i_d.di_changecount = 1;
ip->i_d.di_lsn = 0;
@@ -1149,7 +1200,8 @@ xfs_create(
goto out_trans_cancel;
- xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
+ XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
xfs_bmap_init(&free_list, &first_block);
@@ -1175,11 +1227,8 @@ xfs_create(
*/
error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
prid, resblks > 0, &ip, &committed);
- if (error) {
- if (error == -ENOSPC)
- goto out_trans_cancel;
+ if (error)
goto out_trans_cancel;
- }
/*
* Now we join the directory inode to the transaction. We do not do it
@@ -1188,7 +1237,7 @@ xfs_create(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1261,7 +1310,7 @@ xfs_create(
xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
return error;
}
@@ -1318,11 +1367,8 @@ xfs_create_tmpfile(
error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
prid, resblks > 0, &ip, NULL);
- if (error) {
- if (error == -ENOSPC)
- goto out_trans_cancel;
+ if (error)
goto out_trans_cancel;
- }
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
@@ -1409,10 +1455,11 @@ xfs_link(
if (error)
goto error_return;
+ xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
/*
* If we are using project inheritance, we only allow hard link
@@ -1791,14 +1838,15 @@ xfs_inactive_ifree(
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
/*
- * Just ignore errors at this point. There is nothing we can
- * do except to try to keep going. Make sure it's not a silent
- * error.
+ * Just ignore errors at this point. There is nothing we can do except
+ * to try to keep going. Make sure it's not a silent error.
*/
error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error)
+ if (error) {
xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
__func__, error);
+ xfs_bmap_cancel(&free_list);
+ }
error = xfs_trans_commit(tp);
if (error)
xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
@@ -2515,9 +2563,10 @@ xfs_remove(
goto out_trans_cancel;
}
+ xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
/*
@@ -2898,6 +2947,12 @@ xfs_rename(
* whether the target directory is the same as the source
* directory, we can lock from 2 to 4 inodes.
*/
+ if (!new_parent)
+ xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
+ else
+ xfs_lock_two_inodes(src_dp, target_dp,
+ XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
+
xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
/*
@@ -2905,9 +2960,9 @@ xfs_rename(
* we can rely on either trans_commit or trans_cancel to unlock
* them.
*/
- xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
if (new_parent)
- xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
if (target_ip)
xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 8f22d20368d8..ca9e11989cbd 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -284,9 +284,9 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
* Flags for lockdep annotations.
*
* XFS_LOCK_PARENT - for directory operations that require locking a
- * parent directory inode and a child entry inode. The parent gets locked
- * with this flag so it gets a lockdep subclass of 1 and the child entry
- * lock will have a lockdep subclass of 0.
+ * parent directory inode and a child entry inode. IOLOCK requires nesting,
+ * MMAPLOCK does not support this class, ILOCK requires a single subclass
+ * to differentiate parent from child.
*
* XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
* inodes do not participate in the normal lock order, and thus have their
@@ -295,30 +295,63 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
* XFS_LOCK_INUMORDER - for locking several inodes at the some time
* with xfs_lock_inodes(). This flag is used as the starting subclass
* and each subsequent lock acquired will increment the subclass by one.
- * So the first lock acquired will have a lockdep subclass of 4, the
- * second lock will have a lockdep subclass of 5, and so on. It is
- * the responsibility of the class builder to shift this to the correct
- * portion of the lock_mode lockdep mask.
+ * However, MAX_LOCKDEP_SUBCLASSES == 8, which means we are greatly
+ * limited to the subclasses we can represent via nesting. We need at least
+ * 5 inodes nest depth for the ILOCK through rename, and we also have to support
+ * XFS_ILOCK_PARENT, which gives 6 subclasses. Then we have XFS_ILOCK_RTBITMAP
+ * and XFS_ILOCK_RTSUM, which are another 2 unique subclasses, so that's all
+ * 8 subclasses supported by lockdep.
+ *
+ * This also means we have to number the sub-classes in the lowest bits of
+ * the mask we keep, and we have to ensure we never exceed 3 bits of lockdep
+ * mask and we can't use bit-masking to build the subclasses. What a mess.
+ *
+ * Bit layout:
+ *
+ * Bit Lock Region
+ * 16-19 XFS_IOLOCK_SHIFT dependencies
+ * 20-23 XFS_MMAPLOCK_SHIFT dependencies
+ * 24-31 XFS_ILOCK_SHIFT dependencies
+ *
+ * IOLOCK values
+ *
+ * 0-3 subclass value
+ * 4-7 PARENT subclass values
+ *
+ * MMAPLOCK values
+ *
+ * 0-3 subclass value
+ * 4-7 unused
+ *
+ * ILOCK values
+ * 0-4 subclass values
+ * 5 PARENT subclass (not nestable)
+ * 6 RTBITMAP subclass (not nestable)
+ * 7 RTSUM subclass (not nestable)
+ *
*/
-#define XFS_LOCK_PARENT 1
-#define XFS_LOCK_RTBITMAP 2
-#define XFS_LOCK_RTSUM 3
-#define XFS_LOCK_INUMORDER 4
-
-#define XFS_IOLOCK_SHIFT 16
-#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
-
-#define XFS_MMAPLOCK_SHIFT 20
-
-#define XFS_ILOCK_SHIFT 24
-#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
-#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
-#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
-
-#define XFS_IOLOCK_DEP_MASK 0x000f0000
-#define XFS_MMAPLOCK_DEP_MASK 0x00f00000
-#define XFS_ILOCK_DEP_MASK 0xff000000
-#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \
+#define XFS_IOLOCK_SHIFT 16
+#define XFS_IOLOCK_PARENT_VAL 4
+#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1)
+#define XFS_IOLOCK_DEP_MASK 0x000f0000
+#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT)
+
+#define XFS_MMAPLOCK_SHIFT 20
+#define XFS_MMAPLOCK_NUMORDER 0
+#define XFS_MMAPLOCK_MAX_SUBCLASS 3
+#define XFS_MMAPLOCK_DEP_MASK 0x00f00000
+
+#define XFS_ILOCK_SHIFT 24
+#define XFS_ILOCK_PARENT_VAL 5
+#define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1)
+#define XFS_ILOCK_RTBITMAP_VAL 6
+#define XFS_ILOCK_RTSUM_VAL 7
+#define XFS_ILOCK_DEP_MASK 0xff000000
+#define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT)
+#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT)
+
+#define XFS_LOCK_SUBCLASS_MASK (XFS_IOLOCK_DEP_MASK | \
XFS_MMAPLOCK_DEP_MASK | \
XFS_ILOCK_DEP_MASK)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index bf13a5a7e2f4..62bd80f4edd9 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -703,17 +703,10 @@ xfs_iflush_abort(
xfs_inode_log_item_t *iip = ip->i_itemp;
if (iip) {
- struct xfs_ail *ailp = iip->ili_item.li_ailp;
if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
- spin_lock(&ailp->xa_lock);
- if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
- /* xfs_trans_ail_delete() drops the AIL lock. */
- xfs_trans_ail_delete(ailp, &iip->ili_item,
- stale ?
- SHUTDOWN_LOG_IO_ERROR :
+ xfs_trans_ail_remove(&iip->ili_item,
+ stale ? SHUTDOWN_LOG_IO_ERROR :
SHUTDOWN_CORRUPT_INCORE);
- } else
- spin_unlock(&ailp->xa_lock);
}
iip->ili_logged = 0;
/*
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 766b23f86ce9..8294132e6a3c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -609,7 +609,7 @@ xfs_setattr_nonsize(
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
if (error)
- goto out_dqrele;
+ goto out_trans_cancel;
xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -640,7 +640,7 @@ xfs_setattr_nonsize(
NULL, capable(CAP_FOWNER) ?
XFS_QMOPT_FORCE_RES : 0);
if (error) /* out of quota */
- goto out_trans_cancel;
+ goto out_unlock;
}
}
@@ -729,10 +729,10 @@ xfs_setattr_nonsize(
return 0;
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
out_trans_cancel:
xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out_dqrele:
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
return error;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f41b0c3fddab..930ebd86beba 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -473,7 +473,8 @@ xfs_bulkstat(
* pending error, then we are done.
*/
del_cursor:
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, error ?
+ XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
xfs_buf_relse(agbp);
if (error)
break;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 08d4fe46f0fa..aaadee0969c9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -668,9 +668,9 @@ xfs_log_mount(
ASSERT(0);
goto out_free_log;
}
+ xfs_crit(mp, "Log size out of supported range.");
xfs_crit(mp,
-"Log size out of supported range. Continuing onwards, but if log hangs are\n"
-"experienced then please report this message in the bug report.");
+"Continuing onwards, but if log hangs are experienced then please report this message in the bug report.");
}
/*
@@ -700,6 +700,7 @@ xfs_log_mount(
if (error) {
xfs_warn(mp, "log mount/recovery failed: error %d",
error);
+ xlog_recover_cancel(mp->m_log);
goto out_destroy_ail;
}
}
@@ -740,18 +741,35 @@ out:
* it.
*/
int
-xfs_log_mount_finish(xfs_mount_t *mp)
+xfs_log_mount_finish(
+ struct xfs_mount *mp)
{
int error = 0;
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
- error = xlog_recover_finish(mp->m_log);
- if (!error)
- xfs_log_work_queue(mp);
- } else {
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ return 0;
}
+ error = xlog_recover_finish(mp->m_log);
+ if (!error)
+ xfs_log_work_queue(mp);
+
+ return error;
+}
+
+/*
+ * The mount has failed. Cancel the recovery if it hasn't completed and destroy
+ * the log.
+ */
+int
+xfs_log_mount_cancel(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ error = xlog_recover_cancel(mp->m_log);
+ xfs_log_unmount(mp);
return error;
}
@@ -1142,11 +1160,13 @@ xlog_space_left(
* In this case we just want to return the size of the
* log as the amount of space left.
*/
+ xfs_alert(log->l_mp, "xlog_space_left: head behind tail");
xfs_alert(log->l_mp,
- "xlog_space_left: head behind tail\n"
- " tail_cycle = %d, tail_bytes = %d\n"
- " GH cycle = %d, GH bytes = %d",
- tail_cycle, tail_bytes, head_cycle, head_bytes);
+ " tail_cycle = %d, tail_bytes = %d",
+ tail_cycle, tail_bytes);
+ xfs_alert(log->l_mp,
+ " GH cycle = %d, GH bytes = %d",
+ head_cycle, head_bytes);
ASSERT(0);
free_bytes = log->l_logsize;
}
@@ -1652,8 +1672,13 @@ xlog_cksum(
if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
int i;
+ int xheads;
+
+ xheads = size / XLOG_HEADER_CYCLE_SIZE;
+ if (size % XLOG_HEADER_CYCLE_SIZE)
+ xheads++;
- for (i = 1; i < log->l_iclog_heads; i++) {
+ for (i = 1; i < xheads; i++) {
crc = crc32c(crc, &xhdr[i].hic_xheader,
sizeof(struct xlog_rec_ext_header));
}
@@ -2028,26 +2053,24 @@ xlog_print_tic_res(
"SWAPEXT"
};
- xfs_warn(mp,
- "xlog_write: reservation summary:\n"
- " trans type = %s (%u)\n"
- " unit res = %d bytes\n"
- " current res = %d bytes\n"
- " total reg = %u bytes (o/flow = %u bytes)\n"
- " ophdrs = %u (ophdr space = %u bytes)\n"
- " ophdr + reg = %u bytes\n"
- " num regions = %u",
- ((ticket->t_trans_type <= 0 ||
- ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+ xfs_warn(mp, "xlog_write: reservation summary:");
+ xfs_warn(mp, " trans type = %s (%u)",
+ ((ticket->t_trans_type <= 0 ||
+ ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
"bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
- ticket->t_trans_type,
- ticket->t_unit_res,
- ticket->t_curr_res,
- ticket->t_res_arr_sum, ticket->t_res_o_flow,
- ticket->t_res_num_ophdrs, ophdr_spc,
- ticket->t_res_arr_sum +
- ticket->t_res_o_flow + ophdr_spc,
- ticket->t_res_num);
+ ticket->t_trans_type);
+ xfs_warn(mp, " unit res = %d bytes",
+ ticket->t_unit_res);
+ xfs_warn(mp, " current res = %d bytes",
+ ticket->t_curr_res);
+ xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)",
+ ticket->t_res_arr_sum, ticket->t_res_o_flow);
+ xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)",
+ ticket->t_res_num_ophdrs, ophdr_spc);
+ xfs_warn(mp, " ophdr + reg = %u bytes",
+ ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
+ xfs_warn(mp, " num regions = %u",
+ ticket->t_res_num);
for (i = 0; i < ticket->t_res_num; i++) {
uint r_type = ticket->t_res_arr[i].r_type;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index fa27aaec72cb..09d91d3166cd 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -147,6 +147,7 @@ int xfs_log_mount(struct xfs_mount *mp,
xfs_daddr_t start_block,
int num_bblocks);
int xfs_log_mount_finish(struct xfs_mount *mp);
+int xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index abc2ccbff739..4e7649351f5a 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -307,7 +307,13 @@ xlog_cil_insert_items(
if (!(lidp->lid_flags & XFS_LID_DIRTY))
continue;
- list_move_tail(&lip->li_cil, &cil->xc_cil);
+ /*
+ * Only move the item if it isn't already at the tail. This is
+ * to prevent a transient list_empty() state when reinserting
+ * an item that is already the only item in the CIL.
+ */
+ if (!list_is_last(&lip->li_cil, &cil->xc_cil))
+ list_move_tail(&lip->li_cil, &cil->xc_cil);
}
/* account for space used by new iovec headers */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 1c87c8abfbed..950f3f94720c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -426,6 +426,8 @@ xlog_recover(
extern int
xlog_recover_finish(
struct xlog *log);
+extern int
+xlog_recover_cancel(struct xlog *);
extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
char *dp, int size);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 480ebba8464f..512a0945d52a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1895,15 +1895,25 @@ xlog_recover_get_buf_lsn(
*/
goto recover_immediately;
case XFS_SB_MAGIC:
+ /*
+ * superblock uuids are magic. We may or may not have a
+ * sb_meta_uuid on disk, but it will be set in the in-core
+ * superblock. We set the uuid pointer for verification
+ * according to the superblock feature mask to ensure we check
+ * the relevant UUID in the superblock.
+ */
lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
- uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+ if (xfs_sb_version_hasmetauuid(&mp->m_sb))
+ uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
+ else
+ uuid = &((struct xfs_dsb *)blk)->sb_uuid;
break;
default:
break;
}
if (lsn != (xfs_lsn_t)-1) {
- if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
goto recover_immediately;
return lsn;
}
@@ -2933,16 +2943,16 @@ xlog_recover_efi_pass2(
struct xlog_recover_item *item,
xfs_lsn_t lsn)
{
- int error;
- xfs_mount_t *mp = log->l_mp;
- xfs_efi_log_item_t *efip;
- xfs_efi_log_format_t *efi_formatp;
+ int error;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_efi_log_item *efip;
+ struct xfs_efi_log_format *efi_formatp;
efi_formatp = item->ri_buf[0].i_addr;
efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
- if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
- &(efip->efi_format)))) {
+ error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
+ if (error) {
xfs_efi_item_free(efip);
return error;
}
@@ -2950,20 +2960,23 @@ xlog_recover_efi_pass2(
spin_lock(&log->l_ailp->xa_lock);
/*
- * xfs_trans_ail_update() drops the AIL lock.
+ * The EFI has two references. One for the EFD and one for EFI to ensure
+ * it makes it into the AIL. Insert the EFI into the AIL directly and
+ * drop the EFI reference. Note that xfs_trans_ail_update() drops the
+ * AIL lock.
*/
xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
+ xfs_efi_release(efip);
return 0;
}
/*
- * This routine is called when an efd format structure is found in
- * a committed transaction in the log. It's purpose is to cancel
- * the corresponding efi if it was still in the log. To do this
- * it searches the AIL for the efi with an id equal to that in the
- * efd format structure. If we find it, we remove the efi from the
- * AIL and free it.
+ * This routine is called when an EFD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding EFI if it
+ * was still in the log. To do this it searches the AIL for the EFI with an id
+ * equal to that in the EFD format structure. If we find it we drop the EFD
+ * reference, which removes the EFI from the AIL and frees it.
*/
STATIC int
xlog_recover_efd_pass2(
@@ -2985,8 +2998,8 @@ xlog_recover_efd_pass2(
efi_id = efd_formatp->efd_efi_id;
/*
- * Search for the efi with the id in the efd format structure
- * in the AIL.
+ * Search for the EFI with the id in the EFD format structure in the
+ * AIL.
*/
spin_lock(&ailp->xa_lock);
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
@@ -2995,18 +3008,18 @@ xlog_recover_efd_pass2(
efip = (xfs_efi_log_item_t *)lip;
if (efip->efi_format.efi_id == efi_id) {
/*
- * xfs_trans_ail_delete() drops the
- * AIL lock.
+ * Drop the EFD reference to the EFI. This
+ * removes the EFI from the AIL and frees it.
*/
- xfs_trans_ail_delete(ailp, lip,
- SHUTDOWN_CORRUPT_INCORE);
- xfs_efi_item_free(efip);
+ spin_unlock(&ailp->xa_lock);
+ xfs_efi_release(efip);
spin_lock(&ailp->xa_lock);
break;
}
}
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
+
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
@@ -3034,6 +3047,11 @@ xlog_recover_do_icreate_pass2(
unsigned int count;
unsigned int isize;
xfs_agblock_t length;
+ int blks_per_cluster;
+ int bb_per_cluster;
+ int cancel_count;
+ int nbufs;
+ int i;
icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
if (icl->icl_type != XFS_LI_ICREATE) {
@@ -3092,22 +3110,45 @@ xlog_recover_do_icreate_pass2(
}
/*
- * Inode buffers can be freed. Do not replay the inode initialisation as
- * we could be overwriting something written after this inode buffer was
- * cancelled.
+ * The icreate transaction can cover multiple cluster buffers and these
+ * buffers could have been freed and reused. Check the individual
+ * buffers for cancellation so we don't overwrite anything written after
+ * a cancellation.
+ */
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ nbufs = length / blks_per_cluster;
+ for (i = 0, cancel_count = 0; i < nbufs; i++) {
+ xfs_daddr_t daddr;
+
+ daddr = XFS_AGB_TO_DADDR(mp, agno,
+ agbno + i * blks_per_cluster);
+ if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
+ cancel_count++;
+ }
+
+ /*
+ * We currently only use icreate for a single allocation at a time. This
+ * means we should expect either all or none of the buffers to be
+ * cancelled. Be conservative and skip replay if at least one buffer is
+ * cancelled, but warn the user that something is awry if the buffers
+ * are not consistent.
*
- * XXX: we need to iterate all buffers and only init those that are not
- * cancelled. I think that a more fine grained factoring of
- * xfs_ialloc_inode_init may be appropriate here to enable this to be
- * done easily.
+ * XXX: This must be refined to only skip cancelled clusters once we use
+ * icreate for multiple chunk allocations.
*/
- if (xlog_check_buffer_cancelled(log,
- XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+ ASSERT(!cancel_count || cancel_count == nbufs);
+ if (cancel_count) {
+ if (cancel_count != nbufs)
+ xfs_warn(mp,
+ "WARNING: partial inode chunk cancellation, skipped icreate.");
+ trace_xfs_log_recover_icreate_cancel(log, icl);
return 0;
+ }
- xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length,
- be32_to_cpu(icl->icl_gen));
- return 0;
+ trace_xfs_log_recover_icreate_recover(log, icl);
+ return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
+ length, be32_to_cpu(icl->icl_gen));
}
STATIC void
@@ -3385,14 +3426,24 @@ xlog_recover_add_to_cont_trans(
char *ptr, *old_ptr;
int old_len;
+ /*
+ * If the transaction is empty, the header was split across this and the
+ * previous record. Copy the rest of the header.
+ */
if (list_empty(&trans->r_itemq)) {
- /* finish copying rest of trans header */
+ ASSERT(len < sizeof(struct xfs_trans_header));
+ if (len > sizeof(struct xfs_trans_header)) {
+ xfs_warn(log->l_mp, "%s: bad header length", __func__);
+ return -EIO;
+ }
+
xlog_recover_add_item(&trans->r_itemq);
ptr = (char *)&trans->r_theader +
- sizeof(xfs_trans_header_t) - len;
+ sizeof(struct xfs_trans_header) - len;
memcpy(ptr, dp, len);
return 0;
}
+
/* take the tail entry */
item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
@@ -3441,7 +3492,19 @@ xlog_recover_add_to_trans(
ASSERT(0);
return -EIO;
}
- if (len == sizeof(xfs_trans_header_t))
+
+ if (len > sizeof(struct xfs_trans_header)) {
+ xfs_warn(log->l_mp, "%s: bad header length", __func__);
+ ASSERT(0);
+ return -EIO;
+ }
+
+ /*
+ * The transaction header can be arbitrarily split across op
+ * records. If we don't have the whole thing here, copy what we
+ * do have and handle the rest in the next record.
+ */
+ if (len == sizeof(struct xfs_trans_header))
xlog_recover_add_item(&trans->r_itemq);
memcpy(&trans->r_theader, dp, len);
return 0;
@@ -3744,7 +3807,7 @@ xlog_recover_process_efi(
* free the memory associated with it.
*/
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- xfs_efi_release(efip, efip->efi_format.efi_nextents);
+ xfs_efi_release(efip);
return -EIO;
}
}
@@ -3757,11 +3820,11 @@ xlog_recover_process_efi(
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
extp = &(efip->efi_format.efi_extents[i]);
- error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
+ error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
+ extp->ext_len);
if (error)
goto abort_error;
- xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
- extp->ext_len);
+
}
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
@@ -3793,10 +3856,10 @@ abort_error:
*/
STATIC int
xlog_recover_process_efis(
- struct xlog *log)
+ struct xlog *log)
{
- xfs_log_item_t *lip;
- xfs_efi_log_item_t *efip;
+ struct xfs_log_item *lip;
+ struct xfs_efi_log_item *efip;
int error = 0;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp;
@@ -3820,7 +3883,7 @@ xlog_recover_process_efis(
/*
* Skip EFIs that we've already processed.
*/
- efip = (xfs_efi_log_item_t *)lip;
+ efip = container_of(lip, struct xfs_efi_log_item, efi_item);
if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
lip = xfs_trans_ail_cursor_next(ailp, &cur);
continue;
@@ -3840,6 +3903,50 @@ out:
}
/*
+ * A cancel occurs when the mount has failed and we're bailing out. Release all
+ * pending EFIs so they don't pin the AIL.
+ */
+STATIC int
+xlog_recover_cancel_efis(
+ struct xlog *log)
+{
+ struct xfs_log_item *lip;
+ struct xfs_efi_log_item *efip;
+ int error = 0;
+ struct xfs_ail_cursor cur;
+ struct xfs_ail *ailp;
+
+ ailp = log->l_ailp;
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ while (lip != NULL) {
+ /*
+ * We're done when we see something other than an EFI.
+ * There should be no EFIs left in the AIL now.
+ */
+ if (lip->li_type != XFS_LI_EFI) {
+#ifdef DEBUG
+ for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+ ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
+ break;
+ }
+
+ efip = container_of(lip, struct xfs_efi_log_item, efi_item);
+
+ spin_unlock(&ailp->xa_lock);
+ xfs_efi_release(efip);
+ spin_lock(&ailp->xa_lock);
+
+ lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ }
+
+ xfs_trans_ail_cursor_done(&cur);
+ spin_unlock(&ailp->xa_lock);
+ return error;
+}
+
+/*
* This routine performs a transaction to null out a bad inode pointer
* in an agi unlinked inode hash bucket.
*/
@@ -4532,11 +4639,13 @@ xlog_recover(
xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
xfs_warn(log->l_mp,
-"Superblock has unknown incompatible log features (0x%x) enabled.\n"
-"The log can not be fully and/or safely recovered by this kernel.\n"
-"Please recover the log on a kernel that supports the unknown features.",
+"Superblock has unknown incompatible log features (0x%x) enabled.",
(log->l_mp->m_sb.sb_features_log_incompat &
XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+ xfs_warn(log->l_mp,
+"The log can not be fully and/or safely recovered by this kernel.");
+ xfs_warn(log->l_mp,
+"Please recover the log on a kernel that supports the unknown features.");
return -EINVAL;
}
@@ -4612,6 +4721,17 @@ xlog_recover_finish(
return 0;
}
+int
+xlog_recover_cancel(
+ struct xlog *log)
+{
+ int error = 0;
+
+ if (log->l_flags & XLOG_RECOVERY_NEEDED)
+ error = xlog_recover_cancel_efis(log);
+
+ return error;
+}
#if defined(DEBUG)
/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 461e791efad7..bf92e0c037c7 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -615,14 +615,14 @@ xfs_default_resblks(xfs_mount_t *mp)
*/
int
xfs_mountfs(
- xfs_mount_t *mp)
+ struct xfs_mount *mp)
{
- xfs_sb_t *sbp = &(mp->m_sb);
- xfs_inode_t *rip;
- __uint64_t resblks;
- uint quotamount = 0;
- uint quotaflags = 0;
- int error = 0;
+ struct xfs_sb *sbp = &(mp->m_sb);
+ struct xfs_inode *rip;
+ __uint64_t resblks;
+ uint quotamount = 0;
+ uint quotaflags = 0;
+ int error = 0;
xfs_sb_mount_common(mp, sbp);
@@ -799,7 +799,9 @@ xfs_mountfs(
}
/*
- * log's mount-time initialization. Perform 1st part recovery if needed
+ * Log's mount-time initialization. The first part of recovery can place
+ * some items on the AIL, to be handled when recovery is finished or
+ * cancelled.
*/
error = xfs_log_mount(mp, mp->m_logdev_targp,
XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
@@ -910,9 +912,9 @@ xfs_mountfs(
}
/*
- * Finish recovering the file system. This part needed to be
- * delayed until after the root and real-time bitmap inodes
- * were consistently read in.
+ * Finish recovering the file system. This part needed to be delayed
+ * until after the root and real-time bitmap inodes were consistently
+ * read in.
*/
error = xfs_log_mount_finish(mp);
if (error) {
@@ -955,8 +957,10 @@ xfs_mountfs(
xfs_rtunmount_inodes(mp);
out_rele_rip:
IRELE(rip);
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
out_log_dealloc:
- xfs_log_unmount(mp);
+ xfs_log_mount_cancel(mp);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_wait_buftarg(mp->m_logdev_targp);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index f4e8c06eee26..ab1bac6a3a1c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -757,31 +757,30 @@ xfs_rtallocate_extent_size(
/*
* Allocate space to the bitmap or summary file, and zero it, for growfs.
*/
-STATIC int /* error */
+STATIC int
xfs_growfs_rt_alloc(
- xfs_mount_t *mp, /* file system mount point */
- xfs_extlen_t oblocks, /* old count of blocks */
- xfs_extlen_t nblocks, /* new count of blocks */
- xfs_inode_t *ip) /* inode (bitmap/summary) */
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_extlen_t oblocks, /* old count of blocks */
+ xfs_extlen_t nblocks, /* new count of blocks */
+ struct xfs_inode *ip) /* inode (bitmap/summary) */
{
- xfs_fileoff_t bno; /* block number in file */
- xfs_buf_t *bp; /* temporary buffer for zeroing */
- int committed; /* transaction committed flag */
- xfs_daddr_t d; /* disk block address */
- int error; /* error return value */
- xfs_fsblock_t firstblock; /* first block allocated in xaction */
- xfs_bmap_free_t flist; /* list of freed blocks */
- xfs_fsblock_t fsbno; /* filesystem block for bno */
- xfs_bmbt_irec_t map; /* block map output */
- int nmap; /* number of block maps */
- int resblks; /* space reservation */
+ xfs_fileoff_t bno; /* block number in file */
+ struct xfs_buf *bp; /* temporary buffer for zeroing */
+ int committed; /* transaction committed flag */
+ xfs_daddr_t d; /* disk block address */
+ int error; /* error return value */
+ xfs_fsblock_t firstblock;/* first block allocated in xaction */
+ struct xfs_bmap_free flist; /* list of freed blocks */
+ xfs_fsblock_t fsbno; /* filesystem block for bno */
+ struct xfs_bmbt_irec map; /* block map output */
+ int nmap; /* number of block maps */
+ int resblks; /* space reservation */
+ struct xfs_trans *tp;
/*
* Allocate space to the file, as necessary.
*/
while (oblocks < nblocks) {
- xfs_trans_t *tp;
-
tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
/*
@@ -790,7 +789,7 @@ xfs_growfs_rt_alloc(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
resblks, 0);
if (error)
- goto error_cancel;
+ goto out_trans_cancel;
/*
* Lock the inode.
*/
@@ -808,16 +807,16 @@ xfs_growfs_rt_alloc(
if (!error && nmap < 1)
error = -ENOSPC;
if (error)
- goto error_cancel;
+ goto out_bmap_cancel;
/*
* Free any blocks freed up in the transaction, then commit.
*/
error = xfs_bmap_finish(&tp, &flist, &committed);
if (error)
- goto error_cancel;
+ goto out_bmap_cancel;
error = xfs_trans_commit(tp);
if (error)
- goto error;
+ return error;
/*
* Now we need to clear the allocated blocks.
* Do this one block per transaction, to keep it simple.
@@ -832,7 +831,7 @@ xfs_growfs_rt_alloc(
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
0, 0);
if (error)
- goto error_cancel;
+ goto out_trans_cancel;
/*
* Lock the bitmap inode.
*/
@@ -846,9 +845,7 @@ xfs_growfs_rt_alloc(
mp->m_bsize, 0);
if (bp == NULL) {
error = -EIO;
-error_cancel:
- xfs_trans_cancel(tp);
- goto error;
+ goto out_trans_cancel;
}
memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
@@ -857,16 +854,20 @@ error_cancel:
*/
error = xfs_trans_commit(tp);
if (error)
- goto error;
+ return error;
}
/*
* Go on to the next extent, if any.
*/
oblocks = map.br_startoff + map.br_blockcount;
}
+
return 0;
-error:
+out_bmap_cancel:
+ xfs_bmap_cancel(&flist);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
return error;
}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1fb16562c159..904f637cfa5f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -261,16 +261,8 @@ xfs_parseargs(
mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
if (!mp->m_rtname)
return -ENOMEM;
- } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
- if (!value || !*value) {
- xfs_warn(mp, "%s option requires an argument",
- this_char);
- return -EINVAL;
- }
- if (kstrtoint(value, 10, &iosize))
- return -EINVAL;
- iosizelog = ffs(iosize) - 1;
- } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
+ } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE) ||
+ !strcmp(this_char, MNTOPT_BIOSIZE)) {
if (!value || !*value) {
xfs_warn(mp, "%s option requires an argument",
this_char);
@@ -511,9 +503,9 @@ xfs_showargs(
seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
if (mp->m_logname)
- seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
+ seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
if (mp->m_rtname)
- seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
+ seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
if (mp->m_dalign > 0)
seq_printf(m, "," MNTOPT_SUNIT "=%d",
@@ -1528,6 +1520,10 @@ xfs_fs_fill_super(
}
}
+ if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+ xfs_alert(mp,
+ "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 4be27b0210af..996481eeb491 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -240,7 +240,8 @@ xfs_symlink(
if (error)
goto out_trans_cancel;
- xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
+ xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
+ XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
/*
@@ -288,7 +289,7 @@ xfs_symlink(
* the transaction cancel unlocking dp so don't do it explicitly in the
* error path.
*/
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
unlock_dp_on_error = false;
/*
@@ -421,7 +422,7 @@ out_release_inode:
xfs_qm_dqrele(pdqp);
if (unlock_dp_on_error)
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
return error;
}
@@ -501,7 +502,7 @@ xfs_inactive_symlink_rmt(
/*
* Unmap the dead block(s) to the free_list.
*/
- error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+ error = xfs_bunmapi(tp, ip, 0, size, 0, nmaps,
&first_block, &free_list, &done);
if (error)
goto error_bmap_cancel;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8d916d33d93d..5ed36b1e04c1 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -687,6 +687,7 @@ DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
DECLARE_EVENT_CLASS(xfs_iref_class,
@@ -2089,6 +2090,40 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class,
+ TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f),
+ TP_ARGS(log, in_f),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(unsigned int, count)
+ __field(unsigned int, isize)
+ __field(xfs_agblock_t, length)
+ __field(unsigned int, gen)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->agno = be32_to_cpu(in_f->icl_ag);
+ __entry->agbno = be32_to_cpu(in_f->icl_agbno);
+ __entry->count = be32_to_cpu(in_f->icl_count);
+ __entry->isize = be32_to_cpu(in_f->icl_isize);
+ __entry->length = be32_to_cpu(in_f->icl_length);
+ __entry->gen = be32_to_cpu(in_f->icl_gen);
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u count %u isize %u length %u "
+ "gen %u", MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno, __entry->agbno, __entry->count, __entry->isize,
+ __entry->length, __entry->gen)
+)
+#define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \
+ TP_PROTO(struct xlog *log, struct xfs_icreate_log *in_f), \
+ TP_ARGS(log, in_f))
+
+DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_cancel);
+DEFINE_LOG_RECOVER_ICREATE_ITEM(xfs_log_recover_icreate_recover);
+
DECLARE_EVENT_CLASS(xfs_discard_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 0582a27107d4..a0ab1dae9c31 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1019,9 +1019,10 @@ xfs_trans_cancel(
* chunk we've been working on and get a new transaction to continue.
*/
int
-xfs_trans_roll(
+__xfs_trans_roll(
struct xfs_trans **tpp,
- struct xfs_inode *dp)
+ struct xfs_inode *dp,
+ int *committed)
{
struct xfs_trans *trans;
struct xfs_trans_res tres;
@@ -1052,6 +1053,7 @@ xfs_trans_roll(
if (error)
return error;
+ *committed = 1;
trans = *tpp;
/*
@@ -1074,3 +1076,12 @@ xfs_trans_roll(
xfs_trans_ijoin(trans, dp, 0);
return 0;
}
+
+int
+xfs_trans_roll(
+ struct xfs_trans **tpp,
+ struct xfs_inode *dp)
+{
+ int committed = 0;
+ return __xfs_trans_roll(tpp, dp, &committed);
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 3b21b4e5e467..4643070d7cae 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -213,7 +213,6 @@ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint);
-void xfs_efi_release(struct xfs_efi_log_item *, uint);
void xfs_trans_log_efi_extent(xfs_trans_t *,
struct xfs_efi_log_item *,
xfs_fsblock_t,
@@ -221,11 +220,11 @@ void xfs_trans_log_efi_extent(xfs_trans_t *,
struct xfs_efd_log_item *xfs_trans_get_efd(xfs_trans_t *,
struct xfs_efi_log_item *,
uint);
-void xfs_trans_log_efd_extent(xfs_trans_t *,
- struct xfs_efd_log_item *,
- xfs_fsblock_t,
- xfs_extlen_t);
+int xfs_trans_free_extent(struct xfs_trans *,
+ struct xfs_efd_log_item *, xfs_fsblock_t,
+ xfs_extlen_t);
int xfs_trans_commit(struct xfs_trans *);
+int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *);
int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
void xfs_trans_cancel(xfs_trans_t *);
int xfs_trans_ail_init(struct xfs_mount *);
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 284397dd7990..a96ae540eb62 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -25,6 +25,7 @@
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_extfree_item.h"
+#include "xfs_alloc.h"
/*
* This routine is called to allocate an "extent free intention"
@@ -108,19 +109,30 @@ xfs_trans_get_efd(xfs_trans_t *tp,
}
/*
- * This routine is called to indicate that the described
- * extent is to be logged as having been freed. It should
- * be called once for each extent freed.
+ * Free an extent and log it to the EFD. Note that the transaction is marked
+ * dirty regardless of whether the extent free succeeds or fails to support the
+ * EFI/EFD lifecycle rules.
*/
-void
-xfs_trans_log_efd_extent(xfs_trans_t *tp,
- xfs_efd_log_item_t *efdp,
- xfs_fsblock_t start_block,
- xfs_extlen_t ext_len)
+int
+xfs_trans_free_extent(
+ struct xfs_trans *tp,
+ struct xfs_efd_log_item *efdp,
+ xfs_fsblock_t start_block,
+ xfs_extlen_t ext_len)
{
uint next_extent;
- xfs_extent_t *extp;
+ struct xfs_extent *extp;
+ int error;
+ error = xfs_free_extent(tp, start_block, ext_len);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the EFI and frees the EFD
+ * 2.) shuts down the filesystem
+ */
tp->t_flags |= XFS_TRANS_DIRTY;
efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
@@ -130,4 +142,6 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp,
extp->ext_start = start_block;
extp->ext_len = ext_len;
efdp->efd_next_extent++;
+
+ return error;
}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 1b736294558a..49931b72da8a 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -119,6 +119,21 @@ xfs_trans_ail_delete(
xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
}
+static inline void
+xfs_trans_ail_remove(
+ struct xfs_log_item *lip,
+ int shutdown_type)
+{
+ struct xfs_ail *ailp = lip->li_ailp;
+
+ spin_lock(&ailp->xa_lock);
+ /* xfs_trans_ail_delete() drops the AIL lock */
+ if (lip->li_flags & XFS_LI_IN_AIL)
+ xfs_trans_ail_delete(ailp, lip, shutdown_type);
+ else
+ spin_unlock(&ailp->xa_lock);
+}
+
void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
void xfs_ail_push_all(struct xfs_ail *);
void xfs_ail_push_all_sync(struct xfs_ail *);