diff options
Diffstat (limited to 'fs')
90 files changed, 761 insertions, 1214 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 59c32c9b799f..c4a2dc41beac 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -121,10 +121,6 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); - /* No mandatory locks */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); @@ -312,10 +308,6 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); - /* No mandatory locks */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - goto out_err; - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); @@ -327,7 +319,6 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) ret = v9fs_file_getlock(filp, fl); else ret = -EINVAL; -out_err: return ret; } @@ -348,10 +339,6 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd, p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); - /* No mandatory locks */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - goto out_err; - if (!(fl->fl_flags & FL_FLOCK)) goto out_err; diff --git a/fs/Kconfig b/fs/Kconfig index a7749c126b8e..949128bf86c9 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -101,16 +101,6 @@ config FILE_LOCKING for filesystems like NFS and for the flock() system call. Disabling this option saves about 11k. -config MANDATORY_FILE_LOCKING - bool "Enable Mandatory file locking" - depends on FILE_LOCKING - default y - help - This option enables files appropriately marked files on appropriely - mounted filesystems to support mandatory locking. - - To the best of my knowledge this is dead code that no one cares about. - source "fs/crypto/Kconfig" source "fs/verity/Kconfig" diff --git a/fs/afs/flock.c b/fs/afs/flock.c index cb3054c7843e..c4210a3964d8 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -772,10 +772,6 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) fl->fl_type, fl->fl_flags, (long long) fl->fl_start, (long long) fl->fl_end); - /* AFS doesn't support mandatory locks */ - if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; - if (IS_GETLK(cmd)) return afs_do_getlk(file, fl); @@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); req->done = true; - if (iocb->ki_eventfd && eventfd_signal_count()) { + if (iocb->ki_eventfd && eventfd_signal_allowed()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); schedule_work(&req->work); diff --git a/fs/block_dev.c b/fs/block_dev.c index 9ef4f1fc2cb0..1f21ac984253 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -35,6 +35,7 @@ #include <linux/uaccess.h> #include <linux/suspend.h> #include "internal.h" +#include "../block/blk.h" struct bdev_inode { struct block_device bdev; @@ -686,7 +687,8 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) return retval; } -int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); @@ -707,7 +709,6 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) return error; } -EXPORT_SYMBOL(blkdev_fsync); /** * bdev_read_page() - Start reading a page from a block device @@ -801,7 +802,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) if (!ei) return NULL; memset(&ei->bdev, 0, sizeof(ei->bdev)); - ei->bdev.bd_bdi = &noop_backing_dev_info; return &ei->vfs_inode; } @@ -812,8 +812,15 @@ static void bdev_free_inode(struct inode *inode) free_percpu(bdev->bd_stats); kfree(bdev->bd_meta_info); - if (!bdev_is_partition(bdev)) + if (!bdev_is_partition(bdev)) { + if (bdev->bd_disk && bdev->bd_disk->bdi) + bdi_put(bdev->bd_disk->bdi); kfree(bdev->bd_disk); + } + + if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) + blk_free_ext_minor(MINOR(bdev->bd_dev)); + kmem_cache_free(bdev_cachep, BDEV_I(inode)); } @@ -826,16 +833,9 @@ static void init_once(void *data) static void bdev_evict_inode(struct inode *inode) { - struct block_device *bdev = &BDEV_I(inode)->bdev; truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); - /* Detach inode from wb early as bdi_put() may free bdi->wb */ - inode_detach_wb(inode); - if (bdev->bd_bdi != &noop_backing_dev_info) { - bdi_put(bdev->bd_bdi); - bdev->bd_bdi = &noop_backing_dev_info; - } } static const struct super_operations bdev_sops = { @@ -902,9 +902,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev->bd_disk = disk; bdev->bd_partno = partno; bdev->bd_inode = inode; -#ifdef CONFIG_SYSFS - INIT_LIST_HEAD(&bdev->bd_holder_disks); -#endif bdev->bd_stats = alloc_percpu(struct disk_stats); if (!bdev->bd_stats) { iput(inode); @@ -921,31 +918,6 @@ void bdev_add(struct block_device *bdev, dev_t dev) insert_inode_hash(bdev->bd_inode); } -static struct block_device *bdget(dev_t dev) -{ - struct inode *inode; - - inode = ilookup(blockdev_superblock, dev); - if (!inode) - return NULL; - return &BDEV_I(inode)->bdev; -} - -/** - * bdgrab -- Grab a reference to an already referenced block device - * @bdev: Block device to grab a reference to. - * - * Returns the block_device with an additional reference when successful, - * or NULL if the inode is already beeing freed. - */ -struct block_device *bdgrab(struct block_device *bdev) -{ - if (!igrab(bdev->bd_inode)) - return NULL; - return bdev; -} -EXPORT_SYMBOL(bdgrab); - long nr_blockdev_pages(void) { struct inode *inode; @@ -959,12 +931,6 @@ long nr_blockdev_pages(void) return ret; } -void bdput(struct block_device *bdev) -{ - iput(bdev->bd_inode); -} -EXPORT_SYMBOL(bdput); - /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest @@ -1094,148 +1060,6 @@ void bd_abort_claiming(struct block_device *bdev, void *holder) } EXPORT_SYMBOL(bd_abort_claiming); -#ifdef CONFIG_SYSFS -struct bd_holder_disk { - struct list_head list; - struct gendisk *disk; - int refcnt; -}; - -static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, - struct gendisk *disk) -{ - struct bd_holder_disk *holder; - - list_for_each_entry(holder, &bdev->bd_holder_disks, list) - if (holder->disk == disk) - return holder; - return NULL; -} - -static int add_symlink(struct kobject *from, struct kobject *to) -{ - return sysfs_create_link(from, to, kobject_name(to)); -} - -static void del_symlink(struct kobject *from, struct kobject *to) -{ - sysfs_remove_link(from, kobject_name(to)); -} - -/** - * bd_link_disk_holder - create symlinks between holding disk and slave bdev - * @bdev: the claimed slave bdev - * @disk: the holding disk - * - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. - * - * This functions creates the following sysfs symlinks. - * - * - from "slaves" directory of the holder @disk to the claimed @bdev - * - from "holders" directory of the @bdev to the holder @disk - * - * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is - * passed to bd_link_disk_holder(), then: - * - * /sys/block/dm-0/slaves/sda --> /sys/block/sda - * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 - * - * The caller must have claimed @bdev before calling this function and - * ensure that both @bdev and @disk are valid during the creation and - * lifetime of these symlinks. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - struct bd_holder_disk *holder; - int ret = 0; - - mutex_lock(&bdev->bd_disk->open_mutex); - - WARN_ON_ONCE(!bdev->bd_holder); - - /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) - goto out_unlock; - - holder = bd_find_holder_disk(bdev, disk); - if (holder) { - holder->refcnt++; - goto out_unlock; - } - - holder = kzalloc(sizeof(*holder), GFP_KERNEL); - if (!holder) { - ret = -ENOMEM; - goto out_unlock; - } - - INIT_LIST_HEAD(&holder->list); - holder->disk = disk; - holder->refcnt = 1; - - ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); - if (ret) - goto out_free; - - ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - if (ret) - goto out_del; - /* - * bdev could be deleted beneath us which would implicitly destroy - * the holder directory. Hold on to it. - */ - kobject_get(bdev->bd_holder_dir); - - list_add(&holder->list, &bdev->bd_holder_disks); - goto out_unlock; - -out_del: - del_symlink(disk->slave_dir, bdev_kobj(bdev)); -out_free: - kfree(holder); -out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(bd_link_disk_holder); - -/** - * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() - * @bdev: the calimed slave bdev - * @disk: the holding disk - * - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. - * - * CONTEXT: - * Might sleep. - */ -void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - struct bd_holder_disk *holder; - - mutex_lock(&bdev->bd_disk->open_mutex); - - holder = bd_find_holder_disk(bdev, disk); - - if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_holder_dir); - list_del_init(&holder->list); - kfree(holder); - } - - mutex_unlock(&bdev->bd_disk->open_mutex); -} -EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); -#endif - static void blkdev_flush_mapping(struct block_device *bdev) { WARN_ON_ONCE(bdev->bd_holders); @@ -1260,11 +1084,8 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) } } - if (!bdev->bd_openers) { + if (!bdev->bd_openers) set_init_blocksize(bdev); - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); - } if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); bdev->bd_openers++; @@ -1282,16 +1103,14 @@ static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) static int blkdev_get_part(struct block_device *part, fmode_t mode) { struct gendisk *disk = part->bd_disk; - struct block_device *whole; int ret; if (part->bd_openers) goto done; - whole = bdgrab(disk->part0); - ret = blkdev_get_whole(whole, mode); + ret = blkdev_get_whole(bdev_whole(part), mode); if (ret) - goto out_put_whole; + return ret; ret = -ENXIO; if (!bdev_nr_sectors(part)) @@ -1299,16 +1118,12 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) disk->open_partitions++; set_init_blocksize(part); - if (part->bd_bdi == &noop_backing_dev_info) - part->bd_bdi = bdi_get(disk->queue->backing_dev_info); done: part->bd_openers++; return 0; out_blkdev_put: - blkdev_put_whole(whole, mode); -out_put_whole: - bdput(whole); + blkdev_put_whole(bdev_whole(part), mode); return ret; } @@ -1321,42 +1136,42 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; blkdev_put_whole(whole, mode); - bdput(whole); } struct block_device *blkdev_get_no_open(dev_t dev) { struct block_device *bdev; - struct gendisk *disk; + struct inode *inode; - bdev = bdget(dev); - if (!bdev) { + inode = ilookup(blockdev_superblock, dev); + if (!inode) { blk_request_module(dev); - bdev = bdget(dev); - if (!bdev) + inode = ilookup(blockdev_superblock, dev); + if (!inode) return NULL; } - disk = bdev->bd_disk; - if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) - goto bdput; - if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) - goto put_disk; - if (!try_module_get(bdev->bd_disk->fops->owner)) - goto put_disk; + /* switch from the inode reference to a device mode one: */ + bdev = &BDEV_I(inode)->bdev; + if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) + bdev = NULL; + iput(inode); + + if (!bdev) + return NULL; + if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) || + !try_module_get(bdev->bd_disk->fops->owner)) { + put_device(&bdev->bd_device); + return NULL; + } + return bdev; -put_disk: - put_disk(disk); -bdput: - bdput(bdev); - return NULL; } void blkdev_put_no_open(struct block_device *bdev) { module_put(bdev->bd_disk->fops->owner); - put_disk(bdev->bd_disk); - bdput(bdev); + put_device(&bdev->bd_device); } /** @@ -1409,7 +1224,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) mutex_lock(&disk->open_mutex); ret = -ENXIO; - if (!(disk->flags & GENHD_FL_UP)) + if (!disk_live(disk)) goto abort_claiming; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06f9f167222b..bd5689fa290e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -629,7 +629,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) { + if (inode_need_compress(BTRFS_I(inode), start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a1e2813731d1..7e7a897ae0d3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1395,9 +1395,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } else { struct address_space *mapping = inode->i_mapping; - struct page *page = find_or_create_page(mapping, 0, - mapping_gfp_constraint(mapping, - ~__GFP_FS)); + struct page *page; + + filemap_invalidate_lock_shared(mapping); + page = find_or_create_page(mapping, 0, + mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; goto out_inline; @@ -1418,6 +1420,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; out_inline: + filemap_invalidate_unlock_shared(mapping); dout("filemap_fault %p %llu read inline data ret %x\n", inode, off, ret); } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2a2900903f8c..39db97f149b9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1743,7 +1743,11 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush *ceph_alloc_cap_flush(void) { - return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); + struct ceph_cap_flush *cf; + + cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); + cf->is_capsnap = false; + return cf; } void ceph_free_cap_flush(struct ceph_cap_flush *cf) @@ -1778,7 +1782,7 @@ static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, prev->wake = true; wake = false; } - list_del(&cf->g_list); + list_del_init(&cf->g_list); return wake; } @@ -1793,7 +1797,7 @@ static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, prev->wake = true; wake = false; } - list_del(&cf->i_list); + list_del_init(&cf->i_list); return wake; } @@ -2352,7 +2356,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { - if (!cf->caps) { + if (cf->is_capsnap) { last_snap_flush = cf->tid; break; } @@ -2371,7 +2375,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, first_tid = cf->tid + 1; - if (cf->caps) { + if (!cf->is_capsnap) { struct cap_msg_args arg; dout("kick_flushing_caps %p cap %p tid %llu %s\n", @@ -3516,7 +3520,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, cleaned = cf->caps; /* Is this a capsnap? */ - if (cf->caps == 0) + if (cf->is_capsnap) continue; if (cf->tid <= flush_tid) { @@ -3589,8 +3593,9 @@ out: while (!list_empty(&to_remove)) { cf = list_first_entry(&to_remove, struct ceph_cap_flush, i_list); - list_del(&cf->i_list); - ceph_free_cap_flush(cf); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); } if (wake_ci) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d1755ac1d964..e1d605a02d4a 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2088,6 +2088,7 @@ static long ceph_fallocate(struct file *file, int mode, if (ret < 0) goto unlock; + filemap_invalidate_lock(inode->i_mapping); ceph_zero_pagecache_range(inode, offset, length); ret = ceph_zero_objects(inode, offset, length); @@ -2100,6 +2101,7 @@ static long ceph_fallocate(struct file *file, int mode, if (dirty) __mark_inode_dirty(inode, dirty); } + filemap_invalidate_unlock(inode->i_mapping); ceph_put_cap_refs(ci, got); unlock: diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index fa8a847743d0..bdeb271f47d9 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -240,9 +240,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - /* No mandatory locks */ - if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) - return -ENOLCK; dout("ceph_lock, fl_owner: %p\n", fl->fl_owner); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index afdc20213876..0b69aec23e5c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1616,7 +1616,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&mdsc->cap_dirty_lock); list_for_each_entry(cf, &to_remove, i_list) - list_del(&cf->g_list); + list_del_init(&cf->g_list); if (!list_empty(&ci->i_dirty_item)) { pr_warn_ratelimited( @@ -1668,8 +1668,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_cap_flush *cf; cf = list_first_entry(&to_remove, struct ceph_cap_flush, i_list); - list_del(&cf->i_list); - ceph_free_cap_flush(cf); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); } wake_up_all(&ci->i_cap_wq); diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index abd9af7727ad..3c444b9cb17b 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -394,9 +394,11 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m) { int i; - for (i = 0; i < m->possible_max_rank; i++) - kfree(m->m_info[i].export_targets); - kfree(m->m_info); + if (m->m_info) { + for (i = 0; i < m->possible_max_rank; i++) + kfree(m->m_info[i].export_targets); + kfree(m->m_info); + } kfree(m->m_data_pg_pools); kfree(m); } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 4c6bd1042c94..15105f9da3fd 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -487,6 +487,9 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); return; } + capsnap->cap_flush.is_capsnap = true; + INIT_LIST_HEAD(&capsnap->cap_flush.i_list); + INIT_LIST_HEAD(&capsnap->cap_flush.g_list); spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 9215a2f4535c..b1a363641beb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -182,8 +182,9 @@ struct ceph_cap { struct ceph_cap_flush { u64 tid; - int caps; /* 0 means capsnap */ + int caps; bool wake; /* wake up flush waiters when finish ? */ + bool is_capsnap; /* true means capsnap */ struct list_head g_list; // global struct list_head i_list; // per inode }; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 2dfd0d8297eb..ddc0e8f97872 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3590,6 +3590,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, return rc; } + filemap_invalidate_lock(inode->i_mapping); /* * We implement the punch hole through ioctl, so we need remove the page * caches first, otherwise the data may be inconsistent with the server. @@ -3607,6 +3608,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, sizeof(struct file_zero_data_information), CIFSMaxBufSize, NULL, NULL); free_xid(xid); + filemap_invalidate_unlock(inode->i_mapping); return rc; } diff --git a/fs/eventfd.c b/fs/eventfd.c index e265b6dd4f34..3627dd7d25db 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -25,8 +25,6 @@ #include <linux/idr.h> #include <linux/uio.h> -DEFINE_PER_CPU(int, eventfd_wake_count); - static DEFINE_IDA(eventfd_ida); struct eventfd_ctx { @@ -67,21 +65,21 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) * Deadlock or stack overflow issues can happen if we recurse here * through waitqueue wakeup handlers. If the caller users potentially * nested waitqueues with custom wakeup handlers, then it should - * check eventfd_signal_count() before calling this function. If - * it returns true, the eventfd_signal() call should be deferred to a + * check eventfd_signal_allowed() before calling this function. If + * it returns false, the eventfd_signal() call should be deferred to a * safe context. */ - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) + if (WARN_ON_ONCE(current->in_eventfd_signal)) return 0; spin_lock_irqsave(&ctx->wqh.lock, flags); - this_cpu_inc(eventfd_wake_count); + current->in_eventfd_signal = 1; if (ULLONG_MAX - ctx->count < n) n = ULLONG_MAX - ctx->count; ctx->count += n; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); - this_cpu_dec(eventfd_wake_count); + current->in_eventfd_signal = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 54eec9185627..1248ff4ef562 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config EXT2_FS tristate "Second extended fs support" + select FS_IOMAP help Ext2 is a standard Linux file system for hard disks. diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index e512630cb63e..3be9dd6412b7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -667,9 +667,6 @@ struct ext2_inode_info { struct rw_semaphore xattr_sem; #endif rwlock_t i_meta_lock; -#ifdef CONFIG_FS_DAX - struct rw_semaphore dax_sem; -#endif /* * truncate_mutex is for serialising ext2_truncate() against @@ -685,14 +682,6 @@ struct ext2_inode_info { #endif }; -#ifdef CONFIG_FS_DAX -#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem) -#define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem) -#else -#define dax_sem_down_write(ext2_inode) -#define dax_sem_up_write(ext2_inode) -#endif - /* * Inode dynamic state flags */ diff --git a/fs/ext2/file.c b/fs/ext2/file.c index f98466acc672..eb97aa3d700e 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -81,7 +81,7 @@ out_unlock: * * mmap_lock (MM) * sb_start_pagefault (vfs, freeze) - * ext2_inode_info->dax_sem + * address_space->invalidate_lock * address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX) * ext2_inode_info->truncate_mutex * @@ -91,7 +91,6 @@ out_unlock: static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); vm_fault_t ret; bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); @@ -100,11 +99,11 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); } - down_read(&ei->dax_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); - up_read(&ei->dax_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); if (write) sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index dadb121beb22..333fa62661d5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -799,7 +799,6 @@ int ext2_get_block(struct inode *inode, sector_t iblock, } -#ifdef CONFIG_FS_DAX static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { @@ -852,16 +851,18 @@ const struct iomap_ops ext2_iomap_ops = { .iomap_begin = ext2_iomap_begin, .iomap_end = ext2_iomap_end, }; -#else -/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */ -const struct iomap_ops ext2_iomap_ops; -#endif /* CONFIG_FS_DAX */ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, start, len, - ext2_get_block); + int ret; + + inode_lock(inode); + len = min_t(u64, len, i_size_read(inode)); + ret = iomap_fiemap(inode, fieinfo, start, len, &ext2_iomap_ops); + inode_unlock(inode); + + return ret; } static int ext2_writepage(struct page *page, struct writeback_control *wbc) @@ -1177,7 +1178,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de ext2_free_data(inode, p, q); } -/* dax_sem must be held when calling this function */ +/* mapping->invalidate_lock must be held when calling this function */ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) { __le32 *i_data = EXT2_I(inode)->i_data; @@ -1194,7 +1195,7 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); #ifdef CONFIG_FS_DAX - WARN_ON(!rwsem_is_locked(&ei->dax_sem)); + WARN_ON(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)); #endif n = ext2_block_to_path(inode, iblock, offsets, NULL); @@ -1276,9 +1277,9 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset) if (ext2_inode_is_fast_symlink(inode)) return; - dax_sem_down_write(EXT2_I(inode)); + filemap_invalidate_lock(inode->i_mapping); __ext2_truncate_blocks(inode, offset); - dax_sem_up_write(EXT2_I(inode)); + filemap_invalidate_unlock(inode->i_mapping); } static int ext2_setsize(struct inode *inode, loff_t newsize) @@ -1308,10 +1309,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) if (error) return error; - dax_sem_down_write(EXT2_I(inode)); + filemap_invalidate_lock(inode->i_mapping); truncate_setsize(inode, newsize); __ext2_truncate_blocks(inode, newsize); - dax_sem_up_write(EXT2_I(inode)); + filemap_invalidate_unlock(inode->i_mapping); inode->i_mtime = inode->i_ctime = current_time(inode); if (inode_needs_sync(inode)) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 21e09fbaa46f..987bcf32ed46 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -206,9 +206,6 @@ static void init_once(void *foo) init_rwsem(&ei->xattr_sem); #endif mutex_init(&ei->truncate_mutex); -#ifdef CONFIG_FS_DAX - init_rwsem(&ei->dax_sem); -#endif inode_init_once(&ei->vfs_inode); } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3c51e243450d..7ebaf66b6e31 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1086,15 +1086,6 @@ struct ext4_inode_info { * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; - /* - * i_mmap_sem is for serializing page faults with truncate / punch hole - * operations. We have to make sure that new page cannot be faulted in - * a section of the inode that is being punched. We cannot easily use - * i_data_sem for this since we need protection for the whole punch - * operation and i_data_sem ranks below transaction start so we have - * to occasionally drop it. - */ - struct rw_semaphore i_mmap_sem; struct inode vfs_inode; struct jbd2_inode *jinode; @@ -2972,7 +2963,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); -extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_release_space(struct inode *inode, int to_free); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 92ad64b89d9b..c33e0a2cb6c3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4474,6 +4474,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; unsigned int max_blocks; loff_t new_size = 0; @@ -4560,17 +4561,17 @@ static long ext4_zero_range(struct file *file, loff_t offset, * Prevent page faults from reinstantiating pages we have * released from page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); goto out_mutex; } ret = ext4_update_disksize_before_punch(inode, offset, len); if (ret) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); goto out_mutex; } /* Now release the pages and zero block aligned part of pages */ @@ -4579,7 +4580,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) goto out_mutex; } @@ -5221,6 +5222,7 @@ out: static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; @@ -5274,7 +5276,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * Prevent page faults from reinstantiating pages we have released from * page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) @@ -5289,15 +5291,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * Write tail of the last page before removed range since it will get * removed from the page cache below. */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); + ret = filemap_write_and_wait_range(mapping, ioffset, offset); if (ret) goto out_mmap; /* * Write data that will be shifted to preserve them when discarding * page cache below. We are also protected from pages becoming dirty - * by i_mmap_sem. + * by i_rwsem and invalidate_lock. */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, + ret = filemap_write_and_wait_range(mapping, offset + len, LLONG_MAX); if (ret) goto out_mmap; @@ -5350,7 +5352,7 @@ out_stop: ext4_journal_stop(handle); ext4_fc_stop_ineligible(sb); out_mmap: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; @@ -5367,6 +5369,7 @@ out_mutex: static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; @@ -5425,7 +5428,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) * Prevent page faults from reinstantiating pages we have released from * page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) @@ -5526,7 +5529,7 @@ out_stop: ext4_journal_stop(handle); ext4_fc_stop_ineligible(sb); out_mmap: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 816dedcbd541..d3b4ed91aa68 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -704,22 +704,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); + struct address_space *mapping = vmf->vma->vm_file->f_mapping; pfn_t pfn; if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(mapping); retry: handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); if (IS_ERR(handle)) { - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); return VM_FAULT_SIGBUS; } } else { - down_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(mapping); } result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops); if (write) { @@ -731,10 +732,10 @@ retry: /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) result = dax_finish_sync_fault(vmf, pe_size, pfn); - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); } return result; @@ -756,7 +757,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = { #endif static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = ext4_filemap_fault, + .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d8de607849df..325c038e7b23 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3950,20 +3950,19 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } -static void ext4_wait_dax_page(struct ext4_inode_info *ei) +static void ext4_wait_dax_page(struct inode *inode) { - up_write(&ei->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); schedule(); - down_write(&ei->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); } int ext4_break_layouts(struct inode *inode) { - struct ext4_inode_info *ei = EXT4_I(inode); struct page *page; int error; - if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem))) + if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; do { @@ -3974,7 +3973,7 @@ int ext4_break_layouts(struct inode *inode) error = ___wait_var_event(&page->_refcount, atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 0, 0, - ext4_wait_dax_page(ei)); + ext4_wait_dax_page(inode)); } while (error == 0); return error; @@ -4005,9 +4004,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); if (ext4_has_inline_data(inode)) { - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_convert_inline_data(inode); - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) return ret; } @@ -4058,7 +4057,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) * Prevent page faults from reinstantiating pages we have released from * page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) @@ -4131,7 +4130,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) out_stop: ext4_journal_stop(handle); out_dio: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; @@ -5426,11 +5425,11 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode_dio_wait(inode); } - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); rc = ext4_break_layouts(inode); if (rc) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); goto err_out; } @@ -5506,7 +5505,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, error = rc; } out_mmap_sem: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); } if (!error) { @@ -5983,10 +5982,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) * data (and journalled aops don't know how to handle these cases). */ if (val) { - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err < 0) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return err; } } @@ -6019,7 +6018,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) percpu_up_write(&sbi->s_writepages_rwsem); if (val) - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ @@ -6063,7 +6062,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(mapping); err = ext4_convert_inline_data(inode); if (err) @@ -6176,7 +6175,7 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(err); out: - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; out_error: @@ -6184,15 +6183,3 @@ out_error: ext4_journal_stop(handle); goto out; } - -vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - vm_fault_t ret; - - down_read(&EXT4_I(inode)->i_mmap_sem); - ret = filemap_fault(vmf); - up_read(&EXT4_I(inode)->i_mmap_sem); - - return ret; -} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 6eed6170aded..4fb5fe083c2b 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -148,7 +148,7 @@ static long swap_inode_boot_loader(struct super_block *sb, goto journal_err_out; } - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err) goto err_out; @@ -256,7 +256,7 @@ err_out1: ext4_double_up_write_data_sem(inode, inode_bl); err_out: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dfa09a277b56..d6df62fc810c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -90,12 +90,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, /* * Lock ordering * - * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and - * i_mmap_rwsem (inode->i_mmap_rwsem)! - * * page fault path: - * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> - * page lock -> i_data_sem (rw) + * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start + * -> page lock -> i_data_sem (rw) * * buffered write path: * sb_start_write -> i_mutex -> mmap_lock @@ -103,8 +100,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, * i_data_sem (rw) * * truncate: - * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock - * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start -> + * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> + * page lock + * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start -> * i_data_sem (rw) * * direct IO: @@ -1360,7 +1358,6 @@ static void init_once(void *foo) INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); - init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); } diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index bcbe3668c1d4..ce84aa2786c7 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -11,14 +11,16 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { + struct address_space *mapping = inode->i_mapping; + /* * We don't need to call ext4_break_layouts() because the blocks we * are truncating were never visible to userspace. */ - down_write(&EXT4_I(inode)->i_mmap_sem); - truncate_inode_pages(inode->i_mapping, inode->i_size); + filemap_invalidate_lock(mapping); + truncate_inode_pages(mapping, inode->i_size); ext4_truncate(inode); - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d2cf48c5a2e4..eb222b35edef 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3187,12 +3187,12 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -3852,7 +3852,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, int ret = 0; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); set_inode_flag(inode, FI_ALIGNED_WRITE); @@ -3894,7 +3894,7 @@ done: clear_inode_flag(inode, FI_DO_DEFRAG); clear_inode_flag(inode, FI_ALIGNED_WRITE); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ee8eb33e2c25..906b2c4b50e7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -754,7 +754,6 @@ struct f2fs_inode_info { /* avoid racing between foreground op and gc */ struct rw_semaphore i_gc_rwsem[2]; - struct rw_semaphore i_mmap_sem; struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ int i_extra_isize; /* size of extra space located in i_addr */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6afd4562335f..1ff333755721 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -38,10 +38,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) struct inode *inode = file_inode(vmf->vma->vm_file); vm_fault_t ret; - down_read(&F2FS_I(inode)->i_mmap_sem); ret = filemap_fault(vmf); - up_read(&F2FS_I(inode)->i_mmap_sem); - if (!ret) f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO, F2FS_BLKSIZE); @@ -101,7 +98,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); file_update_time(vmf->vma->vm_file); - down_read(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || @@ -159,7 +156,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) trace_f2fs_vm_page_mkwrite(page, DATA); out_sem: - up_read(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); err: @@ -940,7 +937,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); truncate_setsize(inode, attr->ia_size); @@ -950,7 +947,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * do not trim all blocks after i_size if target size is * larger than i_size. */ - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) return err; @@ -1095,7 +1092,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_end = (loff_t)pg_end << PAGE_SHIFT; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -1104,7 +1101,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -1339,7 +1336,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); @@ -1347,7 +1344,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1378,13 +1375,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; /* write out all moved pages, if possible */ - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); new_size = i_size_read(inode) - len; ret = f2fs_truncate_blocks(inode, new_size, true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (!ret) f2fs_i_size_write(inode, new_size); return ret; @@ -1484,7 +1481,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, pgoff_t end; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_pagecache_range(inode, (loff_t)index << PAGE_SHIFT, @@ -1496,7 +1493,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1508,7 +1505,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); @@ -1543,6 +1540,7 @@ out: static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; pgoff_t nr, pg_start, pg_end, delta, idx; loff_t new_size; int ret = 0; @@ -1565,14 +1563,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) return ret; /* write out all dirty pages from offset */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX); if (ret) return ret; @@ -1583,7 +1581,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1599,14 +1597,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ - down_write(&F2FS_I(inode)->i_mmap_sem); - filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + filemap_invalidate_lock(mapping); + filemap_write_and_wait_range(mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (!ret) f2fs_i_size_write(inode, new_size); @@ -3440,7 +3438,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) goto out; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3476,7 +3474,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) } up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); out: inode_unlock(inode); @@ -3593,7 +3591,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) } down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3629,7 +3627,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) } up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); @@ -3748,7 +3746,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) goto err; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = filemap_write_and_wait_range(mapping, range.start, to_end ? LLONG_MAX : end_addr - 1); @@ -3835,7 +3833,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) ret = f2fs_secure_erase(prev_bdev, inode, prev_index, prev_block, len, range.flags); out: - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); err: inode_unlock(inode); @@ -4313,9 +4311,9 @@ write: /* if we couldn't write data, we should deallocate blocks. */ if (preallocated && i_size_read(inode) < target_size) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); f2fs_truncate(inode); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8fecd3050ccd..ce2ab1b85c11 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1289,7 +1289,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) mutex_init(&fi->inmem_lock); init_rwsem(&fi->i_gc_rwsem[READ]); init_rwsem(&fi->i_gc_rwsem[WRITE]); - init_rwsem(&fi->i_mmap_sem); init_rwsem(&fi->i_xattr_sem); /* Will be used by directory only */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 6642246206bd..daad532a4e2b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -378,7 +378,7 @@ out: ret = kstrtol(name, 10, &data); if (ret) return ret; - if (data >= IOPRIO_BE_NR || data < 0) + if (data >= IOPRIO_NR_LEVELS || data < 0) return -EINVAL; cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 860e884e56e8..978ac6751aeb 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -5,6 +5,7 @@ #include <linux/blkdev.h> #include <linux/sched/signal.h> +#include <linux/backing-dev-defs.h> #include "fat.h" struct fatent_operations { diff --git a/fs/fcntl.c b/fs/fcntl.c index f946bec8f1f1..68added37c15 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -150,7 +150,8 @@ void f_delown(struct file *filp) pid_t f_getown(struct file *filp) { pid_t pid = 0; - read_lock(&filp->f_owner.lock); + + read_lock_irq(&filp->f_owner.lock); rcu_read_lock(); if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) { pid = pid_vnr(filp->f_owner.pid); @@ -158,7 +159,7 @@ pid_t f_getown(struct file *filp) pid = -pid; } rcu_read_unlock(); - read_unlock(&filp->f_owner.lock); + read_unlock_irq(&filp->f_owner.lock); return pid; } @@ -208,7 +209,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg) struct f_owner_ex owner = {}; int ret = 0; - read_lock(&filp->f_owner.lock); + read_lock_irq(&filp->f_owner.lock); rcu_read_lock(); if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) owner.pid = pid_vnr(filp->f_owner.pid); @@ -231,7 +232,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg) ret = -EINVAL; break; } - read_unlock(&filp->f_owner.lock); + read_unlock_irq(&filp->f_owner.lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); @@ -249,10 +250,10 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) uid_t src[2]; int err; - read_lock(&filp->f_owner.lock); + read_lock_irq(&filp->f_owner.lock); src[0] = from_kuid(user_ns, filp->f_owner.uid); src[1] = from_kuid(user_ns, filp->f_owner.euid); - read_unlock(&filp->f_owner.lock); + read_unlock_irq(&filp->f_owner.lock); err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); @@ -1003,13 +1004,14 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; + unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } - read_lock(&fa->fa_lock); + read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = &fa->fa_file->f_owner; /* Don't send SIGURG to processes which have not set a @@ -1018,7 +1020,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } - read_unlock(&fa->fa_lock); + read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } } diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 9d58371d22c2..281d79f8b3d3 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -444,12 +444,12 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, /* * Can't do inline reclaim in fault path. We call * dax_layout_busy_page() before we free a range. And - * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it. - * In fault path we enter with fi->i_mmap_sem held and can't drop - * it. Also in fault path we hold fi->i_mmap_sem shared and not - * exclusive, so that creates further issues with fuse_wait_dax_page(). - * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory - * range to become free and retry. + * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it. + * In fault path we enter with mapping->invalidate_lock held and can't + * drop it. Also in fault path we hold mapping->invalidate_lock shared + * and not exclusive, so that creates further issues with + * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault() + * will wait for a memory range to become free and retry. */ if (flags & IOMAP_FAULT) { alloc_dmap = alloc_dax_mapping(fcd); @@ -513,7 +513,7 @@ static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, down_write(&fi->dax->sem); node = interval_tree_iter_first(&fi->dax->tree, idx, idx); - /* We are holding either inode lock or i_mmap_sem, and that should + /* We are holding either inode lock or invalidate_lock, and that should * ensure that dmap can't be truncated. We are holding a reference * on dmap and that should make sure it can't be reclaimed. So dmap * should still be there in tree despite the fact we dropped and @@ -660,14 +660,12 @@ static const struct iomap_ops fuse_iomap_ops = { static void fuse_wait_dax_page(struct inode *inode) { - struct fuse_inode *fi = get_fuse_inode(inode); - - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); schedule(); - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); } -/* Should be called with fi->i_mmap_sem lock held exclusively */ +/* Should be called with mapping->invalidate_lock held exclusively */ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, loff_t start, loff_t end) { @@ -813,18 +811,18 @@ retry: * we do not want any read/write/mmap to make progress and try * to populate page cache or access memory we are trying to free. */ - down_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { error = 0; retry = true; - up_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); goto retry; } if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); - up_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); if (write) sb_end_pagefault(sb); @@ -960,7 +958,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, int ret; struct interval_tree_node *node; - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); /* Lookup a dmap and corresponding file offset to reclaim. */ down_read(&fi->dax->sem); @@ -1021,7 +1019,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, out_write_dmap_sem: up_write(&fi->dax->sem); out_mmap_sem: - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return dmap; } @@ -1050,10 +1048,10 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) * had a reference or some other temporary failure, * Try again. We want to give up inline reclaim only * if there is no range assigned to this node. Otherwise - * if a deadlock is possible if we sleep with fi->i_mmap_sem - * held and worker to free memory can't make progress due - * to unavailability of fi->i_mmap_sem lock. So sleep - * only if fi->dax->nr=0 + * if a deadlock is possible if we sleep with + * mapping->invalidate_lock held and worker to free memory + * can't make progress due to unavailability of + * mapping->invalidate_lock. So sleep only if fi->dax->nr=0 */ if (retry) continue; @@ -1061,8 +1059,8 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) * There are no mappings which can be reclaimed. Wait for one. * We are not holding fi->dax->sem. So it is possible * that range gets added now. But as we are not holding - * fi->i_mmap_sem, worker should still be able to free up - * a range and wake us up. + * mapping->invalidate_lock, worker should still be able to + * free up a range and wake us up. */ if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) { if (wait_event_killable_exclusive(fcd->range_waitq, @@ -1108,7 +1106,7 @@ static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd, /* * Free a range of memory. * Locking: - * 1. Take fi->i_mmap_sem to block dax faults. + * 1. Take mapping->invalidate_lock to block dax faults. * 2. Take fi->dax->sem to protect interval tree and also to make sure * read/write can not reuse a dmap which we might be freeing. */ @@ -1122,7 +1120,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, loff_t dmap_start = start_idx << FUSE_DAX_SHIFT; loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1; - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); if (ret) { pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n", @@ -1134,7 +1132,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx); up_write(&fi->dax->sem); out_mmap_sem: - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return ret; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index eade6f965b2e..d9b977c0f38d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1556,6 +1556,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_conn *fc = fm->fc; struct fuse_inode *fi = get_fuse_inode(inode); + struct address_space *mapping = inode->i_mapping; FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; @@ -1580,11 +1581,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } if (FUSE_IS_DAX(inode) && is_truncate) { - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(mapping); fault_blocked = true; err = fuse_dax_break_layouts(inode, 0, 0); if (err) { - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return err; } } @@ -1694,13 +1695,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); - invalidate_inode_pages2(inode->i_mapping); + invalidate_inode_pages2(mapping); } clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); out: if (fault_blocked) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return 0; @@ -1711,7 +1712,7 @@ error: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (fault_blocked) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return err; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 97f860cfc195..621a662c19fb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -243,7 +243,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) } if (dax_truncate) { - down_write(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) goto out; @@ -255,7 +255,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) out: if (dax_truncate) - up_write(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (is_wb_truncate | dax_truncate) { fuse_release_nowrite(inode); @@ -2920,7 +2920,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (lock_inode) { inode_lock(inode); if (block_faults) { - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) goto out; @@ -2976,7 +2976,7 @@ out: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (block_faults) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (lock_inode) inode_unlock(inode); @@ -3045,7 +3045,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, * modifications. Yet this does give less guarantees than if the * copying was performed with write(2). * - * To fix this a i_mmap_sem style lock could be used to prevent new + * To fix this a mapping->invalidate_lock could be used to prevent new * faults while the copy is ongoing. */ err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 07829ce78695..6fb639b97ea8 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -149,13 +149,6 @@ struct fuse_inode { /** Lock to protect write related fields */ spinlock_t lock; - /** - * Can't take inode lock in fault path (leads to circular dependency). - * Introduce another semaphore which can be taken in fault path and - * then other filesystem paths can take this to block faults. - */ - struct rw_semaphore i_mmap_sem; - #ifdef CONFIG_FUSE_DAX /* * Dax specific inode data diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b9beb39a4a18..e07e429f32e1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -85,7 +85,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->orig_ino = 0; fi->state = 0; mutex_init(&fi->mutex); - init_rwsem(&fi->i_mmap_sem); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); if (!fi->forget) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 84ec053d43b4..c559827cb6f9 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1237,9 +1237,6 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; - if (cmd == F_CANCELLK) { /* Hack: */ cmd = F_SETLK; diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index 2b36dc6f0a10..ec975f466877 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -2,6 +2,7 @@ config HPFS_FS tristate "OS/2 HPFS file system support" depends on BLOCK + select FS_IOMAP help OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS is the file system used for organizing files on OS/2 hard disk diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index c3a49aacf20a..fb37f57130aa 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -9,6 +9,7 @@ #include "hpfs_fn.h" #include <linux/mpage.h> +#include <linux/iomap.h> #include <linux/fiemap.h> #define BLOCKS(size) (((size) + 511) >> 9) @@ -116,6 +117,47 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he return r; } +static int hpfs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap, struct iomap *srcmap) +{ + struct super_block *sb = inode->i_sb; + unsigned int blkbits = inode->i_blkbits; + unsigned int n_secs; + secno s; + + if (WARN_ON_ONCE(flags & (IOMAP_WRITE | IOMAP_ZERO))) + return -EINVAL; + + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = offset; + + hpfs_lock(sb); + s = hpfs_bmap(inode, offset >> blkbits, &n_secs); + if (s) { + n_secs = hpfs_search_hotfix_map_for_range(sb, s, + min_t(loff_t, n_secs, length)); + if (unlikely(!n_secs)) { + s = hpfs_search_hotfix_map(sb, s); + n_secs = 1; + } + iomap->type = IOMAP_MAPPED; + iomap->flags = IOMAP_F_MERGED; + iomap->addr = (u64)s << blkbits; + iomap->length = (u64)n_secs << blkbits; + } else { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = 1 << blkbits; + } + + hpfs_unlock(sb); + return 0; +} + +static const struct iomap_ops hpfs_iomap_ops = { + .iomap_begin = hpfs_iomap_begin, +}; + static int hpfs_readpage(struct file *file, struct page *page) { return mpage_readpage(page, hpfs_get_block); @@ -192,7 +234,14 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, start, len, hpfs_get_block); + int ret; + + inode_lock(inode); + len = min_t(u64, len, i_size_read(inode)); + ret = iomap_fiemap(inode, fieinfo, start, len, &hpfs_iomap_ops); + inode_unlock(inode); + + return ret; } const struct address_space_operations hpfs_aops = { diff --git a/fs/inode.c b/fs/inode.c index c93500d84264..84c528cd1955 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -190,6 +190,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->writeback_index = 0; + __init_rwsem(&mapping->invalidate_lock, "mapping.invalidate_lock", + &sb->s_type->invalidate_lock_key); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ diff --git a/fs/ioctl.c b/fs/ioctl.c index 1e2204fa9963..eea8267ae1f2 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -263,209 +263,6 @@ static long ioctl_file_clone_range(struct file *file, args.src_length, args.dest_offset); } -#ifdef CONFIG_BLOCK - -static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) -{ - return (offset >> inode->i_blkbits); -} - -static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) -{ - return (blk << inode->i_blkbits); -} - -/** - * __generic_block_fiemap - FIEMAP for block based inodes (no locking) - * @inode: the inode to map - * @fieinfo: the fiemap info struct that will be passed back to userspace - * @start: where to start mapping in the inode - * @len: how much space to map - * @get_block: the fs's get_block function - * - * This does FIEMAP for block based inodes. Basically it will just loop - * through get_block until we hit the number of extents we want to map, or we - * go past the end of the file and hit a hole. - * - * If it is possible to have data blocks beyond a hole past @inode->i_size, then - * please do not use this function, it will stop at the first unmapped block - * beyond i_size. - * - * If you use this function directly, you need to do your own locking. Use - * generic_block_fiemap if you want the locking done for you. - */ -static int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, loff_t start, - loff_t len, get_block_t *get_block) -{ - struct buffer_head map_bh; - sector_t start_blk, last_blk; - loff_t isize = i_size_read(inode); - u64 logical = 0, phys = 0, size = 0; - u32 flags = FIEMAP_EXTENT_MERGED; - bool past_eof = false, whole_file = false; - int ret = 0; - - ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); - if (ret) - return ret; - - /* - * Either the i_mutex or other appropriate locking needs to be held - * since we expect isize to not change at all through the duration of - * this call. - */ - if (len >= isize) { - whole_file = true; - len = isize; - } - - /* - * Some filesystems can't deal with being asked to map less than - * blocksize, so make sure our len is at least block length. - */ - if (logical_to_blk(inode, len) == 0) - len = blk_to_logical(inode, 1); - - start_blk = logical_to_blk(inode, start); - last_blk = logical_to_blk(inode, start + len - 1); - - do { - /* - * we set b_size to the total size we want so it will map as - * many contiguous blocks as possible at once - */ - memset(&map_bh, 0, sizeof(struct buffer_head)); - map_bh.b_size = len; - - ret = get_block(inode, start_blk, &map_bh, 0); - if (ret) - break; - - /* HOLE */ - if (!buffer_mapped(&map_bh)) { - start_blk++; - - /* - * We want to handle the case where there is an - * allocated block at the front of the file, and then - * nothing but holes up to the end of the file properly, - * to make sure that extent at the front gets properly - * marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && - blk_to_logical(inode, start_blk) >= isize) - past_eof = 1; - - /* - * First hole after going past the EOF, this is our - * last extent - */ - if (past_eof && size) { - flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, - flags); - } else if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - size = 0; - } - - /* if we have holes up to/past EOF then we're done */ - if (start_blk > last_blk || past_eof || ret) - break; - } else { - /* - * We have gone over the length of what we wanted to - * map, and it wasn't the entire file, so add the extent - * we got last time and exit. - * - * This is for the case where say we want to map all the - * way up to the second to the last block in a file, but - * the last block is a hole, making the second to last - * block FIEMAP_EXTENT_LAST. In this case we want to - * see if there is a hole after the second to last block - * so we can mark it properly. If we found data after - * we exceeded the length we were requesting, then we - * are good to go, just add the extent to the fieinfo - * and break - */ - if (start_blk > last_blk && !whole_file) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, - flags); - break; - } - - /* - * if size != 0 then we know we already have an extent - * to add, so add it. - */ - if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, - flags); - if (ret) - break; - } - - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; - flags = FIEMAP_EXTENT_MERGED; - - start_blk += logical_to_blk(inode, size); - - /* - * If we are past the EOF, then we need to make sure as - * soon as we find a hole that the last extent we found - * is marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && logical + size >= isize) - past_eof = true; - } - cond_resched(); - if (fatal_signal_pending(current)) { - ret = -EINTR; - break; - } - - } while (1); - - /* If ret is 1 then we just hit the end of the extent array */ - if (ret == 1) - ret = 0; - - return ret; -} - -/** - * generic_block_fiemap - FIEMAP for block based inodes - * @inode: The inode to map - * @fieinfo: The mapping information - * @start: The initial block to map - * @len: The length of the extect to attempt to map - * @get_block: The block mapping function for the fs - * - * Calls __generic_block_fiemap to map the inode, after taking - * the inode's mutex lock. - */ - -int generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block) -{ - int ret; - inode_lock(inode); - ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); - inode_unlock(inode); - return ret; -} -EXPORT_SYMBOL(generic_block_fiemap); - -#endif /* CONFIG_BLOCK */ - /* * This provides compatibility with legacy XFS pre-allocation ioctls * which predate the fallocate syscall. diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 21edc423b79f..678e2c51b855 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -155,7 +155,6 @@ struct iso9660_options{ unsigned int overriderockperm:1; unsigned int uid_set:1; unsigned int gid_set:1; - unsigned int utf8:1; unsigned char map; unsigned char check; unsigned int blocksize; @@ -356,7 +355,6 @@ static int parse_options(char *options, struct iso9660_options *popt) popt->gid = GLOBAL_ROOT_GID; popt->uid = GLOBAL_ROOT_UID; popt->iocharset = NULL; - popt->utf8 = 0; popt->overriderockperm = 0; popt->session=-1; popt->sbsector=-1; @@ -389,10 +387,13 @@ static int parse_options(char *options, struct iso9660_options *popt) case Opt_cruft: popt->cruft = 1; break; +#ifdef CONFIG_JOLIET case Opt_utf8: - popt->utf8 = 1; + kfree(popt->iocharset); + popt->iocharset = kstrdup("utf8", GFP_KERNEL); + if (!popt->iocharset) + return 0; break; -#ifdef CONFIG_JOLIET case Opt_iocharset: kfree(popt->iocharset); popt->iocharset = match_strdup(&args[0]); @@ -495,7 +496,6 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) if (sbi->s_nocompress) seq_puts(m, ",nocompress"); if (sbi->s_overriderockperm) seq_puts(m, ",overriderockperm"); if (sbi->s_showassoc) seq_puts(m, ",showassoc"); - if (sbi->s_utf8) seq_puts(m, ",utf8"); if (sbi->s_check) seq_printf(m, ",check=%c", sbi->s_check); if (sbi->s_mapping) seq_printf(m, ",map=%c", sbi->s_mapping); @@ -518,9 +518,10 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",fmode=%o", sbi->s_fmode); #ifdef CONFIG_JOLIET - if (sbi->s_nls_iocharset && - strcmp(sbi->s_nls_iocharset->charset, CONFIG_NLS_DEFAULT) != 0) + if (sbi->s_nls_iocharset) seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset); + else + seq_puts(m, ",iocharset=utf8"); #endif return 0; } @@ -863,14 +864,13 @@ root_found: sbi->s_nls_iocharset = NULL; #ifdef CONFIG_JOLIET - if (joliet_level && opt.utf8 == 0) { + if (joliet_level) { char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT; - sbi->s_nls_iocharset = load_nls(p); - if (! sbi->s_nls_iocharset) { - /* Fail only if explicit charset specified */ - if (opt.iocharset) + if (strcmp(p, "utf8") != 0) { + sbi->s_nls_iocharset = opt.iocharset ? + load_nls(opt.iocharset) : load_nls_default(); + if (!sbi->s_nls_iocharset) goto out_freesbi; - sbi->s_nls_iocharset = load_nls_default(); } } #endif @@ -886,7 +886,6 @@ root_found: sbi->s_gid = opt.gid; sbi->s_uid_set = opt.uid_set; sbi->s_gid_set = opt.gid_set; - sbi->s_utf8 = opt.utf8; sbi->s_nocompress = opt.nocompress; sbi->s_overriderockperm = opt.overriderockperm; /* diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 055ec6c586f7..dcdc191ed183 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -44,7 +44,6 @@ struct isofs_sb_info { unsigned char s_session; unsigned int s_high_sierra:1; unsigned int s_rock:2; - unsigned int s_utf8:1; unsigned int s_cruft:1; /* Broken disks with high byte of length * containing junk */ unsigned int s_nocompress:1; diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c index be8b6a9d0b92..c0f04a1e7f69 100644 --- a/fs/isofs/joliet.c +++ b/fs/isofs/joliet.c @@ -41,14 +41,12 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls) int get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode) { - unsigned char utf8; struct nls_table *nls; unsigned char len = 0; - utf8 = ISOFS_SB(inode->i_sb)->s_utf8; nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset; - if (utf8) { + if (!nls) { len = utf16s_to_utf8s((const wchar_t *) de->name, de->name_len[0] >> 1, UTF16_BIG_ENDIAN, outname, PAGE_SIZE); diff --git a/fs/locks.c b/fs/locks.c index 74b2a1dfe8d8..3d6fb4ae847b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1397,103 +1397,6 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) return error; } -#ifdef CONFIG_MANDATORY_FILE_LOCKING -/** - * locks_mandatory_locked - Check for an active lock - * @file: the file to check - * - * Searches the inode's list of locks to find any POSIX locks which conflict. - * This function is called from locks_verify_locked() only. - */ -int locks_mandatory_locked(struct file *file) -{ - int ret; - struct inode *inode = locks_inode(file); - struct file_lock_context *ctx; - struct file_lock *fl; - - ctx = smp_load_acquire(&inode->i_flctx); - if (!ctx || list_empty_careful(&ctx->flc_posix)) - return 0; - - /* - * Search the lock list for this inode for any POSIX locks. - */ - spin_lock(&ctx->flc_lock); - ret = 0; - list_for_each_entry(fl, &ctx->flc_posix, fl_list) { - if (fl->fl_owner != current->files && - fl->fl_owner != file) { - ret = -EAGAIN; - break; - } - } - spin_unlock(&ctx->flc_lock); - return ret; -} - -/** - * locks_mandatory_area - Check for a conflicting lock - * @inode: the file to check - * @filp: how the file was opened (if it was) - * @start: first byte in the file to check - * @end: lastbyte in the file to check - * @type: %F_WRLCK for a write lock, else %F_RDLCK - * - * Searches the inode's list of locks to find any POSIX locks which conflict. - */ -int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start, - loff_t end, unsigned char type) -{ - struct file_lock fl; - int error; - bool sleep = false; - - locks_init_lock(&fl); - fl.fl_pid = current->tgid; - fl.fl_file = filp; - fl.fl_flags = FL_POSIX | FL_ACCESS; - if (filp && !(filp->f_flags & O_NONBLOCK)) - sleep = true; - fl.fl_type = type; - fl.fl_start = start; - fl.fl_end = end; - - for (;;) { - if (filp) { - fl.fl_owner = filp; - fl.fl_flags &= ~FL_SLEEP; - error = posix_lock_inode(inode, &fl, NULL); - if (!error) - break; - } - - if (sleep) - fl.fl_flags |= FL_SLEEP; - fl.fl_owner = current->files; - error = posix_lock_inode(inode, &fl, NULL); - if (error != FILE_LOCK_DEFERRED) - break; - error = wait_event_interruptible(fl.fl_wait, - list_empty(&fl.fl_blocked_member)); - if (!error) { - /* - * If we've been sleeping someone might have - * changed the permissions behind our back. - */ - if (__mandatory_lock(inode)) - continue; - } - - break; - } - locks_delete_block(&fl); - - return error; -} -EXPORT_SYMBOL(locks_mandatory_area); -#endif /* CONFIG_MANDATORY_FILE_LOCKING */ - static void lease_clear_pending(struct file_lock *fl, int arg) { switch (arg) { @@ -2486,14 +2389,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (file_lock == NULL) return -ENOLCK; - /* Don't allow mandatory locks on files that may be memory mapped - * and shared. - */ - if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) { - error = -EAGAIN; - goto out; - } - error = flock_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2611,21 +2506,12 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, struct flock64 *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct inode *inode = locks_inode(filp); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; - /* Don't allow mandatory locks on files that may be memory mapped - * and shared. - */ - if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) { - error = -EAGAIN; - goto out; - } - error = flock64_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2857,8 +2743,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, seq_puts(f, "POSIX "); seq_printf(f, " %s ", - (inode == NULL) ? "*NOINODE*" : - mandatory_lock(inode) ? "MANDATORY" : "ADVISORY "); + (inode == NULL) ? "*NOINODE*" : "ADVISORY "); } else if (IS_FLOCK(fl)) { if (fl->fl_type & LOCK_MAND) { seq_puts(f, "FLOCK MSNFS "); diff --git a/fs/namei.c b/fs/namei.c index bf6d8a738c59..471eb9fead6e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3023,9 +3023,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) /* * Refuse to truncate files with mandatory locks held on them. */ - error = locks_verify_locked(filp); - if (!error) - error = security_path_truncate(path); + error = security_path_truncate(path); if (!error) { error = do_truncate(mnt_userns, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, diff --git a/fs/namespace.c b/fs/namespace.c index 97adcb5ab5d5..20caa4b4c539 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1715,22 +1715,14 @@ static inline bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } -#ifdef CONFIG_MANDATORY_FILE_LOCKING -static bool may_mandlock(void) +static void warn_mandlock(void) { - pr_warn_once("======================================================\n" - "WARNING: the mand mount option is being deprecated and\n" - " will be removed in v5.15!\n" - "======================================================\n"); - return capable(CAP_SYS_ADMIN); + pr_warn_once("=======================================================\n" + "WARNING: The mand mount option has been deprecated and\n" + " and is ignored by this kernel. Remove the mand\n" + " option from the mount to silence this warning.\n" + "=======================================================\n"); } -#else -static inline bool may_mandlock(void) -{ - pr_warn("VFS: \"mand\" mount option not supported"); - return false; -} -#endif static int can_umount(const struct path *path, int flags) { @@ -3197,8 +3189,8 @@ int path_mount(const char *dev_name, struct path *path, return ret; if (!may_mount()) return -EPERM; - if ((flags & SB_MANDLOCK) && !may_mandlock()) - return -EPERM; + if (flags & SB_MANDLOCK) + warn_mandlock(); /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) @@ -3581,9 +3573,8 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) goto err_unlock; - ret = -EPERM; - if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock()) - goto err_unlock; + if (fc->sb_flags & SB_MANDLOCK) + warn_mandlock(); newmount.mnt = vfs_create_mount(fc); if (IS_ERR(newmount.mnt)) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 1fef107961bc..514be5d28d70 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -806,10 +806,6 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) nfs_inc_stats(inode, NFSIOS_VFSLOCK); - /* No mandatory locks over NFS */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - goto out_err; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) is_local = 1; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fa67ecd5fe63..8313e1dbb5dc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5735,16 +5735,6 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, NFS4_SHARE_DENY_READ); } -/* - * Allow READ/WRITE during grace period on recovered state only for files - * that are not able to provide mandatory locking. - */ -static inline int -grace_disallows_io(struct net *net, struct inode *inode) -{ - return opens_in_grace(net) && mandatory_lock(inode); -} - static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* @@ -6026,7 +6016,6 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, stateid_t *stateid, int flags, struct nfsd_file **nfp, struct nfs4_stid **cstid) { - struct inode *ino = d_inode(fhp->fh_dentry); struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct nfs4_stid *s = NULL; @@ -6035,9 +6024,6 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, if (nfp) *nfp = NULL; - if (grace_disallows_io(net, ino)) - return nfserr_grace; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { status = check_special_stateids(net, fhp, stateid, flags); goto done; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a224a5e23cc1..92e77f92268a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -333,7 +333,6 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap) { struct inode *inode = d_inode(fhp->fh_dentry); - int host_err; if (iap->ia_size < inode->i_size) { __be32 err; @@ -343,20 +342,7 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) return err; } - - host_err = get_write_access(inode); - if (host_err) - goto out_nfserrno; - - host_err = locks_verify_truncate(inode, NULL, iap->ia_size); - if (host_err) - goto out_put_write_access; - return 0; - -out_put_write_access: - put_write_access(inode); -out_nfserrno: - return nfserrno(host_err); + return nfserrno(get_write_access(inode)); } /* @@ -750,13 +736,6 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, err = nfserr_perm; if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE)) goto out; - /* - * We must ignore files (but only files) which might have mandatory - * locks on them because there is no way to know if the accesser has - * the lock. - */ - if (S_ISREG((inode)->i_mode) && mandatory_lock(inode)) - goto out; if (!inode->i_fop) goto out; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 4abd928b0bc8..f6b2d280aab5 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1053,7 +1053,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi); + sb->s_bdi = bdi_get(sb->s_bdev->bd_disk->bdi); err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 28b67cb9458d..6facdf476255 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> #include <linux/fcntl.h> +#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/anon_inodes.h> @@ -109,8 +110,10 @@ struct kmem_cache *fanotify_path_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; #define FANOTIFY_EVENT_ALIGN 4 -#define FANOTIFY_INFO_HDR_LEN \ +#define FANOTIFY_FID_INFO_HDR_LEN \ (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ + sizeof(struct fanotify_event_info_pidfd) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -119,10 +122,11 @@ static int fanotify_fid_info_len(int fh_len, int name_len) if (name_len) info_len += name_len + 1; - return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN); + return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len, + FANOTIFY_EVENT_ALIGN); } -static int fanotify_event_info_len(unsigned int fid_mode, +static int fanotify_event_info_len(unsigned int info_mode, struct fanotify_event *event) { struct fanotify_info *info = fanotify_event_info(event); @@ -133,7 +137,8 @@ static int fanotify_event_info_len(unsigned int fid_mode, if (dir_fh_len) { info_len += fanotify_fid_info_len(dir_fh_len, info->name_len); - } else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { + } else if ((info_mode & FAN_REPORT_NAME) && + (event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_NAME, if name was not recorded in * event on a directory, we will report the name ".". @@ -141,6 +146,9 @@ static int fanotify_event_info_len(unsigned int fid_mode, dot_len = 1; } + if (info_mode & FAN_REPORT_PIDFD) + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; + if (fh_len) info_len += fanotify_fid_info_len(fh_len, dot_len); @@ -176,7 +184,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, size_t event_size = FAN_EVENT_METADATA_LEN; struct fanotify_event *event = NULL; struct fsnotify_event *fsn_event; - unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -186,8 +194,8 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, goto out; event = FANOTIFY_E(fsn_event); - if (fid_mode) - event_size += fanotify_event_info_len(fid_mode, event); + if (info_mode) + event_size += fanotify_event_info_len(info_mode, event); if (event_size > count) { event = ERR_PTR(-EINVAL); @@ -308,9 +316,10 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } -static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, - int info_type, const char *name, size_t name_len, - char __user *buf, size_t count) +static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, + int info_type, const char *name, + size_t name_len, + char __user *buf, size_t count) { struct fanotify_event_info_fid info = { }; struct file_handle handle = { }; @@ -403,6 +412,117 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, return info_len; } +static int copy_pidfd_info_to_user(int pidfd, + char __user *buf, + size_t count) +{ + struct fanotify_event_info_pidfd info = { }; + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; + + if (WARN_ON_ONCE(info_len > count)) + return -EFAULT; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; + info.hdr.len = info_len; + info.pidfd = pidfd; + + if (copy_to_user(buf, &info, info_len)) + return -EFAULT; + + return info_len; +} + +static int copy_info_records_to_user(struct fanotify_event *event, + struct fanotify_info *info, + unsigned int info_mode, int pidfd, + char __user *buf, size_t count) +{ + int ret, total_bytes = 0, info_type = 0; + unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; + unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; + + /* + * Event info records order is as follows: dir fid + name, child fid. + */ + if (fanotify_event_dir_fh_len(event)) { + info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : + FAN_EVENT_INFO_TYPE_DFID; + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_info_dir_fh(info), + info_type, + fanotify_info_name(info), + info->name_len, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (fanotify_event_object_fh_len(event)) { + const char *dot = NULL; + int dot_len = 0; + + if (fid_mode == FAN_REPORT_FID || info_type) { + /* + * With only group flag FAN_REPORT_FID only type FID is + * reported. Second info record type is always FID. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } else if ((fid_mode & FAN_REPORT_NAME) && + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_NAME, if name was not + * recorded in an event on a directory, report the name + * "." with info type DFID_NAME. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; + dot = "."; + dot_len = 1; + } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_DIR_FID, a single info + * record has type DFID for directory entry modification + * event and for event on a directory. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID; + } else { + /* + * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, + * a single info record has type FID for event on a + * non-directory, when there is no directory to report. + * For example, on FAN_DELETE_SELF event. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } + + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_event_object_fh(event), + info_type, dot, dot_len, + buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (pidfd_mode) { + ret = copy_pidfd_info_to_user(pidfd, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + return total_bytes; +} + static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event *event, char __user *buf, size_t count) @@ -410,15 +530,15 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event_metadata metadata; struct path *path = fanotify_event_path(event); struct fanotify_info *info = fanotify_event_info(event); - unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); + unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; struct file *f = NULL; - int ret, fd = FAN_NOFD; - int info_type = 0; + int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD; pr_debug("%s: group=%p event=%p\n", __func__, group, event); metadata.event_len = FAN_EVENT_METADATA_LEN + - fanotify_event_info_len(fid_mode, event); + fanotify_event_info_len(info_mode, event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; @@ -447,6 +567,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, } metadata.fd = fd; + if (pidfd_mode) { + /* + * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual + * exclusion is ever lifted. At the time of incoporating pidfd + * support within fanotify, the pidfd API only supported the + * creation of pidfds for thread-group leaders. + */ + WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID)); + + /* + * The PIDTYPE_TGID check for an event->pid is performed + * preemptively in an attempt to catch out cases where the event + * listener reads events after the event generating process has + * already terminated. Report FAN_NOPIDFD to the event listener + * in those cases, with all other pidfd creation errors being + * reported as FAN_EPIDFD. + */ + if (metadata.pid == 0 || + !pid_has_task(event->pid, PIDTYPE_TGID)) { + pidfd = FAN_NOPIDFD; + } else { + pidfd = pidfd_create(event->pid, 0); + if (pidfd < 0) + pidfd = FAN_EPIDFD; + } + } + ret = -EFAULT; /* * Sanity check copy size in case get_one_event() and @@ -467,67 +614,11 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (f) fd_install(fd, f); - /* Event info records order is: dir fid + name, child fid */ - if (fanotify_event_dir_fh_len(event)) { - info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : - FAN_EVENT_INFO_TYPE_DFID; - ret = copy_info_to_user(fanotify_event_fsid(event), - fanotify_info_dir_fh(info), - info_type, fanotify_info_name(info), - info->name_len, buf, count); + if (info_mode) { + ret = copy_info_records_to_user(event, info, info_mode, pidfd, + buf, count); if (ret < 0) goto out_close_fd; - - buf += ret; - count -= ret; - } - - if (fanotify_event_object_fh_len(event)) { - const char *dot = NULL; - int dot_len = 0; - - if (fid_mode == FAN_REPORT_FID || info_type) { - /* - * With only group flag FAN_REPORT_FID only type FID is - * reported. Second info record type is always FID. - */ - info_type = FAN_EVENT_INFO_TYPE_FID; - } else if ((fid_mode & FAN_REPORT_NAME) && - (event->mask & FAN_ONDIR)) { - /* - * With group flag FAN_REPORT_NAME, if name was not - * recorded in an event on a directory, report the - * name "." with info type DFID_NAME. - */ - info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; - dot = "."; - dot_len = 1; - } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || - (event->mask & FAN_ONDIR)) { - /* - * With group flag FAN_REPORT_DIR_FID, a single info - * record has type DFID for directory entry modification - * event and for event on a directory. - */ - info_type = FAN_EVENT_INFO_TYPE_DFID; - } else { - /* - * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, - * a single info record has type FID for event on a - * non-directory, when there is no directory to report. - * For example, on FAN_DELETE_SELF event. - */ - info_type = FAN_EVENT_INFO_TYPE_FID; - } - - ret = copy_info_to_user(fanotify_event_fsid(event), - fanotify_event_object_fh(event), - info_type, dot, dot_len, buf, count); - if (ret < 0) - goto out_close_fd; - - buf += ret; - count -= ret; } return metadata.event_len; @@ -537,6 +628,10 @@ out_close_fd: put_unused_fd(fd); fput(f); } + + if (pidfd >= 0) + close_fd(pidfd); + return ret; } @@ -1082,6 +1177,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) #endif return -EINVAL; + /* + * A pidfd can only be returned for a thread-group leader; thus + * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually + * exclusive. + */ + if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) + return -EINVAL; + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL; @@ -1483,7 +1586,7 @@ static int __init fanotify_user_setup(void) FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 30d422b8c0fc..963e6ce75b96 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -87,15 +87,15 @@ static void fsnotify_unmount_inodes(struct super_block *sb) if (iput_inode) iput(iput_inode); - /* Wait for outstanding inode references from connectors */ - wait_var_event(&sb->s_fsnotify_inode_refs, - !atomic_long_read(&sb->s_fsnotify_inode_refs)); } void fsnotify_sb_delete(struct super_block *sb) { fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); + /* Wait for outstanding object references from connectors */ + wait_var_event(&sb->s_fsnotify_connectors, + !atomic_long_read(&sb->s_fsnotify_connectors)); } /* diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index ff2063ec6b0f..87d8a50ee803 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -27,6 +27,21 @@ static inline struct super_block *fsnotify_conn_sb( return container_of(conn->obj, struct super_block, s_fsnotify_marks); } +static inline struct super_block *fsnotify_connector_sb( + struct fsnotify_mark_connector *conn) +{ + switch (conn->type) { + case FSNOTIFY_OBJ_TYPE_INODE: + return fsnotify_conn_inode(conn)->i_sb; + case FSNOTIFY_OBJ_TYPE_VFSMOUNT: + return fsnotify_conn_mount(conn)->mnt.mnt_sb; + case FSNOTIFY_OBJ_TYPE_SB: + return fsnotify_conn_sb(conn); + default: + return NULL; + } +} + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d32ab349db74..95006d1d29ab 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -169,6 +169,37 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) } } +static void fsnotify_get_inode_ref(struct inode *inode) +{ + ihold(inode); + atomic_long_inc(&inode->i_sb->s_fsnotify_connectors); +} + +static void fsnotify_put_inode_ref(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + iput(inode); + if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) + wake_up_var(&sb->s_fsnotify_connectors); +} + +static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn) +{ + struct super_block *sb = fsnotify_connector_sb(conn); + + if (sb) + atomic_long_inc(&sb->s_fsnotify_connectors); +} + +static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn) +{ + struct super_block *sb = fsnotify_connector_sb(conn); + + if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) + wake_up_var(&sb->s_fsnotify_connectors); +} + static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) @@ -182,13 +213,13 @@ static void *fsnotify_detach_connector_from_object( if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; - atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs); } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } + fsnotify_put_sb_connectors(conn); rcu_assign_pointer(*(conn->obj), NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; @@ -209,19 +240,12 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) /* Drop object reference originally held by a connector */ static void fsnotify_drop_object(unsigned int type, void *objp) { - struct inode *inode; - struct super_block *sb; - if (!objp) return; /* Currently only inode references are passed to be dropped */ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) return; - inode = objp; - sb = inode->i_sb; - iput(inode); - if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs)) - wake_up_var(&sb->s_fsnotify_inode_refs); + fsnotify_put_inode_ref(objp); } void fsnotify_put_mark(struct fsnotify_mark *mark) @@ -493,8 +517,12 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, conn->fsid.val[0] = conn->fsid.val[1] = 0; conn->flags = 0; } - if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) - inode = igrab(fsnotify_conn_inode(conn)); + if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { + inode = fsnotify_conn_inode(conn); + fsnotify_get_inode_ref(inode); + } + fsnotify_get_sb_connectors(conn); + /* * cmpxchg() provides the barrier so that readers of *connp can see * only initialized structure @@ -502,7 +530,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ if (inode) - iput(inode); + fsnotify_put_inode_ref(inode); kmem_cache_free(fsnotify_mark_connector_cachep, conn); } diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index fab7c6a4a7d0..73a3854b2afb 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -101,8 +101,6 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - if (__mandatory_lock(inode)) - return -ENOLCK; if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || ocfs2_mount_local(osb)) @@ -121,8 +119,6 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); } diff --git a/fs/open.c b/fs/open.c index 94bef26ff1b6..daa324606a41 100644 --- a/fs/open.c +++ b/fs/open.c @@ -105,9 +105,7 @@ long vfs_truncate(const struct path *path, loff_t length) if (error) goto put_write_and_out; - error = locks_verify_truncate(inode, NULL, length); - if (!error) - error = security_path_truncate(path); + error = security_path_truncate(path); if (!error) error = do_truncate(mnt_userns, path->dentry, length, 0, NULL); @@ -189,9 +187,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) if (IS_APPEND(file_inode(f.file))) goto out_putf; sb_start_write(inode->i_sb); - error = locks_verify_truncate(inode, f.file, length); - if (!error) - error = security_path_truncate(&f.file->f_path); + error = security_path_truncate(&f.file->f_path); if (!error) error = do_truncate(file_mnt_user_ns(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); diff --git a/fs/pipe.c b/fs/pipe.c index 678dee2a8228..6d4342bad9f1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -363,10 +363,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) * _very_ unlikely case that the pipe was full, but we got * no data. */ - if (unlikely(was_full)) { + if (unlikely(was_full)) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - } + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); /* * But because we didn't read anything, at this point we can @@ -385,12 +384,11 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) wake_next_reader = false; __pipe_unlock(pipe); - if (was_full) { + if (was_full) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - } if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); if (ret > 0) file_accessed(filp); return ret; @@ -565,10 +563,9 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * become empty while we dropped the lock. */ __pipe_unlock(pipe); - if (was_empty) { + if (was_empty) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - } + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); __pipe_lock(pipe); was_empty = pipe_empty(pipe->head, pipe->tail); @@ -591,10 +588,9 @@ out: * Epoll nonsensically wants a wakeup whether the pipe * was already empty or not. */ - if (was_empty || pipe->poll_usage) { + if (was_empty || pipe->poll_usage) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - } + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { diff --git a/fs/read_write.c b/fs/read_write.c index 9db7adf160d2..af057c57bdc6 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -365,12 +365,8 @@ out_putf: int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { - struct inode *inode; - int retval = -EINVAL; - - inode = file_inode(file); if (unlikely((ssize_t) count < 0)) - return retval; + return -EINVAL; /* * ranged mandatory locking does not apply to streams - it makes sense @@ -381,19 +377,12 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) - return retval; + return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) - return retval; - } - - if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - retval = locks_mandatory_area(inode, file, pos, pos + count - 1, - read_write == READ ? F_RDLCK : F_WRLCK); - if (retval < 0) - return retval; + return -EINVAL; } } diff --git a/fs/remap_range.c b/fs/remap_range.c index e4a5fdd7ad7b..6d4a9beaa097 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -99,24 +99,12 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, static int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { - struct inode *inode = file_inode(file); - if (unlikely(pos < 0 || len < 0)) return -EINVAL; if (unlikely((loff_t) (pos + len) < 0)) return -EINVAL; - if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - loff_t end = len ? pos + len - 1 : OFFSET_MAX; - int retval; - - retval = locks_mandatory_area(inode, file, pos, end, - write ? F_WRLCK : F_RDLCK); - if (retval < 0) - return retval; - } - return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 855f0e87066d..2db8bcf7ff85 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -49,8 +49,7 @@ static int copy_bio_to_actor(struct bio *bio, bytes_to_copy = min_t(int, bytes_to_copy, req_length - copied_bytes); - memcpy(actor_addr + actor_offset, - page_address(bvec->bv_page) + bvec->bv_offset + offset, + memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset, bytes_to_copy); actor_offset += bytes_to_copy; @@ -177,7 +176,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, goto out_free_bio; } /* Extract the length of the metadata block */ - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length = data[offset]; if (offset < bvec->bv_len - 1) { length |= data[offset + 1] << 8; @@ -186,7 +185,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, res = -EIO; goto out_free_bio; } - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length |= data[0] << 8; } bio_free_pages(bio); diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index 233d5582fbee..b685b6238316 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -101,7 +101,7 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 97bb7d92ddcd..cb510a631968 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -76,7 +76,7 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index e80419aed862..68f6d09bb3a2 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -146,7 +146,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; stream->buf.in = data + offset; stream->buf.in_size = avail; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index bcb881ec47f2..a20e9042146b 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -76,7 +76,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; stream->next_in = data + offset; stream->avail_in = avail; diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index b7cb1faa652d..0015cf8b5582 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -94,7 +94,7 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; in_buf.src = data + offset; in_buf.size = avail; diff --git a/fs/super.c b/fs/super.c index 91b7f156735b..bcef3a6f4c4b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1203,7 +1203,7 @@ static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) s->s_iflags |= SB_I_STABLE_WRITES; diff --git a/fs/timerfd.c b/fs/timerfd.c index c5509d2448e3..e9c96a0c79f1 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -115,6 +115,22 @@ void timerfd_clock_was_set(void) rcu_read_unlock(); } +static void timerfd_resume_work(struct work_struct *work) +{ + timerfd_clock_was_set(); +} + +static DECLARE_WORK(timerfd_work, timerfd_resume_work); + +/* + * Invoked from timekeeping_resume(). Defer the actual update to work so + * timerfd_clock_was_set() runs in task context. + */ +void timerfd_resume(void) +{ + schedule_work(&timerfd_work); +} + static void __timerfd_remove_cancel(struct timerfd_ctx *ctx) { if (ctx->might_cancel) { diff --git a/fs/udf/dir.c b/fs/udf/dir.c index c19dba45aa20..70abdfad2df1 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -35,7 +35,6 @@ #include "udf_i.h" #include "udf_sb.h" - static int udf_readdir(struct file *file, struct dir_context *ctx) { struct inode *dir = file_inode(file); @@ -135,7 +134,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) lfi = cfi.lengthFileIdent; if (fibh.sbh == fibh.ebh) { - nameptr = fi->fileIdent + liu; + nameptr = udf_get_fi_ident(fi); } else { int poffset; /* Unpaded ending offset */ @@ -153,7 +152,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) } } nameptr = copy_name; - memcpy(nameptr, fi->fileIdent + liu, + memcpy(nameptr, udf_get_fi_ident(fi), lfi - poffset); memcpy(nameptr + lfi - poffset, fibh.ebh->b_data, poffset); diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h index 185c3e247648..de17a97e8667 100644 --- a/fs/udf/ecma_167.h +++ b/fs/udf/ecma_167.h @@ -307,14 +307,14 @@ struct logicalVolDesc { struct regid impIdent; uint8_t impUse[128]; struct extent_ad integritySeqExt; - uint8_t partitionMaps[0]; + uint8_t partitionMaps[]; } __packed; /* Generic Partition Map (ECMA 167r3 3/10.7.1) */ struct genericPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; - uint8_t partitionMapping[0]; + uint8_t partitionMapping[]; } __packed; /* Partition Map Type (ECMA 167r3 3/10.7.1.1) */ @@ -342,7 +342,7 @@ struct unallocSpaceDesc { struct tag descTag; __le32 volDescSeqNum; __le32 numAllocDescs; - struct extent_ad allocDescs[0]; + struct extent_ad allocDescs[]; } __packed; /* Terminating Descriptor (ECMA 167r3 3/10.9) */ @@ -360,9 +360,9 @@ struct logicalVolIntegrityDesc { uint8_t logicalVolContentsUse[32]; __le32 numOfPartitions; __le32 lengthOfImpUse; - __le32 freeSpaceTable[0]; - __le32 sizeTable[0]; - uint8_t impUse[0]; + __le32 freeSpaceTable[]; + /* __le32 sizeTable[]; */ + /* uint8_t impUse[]; */ } __packed; /* Integrity Type (ECMA 167r3 3/10.10.3) */ @@ -471,9 +471,9 @@ struct fileIdentDesc { uint8_t lengthFileIdent; struct long_ad icb; __le16 lengthOfImpUse; - uint8_t impUse[0]; - uint8_t fileIdent[0]; - uint8_t padding[0]; + uint8_t impUse[]; + /* uint8_t fileIdent[]; */ + /* uint8_t padding[]; */ } __packed; /* File Characteristics (ECMA 167r3 4/14.4.3) */ @@ -578,8 +578,8 @@ struct fileEntry { __le64 uniqueID; __le32 lengthExtendedAttr; __le32 lengthAllocDescs; - uint8_t extendedAttr[0]; - uint8_t allocDescs[0]; + uint8_t extendedAttr[]; + /* uint8_t allocDescs[]; */ } __packed; /* Permissions (ECMA 167r3 4/14.9.5) */ @@ -632,7 +632,7 @@ struct genericFormat { uint8_t attrSubtype; uint8_t reserved[3]; __le32 attrLength; - uint8_t attrData[0]; + uint8_t attrData[]; } __packed; /* Character Set Information (ECMA 167r3 4/14.10.3) */ @@ -643,7 +643,7 @@ struct charSetInfo { __le32 attrLength; __le32 escapeSeqLength; uint8_t charSetType; - uint8_t escapeSeq[0]; + uint8_t escapeSeq[]; } __packed; /* Alternate Permissions (ECMA 167r3 4/14.10.4) */ @@ -682,7 +682,7 @@ struct infoTimesExtAttr { __le32 attrLength; __le32 dataLength; __le32 infoTimeExistence; - uint8_t infoTimes[0]; + uint8_t infoTimes[]; } __packed; /* Device Specification (ECMA 167r3 4/14.10.7) */ @@ -694,7 +694,7 @@ struct deviceSpec { __le32 impUseLength; __le32 majorDeviceIdent; __le32 minorDeviceIdent; - uint8_t impUse[0]; + uint8_t impUse[]; } __packed; /* Implementation Use Extended Attr (ECMA 167r3 4/14.10.8) */ @@ -705,7 +705,7 @@ struct impUseExtAttr { __le32 attrLength; __le32 impUseLength; struct regid impIdent; - uint8_t impUse[0]; + uint8_t impUse[]; } __packed; /* Application Use Extended Attribute (ECMA 167r3 4/14.10.9) */ @@ -716,7 +716,7 @@ struct appUseExtAttr { __le32 attrLength; __le32 appUseLength; struct regid appIdent; - uint8_t appUse[0]; + uint8_t appUse[]; } __packed; #define EXTATTR_CHAR_SET 1 @@ -733,7 +733,7 @@ struct unallocSpaceEntry { struct tag descTag; struct icbtag icbTag; __le32 lengthAllocDescs; - uint8_t allocDescs[0]; + uint8_t allocDescs[]; } __packed; /* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */ @@ -741,7 +741,7 @@ struct spaceBitmapDesc { struct tag descTag; __le32 numOfBits; __le32 numOfBytes; - uint8_t bitmap[0]; + uint8_t bitmap[]; } __packed; /* Partition Integrity Entry (ECMA 167r3 4/14.13) */ @@ -780,7 +780,7 @@ struct pathComponent { uint8_t componentType; uint8_t lengthComponentIdent; __le16 componentFileVersionNum; - dchars componentIdent[0]; + dchars componentIdent[]; } __packed; /* File Entry (ECMA 167r3 4/14.17) */ @@ -809,8 +809,8 @@ struct extendedFileEntry { __le64 uniqueID; __le32 lengthExtendedAttr; __le32 lengthAllocDescs; - uint8_t extendedAttr[0]; - uint8_t allocDescs[0]; + uint8_t extendedAttr[]; + /* uint8_t allocDescs[]; */ } __packed; #endif /* _ECMA_167_H */ diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 4917670860a0..1d6b7a50736b 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -390,8 +390,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, dfibh.eoffset += (sfibh.eoffset - sfibh.soffset); dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset); if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse, - sfi->fileIdent + - le16_to_cpu(sfi->lengthOfImpUse))) { + udf_get_fi_ident(sfi))) { iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; brelse(dbh); return NULL; diff --git a/fs/udf/misc.c b/fs/udf/misc.c index eab94527340d..1614d308d0f0 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -173,13 +173,22 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, else offset = le32_to_cpu(eahd->appAttrLocation); - while (offset < iinfo->i_lenEAttr) { + while (offset + sizeof(*gaf) < iinfo->i_lenEAttr) { + uint32_t attrLength; + gaf = (struct genericFormat *)&ea[offset]; + attrLength = le32_to_cpu(gaf->attrLength); + + /* Detect undersized elements and buffer overflows */ + if ((attrLength < sizeof(*gaf)) || + (attrLength > (iinfo->i_lenEAttr - offset))) + break; + if (le32_to_cpu(gaf->attrType) == type && gaf->attrSubtype == subtype) return gaf; else - offset += le32_to_cpu(gaf->attrLength); + offset += attrLength; } } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 7c7c9bbbfa57..caeef08efed2 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -74,12 +74,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, if (fileident) { if (adinicb || (offset + lfi < 0)) { - memcpy((uint8_t *)sfi->fileIdent + liu, fileident, lfi); + memcpy(udf_get_fi_ident(sfi), fileident, lfi); } else if (offset >= 0) { memcpy(fibh->ebh->b_data + offset, fileident, lfi); } else { - memcpy((uint8_t *)sfi->fileIdent + liu, fileident, - -offset); + memcpy(udf_get_fi_ident(sfi), fileident, -offset); memcpy(fibh->ebh->b_data, fileident - offset, lfi + offset); } @@ -88,11 +87,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, offset += lfi; if (adinicb || (offset + padlen < 0)) { - memset((uint8_t *)sfi->padding + liu + lfi, 0x00, padlen); + memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen); } else if (offset >= 0) { memset(fibh->ebh->b_data + offset, 0x00, padlen); } else { - memset((uint8_t *)sfi->padding + liu + lfi, 0x00, -offset); + memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset); memset(fibh->ebh->b_data, 0x00, padlen + offset); } @@ -226,7 +225,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, lfi = cfi->lengthFileIdent; if (fibh->sbh == fibh->ebh) { - nameptr = fi->fileIdent + liu; + nameptr = udf_get_fi_ident(fi); } else { int poffset; /* Unpaded ending offset */ @@ -246,7 +245,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, } } nameptr = copy_name; - memcpy(nameptr, fi->fileIdent + liu, + memcpy(nameptr, udf_get_fi_ident(fi), lfi - poffset); memcpy(nameptr + lfi - poffset, fibh->ebh->b_data, poffset); diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h index 22bc4fb2feb9..157de0ec0cd5 100644 --- a/fs/udf/osta_udf.h +++ b/fs/udf/osta_udf.h @@ -111,7 +111,7 @@ struct logicalVolIntegrityDescImpUse { __le16 minUDFReadRev; __le16 minUDFWriteRev; __le16 maxUDFWriteRev; - uint8_t impUse[0]; + uint8_t impUse[]; } __packed; /* Implementation Use Volume Descriptor (UDF 2.60 2.2.7) */ @@ -178,15 +178,6 @@ struct metadataPartitionMap { uint8_t reserved2[5]; } __packed; -/* Virtual Allocation Table (UDF 1.5 2.2.10) */ -struct virtualAllocationTable15 { - __le32 vatEntry[0]; - struct regid vatIdent; - __le32 previousVATICBLoc; -} __packed; - -#define ICBTAG_FILE_TYPE_VAT15 0x00U - /* Virtual Allocation Table (UDF 2.60 2.2.11) */ struct virtualAllocationTable20 { __le16 lengthHeader; @@ -199,8 +190,8 @@ struct virtualAllocationTable20 { __le16 minUDFWriteRev; __le16 maxUDFWriteRev; __le16 reserved; - uint8_t impUse[0]; - __le32 vatEntry[0]; + uint8_t impUse[]; + /* __le32 vatEntry[]; */ } __packed; #define ICBTAG_FILE_TYPE_VAT20 0xF8U @@ -217,8 +208,7 @@ struct sparingTable { __le16 reallocationTableLen; __le16 reserved; __le32 sequenceNum; - struct sparingEntry - mapEntry[0]; + struct sparingEntry mapEntry[]; } __packed; /* Metadata File (and Metadata Mirror File) (UDF 2.60 2.2.13.1) */ @@ -241,7 +231,7 @@ struct allocDescImpUse { /* FreeEASpace (UDF 2.60 3.3.4.5.1.1) */ struct freeEaSpace { __le16 headerChecksum; - uint8_t freeEASpace[0]; + uint8_t freeEASpace[]; } __packed; /* DVD Copyright Management Information (UDF 2.60 3.3.4.5.1.2) */ @@ -265,7 +255,7 @@ struct LVExtensionEA { /* FreeAppEASpace (UDF 2.60 3.3.4.6.1) */ struct freeAppEASpace { __le16 headerChecksum; - uint8_t freeEASpace[0]; + uint8_t freeEASpace[]; } __packed; /* UDF Defined System Stream (UDF 2.60 3.3.7) */ diff --git a/fs/udf/super.c b/fs/udf/super.c index 2f83c1204e20..b2d7c57d0688 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -108,16 +108,10 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb) return NULL; lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data; partnum = le32_to_cpu(lvid->numOfPartitions); - if ((sb->s_blocksize - sizeof(struct logicalVolIntegrityDescImpUse) - - offsetof(struct logicalVolIntegrityDesc, impUse)) / - (2 * sizeof(uint32_t)) < partnum) { - udf_err(sb, "Logical volume integrity descriptor corrupted " - "(numOfPartitions = %u)!\n", partnum); - return NULL; - } /* The offset is to skip freeSpaceTable and sizeTable arrays */ offset = partnum * 2 * sizeof(uint32_t); - return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]); + return (struct logicalVolIntegrityDescImpUse *) + (((uint8_t *)(lvid + 1)) + offset); } /* UDF filesystem type */ @@ -349,10 +343,10 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",lastblock=%u", sbi->s_last_block); if (sbi->s_anchor != 0) seq_printf(seq, ",anchor=%u", sbi->s_anchor); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) - seq_puts(seq, ",utf8"); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map) + if (sbi->s_nls_map) seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset); + else + seq_puts(seq, ",iocharset=utf8"); return 0; } @@ -558,19 +552,24 @@ static int udf_parse_options(char *options, struct udf_options *uopt, /* Ignored (never implemented properly) */ break; case Opt_utf8: - uopt->flags |= (1 << UDF_FLAG_UTF8); + if (!remount) { + unload_nls(uopt->nls_map); + uopt->nls_map = NULL; + } break; case Opt_iocharset: if (!remount) { - if (uopt->nls_map) - unload_nls(uopt->nls_map); - /* - * load_nls() failure is handled later in - * udf_fill_super() after all options are - * parsed. - */ + unload_nls(uopt->nls_map); + uopt->nls_map = NULL; + } + /* When nls_map is not loaded then UTF-8 is used */ + if (!remount && strcmp(args[0].from, "utf8") != 0) { uopt->nls_map = load_nls(args[0].from); - uopt->flags |= (1 << UDF_FLAG_NLS_MAP); + if (!uopt->nls_map) { + pr_err("iocharset %s not found\n", + args[0].from); + return 0; + } } break; case Opt_uforget: @@ -1542,6 +1541,7 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ struct udf_sb_info *sbi = UDF_SB(sb); struct logicalVolIntegrityDesc *lvid; int indirections = 0; + u32 parts, impuselen; while (++indirections <= UDF_MAX_LVID_NESTING) { final_bh = NULL; @@ -1568,15 +1568,27 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data; if (lvid->nextIntegrityExt.extLength == 0) - return; + goto check; loc = leea_to_cpu(lvid->nextIntegrityExt); } udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n", UDF_MAX_LVID_NESTING); +out_err: brelse(sbi->s_lvid_bh); sbi->s_lvid_bh = NULL; + return; +check: + parts = le32_to_cpu(lvid->numOfPartitions); + impuselen = le32_to_cpu(lvid->lengthOfImpUse); + if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize || + sizeof(struct logicalVolIntegrityDesc) + impuselen + + 2 * parts * sizeof(u32) > sb->s_blocksize) { + udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), " + "ignoring.\n", parts, impuselen); + goto out_err; + } } /* @@ -2139,21 +2151,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (!udf_parse_options((char *)options, &uopt, false)) goto parse_options_failure; - if (uopt.flags & (1 << UDF_FLAG_UTF8) && - uopt.flags & (1 << UDF_FLAG_NLS_MAP)) { - udf_err(sb, "utf8 cannot be combined with iocharset\n"); - goto parse_options_failure; - } - if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) { - uopt.nls_map = load_nls_default(); - if (!uopt.nls_map) - uopt.flags &= ~(1 << UDF_FLAG_NLS_MAP); - else - udf_debug("Using default NLS map\n"); - } - if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP))) - uopt.flags |= (1 << UDF_FLAG_UTF8); - fileset.logicalBlockNum = 0xFFFFFFFF; fileset.partitionReferenceNum = 0xFFFF; @@ -2308,8 +2305,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) error_out: iput(sbi->s_vat_inode); parse_options_failure: - if (uopt.nls_map) - unload_nls(uopt.nls_map); + unload_nls(uopt.nls_map); if (lvid_open) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); @@ -2359,8 +2355,7 @@ static void udf_put_super(struct super_block *sb) sbi = UDF_SB(sb); iput(sbi->s_vat_inode); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) - unload_nls(sbi->s_nls_map); + unload_nls(sbi->s_nls_map); if (!sb_rdonly(sb)) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 758efe557a19..4fa620543d30 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -20,8 +20,6 @@ #define UDF_FLAG_UNDELETE 6 #define UDF_FLAG_UNHIDE 7 #define UDF_FLAG_VARCONV 8 -#define UDF_FLAG_NLS_MAP 9 -#define UDF_FLAG_UTF8 10 #define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ #define UDF_FLAG_GID_FORGET 12 #define UDF_FLAG_UID_SET 13 diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 9dd0814f1077..7e258f15b8ef 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -130,6 +130,10 @@ static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi) le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent, UDF_NAME_PAD); } +static inline uint8_t *udf_get_fi_ident(struct fileIdentDesc *fi) +{ + return ((uint8_t *)(fi + 1)) + le16_to_cpu(fi->lengthOfImpUse); +} /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 5fcfa96463eb..622569007b53 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -177,7 +177,7 @@ static int udf_name_from_CS0(struct super_block *sb, return 0; } - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + if (UDF_SB(sb)->s_nls_map) conv_f = UDF_SB(sb)->s_nls_map->uni2char; else conv_f = NULL; @@ -285,7 +285,7 @@ static int udf_name_to_CS0(struct super_block *sb, if (ocu_max_len <= 0) return 0; - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + if (UDF_SB(sb)->s_nls_map) conv_f = UDF_SB(sb)->s_nls_map->char2uni; else conv_f = NULL; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 213a97a921bb..1cd3f940fa6a 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1626,7 +1626,6 @@ xfs_swap_extents( struct xfs_bstat *sbp = &sxp->sx_stat; int src_log_flags, target_log_flags; int error = 0; - int lock_flags; uint64_t f; int resblks = 0; unsigned int flags = 0; @@ -1638,8 +1637,8 @@ xfs_swap_extents( * do the rest of the checks. */ lock_two_nondirectories(VFS_I(ip), VFS_I(tip)); - lock_flags = XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, XFS_MMAPLOCK_EXCL, tip, XFS_MMAPLOCK_EXCL); + filemap_invalidate_lock_two(VFS_I(ip)->i_mapping, + VFS_I(tip)->i_mapping); /* Verify that both files have the same format */ if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { @@ -1711,7 +1710,6 @@ xfs_swap_extents( * or cancel will unlock the inodes from this point onwards. */ xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL); - lock_flags |= XFS_ILOCK_EXCL; xfs_trans_ijoin(tp, ip, 0); xfs_trans_ijoin(tp, tip, 0); @@ -1830,13 +1828,16 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); +out_unlock_ilock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL); out_unlock: - xfs_iunlock(ip, lock_flags); - xfs_iunlock(tip, lock_flags); + filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping, + VFS_I(tip)->i_mapping); unlock_two_nondirectories(VFS_I(ip), VFS_I(tip)); return error; out_trans_cancel: xfs_trans_cancel(tp); - goto out_unlock; + goto out_unlock_ilock; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8ff42b3585e0..3ab73567a0f5 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -844,7 +844,7 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; - if (bdi_read_congested(target->bt_bdev->bd_bdi)) + if (bdi_read_congested(target->bt_bdev->bd_disk->bdi)) return; xfs_buf_read_map(target, map, nmaps, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index cc3cfb12df53..3dfbdcdb0d1c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1302,7 +1302,7 @@ xfs_file_llseek( * * mmap_lock (MM) * sb_start_pagefault(vfs, freeze) - * i_mmaplock (XFS - truncate serialisation) + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) * page_lock (MM) * i_lock (XFS - extent map serialisation) */ @@ -1323,24 +1323,27 @@ __xfs_filemap_fault( file_update_time(vmf->vma->vm_file); } - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { pfn_t pfn; + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, (write_fault && !vmf->cow_page) ? &xfs_direct_write_iomap_ops : &xfs_read_iomap_ops); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { - if (write_fault) + if (write_fault) { + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); - else + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + } else { ret = filemap_fault(vmf); + } } - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (write_fault) sb_end_pagefault(inode->i_sb); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 990b72ae3635..f00145e1a976 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -132,7 +132,7 @@ xfs_ilock_attr_map_shared( /* * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 - * multi-reader locks: i_mmap_lock and the i_lock. This routine allows + * multi-reader locks: invalidate_lock and the i_lock. This routine allows * various combinations of the locks to be obtained. * * The 3 locks should always be ordered so that the IO lock is obtained first, @@ -140,23 +140,23 @@ xfs_ilock_attr_map_shared( * * Basic locking order: * - * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock + * i_rwsem -> invalidate_lock -> page_lock -> i_ilock * * mmap_lock locking order: * * i_rwsem -> page lock -> mmap_lock - * mmap_lock -> i_mmap_lock -> page_lock + * mmap_lock -> invalidate_lock -> page_lock * * The difference in mmap_lock locking order mean that we cannot hold the - * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can - * fault in pages during copy in/out (for buffered IO) or require the mmap_lock - * in get_user_pages() to map the user pages into the kernel address space for - * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because - * page faults already hold the mmap_lock. + * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths + * can fault in pages during copy in/out (for buffered IO) or require the + * mmap_lock in get_user_pages() to map the user pages into the kernel address + * space for direct IO. Similarly the i_rwsem cannot be taken inside a page + * fault because page faults already hold the mmap_lock. * * Hence to serialise fully against both syscall and mmap based IO, we need to - * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both - * taken in places where we need to invalidate the page cache in a race + * take both the i_rwsem and the invalidate_lock. These locks should *only* be + * both taken in places where we need to invalidate the page cache in a race * free manner (e.g. truncate, hole punch and other extent manipulation * functions). */ @@ -188,10 +188,13 @@ xfs_ilock( XFS_IOLOCK_DEP(lock_flags)); } - if (lock_flags & XFS_MMAPLOCK_EXCL) - mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_MMAPLOCK_SHARED) - mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + if (lock_flags & XFS_MMAPLOCK_EXCL) { + down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock, + XFS_MMAPLOCK_DEP(lock_flags)); + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock, + XFS_MMAPLOCK_DEP(lock_flags)); + } if (lock_flags & XFS_ILOCK_EXCL) mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); @@ -240,10 +243,10 @@ xfs_ilock_nowait( } if (lock_flags & XFS_MMAPLOCK_EXCL) { - if (!mrtryupdate(&ip->i_mmaplock)) + if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) goto out_undo_iolock; } else if (lock_flags & XFS_MMAPLOCK_SHARED) { - if (!mrtryaccess(&ip->i_mmaplock)) + if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) goto out_undo_iolock; } @@ -258,9 +261,9 @@ xfs_ilock_nowait( out_undo_mmaplock: if (lock_flags & XFS_MMAPLOCK_EXCL) - mrunlock_excl(&ip->i_mmaplock); + up_write(&VFS_I(ip)->i_mapping->invalidate_lock); else if (lock_flags & XFS_MMAPLOCK_SHARED) - mrunlock_shared(&ip->i_mmaplock); + up_read(&VFS_I(ip)->i_mapping->invalidate_lock); out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) up_write(&VFS_I(ip)->i_rwsem); @@ -307,9 +310,9 @@ xfs_iunlock( up_read(&VFS_I(ip)->i_rwsem); if (lock_flags & XFS_MMAPLOCK_EXCL) - mrunlock_excl(&ip->i_mmaplock); + up_write(&VFS_I(ip)->i_mapping->invalidate_lock); else if (lock_flags & XFS_MMAPLOCK_SHARED) - mrunlock_shared(&ip->i_mmaplock); + up_read(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_ILOCK_EXCL) mrunlock_excl(&ip->i_lock); @@ -335,7 +338,7 @@ xfs_ilock_demote( if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); if (lock_flags & XFS_MMAPLOCK_EXCL) - mrdemote(&ip->i_mmaplock); + downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_IOLOCK_EXCL) downgrade_write(&VFS_I(ip)->i_rwsem); @@ -343,9 +346,29 @@ xfs_ilock_demote( } #if defined(DEBUG) || defined(XFS_WARN) -int +static inline bool +__xfs_rwsem_islocked( + struct rw_semaphore *rwsem, + bool shared) +{ + if (!debug_locks) + return rwsem_is_locked(rwsem); + + if (!shared) + return lockdep_is_held_type(rwsem, 0); + + /* + * We are checking that the lock is held at least in shared + * mode but don't care that it might be held exclusively + * (i.e. shared | excl). Hence we check if the lock is held + * in any mode rather than an explicit shared mode. + */ + return lockdep_is_held_type(rwsem, -1); +} + +bool xfs_isilocked( - xfs_inode_t *ip, + struct xfs_inode *ip, uint lock_flags) { if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { @@ -355,20 +378,17 @@ xfs_isilocked( } if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { - if (!(lock_flags & XFS_MMAPLOCK_SHARED)) - return !!ip->i_mmaplock.mr_writer; - return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, + (lock_flags & XFS_IOLOCK_SHARED)); } - if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { - if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !debug_locks || - lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0); - return rwsem_is_locked(&VFS_I(ip)->i_rwsem); + if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) { + return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, + (lock_flags & XFS_IOLOCK_SHARED)); } ASSERT(0); - return 0; + return false; } #endif @@ -532,12 +552,10 @@ again: } /* - * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - - * the mmaplock or the ilock, but not more than one type at a time. If we lock - * more than one at a time, lockdep will report false positives saying we have - * violated locking orders. The iolock must be double-locked separately since - * we use i_rwsem for that. We now support taking one lock EXCL and the other - * SHARED. + * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and + * mmaplock must be double-locked separately since we use i_rwsem and + * invalidate_lock for that. We now support taking one lock EXCL and the + * other SHARED. */ void xfs_lock_two_inodes( @@ -555,15 +573,8 @@ xfs_lock_two_inodes( ASSERT(hweight32(ip1_mode) == 1); ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); - ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - + ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); + ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { @@ -3741,11 +3752,8 @@ xfs_ilock2_io_mmap( ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); if (ret) return ret; - if (ip1 == ip2) - xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); - else - xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, - ip2, XFS_MMAPLOCK_EXCL); + filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); return 0; } @@ -3755,12 +3763,9 @@ xfs_iunlock2_io_mmap( struct xfs_inode *ip1, struct xfs_inode *ip2) { - bool same_inode = (ip1 == ip2); - - xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); - if (!same_inode) - xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); inode_unlock(VFS_I(ip2)); - if (!same_inode) + if (ip1 != ip2) inode_unlock(VFS_I(ip1)); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 4b6703dbffb8..e0ae905554e2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -40,7 +40,6 @@ typedef struct xfs_inode { /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ - mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ /* @@ -410,7 +409,7 @@ void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); -int xfs_isilocked(xfs_inode_t *, uint); +bool xfs_isilocked(struct xfs_inode *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2c9e26a44546..102cbd606633 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -709,8 +709,6 @@ xfs_fs_inode_init_once( atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); - mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, - "xfsino", ip->i_ino); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); } diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 70055d486bf7..ddc346a9df9b 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -462,7 +462,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) inode_dio_wait(inode); /* Serialize against page faults */ - down_write(&zi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); /* Serialize against zonefs_iomap_begin() */ mutex_lock(&zi->i_truncate_mutex); @@ -500,7 +500,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) unlock: mutex_unlock(&zi->i_truncate_mutex); - up_write(&zi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return ret; } @@ -575,18 +575,6 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, return ret; } -static vm_fault_t zonefs_filemap_fault(struct vm_fault *vmf) -{ - struct zonefs_inode_info *zi = ZONEFS_I(file_inode(vmf->vma->vm_file)); - vm_fault_t ret; - - down_read(&zi->i_mmap_sem); - ret = filemap_fault(vmf); - up_read(&zi->i_mmap_sem); - - return ret; -} - static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); @@ -607,16 +595,16 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) file_update_time(vmf->vma->vm_file); /* Serialize against truncates */ - down_read(&zi->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops); - up_read(&zi->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); return ret; } static const struct vm_operations_struct zonefs_file_vm_ops = { - .fault = zonefs_filemap_fault, + .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = zonefs_filemap_page_mkwrite, }; @@ -1155,7 +1143,6 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); - init_rwsem(&zi->i_mmap_sem); zi->i_wr_refcnt = 0; return &zi->i_vnode; diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 51141907097c..7b147907c328 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -70,12 +70,11 @@ struct zonefs_inode_info { * and changes to the inode private data, and in particular changes to * a sequential file size on completion of direct IO writes. * Serialization of mmap read IOs with truncate and syscall IO - * operations is done with i_mmap_sem in addition to i_truncate_mutex. - * Only zonefs_seq_file_truncate() takes both lock (i_mmap_sem first, - * i_truncate_mutex second). + * operations is done with invalidate_lock in addition to + * i_truncate_mutex. Only zonefs_seq_file_truncate() takes both lock + * (invalidate_lock first, i_truncate_mutex second). */ struct mutex i_truncate_mutex; - struct rw_semaphore i_mmap_sem; /* guarded by i_truncate_mutex */ unsigned int i_wr_refcnt; |